Hyphenator.java revision 44a1df2cf3857692ca95149d4fa70017982f2211
1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package android.text;
18
19import android.annotation.IntRange;
20import android.annotation.NonNull;
21import android.annotation.Nullable;
22import android.system.ErrnoException;
23import android.system.Os;
24import android.system.OsConstants;
25import android.util.Log;
26
27import com.android.internal.annotations.GuardedBy;
28
29import java.io.File;
30import java.io.IOException;
31import java.io.RandomAccessFile;
32import java.util.HashMap;
33import java.util.Locale;
34
35/**
36 * Hyphenator is a wrapper class for a native implementation of automatic hyphenation,
37 * in essence finding valid hyphenation opportunities in a word.
38 *
39 * @hide
40 */
41public class Hyphenator {
42    private static String TAG = "Hyphenator";
43
44    private final static Object sLock = new Object();
45
46    @GuardedBy("sLock")
47    final static HashMap<Locale, Hyphenator> sMap = new HashMap<Locale, Hyphenator>();
48
49    private final long mNativePtr;
50    private final HyphenationData mData;
51
52    private Hyphenator(long nativePtr, HyphenationData data) {
53        mNativePtr = nativePtr;
54        mData = data;
55    }
56
57    public long getNativePtr() {
58        return mNativePtr;
59    }
60
61    public static Hyphenator get(@Nullable Locale locale) {
62        synchronized (sLock) {
63            Hyphenator result = sMap.get(locale);
64            if (result != null) {
65                return result;
66            }
67
68            // If there's a variant, fall back to language+variant only, if available
69            final String variant = locale.getVariant();
70            if (!variant.isEmpty()) {
71                final Locale languageAndVariantOnlyLocale =
72                        new Locale(locale.getLanguage(), "", variant);
73                result = sMap.get(languageAndVariantOnlyLocale);
74                if (result != null) {
75                    return putAlias(locale, result);
76                }
77            }
78
79            // Fall back to language-only, if available
80            final Locale languageOnlyLocale = new Locale(locale.getLanguage());
81            result = sMap.get(languageOnlyLocale);
82            if (result != null) {
83                return putAlias(locale, result);
84            }
85
86            // Fall back to script-only, if available
87            final String script = locale.getScript();
88            if (!script.equals("")) {
89                final Locale scriptOnlyLocale = new Locale.Builder()
90                        .setLanguage("und")
91                        .setScript(script)
92                        .build();
93                result = sMap.get(scriptOnlyLocale);
94                if (result != null) {
95                    return putAlias(locale, result);
96                }
97            }
98
99            return putEmptyAlias(locale);
100        }
101    }
102
103    private static class HyphenationData {
104        private static final String SYSTEM_HYPHENATOR_LOCATION = "/system/usr/hyphen-data";
105
106        public final int mMinPrefix, mMinSuffix;
107        public final long mDataAddress;
108
109        // Reasonable enough values for cases where we have no hyphenation patterns but may be able
110        // to do some automatic hyphenation based on characters. These values would be used very
111        // rarely.
112        private static final int DEFAULT_MIN_PREFIX = 2;
113        private static final int DEFAULT_MIN_SUFFIX = 2;
114
115        public static final HyphenationData sEmptyData =
116                new HyphenationData(DEFAULT_MIN_PREFIX, DEFAULT_MIN_SUFFIX);
117
118        // Create empty HyphenationData.
119        private HyphenationData(int minPrefix, int minSuffix) {
120            mMinPrefix = minPrefix;
121            mMinSuffix = minSuffix;
122            mDataAddress = 0;
123        }
124
125        HyphenationData(String languageTag, int minPrefix, int minSuffix) {
126            mMinPrefix = minPrefix;
127            mMinSuffix = minSuffix;
128
129            final String patternFilename = "hyph-" + languageTag.toLowerCase(Locale.US) + ".hyb";
130            final File patternFile = new File(SYSTEM_HYPHENATOR_LOCATION, patternFilename);
131            if (!patternFile.canRead()) {
132                Log.e(TAG, "hyphenation patterns for " + patternFile + " not found or unreadable");
133                mDataAddress = 0;
134            } else {
135                long address;
136                try (RandomAccessFile f = new RandomAccessFile(patternFile, "r")) {
137                    address = Os.mmap(0, f.length(), OsConstants.PROT_READ,
138                            OsConstants.MAP_SHARED, f.getFD(), 0 /* offset */);
139                } catch (IOException | ErrnoException e) {
140                    Log.e(TAG, "error loading hyphenation " + patternFile, e);
141                    address = 0;
142                }
143                mDataAddress = address;
144            }
145        }
146    }
147
148    // Do not call this method outside of init method.
149    private static Hyphenator putNewHyphenator(Locale loc, HyphenationData data) {
150        final Hyphenator hyphenator = new Hyphenator(nBuildHyphenator(
151                data.mDataAddress, loc.getLanguage(), data.mMinPrefix, data.mMinSuffix), data);
152        sMap.put(loc, hyphenator);
153        return hyphenator;
154    }
155
156    // Do not call this method outside of init method.
157    private static void loadData(String langTag, int minPrefix, int maxPrefix) {
158        final HyphenationData data = new HyphenationData(langTag, minPrefix, maxPrefix);
159        putNewHyphenator(Locale.forLanguageTag(langTag), data);
160    }
161
162    // Caller must acquire sLock before calling this method.
163    // The Hyphenator for the baseLangTag must exists.
164    private static Hyphenator addAliasByTag(String langTag, String baseLangTag) {
165        return putAlias(Locale.forLanguageTag(langTag),
166                sMap.get(Locale.forLanguageTag(baseLangTag)));
167    }
168
169    // Caller must acquire sLock before calling this method.
170    private static Hyphenator putAlias(Locale locale, Hyphenator base) {
171        return putNewHyphenator(locale, base.mData);
172    }
173
174    // Caller must acquire sLock before calling this method.
175    private static Hyphenator putEmptyAlias(Locale locale) {
176        return putNewHyphenator(locale, HyphenationData.sEmptyData);
177    }
178
179    // TODO: Confirm that these are the best values. Various sources suggest (1, 1), but
180    // that appears too small.
181    private static final int INDIC_MIN_PREFIX = 2;
182    private static final int INDIC_MIN_SUFFIX = 2;
183
184    /**
185     * Load hyphenation patterns at initialization time. We want to have patterns
186     * for all locales loaded and ready to use so we don't have to do any file IO
187     * on the UI thread when drawing text in different locales.
188     *
189     * @hide
190     */
191    public static void init() {
192        synchronized (sLock) {
193            sMap.put(null, null);
194
195            loadData("as", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Assamese
196            loadData("bg", 2, 2); // Bulgarian
197            loadData("bn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Bengali
198            loadData("cu", 1, 2); // Church Slavonic
199            loadData("cy", 2, 3); // Welsh
200            loadData("da", 2, 2); // Danish
201            loadData("de-1901", 2, 2); // German 1901 orthography
202            loadData("de-1996", 2, 2); // German 1996 orthography
203            loadData("de-CH-1901", 2, 2); // Swiss High German 1901 orthography
204            loadData("en-GB", 2, 3); // British English
205            loadData("en-US", 2, 3); // American English
206            loadData("es", 2, 2); // Spanish
207            loadData("et", 2, 3); // Estonian
208            loadData("eu", 2, 2); // Basque
209            loadData("fr", 2, 3); // French
210            loadData("ga", 2, 3); // Irish
211            loadData("gu", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Gujarati
212            loadData("hi", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Hindi
213            loadData("hr", 2, 2); // Croatian
214            loadData("hu", 2, 2); // Hungarian
215            // texhyphen sources say Armenian may be (1, 2); but that it needs confirmation.
216            // Going with a more conservative value of (2, 2) for now.
217            loadData("hy", 2, 2); // Armenian
218            loadData("kn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Kannada
219            loadData("ml", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Malayalam
220            loadData("mn-Cyrl", 2, 2); // Mongolian in Cyrillic script
221            loadData("mr", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Marathi
222            loadData("nb", 2, 2); // Norwegian Bokmål
223            loadData("nn", 2, 2); // Norwegian Nynorsk
224            loadData("or", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Oriya
225            loadData("pa", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Punjabi
226            loadData("pt", 2, 3); // Portuguese
227            loadData("sl", 2, 2); // Slovenian
228            loadData("ta", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Tamil
229            loadData("te", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Telugu
230            loadData("tk", 2, 2); // Turkmen
231            loadData("und-Ethi", 1, 1); // Any language in Ethiopic script
232
233            // English locales that fall back to en-US. The data is
234            // from CLDR. It's all English locales, minus the locales whose
235            // parent is en-001 (from supplementalData.xml, under <parentLocales>).
236            // TODO: Figure out how to get this from ICU.
237            addAliasByTag("en-AS", "en-US"); // English (American Samoa)
238            addAliasByTag("en-GU", "en-US"); // English (Guam)
239            addAliasByTag("en-MH", "en-US"); // English (Marshall Islands)
240            addAliasByTag("en-MP", "en-US"); // English (Northern Mariana Islands)
241            addAliasByTag("en-PR", "en-US"); // English (Puerto Rico)
242            addAliasByTag("en-UM", "en-US"); // English (United States Minor Outlying Islands)
243            addAliasByTag("en-VI", "en-US"); // English (Virgin Islands)
244
245            // All English locales other than those falling back to en-US are mapped to en-GB.
246            addAliasByTag("en", "en-GB");
247
248            // For German, we're assuming the 1996 (and later) orthography by default.
249            addAliasByTag("de", "de-1996");
250            // Liechtenstein uses the Swiss hyphenation rules for the 1901 orthography.
251            addAliasByTag("de-LI-1901", "de-CH-1901");
252
253            // Norwegian is very probably Norwegian Bokmål.
254            addAliasByTag("no", "nb");
255
256            // Use mn-Cyrl. According to CLDR's likelySubtags.xml, mn is most likely to be mn-Cyrl.
257            addAliasByTag("mn", "mn-Cyrl"); // Mongolian
258
259            // Fall back to Ethiopic script for languages likely to be written in Ethiopic.
260            // Data is from CLDR's likelySubtags.xml.
261            // TODO: Convert this to a mechanism using ICU4J's ULocale#addLikelySubtags().
262            addAliasByTag("am", "und-Ethi"); // Amharic
263            addAliasByTag("byn", "und-Ethi"); // Blin
264            addAliasByTag("gez", "und-Ethi"); // Geʻez
265            addAliasByTag("ti", "und-Ethi"); // Tigrinya
266            addAliasByTag("wal", "und-Ethi"); // Wolaytta
267        }
268    };
269
270    private static native long nBuildHyphenator(/* non-zero */ long dataAddress,
271            @NonNull String langTag, @IntRange(from = 1) int minPrefix,
272            @IntRange(from = 1) int minSuffix);
273}
274