1/*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package android.text;
18
19import android.annotation.Nullable;
20import android.util.Log;
21
22import com.android.internal.annotations.GuardedBy;
23
24import java.io.File;
25import java.io.IOException;
26import java.io.RandomAccessFile;
27import java.nio.ByteBuffer;
28import java.nio.MappedByteBuffer;
29import java.nio.channels.FileChannel;
30import java.util.HashMap;
31import java.util.Locale;
32
33/**
34 * Hyphenator is a wrapper class for a native implementation of automatic hyphenation,
35 * in essence finding valid hyphenation opportunities in a word.
36 *
37 * @hide
38 */
39public class Hyphenator {
40    // This class has deliberately simple lifetime management (no finalizer) because in
41    // the common case a process will use a very small number of locales.
42
43    private static String TAG = "Hyphenator";
44
45    // TODO: Confirm that these are the best values. Various sources suggest (1, 1), but
46    // that appears too small.
47    private static final int INDIC_MIN_PREFIX = 2;
48    private static final int INDIC_MIN_SUFFIX = 2;
49
50    private final static Object sLock = new Object();
51
52    @GuardedBy("sLock")
53    final static HashMap<Locale, Hyphenator> sMap = new HashMap<Locale, Hyphenator>();
54
55    // Reasonable enough values for cases where we have no hyphenation patterns but may be able to
56    // do some automatic hyphenation based on characters. These values would be used very rarely.
57    private static final int DEFAULT_MIN_PREFIX = 2;
58    private static final int DEFAULT_MIN_SUFFIX = 2;
59    final static Hyphenator sEmptyHyphenator =
60            new Hyphenator(StaticLayout.nLoadHyphenator(
61                                   null, 0, DEFAULT_MIN_PREFIX, DEFAULT_MIN_SUFFIX),
62                           null);
63
64    final private long mNativePtr;
65
66    // We retain a reference to the buffer to keep the memory mapping valid
67    @SuppressWarnings("unused")
68    final private ByteBuffer mBuffer;
69
70    private Hyphenator(long nativePtr, ByteBuffer b) {
71        mNativePtr = nativePtr;
72        mBuffer = b;
73    }
74
75    public long getNativePtr() {
76        return mNativePtr;
77    }
78
79    public static Hyphenator get(@Nullable Locale locale) {
80        synchronized (sLock) {
81            Hyphenator result = sMap.get(locale);
82            if (result != null) {
83                return result;
84            }
85
86            // If there's a variant, fall back to language+variant only, if available
87            final String variant = locale.getVariant();
88            if (!variant.isEmpty()) {
89                final Locale languageAndVariantOnlyLocale =
90                        new Locale(locale.getLanguage(), "", variant);
91                result = sMap.get(languageAndVariantOnlyLocale);
92                if (result != null) {
93                    sMap.put(locale, result);
94                    return result;
95                }
96            }
97
98            // Fall back to language-only, if available
99            final Locale languageOnlyLocale = new Locale(locale.getLanguage());
100            result = sMap.get(languageOnlyLocale);
101            if (result != null) {
102                sMap.put(locale, result);
103                return result;
104            }
105
106            // Fall back to script-only, if available
107            final String script = locale.getScript();
108            if (!script.equals("")) {
109                final Locale scriptOnlyLocale = new Locale.Builder()
110                        .setLanguage("und")
111                        .setScript(script)
112                        .build();
113                result = sMap.get(scriptOnlyLocale);
114                if (result != null) {
115                    sMap.put(locale, result);
116                    return result;
117                }
118            }
119
120            sMap.put(locale, sEmptyHyphenator);  // To remember we found nothing.
121        }
122        return sEmptyHyphenator;
123    }
124
125    private static class HyphenationData {
126        final String mLanguageTag;
127        final int mMinPrefix, mMinSuffix;
128        HyphenationData(String languageTag, int minPrefix, int minSuffix) {
129            this.mLanguageTag = languageTag;
130            this.mMinPrefix = minPrefix;
131            this.mMinSuffix = minSuffix;
132        }
133    }
134
135    private static Hyphenator loadHyphenator(HyphenationData data) {
136        String patternFilename = "hyph-" + data.mLanguageTag.toLowerCase(Locale.US) + ".hyb";
137        File patternFile = new File(getSystemHyphenatorLocation(), patternFilename);
138        if (!patternFile.canRead()) {
139            Log.e(TAG, "hyphenation patterns for " + patternFile + " not found or unreadable");
140            return null;
141        }
142        try {
143            RandomAccessFile f = new RandomAccessFile(patternFile, "r");
144            try {
145                FileChannel fc = f.getChannel();
146                MappedByteBuffer buf = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size());
147                long nativePtr = StaticLayout.nLoadHyphenator(
148                        buf, 0, data.mMinPrefix, data.mMinSuffix);
149                return new Hyphenator(nativePtr, buf);
150            } finally {
151                f.close();
152            }
153        } catch (IOException e) {
154            Log.e(TAG, "error loading hyphenation " + patternFile, e);
155            return null;
156        }
157    }
158
159    private static File getSystemHyphenatorLocation() {
160        return new File("/system/usr/hyphen-data");
161    }
162
163    // This array holds pairs of language tags that are used to prefill the map from locale to
164    // hyphenation data: The hyphenation data for the first field will be prefilled from the
165    // hyphenation data for the second field.
166    //
167    // The aliases that are computable by the get() method above are not included.
168    private static final String[][] LOCALE_FALLBACK_DATA = {
169        // English locales that fall back to en-US. The data is
170        // from CLDR. It's all English locales, minus the locales whose
171        // parent is en-001 (from supplementalData.xml, under <parentLocales>).
172        // TODO: Figure out how to get this from ICU.
173        {"en-AS", "en-US"}, // English (American Samoa)
174        {"en-GU", "en-US"}, // English (Guam)
175        {"en-MH", "en-US"}, // English (Marshall Islands)
176        {"en-MP", "en-US"}, // English (Northern Mariana Islands)
177        {"en-PR", "en-US"}, // English (Puerto Rico)
178        {"en-UM", "en-US"}, // English (United States Minor Outlying Islands)
179        {"en-VI", "en-US"}, // English (Virgin Islands)
180
181        // All English locales other than those falling back to en-US are mapped to en-GB.
182        {"en", "en-GB"},
183
184        // For German, we're assuming the 1996 (and later) orthography by default.
185        {"de", "de-1996"},
186        // Liechtenstein uses the Swiss hyphenation rules for the 1901 orthography.
187        {"de-LI-1901", "de-CH-1901"},
188
189        // Norwegian is very probably Norwegian Bokmål.
190        {"no", "nb"},
191
192        // Use mn-Cyrl. According to CLDR's likelySubtags.xml, mn is most likely to be mn-Cyrl.
193        {"mn", "mn-Cyrl"}, // Mongolian
194
195        // Fall back to Ethiopic script for languages likely to be written in Ethiopic.
196        // Data is from CLDR's likelySubtags.xml.
197        // TODO: Convert this to a mechanism using ICU4J's ULocale#addLikelySubtags().
198        {"am", "und-Ethi"}, // Amharic
199        {"byn", "und-Ethi"}, // Blin
200        {"gez", "und-Ethi"}, // Geʻez
201        {"ti", "und-Ethi"}, // Tigrinya
202        {"wal", "und-Ethi"}, // Wolaytta
203    };
204
205    private static final HyphenationData[] AVAILABLE_LANGUAGES = {
206        new HyphenationData("as", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Assamese
207        new HyphenationData("bg", 2, 2), // Bulgarian
208        new HyphenationData("bn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Bengali
209        new HyphenationData("cu", 1, 2), // Church Slavonic
210        new HyphenationData("cy", 2, 3), // Welsh
211        new HyphenationData("da", 2, 2), // Danish
212        new HyphenationData("de-1901", 2, 2), // German 1901 orthography
213        new HyphenationData("de-1996", 2, 2), // German 1996 orthography
214        new HyphenationData("de-CH-1901", 2, 2), // Swiss High German 1901 orthography
215        new HyphenationData("en-GB", 2, 3), // British English
216        new HyphenationData("en-US", 2, 3), // American English
217        new HyphenationData("es", 2, 2), // Spanish
218        new HyphenationData("et", 2, 3), // Estonian
219        new HyphenationData("eu", 2, 2), // Basque
220        new HyphenationData("fr", 2, 3), // French
221        new HyphenationData("ga", 2, 3), // Irish
222        new HyphenationData("gu", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Gujarati
223        new HyphenationData("hi", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Hindi
224        new HyphenationData("hr", 2, 2), // Croatian
225        new HyphenationData("hu", 2, 2), // Hungarian
226        // texhyphen sources say Armenian may be (1, 2), but that it needs confirmation.
227        // Going with a more conservative value of (2, 2) for now.
228        new HyphenationData("hy", 2, 2), // Armenian
229        new HyphenationData("kn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Kannada
230        new HyphenationData("ml", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Malayalam
231        new HyphenationData("mn-Cyrl", 2, 2), // Mongolian in Cyrillic script
232        new HyphenationData("mr", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Marathi
233        new HyphenationData("nb", 2, 2), // Norwegian Bokmål
234        new HyphenationData("nn", 2, 2), // Norwegian Nynorsk
235        new HyphenationData("or", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Oriya
236        new HyphenationData("pa", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Punjabi
237        new HyphenationData("pt", 2, 3), // Portuguese
238        new HyphenationData("sl", 2, 2), // Slovenian
239        new HyphenationData("ta", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Tamil
240        new HyphenationData("te", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX), // Telugu
241        new HyphenationData("tk", 2, 2), // Turkmen
242        new HyphenationData("und-Ethi", 1, 1), // Any language in Ethiopic script
243    };
244
245    /**
246     * Load hyphenation patterns at initialization time. We want to have patterns
247     * for all locales loaded and ready to use so we don't have to do any file IO
248     * on the UI thread when drawing text in different locales.
249     *
250     * @hide
251     */
252    public static void init() {
253        sMap.put(null, null);
254
255        for (int i = 0; i < AVAILABLE_LANGUAGES.length; i++) {
256            HyphenationData data = AVAILABLE_LANGUAGES[i];
257            Hyphenator h = loadHyphenator(data);
258            if (h != null) {
259                sMap.put(Locale.forLanguageTag(data.mLanguageTag), h);
260            }
261        }
262
263        for (int i = 0; i < LOCALE_FALLBACK_DATA.length; i++) {
264            String language = LOCALE_FALLBACK_DATA[i][0];
265            String fallback = LOCALE_FALLBACK_DATA[i][1];
266            sMap.put(Locale.forLanguageTag(language), sMap.get(Locale.forLanguageTag(fallback)));
267        }
268    }
269}
270