Hyphenator.java revision 44a1df2cf3857692ca95149d4fa70017982f2211
1/* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package android.text; 18 19import android.annotation.IntRange; 20import android.annotation.NonNull; 21import android.annotation.Nullable; 22import android.system.ErrnoException; 23import android.system.Os; 24import android.system.OsConstants; 25import android.util.Log; 26 27import com.android.internal.annotations.GuardedBy; 28 29import java.io.File; 30import java.io.IOException; 31import java.io.RandomAccessFile; 32import java.util.HashMap; 33import java.util.Locale; 34 35/** 36 * Hyphenator is a wrapper class for a native implementation of automatic hyphenation, 37 * in essence finding valid hyphenation opportunities in a word. 38 * 39 * @hide 40 */ 41public class Hyphenator { 42 private static String TAG = "Hyphenator"; 43 44 private final static Object sLock = new Object(); 45 46 @GuardedBy("sLock") 47 final static HashMap<Locale, Hyphenator> sMap = new HashMap<Locale, Hyphenator>(); 48 49 private final long mNativePtr; 50 private final HyphenationData mData; 51 52 private Hyphenator(long nativePtr, HyphenationData data) { 53 mNativePtr = nativePtr; 54 mData = data; 55 } 56 57 public long getNativePtr() { 58 return mNativePtr; 59 } 60 61 public static Hyphenator get(@Nullable Locale locale) { 62 synchronized (sLock) { 63 Hyphenator result = sMap.get(locale); 64 if (result != null) { 65 return result; 66 } 67 68 // If there's a variant, fall back to language+variant only, if available 69 final String variant = locale.getVariant(); 70 if (!variant.isEmpty()) { 71 final Locale languageAndVariantOnlyLocale = 72 new Locale(locale.getLanguage(), "", variant); 73 result = sMap.get(languageAndVariantOnlyLocale); 74 if (result != null) { 75 return putAlias(locale, result); 76 } 77 } 78 79 // Fall back to language-only, if available 80 final Locale languageOnlyLocale = new Locale(locale.getLanguage()); 81 result = sMap.get(languageOnlyLocale); 82 if (result != null) { 83 return putAlias(locale, result); 84 } 85 86 // Fall back to script-only, if available 87 final String script = locale.getScript(); 88 if (!script.equals("")) { 89 final Locale scriptOnlyLocale = new Locale.Builder() 90 .setLanguage("und") 91 .setScript(script) 92 .build(); 93 result = sMap.get(scriptOnlyLocale); 94 if (result != null) { 95 return putAlias(locale, result); 96 } 97 } 98 99 return putEmptyAlias(locale); 100 } 101 } 102 103 private static class HyphenationData { 104 private static final String SYSTEM_HYPHENATOR_LOCATION = "/system/usr/hyphen-data"; 105 106 public final int mMinPrefix, mMinSuffix; 107 public final long mDataAddress; 108 109 // Reasonable enough values for cases where we have no hyphenation patterns but may be able 110 // to do some automatic hyphenation based on characters. These values would be used very 111 // rarely. 112 private static final int DEFAULT_MIN_PREFIX = 2; 113 private static final int DEFAULT_MIN_SUFFIX = 2; 114 115 public static final HyphenationData sEmptyData = 116 new HyphenationData(DEFAULT_MIN_PREFIX, DEFAULT_MIN_SUFFIX); 117 118 // Create empty HyphenationData. 119 private HyphenationData(int minPrefix, int minSuffix) { 120 mMinPrefix = minPrefix; 121 mMinSuffix = minSuffix; 122 mDataAddress = 0; 123 } 124 125 HyphenationData(String languageTag, int minPrefix, int minSuffix) { 126 mMinPrefix = minPrefix; 127 mMinSuffix = minSuffix; 128 129 final String patternFilename = "hyph-" + languageTag.toLowerCase(Locale.US) + ".hyb"; 130 final File patternFile = new File(SYSTEM_HYPHENATOR_LOCATION, patternFilename); 131 if (!patternFile.canRead()) { 132 Log.e(TAG, "hyphenation patterns for " + patternFile + " not found or unreadable"); 133 mDataAddress = 0; 134 } else { 135 long address; 136 try (RandomAccessFile f = new RandomAccessFile(patternFile, "r")) { 137 address = Os.mmap(0, f.length(), OsConstants.PROT_READ, 138 OsConstants.MAP_SHARED, f.getFD(), 0 /* offset */); 139 } catch (IOException | ErrnoException e) { 140 Log.e(TAG, "error loading hyphenation " + patternFile, e); 141 address = 0; 142 } 143 mDataAddress = address; 144 } 145 } 146 } 147 148 // Do not call this method outside of init method. 149 private static Hyphenator putNewHyphenator(Locale loc, HyphenationData data) { 150 final Hyphenator hyphenator = new Hyphenator(nBuildHyphenator( 151 data.mDataAddress, loc.getLanguage(), data.mMinPrefix, data.mMinSuffix), data); 152 sMap.put(loc, hyphenator); 153 return hyphenator; 154 } 155 156 // Do not call this method outside of init method. 157 private static void loadData(String langTag, int minPrefix, int maxPrefix) { 158 final HyphenationData data = new HyphenationData(langTag, minPrefix, maxPrefix); 159 putNewHyphenator(Locale.forLanguageTag(langTag), data); 160 } 161 162 // Caller must acquire sLock before calling this method. 163 // The Hyphenator for the baseLangTag must exists. 164 private static Hyphenator addAliasByTag(String langTag, String baseLangTag) { 165 return putAlias(Locale.forLanguageTag(langTag), 166 sMap.get(Locale.forLanguageTag(baseLangTag))); 167 } 168 169 // Caller must acquire sLock before calling this method. 170 private static Hyphenator putAlias(Locale locale, Hyphenator base) { 171 return putNewHyphenator(locale, base.mData); 172 } 173 174 // Caller must acquire sLock before calling this method. 175 private static Hyphenator putEmptyAlias(Locale locale) { 176 return putNewHyphenator(locale, HyphenationData.sEmptyData); 177 } 178 179 // TODO: Confirm that these are the best values. Various sources suggest (1, 1), but 180 // that appears too small. 181 private static final int INDIC_MIN_PREFIX = 2; 182 private static final int INDIC_MIN_SUFFIX = 2; 183 184 /** 185 * Load hyphenation patterns at initialization time. We want to have patterns 186 * for all locales loaded and ready to use so we don't have to do any file IO 187 * on the UI thread when drawing text in different locales. 188 * 189 * @hide 190 */ 191 public static void init() { 192 synchronized (sLock) { 193 sMap.put(null, null); 194 195 loadData("as", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Assamese 196 loadData("bg", 2, 2); // Bulgarian 197 loadData("bn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Bengali 198 loadData("cu", 1, 2); // Church Slavonic 199 loadData("cy", 2, 3); // Welsh 200 loadData("da", 2, 2); // Danish 201 loadData("de-1901", 2, 2); // German 1901 orthography 202 loadData("de-1996", 2, 2); // German 1996 orthography 203 loadData("de-CH-1901", 2, 2); // Swiss High German 1901 orthography 204 loadData("en-GB", 2, 3); // British English 205 loadData("en-US", 2, 3); // American English 206 loadData("es", 2, 2); // Spanish 207 loadData("et", 2, 3); // Estonian 208 loadData("eu", 2, 2); // Basque 209 loadData("fr", 2, 3); // French 210 loadData("ga", 2, 3); // Irish 211 loadData("gu", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Gujarati 212 loadData("hi", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Hindi 213 loadData("hr", 2, 2); // Croatian 214 loadData("hu", 2, 2); // Hungarian 215 // texhyphen sources say Armenian may be (1, 2); but that it needs confirmation. 216 // Going with a more conservative value of (2, 2) for now. 217 loadData("hy", 2, 2); // Armenian 218 loadData("kn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Kannada 219 loadData("ml", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Malayalam 220 loadData("mn-Cyrl", 2, 2); // Mongolian in Cyrillic script 221 loadData("mr", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Marathi 222 loadData("nb", 2, 2); // Norwegian Bokmål 223 loadData("nn", 2, 2); // Norwegian Nynorsk 224 loadData("or", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Oriya 225 loadData("pa", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Punjabi 226 loadData("pt", 2, 3); // Portuguese 227 loadData("sl", 2, 2); // Slovenian 228 loadData("ta", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Tamil 229 loadData("te", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX); // Telugu 230 loadData("tk", 2, 2); // Turkmen 231 loadData("und-Ethi", 1, 1); // Any language in Ethiopic script 232 233 // English locales that fall back to en-US. The data is 234 // from CLDR. It's all English locales, minus the locales whose 235 // parent is en-001 (from supplementalData.xml, under <parentLocales>). 236 // TODO: Figure out how to get this from ICU. 237 addAliasByTag("en-AS", "en-US"); // English (American Samoa) 238 addAliasByTag("en-GU", "en-US"); // English (Guam) 239 addAliasByTag("en-MH", "en-US"); // English (Marshall Islands) 240 addAliasByTag("en-MP", "en-US"); // English (Northern Mariana Islands) 241 addAliasByTag("en-PR", "en-US"); // English (Puerto Rico) 242 addAliasByTag("en-UM", "en-US"); // English (United States Minor Outlying Islands) 243 addAliasByTag("en-VI", "en-US"); // English (Virgin Islands) 244 245 // All English locales other than those falling back to en-US are mapped to en-GB. 246 addAliasByTag("en", "en-GB"); 247 248 // For German, we're assuming the 1996 (and later) orthography by default. 249 addAliasByTag("de", "de-1996"); 250 // Liechtenstein uses the Swiss hyphenation rules for the 1901 orthography. 251 addAliasByTag("de-LI-1901", "de-CH-1901"); 252 253 // Norwegian is very probably Norwegian Bokmål. 254 addAliasByTag("no", "nb"); 255 256 // Use mn-Cyrl. According to CLDR's likelySubtags.xml, mn is most likely to be mn-Cyrl. 257 addAliasByTag("mn", "mn-Cyrl"); // Mongolian 258 259 // Fall back to Ethiopic script for languages likely to be written in Ethiopic. 260 // Data is from CLDR's likelySubtags.xml. 261 // TODO: Convert this to a mechanism using ICU4J's ULocale#addLikelySubtags(). 262 addAliasByTag("am", "und-Ethi"); // Amharic 263 addAliasByTag("byn", "und-Ethi"); // Blin 264 addAliasByTag("gez", "und-Ethi"); // Geʻez 265 addAliasByTag("ti", "und-Ethi"); // Tigrinya 266 addAliasByTag("wal", "und-Ethi"); // Wolaytta 267 } 268 }; 269 270 private static native long nBuildHyphenator(/* non-zero */ long dataAddress, 271 @NonNull String langTag, @IntRange(from = 1) int minPrefix, 272 @IntRange(from = 1) int minSuffix); 273} 274