ICU.java revision de0eb683370d789e7bb25673b350b8dbf2ba5d69
1/* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package libcore.icu; 18 19import java.util.Collections; 20import java.util.HashMap; 21import java.util.HashSet; 22import java.util.LinkedHashSet; 23import java.util.Locale; 24import java.util.Map; 25import java.util.Set; 26import libcore.util.BasicLruCache; 27 28/** 29 * Makes ICU data accessible to Java. 30 */ 31public final class ICU { 32 private static final BasicLruCache<String, String> CACHED_PATTERNS = 33 new BasicLruCache<String, String>(8); 34 35 private static Locale[] availableLocalesCache; 36 37 private static String[] isoCountries; 38 39 private static String[] isoLanguages; 40 41 /** 42 * Returns an array of two-letter ISO 639-1 language codes, either from ICU or our cache. 43 */ 44 public static String[] getISOLanguages() { 45 if (isoLanguages == null) { 46 isoLanguages = getISOLanguagesNative(); 47 } 48 return isoLanguages.clone(); 49 } 50 51 /** 52 * Returns an array of two-letter ISO 3166 country codes, either from ICU or our cache. 53 */ 54 public static String[] getISOCountries() { 55 if (isoCountries == null) { 56 isoCountries = getISOCountriesNative(); 57 } 58 return isoCountries.clone(); 59 } 60 61 public static Locale forLanguageTag(String languageTag, boolean strict) { 62 final String icuLocaleId = localeForLanguageTag(languageTag, strict); 63 if (icuLocaleId == null) { 64 // TODO: We should probably return "und" here. From what I can tell, 65 // this happens only when the language in the languageTag is bad. 66 // Investigate this a bit more. 67 return null; 68 } 69 70 return localeFromIcuLocaleId(icuLocaleId); 71 } 72 73 public static String toLanguageTag(Locale locale) { 74 return languageTagForLocale(localeIdFromLocale(locale)); 75 } 76 77 private static final int IDX_LANGUAGE = 0; 78 private static final int IDX_SCRIPT = 1; 79 private static final int IDX_REGION = 2; 80 private static final int IDX_VARIANT = 3; 81 82 /* 83 * Parse the {Language, Script, Region, Variant*} section of the ICU locale 84 * ID. This is the bit that appears before the keyword separate "@". The general 85 * structure is a series of ASCII alphanumeric strings (subtags) 86 * separated by underscores. 87 * 88 * Each subtag is interpreted according to its position in the list of subtags 89 * AND its length (groan...). The various cases are explained in comments 90 * below. 91 */ 92 private static void parseLangScriptRegionAndVariants(String string, 93 String[] outputArray) { 94 final int first = string.indexOf('_'); 95 final int second = string.indexOf('_', first + 1); 96 final int third = string.indexOf('_', second + 1); 97 98 if (first == -1) { 99 outputArray[IDX_LANGUAGE] = string; 100 } else if (second == -1) { 101 // Language and country ("ja_JP") OR 102 // Language and script ("en_Latn") OR 103 // Language and variant ("en_POSIX"). 104 105 outputArray[IDX_LANGUAGE] = string.substring(0, first); 106 final String secondString = string.substring(first + 1); 107 108 if (secondString.length() == 4) { 109 // 4 Letter ISO script code. 110 outputArray[IDX_SCRIPT] = secondString; 111 } else if (secondString.length() == 2 || secondString.length() == 3) { 112 // 2 or 3 Letter region code. 113 outputArray[IDX_REGION] = secondString; 114 } else { 115 // If we're here, the length of the second half is either 1 or greater 116 // than 5. Assume that ICU won't hand us malformed tags, and therefore 117 // assume the rest of the string is a series of variant tags. 118 outputArray[IDX_VARIANT] = secondString; 119 } 120 } else if (third == -1) { 121 // Language and country and variant ("ja_JP_TRADITIONAL") OR 122 // Language and script and variant ("en_Latn_POSIX") OR 123 // Language and script and region ("en_Latn_US"). OR 124 // Language and variant with multiple subtags ("en_POSIX_XISOP") 125 126 outputArray[IDX_LANGUAGE] = string.substring(0, first); 127 final String secondString = string.substring(first + 1, second); 128 final String thirdString = string.substring(second + 1); 129 130 if (secondString.length() == 4) { 131 // The second subtag is a script. 132 outputArray[IDX_SCRIPT] = secondString; 133 134 // The third subtag can be either a region or a variant, depending 135 // on its length. 136 if (thirdString.length() == 2 || thirdString.length() == 3 || 137 thirdString.isEmpty()) { 138 outputArray[IDX_REGION] = thirdString; 139 } else { 140 outputArray[IDX_VARIANT] = thirdString; 141 } 142 } else if (secondString.isEmpty() || 143 secondString.length() == 2 || secondString.length() == 3) { 144 // The second string is a region, and the third a variant. 145 outputArray[IDX_REGION] = secondString; 146 outputArray[IDX_VARIANT] = thirdString; 147 } else { 148 // Variant with multiple subtags. 149 outputArray[IDX_VARIANT] = string.substring(first + 1); 150 } 151 } else { 152 // Language, script, region and variant with 1 or more subtags 153 // ("en_Latn_US_POSIX") OR 154 // Language, region and variant with 2 or more subtags 155 // (en_US_POSIX_VARIANT). 156 outputArray[IDX_LANGUAGE] = string.substring(0, first); 157 final String secondString = string.substring(first + 1, second); 158 if (secondString.length() == 4) { 159 outputArray[IDX_SCRIPT] = secondString; 160 outputArray[IDX_REGION] = string.substring(second + 1, third); 161 outputArray[IDX_VARIANT] = string.substring(third + 1); 162 } else { 163 outputArray[IDX_REGION] = secondString; 164 outputArray[IDX_VARIANT] = string.substring(second + 1); 165 } 166 } 167 } 168 169 /** 170 * Returns the appropriate {@code Locale} given a {@code String} of the form returned 171 * by {@code toString}. This is very lenient, and doesn't care what's between the underscores: 172 * this method can parse strings that {@code Locale.toString} won't produce. 173 * Used to remove duplication. 174 */ 175 public static Locale localeFromIcuLocaleId(String localeId) { 176 // @ == ULOC_KEYWORD_SEPARATOR_UNICODE (uloc.h). 177 final int extensionsIndex = localeId.indexOf('@'); 178 179 Map<Character, String> extensionsMap = Collections.EMPTY_MAP; 180 Map<String, String> unicodeKeywordsMap = Collections.EMPTY_MAP; 181 Set<String> unicodeAttributeSet = Collections.EMPTY_SET; 182 183 if (extensionsIndex != -1) { 184 extensionsMap = new HashMap<Character, String>(); 185 unicodeKeywordsMap = new HashMap<String, String>(); 186 unicodeAttributeSet = new HashSet<String>(); 187 188 // ICU sends us a semi-colon (ULOC_KEYWORD_ITEM_SEPARATOR) delimited string 189 // containing all "keywords" it could parse. An ICU keyword is a key-value pair 190 // separated by an "=" (ULOC_KEYWORD_ASSIGN). 191 // 192 // Each keyword item can be one of three things : 193 // - A unicode extension attribute list: In this case the item key is "attribute" 194 // and the value is a hyphen separated list of unicode attributes. 195 // - A unicode extension keyword: In this case, the item key will be larger than 196 // 1 char in length, and the value will be the unicode extension value. 197 // - A BCP-47 extension subtag: In this case, the item key will be exactly one 198 // char in length, and the value will be a sequence of unparsed subtags that 199 // represent the extension. 200 // 201 // Note that this implies that unicode extension keywords are "promoted" to 202 // to the same namespace as the top level extension subtags and their values. 203 // There can't be any collisions in practice because the BCP-47 spec imposes 204 // restrictions on their lengths. 205 final String extensionsString = localeId.substring(extensionsIndex + 1); 206 final String[] extensions = extensionsString.split(";"); 207 for (String extension : extensions) { 208 // This is the special key for the unicode attributes 209 if (extension.startsWith("attribute=")) { 210 String unicodeAttributeValues = extension.substring("attribute=".length()); 211 for (String unicodeAttribute : unicodeAttributeValues.split("-")) { 212 unicodeAttributeSet.add(unicodeAttribute); 213 } 214 } else { 215 final int separatorIndex = extension.indexOf('='); 216 217 if (separatorIndex == 1) { 218 // This is a BCP-47 extension subtag. 219 final String value = extension.substring(2); 220 final char extensionId = extension.charAt(0); 221 222 extensionsMap.put(extensionId, value); 223 } else { 224 // This is a unicode extension keyword. 225 unicodeKeywordsMap.put(extension.substring(0, separatorIndex), 226 extension.substring(separatorIndex + 1)); 227 } 228 } 229 } 230 } 231 232 final String[] outputArray = new String[] { "", "", "", "" }; 233 if (extensionsIndex == -1) { 234 parseLangScriptRegionAndVariants(localeId, outputArray); 235 } else { 236 parseLangScriptRegionAndVariants(localeId.substring(0, extensionsIndex), 237 outputArray); 238 } 239 240 return new Locale(outputArray[IDX_LANGUAGE], outputArray[IDX_REGION], 241 outputArray[IDX_VARIANT], outputArray[IDX_SCRIPT], 242 unicodeAttributeSet, unicodeKeywordsMap, extensionsMap, false); 243 } 244 245 /** 246 * Builds an ICU locale ID from the given locale. The format is very 247 * straightforward. It is a series of subtags in BCP 47 order 248 * {@code lang[_script][_country][_variant]} followed by the keyword 249 * separator {@code @} followed by a list of keywords. Each keyword is 250 * a key value pair, and appear in the form {@code k1=v1;k2=v2;...}. 251 * 252 * In this use case, each key is an extension identifier, and each value 253 * is the value of the extension. 254 */ 255 public static String localeIdFromLocale(Locale l) { 256 StringBuilder b = new StringBuilder(16); 257 b.append(l.getLanguage()); 258 259 final boolean hasScript = !l.getScript().isEmpty(); 260 final boolean hasCountry = !l.getCountry().isEmpty(); 261 final boolean hasVariant = !l.getVariant().isEmpty(); 262 263 if (hasScript || hasCountry || hasVariant) { 264 b.append('_'); 265 if (hasScript) { 266 b.append(l.getScript()); 267 if (hasCountry || hasVariant) { 268 b.append('_'); 269 } 270 } 271 272 if (hasCountry) { 273 b.append(l.getCountry()); 274 if (hasVariant) { 275 b.append('_'); 276 } 277 } 278 279 b.append(l.getVariant()); 280 } 281 282 if (!l.getExtensionKeys().isEmpty()) { 283 b.append('@'); 284 // The private use extension ('x') must show up last in the list 285 // so we cache its value here and append it right at the end. 286 String privateUseExtensionValue = null; 287 for (char c : l.getExtensionKeys()) { 288 if (c == Locale.PRIVATE_USE_EXTENSION) { 289 privateUseExtensionValue = l.getExtension(Locale.PRIVATE_USE_EXTENSION); 290 } else { 291 b.append(c); 292 b.append('='); 293 b.append(l.getExtension(c)); 294 b.append(';'); 295 } 296 } 297 298 if (privateUseExtensionValue != null) { 299 b.append(Locale.PRIVATE_USE_EXTENSION); 300 b.append('='); 301 b.append(privateUseExtensionValue); 302 b.append(';'); 303 } 304 } 305 306 return b.toString(); 307 } 308 309 public static Locale[] localesFromStrings(String[] localeNames) { 310 // We need to remove duplicates caused by the conversion of "he" to "iw", et cetera. 311 // Java needs the obsolete code, ICU needs the modern code, but we let ICU know about 312 // both so that we never need to convert back when talking to it. 313 LinkedHashSet<Locale> set = new LinkedHashSet<Locale>(); 314 for (String localeName : localeNames) { 315 set.add(localeFromIcuLocaleId(localeName)); 316 } 317 return set.toArray(new Locale[set.size()]); 318 } 319 320 public static Locale[] getAvailableLocales() { 321 if (availableLocalesCache == null) { 322 availableLocalesCache = localesFromStrings(getAvailableLocalesNative()); 323 } 324 return availableLocalesCache.clone(); 325 } 326 327 public static Locale[] getAvailableBreakIteratorLocales() { 328 return localesFromStrings(getAvailableBreakIteratorLocalesNative()); 329 } 330 331 public static Locale[] getAvailableCalendarLocales() { 332 return localesFromStrings(getAvailableCalendarLocalesNative()); 333 } 334 335 public static Locale[] getAvailableCollatorLocales() { 336 return localesFromStrings(getAvailableCollatorLocalesNative()); 337 } 338 339 public static Locale[] getAvailableDateFormatLocales() { 340 return localesFromStrings(getAvailableDateFormatLocalesNative()); 341 } 342 343 public static Locale[] getAvailableDateFormatSymbolsLocales() { 344 return getAvailableDateFormatLocales(); 345 } 346 347 public static Locale[] getAvailableDecimalFormatSymbolsLocales() { 348 return getAvailableNumberFormatLocales(); 349 } 350 351 public static Locale[] getAvailableNumberFormatLocales() { 352 return localesFromStrings(getAvailableNumberFormatLocalesNative()); 353 } 354 355 public static String getBestDateTimePattern(String skeleton, String localeName) { 356 String key = skeleton + "\t" + localeName; 357 synchronized (CACHED_PATTERNS) { 358 String pattern = CACHED_PATTERNS.get(key); 359 if (pattern == null) { 360 pattern = getBestDateTimePatternNative(skeleton, localeName); 361 CACHED_PATTERNS.put(key, pattern); 362 } 363 return pattern; 364 } 365 } 366 367 private static native String getBestDateTimePatternNative(String skeleton, String localeName); 368 369 public static char[] getDateFormatOrder(String pattern) { 370 char[] result = new char[3]; 371 int resultIndex = 0; 372 boolean sawDay = false; 373 boolean sawMonth = false; 374 boolean sawYear = false; 375 376 for (int i = 0; i < pattern.length(); ++i) { 377 char ch = pattern.charAt(i); 378 if (ch == 'd' || ch == 'L' || ch == 'M' || ch == 'y') { 379 if (ch == 'd' && !sawDay) { 380 result[resultIndex++] = 'd'; 381 sawDay = true; 382 } else if ((ch == 'L' || ch == 'M') && !sawMonth) { 383 result[resultIndex++] = 'M'; 384 sawMonth = true; 385 } else if ((ch == 'y') && !sawYear) { 386 result[resultIndex++] = 'y'; 387 sawYear = true; 388 } 389 } else if (ch == 'G') { 390 // Ignore the era specifier, if present. 391 } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) { 392 throw new IllegalArgumentException("Bad pattern character '" + ch + "' in " + pattern); 393 } else if (ch == '\'') { 394 if (i < pattern.length() - 1 && pattern.charAt(i + 1) == '\'') { 395 ++i; 396 } else { 397 i = pattern.indexOf('\'', i + 1); 398 if (i == -1) { 399 throw new IllegalArgumentException("Bad quoting in " + pattern); 400 } 401 ++i; 402 } 403 } else { 404 // Ignore spaces and punctuation. 405 } 406 } 407 return result; 408 } 409 410 /** 411 * Returns the version of the CLDR data in use, such as "22.1.1". 412 */ 413 public static native String getCldrVersion(); 414 415 /** 416 * Returns the icu4c version in use, such as "50.1.1". 417 */ 418 public static native String getIcuVersion(); 419 420 /** 421 * Returns the Unicode version our ICU supports, such as "6.2". 422 */ 423 public static native String getUnicodeVersion(); 424 425 // --- Case mapping. 426 427 public static native String toLowerCase(String s, String localeName); 428 public static native String toUpperCase(String s, String localeName); 429 430 // --- Errors. 431 432 // Just the subset of error codes needed by CharsetDecoderICU/CharsetEncoderICU. 433 public static final int U_ZERO_ERROR = 0; 434 public static final int U_INVALID_CHAR_FOUND = 10; 435 public static final int U_TRUNCATED_CHAR_FOUND = 11; 436 public static final int U_ILLEGAL_CHAR_FOUND = 12; 437 public static final int U_BUFFER_OVERFLOW_ERROR = 15; 438 439 public static boolean U_FAILURE(int error) { 440 return error > U_ZERO_ERROR; 441 } 442 443 // --- Native methods accessing ICU's database. 444 445 private static native String[] getAvailableBreakIteratorLocalesNative(); 446 private static native String[] getAvailableCalendarLocalesNative(); 447 private static native String[] getAvailableCollatorLocalesNative(); 448 private static native String[] getAvailableDateFormatLocalesNative(); 449 private static native String[] getAvailableLocalesNative(); 450 private static native String[] getAvailableNumberFormatLocalesNative(); 451 452 public static native String[] getAvailableCurrencyCodes(); 453 public static native String getCurrencyCode(String countryCode); 454 public static native String getCurrencyDisplayName(String locale, String currencyCode); 455 public static native int getCurrencyFractionDigits(String currencyCode); 456 public static native int getCurrencyNumericCode(String currencyCode); 457 public static native String getCurrencySymbol(String locale, String currencyCode); 458 459 public static native String getDisplayCountryNative(String countryCode, String locale); 460 public static native String getDisplayLanguageNative(String languageCode, String locale); 461 public static native String getDisplayVariantNative(String variantCode, String locale); 462 public static native String getDisplayScriptNative(String variantCode, String locale); 463 464 public static native String getISO3CountryNative(String locale); 465 public static native String getISO3LanguageNative(String locale); 466 467 public static native String addLikelySubtags(String locale); 468 public static native String getScript(String locale); 469 470 private static native String[] getISOLanguagesNative(); 471 private static native String[] getISOCountriesNative(); 472 473 private static native String localeForLanguageTag(String languageTag, boolean strict); 474 public static native String languageTagForLocale(String locale); 475 476 static native boolean initLocaleDataNative(String locale, LocaleData result); 477 478 public static native void setDefaultLocale(String locale); 479 public static native String getDefaultLocale(); 480} 481