ICU.java revision de0eb683370d789e7bb25673b350b8dbf2ba5d69
1/*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package libcore.icu;
18
19import java.util.Collections;
20import java.util.HashMap;
21import java.util.HashSet;
22import java.util.LinkedHashSet;
23import java.util.Locale;
24import java.util.Map;
25import java.util.Set;
26import libcore.util.BasicLruCache;
27
28/**
29 * Makes ICU data accessible to Java.
30 */
31public final class ICU {
32  private static final BasicLruCache<String, String> CACHED_PATTERNS =
33      new BasicLruCache<String, String>(8);
34
35  private static Locale[] availableLocalesCache;
36
37  private static String[] isoCountries;
38
39  private static String[] isoLanguages;
40
41  /**
42   * Returns an array of two-letter ISO 639-1 language codes, either from ICU or our cache.
43   */
44  public static String[] getISOLanguages() {
45    if (isoLanguages == null) {
46      isoLanguages = getISOLanguagesNative();
47    }
48    return isoLanguages.clone();
49  }
50
51  /**
52   * Returns an array of two-letter ISO 3166 country codes, either from ICU or our cache.
53   */
54  public static String[] getISOCountries() {
55    if (isoCountries == null) {
56      isoCountries = getISOCountriesNative();
57    }
58    return isoCountries.clone();
59  }
60
61  public static Locale forLanguageTag(String languageTag, boolean strict) {
62    final String icuLocaleId = localeForLanguageTag(languageTag, strict);
63    if (icuLocaleId == null) {
64      // TODO: We should probably return "und" here. From what I can tell,
65      // this happens only when the language in the languageTag is bad.
66      // Investigate this a bit more.
67      return null;
68    }
69
70    return localeFromIcuLocaleId(icuLocaleId);
71  }
72
73  public static String toLanguageTag(Locale locale) {
74    return languageTagForLocale(localeIdFromLocale(locale));
75  }
76
77  private static final int IDX_LANGUAGE = 0;
78  private static final int IDX_SCRIPT = 1;
79  private static final int IDX_REGION = 2;
80  private static final int IDX_VARIANT = 3;
81
82  /*
83   * Parse the {Language, Script, Region, Variant*} section of the ICU locale
84   * ID. This is the bit that appears before the keyword separate "@". The general
85   * structure is a series of ASCII alphanumeric strings (subtags)
86   * separated by underscores.
87   *
88   * Each subtag is interpreted according to its position in the list of subtags
89   * AND its length (groan...). The various cases are explained in comments
90   * below.
91   */
92  private static void parseLangScriptRegionAndVariants(String string,
93          String[] outputArray) {
94    final int first = string.indexOf('_');
95    final int second = string.indexOf('_', first + 1);
96    final int third = string.indexOf('_', second + 1);
97
98    if (first == -1) {
99      outputArray[IDX_LANGUAGE] = string;
100    } else if (second == -1) {
101      // Language and country ("ja_JP") OR
102      // Language and script ("en_Latn") OR
103      // Language and variant ("en_POSIX").
104
105      outputArray[IDX_LANGUAGE] = string.substring(0, first);
106      final String secondString = string.substring(first + 1);
107
108      if (secondString.length() == 4) {
109          // 4 Letter ISO script code.
110          outputArray[IDX_SCRIPT] = secondString;
111      } else if (secondString.length() == 2 || secondString.length() == 3) {
112          // 2 or 3 Letter region code.
113          outputArray[IDX_REGION] = secondString;
114      } else {
115          // If we're here, the length of the second half is either 1 or greater
116          // than 5. Assume that ICU won't hand us malformed tags, and therefore
117          // assume the rest of the string is a series of variant tags.
118          outputArray[IDX_VARIANT] = secondString;
119      }
120    } else if (third == -1) {
121      // Language and country and variant ("ja_JP_TRADITIONAL") OR
122      // Language and script and variant ("en_Latn_POSIX") OR
123      // Language and script and region ("en_Latn_US"). OR
124      // Language and variant with multiple subtags ("en_POSIX_XISOP")
125
126      outputArray[IDX_LANGUAGE] = string.substring(0, first);
127      final String secondString = string.substring(first + 1, second);
128      final String thirdString = string.substring(second + 1);
129
130      if (secondString.length() == 4) {
131          // The second subtag is a script.
132          outputArray[IDX_SCRIPT] = secondString;
133
134          // The third subtag can be either a region or a variant, depending
135          // on its length.
136          if (thirdString.length() == 2 || thirdString.length() == 3 ||
137                  thirdString.isEmpty()) {
138              outputArray[IDX_REGION] = thirdString;
139          } else {
140              outputArray[IDX_VARIANT] = thirdString;
141          }
142      } else if (secondString.isEmpty() ||
143              secondString.length() == 2 || secondString.length() == 3) {
144          // The second string is a region, and the third a variant.
145          outputArray[IDX_REGION] = secondString;
146          outputArray[IDX_VARIANT] = thirdString;
147      } else {
148          // Variant with multiple subtags.
149          outputArray[IDX_VARIANT] = string.substring(first + 1);
150      }
151    } else {
152      // Language, script, region and variant with 1 or more subtags
153      // ("en_Latn_US_POSIX") OR
154      // Language, region and variant with 2 or more subtags
155      // (en_US_POSIX_VARIANT).
156      outputArray[IDX_LANGUAGE] = string.substring(0, first);
157      final String secondString = string.substring(first + 1, second);
158      if (secondString.length() == 4) {
159          outputArray[IDX_SCRIPT] = secondString;
160          outputArray[IDX_REGION] = string.substring(second + 1, third);
161          outputArray[IDX_VARIANT] = string.substring(third + 1);
162      } else {
163          outputArray[IDX_REGION] = secondString;
164          outputArray[IDX_VARIANT] = string.substring(second + 1);
165      }
166    }
167  }
168
169  /**
170   * Returns the appropriate {@code Locale} given a {@code String} of the form returned
171   * by {@code toString}. This is very lenient, and doesn't care what's between the underscores:
172   * this method can parse strings that {@code Locale.toString} won't produce.
173   * Used to remove duplication.
174   */
175  public static Locale localeFromIcuLocaleId(String localeId) {
176    // @ == ULOC_KEYWORD_SEPARATOR_UNICODE (uloc.h).
177    final int extensionsIndex = localeId.indexOf('@');
178
179    Map<Character, String> extensionsMap = Collections.EMPTY_MAP;
180    Map<String, String> unicodeKeywordsMap = Collections.EMPTY_MAP;
181    Set<String> unicodeAttributeSet = Collections.EMPTY_SET;
182
183    if (extensionsIndex != -1) {
184      extensionsMap = new HashMap<Character, String>();
185      unicodeKeywordsMap = new HashMap<String, String>();
186      unicodeAttributeSet = new HashSet<String>();
187
188      // ICU sends us a semi-colon (ULOC_KEYWORD_ITEM_SEPARATOR) delimited string
189      // containing all "keywords" it could parse. An ICU keyword is a key-value pair
190      // separated by an "=" (ULOC_KEYWORD_ASSIGN).
191      //
192      // Each keyword item can be one of three things :
193      // - A unicode extension attribute list: In this case the item key is "attribute"
194      //   and the value is a hyphen separated list of unicode attributes.
195      // - A unicode extension keyword: In this case, the item key will be larger than
196      //   1 char in length, and the value will be the unicode extension value.
197      // - A BCP-47 extension subtag: In this case, the item key will be exactly one
198      //   char in length, and the value will be a sequence of unparsed subtags that
199      //   represent the extension.
200      //
201      // Note that this implies that unicode extension keywords are "promoted" to
202      // to the same namespace as the top level extension subtags and their values.
203      // There can't be any collisions in practice because the BCP-47 spec imposes
204      // restrictions on their lengths.
205      final String extensionsString = localeId.substring(extensionsIndex + 1);
206      final String[] extensions = extensionsString.split(";");
207      for (String extension : extensions) {
208        // This is the special key for the unicode attributes
209        if (extension.startsWith("attribute=")) {
210          String unicodeAttributeValues = extension.substring("attribute=".length());
211          for (String unicodeAttribute : unicodeAttributeValues.split("-")) {
212            unicodeAttributeSet.add(unicodeAttribute);
213          }
214        } else {
215          final int separatorIndex = extension.indexOf('=');
216
217          if (separatorIndex == 1) {
218            // This is a BCP-47 extension subtag.
219            final String value = extension.substring(2);
220            final char extensionId = extension.charAt(0);
221
222            extensionsMap.put(extensionId, value);
223          } else {
224            // This is a unicode extension keyword.
225            unicodeKeywordsMap.put(extension.substring(0, separatorIndex),
226            extension.substring(separatorIndex + 1));
227          }
228        }
229      }
230    }
231
232    final String[] outputArray = new String[] { "", "", "", "" };
233    if (extensionsIndex == -1) {
234      parseLangScriptRegionAndVariants(localeId, outputArray);
235    } else {
236      parseLangScriptRegionAndVariants(localeId.substring(0, extensionsIndex),
237          outputArray);
238    }
239
240    return new Locale(outputArray[IDX_LANGUAGE], outputArray[IDX_REGION],
241        outputArray[IDX_VARIANT], outputArray[IDX_SCRIPT],
242        unicodeAttributeSet, unicodeKeywordsMap, extensionsMap, false);
243  }
244
245  /**
246   * Builds an ICU locale ID from the given locale. The format is very
247   * straightforward. It is a series of subtags in BCP 47 order
248   * {@code lang[_script][_country][_variant]} followed by the keyword
249   * separator {@code @} followed by a list of keywords. Each keyword is
250   * a key value pair, and appear in the form {@code k1=v1;k2=v2;...}.
251   *
252   * In this use case, each key is an extension identifier, and each value
253   * is the value of the extension.
254   */
255  public static String localeIdFromLocale(Locale l) {
256      StringBuilder b = new StringBuilder(16);
257      b.append(l.getLanguage());
258
259      final boolean hasScript = !l.getScript().isEmpty();
260      final boolean hasCountry = !l.getCountry().isEmpty();
261      final boolean hasVariant = !l.getVariant().isEmpty();
262
263      if (hasScript || hasCountry || hasVariant) {
264          b.append('_');
265          if (hasScript) {
266              b.append(l.getScript());
267              if (hasCountry || hasVariant) {
268                  b.append('_');
269              }
270          }
271
272          if (hasCountry) {
273              b.append(l.getCountry());
274              if (hasVariant) {
275                  b.append('_');
276              }
277          }
278
279          b.append(l.getVariant());
280      }
281
282      if (!l.getExtensionKeys().isEmpty()) {
283        b.append('@');
284        // The private use extension ('x') must show up last in the list
285        // so we cache its value here and append it right at the end.
286        String privateUseExtensionValue = null;
287        for (char c : l.getExtensionKeys()) {
288          if (c == Locale.PRIVATE_USE_EXTENSION) {
289            privateUseExtensionValue = l.getExtension(Locale.PRIVATE_USE_EXTENSION);
290          } else {
291            b.append(c);
292            b.append('=');
293            b.append(l.getExtension(c));
294            b.append(';');
295          }
296        }
297
298        if (privateUseExtensionValue != null) {
299          b.append(Locale.PRIVATE_USE_EXTENSION);
300          b.append('=');
301          b.append(privateUseExtensionValue);
302          b.append(';');
303        }
304      }
305
306      return b.toString();
307  }
308
309  public static Locale[] localesFromStrings(String[] localeNames) {
310    // We need to remove duplicates caused by the conversion of "he" to "iw", et cetera.
311    // Java needs the obsolete code, ICU needs the modern code, but we let ICU know about
312    // both so that we never need to convert back when talking to it.
313    LinkedHashSet<Locale> set = new LinkedHashSet<Locale>();
314    for (String localeName : localeNames) {
315      set.add(localeFromIcuLocaleId(localeName));
316    }
317    return set.toArray(new Locale[set.size()]);
318  }
319
320  public static Locale[] getAvailableLocales() {
321    if (availableLocalesCache == null) {
322      availableLocalesCache = localesFromStrings(getAvailableLocalesNative());
323    }
324    return availableLocalesCache.clone();
325  }
326
327  public static Locale[] getAvailableBreakIteratorLocales() {
328    return localesFromStrings(getAvailableBreakIteratorLocalesNative());
329  }
330
331  public static Locale[] getAvailableCalendarLocales() {
332    return localesFromStrings(getAvailableCalendarLocalesNative());
333  }
334
335  public static Locale[] getAvailableCollatorLocales() {
336    return localesFromStrings(getAvailableCollatorLocalesNative());
337  }
338
339  public static Locale[] getAvailableDateFormatLocales() {
340    return localesFromStrings(getAvailableDateFormatLocalesNative());
341  }
342
343  public static Locale[] getAvailableDateFormatSymbolsLocales() {
344    return getAvailableDateFormatLocales();
345  }
346
347  public static Locale[] getAvailableDecimalFormatSymbolsLocales() {
348    return getAvailableNumberFormatLocales();
349  }
350
351  public static Locale[] getAvailableNumberFormatLocales() {
352    return localesFromStrings(getAvailableNumberFormatLocalesNative());
353  }
354
355  public static String getBestDateTimePattern(String skeleton, String localeName) {
356    String key = skeleton + "\t" + localeName;
357    synchronized (CACHED_PATTERNS) {
358      String pattern = CACHED_PATTERNS.get(key);
359      if (pattern == null) {
360        pattern = getBestDateTimePatternNative(skeleton, localeName);
361        CACHED_PATTERNS.put(key, pattern);
362      }
363      return pattern;
364    }
365  }
366
367  private static native String getBestDateTimePatternNative(String skeleton, String localeName);
368
369  public static char[] getDateFormatOrder(String pattern) {
370    char[] result = new char[3];
371    int resultIndex = 0;
372    boolean sawDay = false;
373    boolean sawMonth = false;
374    boolean sawYear = false;
375
376    for (int i = 0; i < pattern.length(); ++i) {
377      char ch = pattern.charAt(i);
378      if (ch == 'd' || ch == 'L' || ch == 'M' || ch == 'y') {
379        if (ch == 'd' && !sawDay) {
380          result[resultIndex++] = 'd';
381          sawDay = true;
382        } else if ((ch == 'L' || ch == 'M') && !sawMonth) {
383          result[resultIndex++] = 'M';
384          sawMonth = true;
385        } else if ((ch == 'y') && !sawYear) {
386          result[resultIndex++] = 'y';
387          sawYear = true;
388        }
389      } else if (ch == 'G') {
390        // Ignore the era specifier, if present.
391      } else if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) {
392        throw new IllegalArgumentException("Bad pattern character '" + ch + "' in " + pattern);
393      } else if (ch == '\'') {
394        if (i < pattern.length() - 1 && pattern.charAt(i + 1) == '\'') {
395          ++i;
396        } else {
397          i = pattern.indexOf('\'', i + 1);
398          if (i == -1) {
399            throw new IllegalArgumentException("Bad quoting in " + pattern);
400          }
401          ++i;
402        }
403      } else {
404        // Ignore spaces and punctuation.
405      }
406    }
407    return result;
408  }
409
410  /**
411   * Returns the version of the CLDR data in use, such as "22.1.1".
412   */
413  public static native String getCldrVersion();
414
415  /**
416   * Returns the icu4c version in use, such as "50.1.1".
417   */
418  public static native String getIcuVersion();
419
420  /**
421   * Returns the Unicode version our ICU supports, such as "6.2".
422   */
423  public static native String getUnicodeVersion();
424
425  // --- Case mapping.
426
427  public static native String toLowerCase(String s, String localeName);
428  public static native String toUpperCase(String s, String localeName);
429
430  // --- Errors.
431
432  // Just the subset of error codes needed by CharsetDecoderICU/CharsetEncoderICU.
433  public static final int U_ZERO_ERROR = 0;
434  public static final int U_INVALID_CHAR_FOUND = 10;
435  public static final int U_TRUNCATED_CHAR_FOUND = 11;
436  public static final int U_ILLEGAL_CHAR_FOUND = 12;
437  public static final int U_BUFFER_OVERFLOW_ERROR = 15;
438
439  public static boolean U_FAILURE(int error) {
440    return error > U_ZERO_ERROR;
441  }
442
443  // --- Native methods accessing ICU's database.
444
445  private static native String[] getAvailableBreakIteratorLocalesNative();
446  private static native String[] getAvailableCalendarLocalesNative();
447  private static native String[] getAvailableCollatorLocalesNative();
448  private static native String[] getAvailableDateFormatLocalesNative();
449  private static native String[] getAvailableLocalesNative();
450  private static native String[] getAvailableNumberFormatLocalesNative();
451
452  public static native String[] getAvailableCurrencyCodes();
453  public static native String getCurrencyCode(String countryCode);
454  public static native String getCurrencyDisplayName(String locale, String currencyCode);
455  public static native int getCurrencyFractionDigits(String currencyCode);
456  public static native int getCurrencyNumericCode(String currencyCode);
457  public static native String getCurrencySymbol(String locale, String currencyCode);
458
459  public static native String getDisplayCountryNative(String countryCode, String locale);
460  public static native String getDisplayLanguageNative(String languageCode, String locale);
461  public static native String getDisplayVariantNative(String variantCode, String locale);
462  public static native String getDisplayScriptNative(String variantCode, String locale);
463
464  public static native String getISO3CountryNative(String locale);
465  public static native String getISO3LanguageNative(String locale);
466
467  public static native String addLikelySubtags(String locale);
468  public static native String getScript(String locale);
469
470  private static native String[] getISOLanguagesNative();
471  private static native String[] getISOCountriesNative();
472
473  private static native String localeForLanguageTag(String languageTag, boolean strict);
474  public static native String languageTagForLocale(String locale);
475
476  static native boolean initLocaleDataNative(String locale, LocaleData result);
477
478  public static native void setDefaultLocale(String locale);
479  public static native String getDefaultLocale();
480}
481