1/*
2 * Copyright (C) 2010 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License
15 */
16
17package com.android.providers.contacts;
18
19import android.icu.text.AlphabeticIndex;
20import android.icu.text.AlphabeticIndex.ImmutableIndex;
21import android.icu.text.Transliterator;
22import android.provider.ContactsContract.FullNameStyle;
23import android.provider.ContactsContract.PhoneticNameStyle;
24import android.os.LocaleList;
25import android.text.TextUtils;
26import android.util.ArraySet;
27import android.util.Log;
28
29import com.android.providers.contacts.HanziToPinyin.Token;
30import com.google.common.annotations.VisibleForTesting;
31
32import java.lang.Character.UnicodeBlock;
33import java.util.ArrayList;
34import java.util.Collections;
35import java.util.HashSet;
36import java.util.Iterator;
37import java.util.Locale;
38import java.util.Set;
39
40
41/**
42 * This utility class provides specialized handling for locale specific
43 * information: labels, name lookup keys.
44 */
45public class ContactLocaleUtils {
46    public static final String TAG = "ContactLocale";
47
48    public static final Locale LOCALE_ARABIC = new Locale("ar");
49    public static final Locale LOCALE_GREEK = new Locale("el");
50    public static final Locale LOCALE_HEBREW = new Locale("he");
51    // Serbian and Ukrainian labels are complementary supersets of Russian
52    public static final Locale LOCALE_SERBIAN = new Locale("sr");
53    public static final Locale LOCALE_UKRAINIAN = new Locale("uk");
54    public static final Locale LOCALE_THAI = new Locale("th");
55
56    // -- Note for adding locales to sDefaultLabelLocales --
57    //
58    // AlphabeticIndex.getBucketLabel() uses a binary search across
59    // the entire label set so care should be taken about growing this
60    // set too large. The following set determines for which locales
61    // we will show labels other than your primary locale. General rules
62    // of thumb for adding a locale: should be a supported locale; and
63    // should not be included if from a name it is not deterministic
64    // which way to label it (so eg Chinese cannot be added because
65    // the labeling of a Chinese character varies between Simplified,
66    // Traditional, and Japanese locales). Use English only for all
67    // Latin based alphabets. Ukrainian and Serbian are chosen for
68    // Cyrillic because their alphabets are complementary supersets
69    // of Russian.
70    private static final Locale[] sDefaultLabelLocales = new Locale[]{
71            Locale.ENGLISH,
72            Locale.JAPANESE,
73            Locale.KOREAN,
74            LOCALE_THAI,
75            LOCALE_ARABIC,
76            LOCALE_HEBREW,
77            LOCALE_GREEK,
78            LOCALE_UKRAINIAN,
79            LOCALE_SERBIAN,
80    };
81
82    /**
83     * This class is the default implementation and should be the base class
84     * for other locales.
85     *
86     * sortKey: same as name
87     * nameLookupKeys: none
88     * labels: uses ICU AlphabeticIndex for labels and extends by labeling
89     *     phone numbers "#".  Eg English labels are: [A-Z], #, " "
90     */
91    private static class ContactLocaleUtilsBase {
92        private static final String EMPTY_STRING = "";
93        private static final String NUMBER_STRING = "#";
94
95        protected final ImmutableIndex mAlphabeticIndex;
96        private final int mAlphabeticIndexBucketCount;
97        private final int mNumberBucketIndex;
98        private final boolean mUsePinyinTransliterator;
99
100        public ContactLocaleUtilsBase(LocaleSet locales) {
101            mUsePinyinTransliterator = locales.shouldPreferSimplifiedChinese();
102
103            final ArraySet<Locale> addedLocales = new ArraySet<>();
104
105            // First, add from the primary locale (which may not be the first locale in the locale
106            // list).
107            AlphabeticIndex ai = new AlphabeticIndex(locales.getPrimaryLocale())
108                    .setMaxLabelCount(300);
109            addedLocales.add(locales.getPrimaryLocale());
110
111            // Next, add all locale form the locale list.
112            final LocaleList localeList = locales.getAllLocales();
113            for (int i = 0; i < localeList.size(); i++) {
114                addLabels(ai, localeList.get(i), addedLocales);
115            }
116            // Then add the default locales.
117            for (int i = 0; i < sDefaultLabelLocales.length; i++) {
118                addLabels(ai, sDefaultLabelLocales[i], addedLocales);
119            }
120            mAlphabeticIndex = ai.buildImmutableIndex();
121            mAlphabeticIndexBucketCount = mAlphabeticIndex.getBucketCount();
122            mNumberBucketIndex = mAlphabeticIndexBucketCount - 1;
123        }
124
125        private static void addLabels(
126                AlphabeticIndex ai, Locale locale, ArraySet<Locale> addedLocales) {
127            if (addedLocales.contains(locale)) {
128                return;
129            }
130            ai.addLabels(locale);
131            addedLocales.add(locale);
132        }
133
134        public String getSortKey(String name) {
135            return name;
136        }
137
138        public int getNumberBucketIndex() {
139            return mNumberBucketIndex;
140        }
141
142        /**
143         * Returns the bucket index for the specified string. AlphabeticIndex
144         * sorts strings into buckets numbered in order from 0 to N, where the
145         * exact value of N depends on how many representative index labels are
146         * used in a particular locale. This routine adds one additional bucket
147         * for phone numbers. It attempts to detect phone numbers and shifts
148         * the bucket indexes returned by AlphabeticIndex in order to make room
149         * for the new # bucket, so the returned range becomes 0 to N+1.
150         */
151        public int getBucketIndex(String name) {
152            boolean prefixIsNumeric = false;
153            final int length = name.length();
154            int offset = 0;
155            while (offset < length) {
156                int codePoint = Character.codePointAt(name, offset);
157                // Ignore standard phone number separators and identify any
158                // string that otherwise starts with a number.
159                if (Character.isDigit(codePoint)) {
160                    prefixIsNumeric = true;
161                    break;
162                } else if (!Character.isSpaceChar(codePoint) &&
163                           codePoint != '+' && codePoint != '(' &&
164                           codePoint != ')' && codePoint != '.' &&
165                           codePoint != '-' && codePoint != '#') {
166                    break;
167                }
168                offset += Character.charCount(codePoint);
169            }
170            if (prefixIsNumeric) {
171                return mNumberBucketIndex;
172            }
173
174            /**
175             * ICU 55 AlphabeticIndex doesn't support Simplified Chinese
176             * as a secondary locale so it is necessary to use the
177             * Pinyin transliterator. We also use this for a Simplified
178             * Chinese primary locale because it gives more accurate letter
179             * buckets. b/19835686
180             */
181            if (mUsePinyinTransliterator) {
182                name = HanziToPinyin.getInstance().transliterate(name);
183            }
184            final int bucket = mAlphabeticIndex.getBucketIndex(name);
185            if (bucket < 0) {
186                return -1;
187            }
188            if (bucket >= mNumberBucketIndex) {
189                return bucket + 1;
190            }
191            return bucket;
192        }
193
194        /**
195         * Returns the number of buckets in use (one more than AlphabeticIndex
196         * uses, because this class adds a bucket for phone numbers).
197         */
198        public int getBucketCount() {
199            return mAlphabeticIndexBucketCount + 1;
200        }
201
202        /**
203         * Returns the label for the specified bucket index if a valid index,
204         * otherwise returns an empty string. '#' is returned for the phone
205         * number bucket; for all others, the AlphabeticIndex label is returned.
206         */
207        public String getBucketLabel(int bucketIndex) {
208            if (bucketIndex < 0 || bucketIndex >= getBucketCount()) {
209                return EMPTY_STRING;
210            } else if (bucketIndex == mNumberBucketIndex) {
211                return NUMBER_STRING;
212            } else if (bucketIndex > mNumberBucketIndex) {
213                --bucketIndex;
214            }
215            return mAlphabeticIndex.getBucket(bucketIndex).getLabel();
216        }
217
218        @SuppressWarnings("unused")
219        public Iterator<String> getNameLookupKeys(String name, int nameStyle) {
220            return null;
221        }
222
223        public ArrayList<String> getLabels() {
224            final int bucketCount = getBucketCount();
225            final ArrayList<String> labels = new ArrayList<String>(bucketCount);
226            for(int i = 0; i < bucketCount; ++i) {
227                labels.add(getBucketLabel(i));
228            }
229            return labels;
230        }
231    }
232
233    /**
234     * Japanese specific locale overrides.
235     *
236     * sortKey: unchanged (same as name)
237     * nameLookupKeys: unchanged (none)
238     * labels: extends default labels by labeling unlabeled CJ characters
239     *     with the Japanese character 他 ("misc"). Japanese labels are:
240     *     あ, か, さ, た, な, は, ま, や, ら, わ, 他, [A-Z], #, " "
241     */
242    private static class JapaneseContactUtils extends ContactLocaleUtilsBase {
243        // \u4ed6 is Japanese character 他 ("misc")
244        private static final String JAPANESE_MISC_LABEL = "\u4ed6";
245        private final int mMiscBucketIndex;
246
247        public JapaneseContactUtils(LocaleSet locales) {
248            super(locales);
249            // Determine which bucket AlphabeticIndex is lumping unclassified
250            // Japanese characters into by looking up the bucket index for
251            // a representative Kanji/CJK unified ideograph (\u65e5 is the
252            // character '日').
253            mMiscBucketIndex = super.getBucketIndex("\u65e5");
254        }
255
256        // Set of UnicodeBlocks for unified CJK (Chinese) characters and
257        // Japanese characters. This includes all code blocks that might
258        // contain a character used in Japanese (which is why unified CJK
259        // blocks are included but Korean Hangul and jamo are not).
260        private static final Set<Character.UnicodeBlock> CJ_BLOCKS;
261        static {
262            Set<UnicodeBlock> set = new HashSet<UnicodeBlock>();
263            set.add(UnicodeBlock.HIRAGANA);
264            set.add(UnicodeBlock.KATAKANA);
265            set.add(UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS);
266            set.add(UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS);
267            set.add(UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS);
268            set.add(UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A);
269            set.add(UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B);
270            set.add(UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION);
271            set.add(UnicodeBlock.CJK_RADICALS_SUPPLEMENT);
272            set.add(UnicodeBlock.CJK_COMPATIBILITY);
273            set.add(UnicodeBlock.CJK_COMPATIBILITY_FORMS);
274            set.add(UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS);
275            set.add(UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT);
276            CJ_BLOCKS = Collections.unmodifiableSet(set);
277        }
278
279        /**
280         * Helper routine to identify unlabeled Chinese or Japanese characters
281         * to put in a 'misc' bucket.
282         *
283         * @return true if the specified Unicode code point is Chinese or
284         *              Japanese
285         */
286        private static boolean isChineseOrJapanese(int codePoint) {
287            return CJ_BLOCKS.contains(UnicodeBlock.of(codePoint));
288        }
289
290        /**
291         * Returns the bucket index for the specified string. Adds an
292         * additional 'misc' bucket for Kanji characters to the base class set.
293         */
294        @Override
295        public int getBucketIndex(String name) {
296            final int bucketIndex = super.getBucketIndex(name);
297            if ((bucketIndex == mMiscBucketIndex &&
298                 !isChineseOrJapanese(Character.codePointAt(name, 0))) ||
299                bucketIndex > mMiscBucketIndex) {
300                return bucketIndex + 1;
301            }
302            return bucketIndex;
303        }
304
305        /**
306         * Returns the number of buckets in use (one more than the base class
307         * uses, because this class adds a bucket for Kanji).
308         */
309        @Override
310        public int getBucketCount() {
311            return super.getBucketCount() + 1;
312        }
313
314        /**
315         * Returns the label for the specified bucket index if a valid index,
316         * otherwise returns an empty string. '他' is returned for unclassified
317         * Kanji; for all others, the label determined by the base class is
318         * returned.
319         */
320        @Override
321        public String getBucketLabel(int bucketIndex) {
322            if (bucketIndex == mMiscBucketIndex) {
323                return JAPANESE_MISC_LABEL;
324            } else if (bucketIndex > mMiscBucketIndex) {
325                --bucketIndex;
326            }
327            return super.getBucketLabel(bucketIndex);
328        }
329
330        @Override
331        public Iterator<String> getNameLookupKeys(String name, int nameStyle) {
332            // Hiragana and Katakana will be positively identified as Japanese.
333            if (nameStyle == PhoneticNameStyle.JAPANESE) {
334                return getRomajiNameLookupKeys(name);
335            }
336            return null;
337        }
338
339        private static boolean mInitializedTransliterator;
340        private static Transliterator mJapaneseTransliterator;
341
342        private static Transliterator getJapaneseTransliterator() {
343            synchronized(JapaneseContactUtils.class) {
344                if (!mInitializedTransliterator) {
345                    mInitializedTransliterator = true;
346                    Transliterator t = null;
347                    try {
348                        t = Transliterator.getInstance("Hiragana-Latin; Katakana-Latin;"
349                                + " Latin-Ascii");
350                    } catch (IllegalArgumentException e) {
351                        Log.w(TAG, "Hiragana/Katakana-Latin transliterator data"
352                                + " is missing");
353                    }
354                    mJapaneseTransliterator = t;
355                }
356                return mJapaneseTransliterator;
357            }
358        }
359
360        public static Iterator<String> getRomajiNameLookupKeys(String name) {
361            final Transliterator t = getJapaneseTransliterator();
362            if (t == null) {
363                return null;
364            }
365            final String romajiName = t.transliterate(name);
366            if (TextUtils.isEmpty(romajiName) ||
367                    TextUtils.equals(name, romajiName)) {
368                return null;
369            }
370            final HashSet<String> keys = new HashSet<String>();
371            keys.add(romajiName);
372            return keys.iterator();
373        }
374    }
375
376    /**
377     * Simplified Chinese specific locale overrides. Uses ICU Transliterator
378     * for generating pinyin transliteration.
379     *
380     * sortKey: unchanged (same as name)
381     * nameLookupKeys: adds additional name lookup keys
382     *     - Chinese character's pinyin and pinyin's initial character.
383     *     - Latin word and initial character.
384     * labels: unchanged
385     *     Simplified Chinese labels are the same as English: [A-Z], #, " "
386     */
387    private static class SimplifiedChineseContactUtils
388        extends ContactLocaleUtilsBase {
389        public SimplifiedChineseContactUtils(LocaleSet locales) {
390            super(locales);
391        }
392
393        @Override
394        public Iterator<String> getNameLookupKeys(String name, int nameStyle) {
395            if (nameStyle != FullNameStyle.JAPANESE &&
396                    nameStyle != FullNameStyle.KOREAN) {
397                return getPinyinNameLookupKeys(name);
398            }
399            return null;
400        }
401
402        public static Iterator<String> getPinyinNameLookupKeys(String name) {
403            // TODO : Reduce the object allocation.
404            HashSet<String> keys = new HashSet<String>();
405            ArrayList<Token> tokens = HanziToPinyin.getInstance().getTokens(name);
406            final int tokenCount = tokens.size();
407            final StringBuilder keyPinyin = new StringBuilder();
408            final StringBuilder keyInitial = new StringBuilder();
409            // There is no space among the Chinese Characters, the variant name
410            // lookup key wouldn't work for Chinese. The keyOriginal is used to
411            // build the lookup keys for itself.
412            final StringBuilder keyOriginal = new StringBuilder();
413            for (int i = tokenCount - 1; i >= 0; i--) {
414                final Token token = tokens.get(i);
415                if (Token.UNKNOWN == token.type) {
416                    continue;
417                }
418                if (Token.PINYIN == token.type) {
419                    keyPinyin.insert(0, token.target);
420                    keyInitial.insert(0, token.target.charAt(0));
421                } else if (Token.LATIN == token.type) {
422                    // Avoid adding space at the end of String.
423                    if (keyPinyin.length() > 0) {
424                        keyPinyin.insert(0, ' ');
425                    }
426                    if (keyOriginal.length() > 0) {
427                        keyOriginal.insert(0, ' ');
428                    }
429                    keyPinyin.insert(0, token.source);
430                    keyInitial.insert(0, token.source.charAt(0));
431                }
432                keyOriginal.insert(0, token.source);
433                keys.add(keyOriginal.toString());
434                keys.add(keyPinyin.toString());
435                keys.add(keyInitial.toString());
436            }
437            return keys.iterator();
438        }
439    }
440
441    private static ContactLocaleUtils sSingleton;
442
443    private final LocaleSet mLocales;
444    private final ContactLocaleUtilsBase mUtils;
445
446    private ContactLocaleUtils(LocaleSet locales) {
447        if (locales == null) {
448            mLocales = LocaleSet.newDefault();
449        } else {
450            mLocales = locales;
451        }
452        if (mLocales.shouldPreferJapanese()) {
453            mUtils = new JapaneseContactUtils(mLocales);
454        } else if (mLocales.shouldPreferSimplifiedChinese()) {
455            mUtils = new SimplifiedChineseContactUtils(mLocales);
456        } else {
457            mUtils = new ContactLocaleUtilsBase(mLocales);
458        }
459        Log.i(TAG, "AddressBook Labels [" + mLocales.toString() + "]: "
460                + getLabels().toString());
461    }
462
463    public boolean isLocale(LocaleSet locales) {
464        return mLocales.equals(locales);
465    }
466
467    public static synchronized ContactLocaleUtils getInstance() {
468        if (sSingleton == null) {
469            sSingleton = new ContactLocaleUtils(LocaleSet.newDefault());
470        }
471        return sSingleton;
472    }
473
474    @VisibleForTesting
475    public static synchronized void setLocaleForTest(Locale... locales) {
476        setLocales(LocaleSet.newForTest(locales));
477    }
478
479    public static synchronized void setLocales(LocaleSet locales) {
480        if (sSingleton == null || !sSingleton.isLocale(locales)) {
481            sSingleton = new ContactLocaleUtils(locales);
482        }
483    }
484
485    public String getSortKey(String name, int nameStyle) {
486        return mUtils.getSortKey(name);
487    }
488
489    public int getBucketIndex(String name) {
490        return mUtils.getBucketIndex(name);
491    }
492
493    public int getNumberBucketIndex() {
494        return mUtils.getNumberBucketIndex();
495    }
496
497    public int getBucketCount() {
498        return mUtils.getBucketCount();
499    }
500
501    public String getBucketLabel(int bucketIndex) {
502        return mUtils.getBucketLabel(bucketIndex);
503    }
504
505    public String getLabel(String name) {
506        return getBucketLabel(getBucketIndex(name));
507    }
508
509    public ArrayList<String> getLabels() {
510        return mUtils.getLabels();
511    }
512
513    /**
514     *  Determine which utility should be used for generating NameLookupKey.
515     *  (ie, whether we generate Romaji or Pinyin lookup keys or not)
516     *
517     *  Hiragana and Katakana are tagged as JAPANESE; Kanji is unclassified
518     *  and tagged as CJK. For Hiragana/Katakana names, generate Romaji
519     *  lookup keys when not in a Chinese or Korean locale.
520     *
521     *  Otherwise, use the default behavior of that locale:
522     *  a. For Japan, generate Romaji lookup keys for Hiragana/Katakana.
523     *  b. For Simplified Chinese locale, generate Pinyin lookup keys.
524     */
525    public Iterator<String> getNameLookupKeys(String name, int nameStyle) {
526        if (!mLocales.isPrimaryLocaleCJK()) {
527            if (mLocales.shouldPreferSimplifiedChinese()) {
528                if (nameStyle == FullNameStyle.CHINESE ||
529                        nameStyle == FullNameStyle.CJK) {
530                    return SimplifiedChineseContactUtils.getPinyinNameLookupKeys(name);
531                }
532            } else {
533                if (nameStyle == FullNameStyle.JAPANESE) {
534                    return JapaneseContactUtils.getRomajiNameLookupKeys(name);
535                }
536            }
537        }
538        return mUtils.getNameLookupKeys(name, nameStyle);
539    }
540
541}
542