1/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.providers.contacts;
18
19import android.icu.text.Transliterator;
20import android.text.TextUtils;
21import android.util.Log;
22
23import java.util.ArrayList;
24import java.util.Locale;
25
26
27/**
28 * An object to convert Chinese character to its corresponding pinyin string.
29 * For characters with multiple possible pinyin string, only one is selected
30 * according to ICU Transliterator class. Polyphone is not supported in this
31 * implementation.
32 */
33public class HanziToPinyin {
34    private static final String TAG = "HanziToPinyin";
35
36    private static HanziToPinyin sInstance;
37    private Transliterator mPinyinTransliterator;
38    private Transliterator mAsciiTransliterator;
39
40    public static class Token {
41        /**
42         * Separator between target string for each source char
43         */
44        public static final String SEPARATOR = " ";
45
46        public static final int LATIN = 1;
47        public static final int PINYIN = 2;
48        public static final int UNKNOWN = 3;
49
50        public Token() {
51        }
52
53        public Token(int type, String source, String target) {
54            this.type = type;
55            this.source = source;
56            this.target = target;
57        }
58
59        /**
60         * Type of this token, ASCII, PINYIN or UNKNOWN.
61         */
62        public int type;
63        /**
64         * Original string before translation.
65         */
66        public String source;
67        /**
68         * Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is
69         * original string in source.
70         */
71        public String target;
72    }
73
74    private HanziToPinyin() {
75        try {
76            mPinyinTransliterator = Transliterator.getInstance(
77                    "Han-Latin/Names; Latin-Ascii; Any-Upper");
78            mAsciiTransliterator = Transliterator.getInstance("Latin-Ascii");
79        } catch (IllegalArgumentException e) {
80            Log.w(TAG, "Han-Latin/Names transliterator data is missing,"
81                  + " HanziToPinyin is disabled");
82        }
83    }
84
85    public boolean hasChineseTransliterator() {
86        return mPinyinTransliterator != null;
87    }
88
89    public static HanziToPinyin getInstance() {
90        synchronized (HanziToPinyin.class) {
91            if (sInstance == null) {
92                sInstance = new HanziToPinyin();
93            }
94            return sInstance;
95        }
96    }
97
98    private void tokenize(char character, Token token) {
99        token.source = Character.toString(character);
100
101        // ASCII
102        if (character < 128) {
103            token.type = Token.LATIN;
104            token.target = token.source;
105            return;
106        }
107
108        // Extended Latin. Transcode these to ASCII equivalents
109        if (character < 0x250 || (0x1e00 <= character && character < 0x1eff)) {
110            token.type = Token.LATIN;
111            token.target = mAsciiTransliterator == null ? token.source :
112                mAsciiTransliterator.transliterate(token.source);
113            return;
114        }
115
116        token.type = Token.PINYIN;
117        token.target = mPinyinTransliterator.transliterate(token.source);
118        if (TextUtils.isEmpty(token.target) ||
119            TextUtils.equals(token.source, token.target)) {
120            token.type = Token.UNKNOWN;
121            token.target = token.source;
122        }
123    }
124
125    public String transliterate(final String input) {
126        if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
127            return null;
128        }
129        return mPinyinTransliterator.transliterate(input);
130    }
131
132    /**
133     * Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without
134     * space will be put into a Token, One Hanzi character which has pinyin will be treated as a
135     * Token. If there is no Chinese transliterator, the empty token array is returned.
136     */
137    public ArrayList<Token> getTokens(final String input) {
138        ArrayList<Token> tokens = new ArrayList<Token>();
139        if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
140            // return empty tokens.
141            return tokens;
142        }
143
144        final int inputLength = input.length();
145        final StringBuilder sb = new StringBuilder();
146        int tokenType = Token.LATIN;
147        Token token = new Token();
148
149        // Go through the input, create a new token when
150        // a. Token type changed
151        // b. Get the Pinyin of current charater.
152        // c. current character is space.
153        for (int i = 0; i < inputLength; i++) {
154            final char character = input.charAt(i);
155            if (Character.isSpaceChar(character)) {
156                if (sb.length() > 0) {
157                    addToken(sb, tokens, tokenType);
158                }
159            } else {
160                tokenize(character, token);
161                if (token.type == Token.PINYIN) {
162                    if (sb.length() > 0) {
163                        addToken(sb, tokens, tokenType);
164                    }
165                    tokens.add(token);
166                    token = new Token();
167                } else {
168                    if (tokenType != token.type && sb.length() > 0) {
169                        addToken(sb, tokens, tokenType);
170                    }
171                    sb.append(token.target);
172                }
173                tokenType = token.type;
174            }
175        }
176        if (sb.length() > 0) {
177            addToken(sb, tokens, tokenType);
178        }
179        return tokens;
180    }
181
182    private void addToken(
183            final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) {
184        String str = sb.toString();
185        tokens.add(new Token(tokenType, str, str));
186        sb.setLength(0);
187    }
188}
189