1/*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.android.providers.contacts;
18
19import android.text.TextUtils;
20import android.util.Log;
21
22import java.util.ArrayList;
23import java.util.Locale;
24
25import libcore.icu.Transliterator;
26
27/**
28 * An object to convert Chinese character to its corresponding pinyin string.
29 * For characters with multiple possible pinyin string, only one is selected
30 * according to ICU Transliterator class. Polyphone is not supported in this
31 * implementation.
32 */
33public class HanziToPinyin {
34    private static final String TAG = "HanziToPinyin";
35
36    private static HanziToPinyin sInstance;
37    private Transliterator mPinyinTransliterator;
38    private Transliterator mAsciiTransliterator;
39
40    public static class Token {
41        /**
42         * Separator between target string for each source char
43         */
44        public static final String SEPARATOR = " ";
45
46        public static final int LATIN = 1;
47        public static final int PINYIN = 2;
48        public static final int UNKNOWN = 3;
49
50        public Token() {
51        }
52
53        public Token(int type, String source, String target) {
54            this.type = type;
55            this.source = source;
56            this.target = target;
57        }
58
59        /**
60         * Type of this token, ASCII, PINYIN or UNKNOWN.
61         */
62        public int type;
63        /**
64         * Original string before translation.
65         */
66        public String source;
67        /**
68         * Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is
69         * original string in source.
70         */
71        public String target;
72    }
73
74    private HanziToPinyin() {
75        try {
76            mPinyinTransliterator = new Transliterator("Han-Latin/Names; Latin-Ascii; Any-Upper");
77            mAsciiTransliterator = new Transliterator("Latin-Ascii");
78        } catch (RuntimeException e) {
79            Log.w(TAG, "Han-Latin/Names transliterator data is missing,"
80                  + " HanziToPinyin is disabled");
81        }
82    }
83
84    public boolean hasChineseTransliterator() {
85        return mPinyinTransliterator != null;
86    }
87
88    public static HanziToPinyin getInstance() {
89        synchronized (HanziToPinyin.class) {
90            if (sInstance == null) {
91                sInstance = new HanziToPinyin();
92            }
93            return sInstance;
94        }
95    }
96
97    private void tokenize(char character, Token token) {
98        token.source = Character.toString(character);
99
100        // ASCII
101        if (character < 128) {
102            token.type = Token.LATIN;
103            token.target = token.source;
104            return;
105        }
106
107        // Extended Latin. Transcode these to ASCII equivalents
108        if (character < 0x250 || (0x1e00 <= character && character < 0x1eff)) {
109            token.type = Token.LATIN;
110            token.target = mAsciiTransliterator == null ? token.source :
111                mAsciiTransliterator.transliterate(token.source);
112            return;
113        }
114
115        token.type = Token.PINYIN;
116        token.target = mPinyinTransliterator.transliterate(token.source);
117        if (TextUtils.isEmpty(token.target) ||
118            TextUtils.equals(token.source, token.target)) {
119            token.type = Token.UNKNOWN;
120            token.target = token.source;
121        }
122    }
123
124    /**
125     * Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without
126     * space will be put into a Token, One Hanzi character which has pinyin will be treated as a
127     * Token. If there is no Chinese transliterator, the empty token array is returned.
128     */
129    public ArrayList<Token> get(final String input) {
130        ArrayList<Token> tokens = new ArrayList<Token>();
131        if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
132            // return empty tokens.
133            return tokens;
134        }
135
136        final int inputLength = input.length();
137        final StringBuilder sb = new StringBuilder();
138        int tokenType = Token.LATIN;
139        Token token = new Token();
140
141        // Go through the input, create a new token when
142        // a. Token type changed
143        // b. Get the Pinyin of current charater.
144        // c. current character is space.
145        for (int i = 0; i < inputLength; i++) {
146            final char character = input.charAt(i);
147            if (Character.isSpaceChar(character)) {
148                if (sb.length() > 0) {
149                    addToken(sb, tokens, tokenType);
150                }
151            } else {
152                tokenize(character, token);
153                if (token.type == Token.PINYIN) {
154                    if (sb.length() > 0) {
155                        addToken(sb, tokens, tokenType);
156                    }
157                    tokens.add(token);
158                    token = new Token();
159                } else {
160                    if (tokenType != token.type && sb.length() > 0) {
161                        addToken(sb, tokens, tokenType);
162                    }
163                    sb.append(token.target);
164                }
165                tokenType = token.type;
166            }
167        }
168        if (sb.length() > 0) {
169            addToken(sb, tokens, tokenType);
170        }
171        return tokens;
172    }
173
174    private void addToken(
175            final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) {
176        String str = sb.toString();
177        tokens.add(new Token(tokenType, str, str));
178        sb.setLength(0);
179    }
180}
181