171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa/* 271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * Copyright (C) 2011 The Android Open Source Project 371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * 471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * Licensed under the Apache License, Version 2.0 (the "License"); 571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * you may not use this file except in compliance with the License. 671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * You may obtain a copy of the License at 771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * 871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * http://www.apache.org/licenses/LICENSE-2.0 971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * 1071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * Unless required by applicable law or agreed to in writing, software 1171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * distributed under the License is distributed on an "AS IS" BASIS, 1271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * See the License for the specific language governing permissions and 1471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * limitations under the License. 1571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa */ 1671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 1771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawapackage com.android.providers.contacts; 1871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 19599753c83f4d0b4acb7c07f493b1f2accd78778eRayhaan Jaufeerallyimport android.icu.text.Transliterator; 2071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawaimport android.text.TextUtils; 2171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawaimport android.util.Log; 2271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 2371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawaimport java.util.ArrayList; 2471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawaimport java.util.Locale; 2571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 2679b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner 2771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa/** 2879b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner * An object to convert Chinese character to its corresponding pinyin string. 2979b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner * For characters with multiple possible pinyin string, only one is selected 3079b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner * according to ICU Transliterator class. Polyphone is not supported in this 3179b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner * implementation. 3271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa */ 3371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawapublic class HanziToPinyin { 3471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa private static final String TAG = "HanziToPinyin"; 3571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 3671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa private static HanziToPinyin sInstance; 370f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner private Transliterator mPinyinTransliterator; 380f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner private Transliterator mAsciiTransliterator; 3971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 4071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa public static class Token { 4171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa /** 4271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * Separator between target string for each source char 4371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa */ 4471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa public static final String SEPARATOR = " "; 4571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 4671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa public static final int LATIN = 1; 4771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa public static final int PINYIN = 2; 4871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa public static final int UNKNOWN = 3; 4971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 5071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa public Token() { 5171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 5271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 5371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa public Token(int type, String source, String target) { 5471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa this.type = type; 5571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa this.source = source; 5671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa this.target = target; 5771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 5871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 5971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa /** 6071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * Type of this token, ASCII, PINYIN or UNKNOWN. 6171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa */ 6271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa public int type; 6371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa /** 6471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * Original string before translation. 6571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa */ 6671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa public String source; 6771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa /** 6871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is 6971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * original string in source. 7071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa */ 7171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa public String target; 7271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 7371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 7479b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner private HanziToPinyin() { 7579b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner try { 76599753c83f4d0b4acb7c07f493b1f2accd78778eRayhaan Jaufeerally mPinyinTransliterator = Transliterator.getInstance( 77599753c83f4d0b4acb7c07f493b1f2accd78778eRayhaan Jaufeerally "Han-Latin/Names; Latin-Ascii; Any-Upper"); 78599753c83f4d0b4acb7c07f493b1f2accd78778eRayhaan Jaufeerally mAsciiTransliterator = Transliterator.getInstance("Latin-Ascii"); 79599753c83f4d0b4acb7c07f493b1f2accd78778eRayhaan Jaufeerally } catch (IllegalArgumentException e) { 8079b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner Log.w(TAG, "Han-Latin/Names transliterator data is missing," 8179b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner + " HanziToPinyin is disabled"); 8279b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner } 8379b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner } 8479b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner 8579b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner public boolean hasChineseTransliterator() { 8679b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner return mPinyinTransliterator != null; 8771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 8871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 8971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa public static HanziToPinyin getInstance() { 9071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa synchronized (HanziToPinyin.class) { 9179b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner if (sInstance == null) { 9279b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner sInstance = new HanziToPinyin(); 9371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 9471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa return sInstance; 9571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 9671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 9771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 980f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner private void tokenize(char character, Token token) { 9979b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner token.source = Character.toString(character); 10079b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner 1010f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner // ASCII 1020f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner if (character < 128) { 10371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa token.type = Token.LATIN; 10479b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner token.target = token.source; 1050f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner return; 1060f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner } 1070f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner 1080f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner // Extended Latin. Transcode these to ASCII equivalents 1090f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner if (character < 0x250 || (0x1e00 <= character && character < 0x1eff)) { 1100f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner token.type = Token.LATIN; 1110f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner token.target = mAsciiTransliterator == null ? token.source : 1120f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner mAsciiTransliterator.transliterate(token.source); 1130f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner return; 11471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 11571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 11671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa token.type = Token.PINYIN; 11779b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner token.target = mPinyinTransliterator.transliterate(token.source); 11879b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner if (TextUtils.isEmpty(token.target) || 11979b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner TextUtils.equals(token.source, token.target)) { 12084ae70d111070cc512790f0b2a573347fa9daca5Xiaotao Duan token.type = Token.UNKNOWN; 12184ae70d111070cc512790f0b2a573347fa9daca5Xiaotao Duan token.target = token.source; 12284ae70d111070cc512790f0b2a573347fa9daca5Xiaotao Duan } 12371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 12471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 125d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner public String transliterate(final String input) { 126d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) { 127d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner return null; 128d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner } 129d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner return mPinyinTransliterator.transliterate(input); 130d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner } 131d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner 13271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa /** 13371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without 13471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * space will be put into a Token, One Hanzi character which has pinyin will be treated as a 13579b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner * Token. If there is no Chinese transliterator, the empty token array is returned. 13671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa */ 137d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner public ArrayList<Token> getTokens(final String input) { 13871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa ArrayList<Token> tokens = new ArrayList<Token>(); 13979b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) { 14071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa // return empty tokens. 14171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa return tokens; 14271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 1430f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner 14471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa final int inputLength = input.length(); 14571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa final StringBuilder sb = new StringBuilder(); 14671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa int tokenType = Token.LATIN; 1470f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner Token token = new Token(); 1480f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner 14971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa // Go through the input, create a new token when 15071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa // a. Token type changed 15171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa // b. Get the Pinyin of current charater. 15271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa // c. current character is space. 15371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa for (int i = 0; i < inputLength; i++) { 15471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa final char character = input.charAt(i); 1550f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner if (Character.isSpaceChar(character)) { 15671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa if (sb.length() > 0) { 15771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa addToken(sb, tokens, tokenType); 15871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 15971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } else { 1600f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner tokenize(character, token); 1610f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner if (token.type == Token.PINYIN) { 16271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa if (sb.length() > 0) { 16371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa addToken(sb, tokens, tokenType); 16471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 1650f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner tokens.add(token); 1660f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner token = new Token(); 16771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } else { 1680f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner if (tokenType != token.type && sb.length() > 0) { 16971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa addToken(sb, tokens, tokenType); 17071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 1710f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner sb.append(token.target); 17271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 1730f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner tokenType = token.type; 17471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 17571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 17671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa if (sb.length() > 0) { 17771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa addToken(sb, tokens, tokenType); 17871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 17971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa return tokens; 18071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 18171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa 18271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa private void addToken( 18371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) { 18471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa String str = sb.toString(); 18571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa tokens.add(new Token(tokenType, str, str)); 18671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa sb.setLength(0); 18771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa } 18871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa} 189