171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa/*
271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * Copyright (C) 2011 The Android Open Source Project
371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa *
471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * Licensed under the Apache License, Version 2.0 (the "License");
571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * you may not use this file except in compliance with the License.
671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * You may obtain a copy of the License at
771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa *
871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa *      http://www.apache.org/licenses/LICENSE-2.0
971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa *
1071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * Unless required by applicable law or agreed to in writing, software
1171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * distributed under the License is distributed on an "AS IS" BASIS,
1271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * See the License for the specific language governing permissions and
1471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa * limitations under the License.
1571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa */
1671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
1771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawapackage com.android.providers.contacts;
1871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
19599753c83f4d0b4acb7c07f493b1f2accd78778eRayhaan Jaufeerallyimport android.icu.text.Transliterator;
2071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawaimport android.text.TextUtils;
2171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawaimport android.util.Log;
2271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
2371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawaimport java.util.ArrayList;
2471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawaimport java.util.Locale;
2571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
2679b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner
2771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa/**
2879b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner * An object to convert Chinese character to its corresponding pinyin string.
2979b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner * For characters with multiple possible pinyin string, only one is selected
3079b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner * according to ICU Transliterator class. Polyphone is not supported in this
3179b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner * implementation.
3271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa */
3371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawapublic class HanziToPinyin {
3471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa    private static final String TAG = "HanziToPinyin";
3571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
3671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa    private static HanziToPinyin sInstance;
370f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner    private Transliterator mPinyinTransliterator;
380f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner    private Transliterator mAsciiTransliterator;
3971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
4071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa    public static class Token {
4171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        /**
4271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa         * Separator between target string for each source char
4371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa         */
4471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        public static final String SEPARATOR = " ";
4571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
4671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        public static final int LATIN = 1;
4771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        public static final int PINYIN = 2;
4871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        public static final int UNKNOWN = 3;
4971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
5071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        public Token() {
5171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        }
5271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
5371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        public Token(int type, String source, String target) {
5471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa            this.type = type;
5571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa            this.source = source;
5671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa            this.target = target;
5771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        }
5871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
5971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        /**
6071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa         * Type of this token, ASCII, PINYIN or UNKNOWN.
6171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa         */
6271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        public int type;
6371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        /**
6471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa         * Original string before translation.
6571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa         */
6671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        public String source;
6771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        /**
6871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa         * Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is
6971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa         * original string in source.
7071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa         */
7171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        public String target;
7271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa    }
7371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
7479b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner    private HanziToPinyin() {
7579b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner        try {
76599753c83f4d0b4acb7c07f493b1f2accd78778eRayhaan Jaufeerally            mPinyinTransliterator = Transliterator.getInstance(
77599753c83f4d0b4acb7c07f493b1f2accd78778eRayhaan Jaufeerally                    "Han-Latin/Names; Latin-Ascii; Any-Upper");
78599753c83f4d0b4acb7c07f493b1f2accd78778eRayhaan Jaufeerally            mAsciiTransliterator = Transliterator.getInstance("Latin-Ascii");
79599753c83f4d0b4acb7c07f493b1f2accd78778eRayhaan Jaufeerally        } catch (IllegalArgumentException e) {
8079b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner            Log.w(TAG, "Han-Latin/Names transliterator data is missing,"
8179b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner                  + " HanziToPinyin is disabled");
8279b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner        }
8379b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner    }
8479b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner
8579b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner    public boolean hasChineseTransliterator() {
8679b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner        return mPinyinTransliterator != null;
8771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa    }
8871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
8971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa    public static HanziToPinyin getInstance() {
9071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        synchronized (HanziToPinyin.class) {
9179b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner            if (sInstance == null) {
9279b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner                sInstance = new HanziToPinyin();
9371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa            }
9471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa            return sInstance;
9571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        }
9671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa    }
9771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
980f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner    private void tokenize(char character, Token token) {
9979b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner        token.source = Character.toString(character);
10079b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner
1010f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner        // ASCII
1020f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner        if (character < 128) {
10371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa            token.type = Token.LATIN;
10479b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner            token.target = token.source;
1050f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner            return;
1060f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner        }
1070f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner
1080f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner        // Extended Latin. Transcode these to ASCII equivalents
1090f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner        if (character < 0x250 || (0x1e00 <= character && character < 0x1eff)) {
1100f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner            token.type = Token.LATIN;
1110f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner            token.target = mAsciiTransliterator == null ? token.source :
1120f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner                mAsciiTransliterator.transliterate(token.source);
1130f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner            return;
11471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        }
11571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
11671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        token.type = Token.PINYIN;
11779b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner        token.target = mPinyinTransliterator.transliterate(token.source);
11879b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner        if (TextUtils.isEmpty(token.target) ||
11979b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner            TextUtils.equals(token.source, token.target)) {
12084ae70d111070cc512790f0b2a573347fa9daca5Xiaotao Duan            token.type = Token.UNKNOWN;
12184ae70d111070cc512790f0b2a573347fa9daca5Xiaotao Duan            token.target = token.source;
12284ae70d111070cc512790f0b2a573347fa9daca5Xiaotao Duan        }
12371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa    }
12471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
125d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner    public String transliterate(final String input) {
126d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner        if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
127d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner            return null;
128d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner        }
129d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner        return mPinyinTransliterator.transliterate(input);
130d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner    }
131d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner
13271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa    /**
13371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa     * Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without
13471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa     * space will be put into a Token, One Hanzi character which has pinyin will be treated as a
13579b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner     * Token. If there is no Chinese transliterator, the empty token array is returned.
13671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa     */
137d4dbd063cf88e70b045607aa865b2fdb2329bf45Jay Shrauner    public ArrayList<Token> getTokens(final String input) {
13871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        ArrayList<Token> tokens = new ArrayList<Token>();
13979b591543a37f256fcc93dc8b47017afc9c8dbcaJay Shrauner        if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) {
14071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa            // return empty tokens.
14171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa            return tokens;
14271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        }
1430f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner
14471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        final int inputLength = input.length();
14571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        final StringBuilder sb = new StringBuilder();
14671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        int tokenType = Token.LATIN;
1470f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner        Token token = new Token();
1480f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner
14971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        // Go through the input, create a new token when
15071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        // a. Token type changed
15171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        // b. Get the Pinyin of current charater.
15271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        // c. current character is space.
15371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        for (int i = 0; i < inputLength; i++) {
15471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa            final char character = input.charAt(i);
1550f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner            if (Character.isSpaceChar(character)) {
15671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa                if (sb.length() > 0) {
15771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa                    addToken(sb, tokens, tokenType);
15871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa                }
15971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa            } else {
1600f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner                tokenize(character, token);
1610f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner                if (token.type == Token.PINYIN) {
16271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa                    if (sb.length() > 0) {
16371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa                        addToken(sb, tokens, tokenType);
16471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa                    }
1650f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner                    tokens.add(token);
1660f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner                    token = new Token();
16771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa                } else {
1680f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner                    if (tokenType != token.type && sb.length() > 0) {
16971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa                        addToken(sb, tokens, tokenType);
17071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa                    }
1710f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner                    sb.append(token.target);
17271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa                }
1730f4b7a9bfe4b2079a7c5bb22b4114b5672639b05Jay Shrauner                tokenType = token.type;
17471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa            }
17571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        }
17671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        if (sb.length() > 0) {
17771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa            addToken(sb, tokens, tokenType);
17871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        }
17971340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        return tokens;
18071340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa    }
18171340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa
18271340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa    private void addToken(
18371340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa            final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) {
18471340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        String str = sb.toString();
18571340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        tokens.add(new Token(tokenType, str, str));
18671340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa        sb.setLength(0);
18771340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa    }
18871340347b4862d4b1368a5d69d1667e2245952e4Daisuke Miyakawa}
189