1/* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.android.providers.contacts; 18 19import android.text.TextUtils; 20import android.util.Log; 21 22import java.util.ArrayList; 23import java.util.Locale; 24 25import libcore.icu.Transliterator; 26 27/** 28 * An object to convert Chinese character to its corresponding pinyin string. 29 * For characters with multiple possible pinyin string, only one is selected 30 * according to ICU Transliterator class. Polyphone is not supported in this 31 * implementation. 32 */ 33public class HanziToPinyin { 34 private static final String TAG = "HanziToPinyin"; 35 36 private static HanziToPinyin sInstance; 37 private Transliterator mPinyinTransliterator; 38 private Transliterator mAsciiTransliterator; 39 40 public static class Token { 41 /** 42 * Separator between target string for each source char 43 */ 44 public static final String SEPARATOR = " "; 45 46 public static final int LATIN = 1; 47 public static final int PINYIN = 2; 48 public static final int UNKNOWN = 3; 49 50 public Token() { 51 } 52 53 public Token(int type, String source, String target) { 54 this.type = type; 55 this.source = source; 56 this.target = target; 57 } 58 59 /** 60 * Type of this token, ASCII, PINYIN or UNKNOWN. 61 */ 62 public int type; 63 /** 64 * Original string before translation. 65 */ 66 public String source; 67 /** 68 * Translated string of source. For Han, target is corresponding Pinyin. Otherwise target is 69 * original string in source. 70 */ 71 public String target; 72 } 73 74 private HanziToPinyin() { 75 try { 76 mPinyinTransliterator = new Transliterator("Han-Latin/Names; Latin-Ascii; Any-Upper"); 77 mAsciiTransliterator = new Transliterator("Latin-Ascii"); 78 } catch (RuntimeException e) { 79 Log.w(TAG, "Han-Latin/Names transliterator data is missing," 80 + " HanziToPinyin is disabled"); 81 } 82 } 83 84 public boolean hasChineseTransliterator() { 85 return mPinyinTransliterator != null; 86 } 87 88 public static HanziToPinyin getInstance() { 89 synchronized (HanziToPinyin.class) { 90 if (sInstance == null) { 91 sInstance = new HanziToPinyin(); 92 } 93 return sInstance; 94 } 95 } 96 97 private void tokenize(char character, Token token) { 98 token.source = Character.toString(character); 99 100 // ASCII 101 if (character < 128) { 102 token.type = Token.LATIN; 103 token.target = token.source; 104 return; 105 } 106 107 // Extended Latin. Transcode these to ASCII equivalents 108 if (character < 0x250 || (0x1e00 <= character && character < 0x1eff)) { 109 token.type = Token.LATIN; 110 token.target = mAsciiTransliterator == null ? token.source : 111 mAsciiTransliterator.transliterate(token.source); 112 return; 113 } 114 115 token.type = Token.PINYIN; 116 token.target = mPinyinTransliterator.transliterate(token.source); 117 if (TextUtils.isEmpty(token.target) || 118 TextUtils.equals(token.source, token.target)) { 119 token.type = Token.UNKNOWN; 120 token.target = token.source; 121 } 122 } 123 124 /** 125 * Convert the input to a array of tokens. The sequence of ASCII or Unknown characters without 126 * space will be put into a Token, One Hanzi character which has pinyin will be treated as a 127 * Token. If there is no Chinese transliterator, the empty token array is returned. 128 */ 129 public ArrayList<Token> get(final String input) { 130 ArrayList<Token> tokens = new ArrayList<Token>(); 131 if (!hasChineseTransliterator() || TextUtils.isEmpty(input)) { 132 // return empty tokens. 133 return tokens; 134 } 135 136 final int inputLength = input.length(); 137 final StringBuilder sb = new StringBuilder(); 138 int tokenType = Token.LATIN; 139 Token token = new Token(); 140 141 // Go through the input, create a new token when 142 // a. Token type changed 143 // b. Get the Pinyin of current charater. 144 // c. current character is space. 145 for (int i = 0; i < inputLength; i++) { 146 final char character = input.charAt(i); 147 if (Character.isSpaceChar(character)) { 148 if (sb.length() > 0) { 149 addToken(sb, tokens, tokenType); 150 } 151 } else { 152 tokenize(character, token); 153 if (token.type == Token.PINYIN) { 154 if (sb.length() > 0) { 155 addToken(sb, tokens, tokenType); 156 } 157 tokens.add(token); 158 token = new Token(); 159 } else { 160 if (tokenType != token.type && sb.length() > 0) { 161 addToken(sb, tokens, tokenType); 162 } 163 sb.append(token.target); 164 } 165 tokenType = token.type; 166 } 167 } 168 if (sb.length() > 0) { 169 addToken(sb, tokens, tokenType); 170 } 171 return tokens; 172 } 173 174 private void addToken( 175 final StringBuilder sb, final ArrayList<Token> tokens, final int tokenType) { 176 String str = sb.toString(); 177 tokens.add(new Token(tokenType, str, str)); 178 sb.setLength(0); 179 } 180} 181