1/* GENERATED SOURCE. DO NOT MODIFY. */ 2/* 3 ******************************************************************************* 4 * Copyright (C) 2012-2014, International Business Machines Corporation and * 5 * others. All Rights Reserved. * 6 ******************************************************************************* 7 */ 8package android.icu.text; 9 10import static android.icu.impl.CharacterIteration.DONE32; 11import static android.icu.impl.CharacterIteration.current32; 12import static android.icu.impl.CharacterIteration.next32; 13 14import java.io.IOException; 15import java.text.CharacterIterator; 16 17import android.icu.impl.Assert; 18 19class CjkBreakEngine extends DictionaryBreakEngine { 20 private static final UnicodeSet fHangulWordSet = new UnicodeSet(); 21 private static final UnicodeSet fHanWordSet = new UnicodeSet(); 22 private static final UnicodeSet fKatakanaWordSet = new UnicodeSet(); 23 private static final UnicodeSet fHiraganaWordSet = new UnicodeSet(); 24 static { 25 fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]"); 26 fHanWordSet.applyPattern("[:Han:]"); 27 fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]"); 28 fHiraganaWordSet.applyPattern("[:Hiragana:]"); 29 30 // freeze them all 31 fHangulWordSet.freeze(); 32 fHanWordSet.freeze(); 33 fKatakanaWordSet.freeze(); 34 fHiraganaWordSet.freeze(); 35 } 36 37 private DictionaryMatcher fDictionary = null; 38 39 public CjkBreakEngine(boolean korean) throws IOException { 40 super(BreakIterator.KIND_WORD); 41 fDictionary = DictionaryData.loadDictionaryFor("Hira"); 42 if (korean) { 43 setCharacters(fHangulWordSet); 44 } else { //Chinese and Japanese 45 UnicodeSet cjSet = new UnicodeSet(); 46 cjSet = new UnicodeSet(); 47 cjSet.addAll(fHanWordSet); 48 cjSet.addAll(fKatakanaWordSet); 49 cjSet.addAll(fHiraganaWordSet); 50 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK 51 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK 52 setCharacters(cjSet); 53 } 54 } 55 56 public boolean equals(Object obj) { 57 if (obj instanceof CjkBreakEngine) { 58 CjkBreakEngine other = (CjkBreakEngine)obj; 59 return this.fSet.equals(other.fSet); 60 } 61 return false; 62 } 63 64 public int hashCode() { 65 return getClass().hashCode(); 66 } 67 68 private static final int kMaxKatakanaLength = 8; 69 private static final int kMaxKatakanaGroupLength = 20; 70 private static final int maxSnlp = 255; 71 private static final int kint32max = Integer.MAX_VALUE; 72 private static int getKatakanaCost(int wordlength) { 73 int katakanaCost[] = new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 }; 74 return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength]; 75 } 76 77 private static boolean isKatakana(int value) { 78 return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) || 79 (value >= 0xFF66 && value <= 0xFF9F); 80 } 81 82 public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos, 83 DequeI foundBreaks) { 84 if (startPos >= endPos) { 85 return 0; 86 } 87 88 inText.setIndex(startPos); 89 90 int inputLength = endPos - startPos; 91 int[] charPositions = new int[inputLength + 1]; 92 StringBuffer s = new StringBuffer(""); 93 inText.setIndex(startPos); 94 while (inText.getIndex() < endPos) { 95 s.append(inText.current()); 96 inText.next(); 97 } 98 String prenormstr = s.toString(); 99 boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES || 100 Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0); 101 CharacterIterator text; 102 int numChars = 0; 103 if (isNormalized) { 104 text = new java.text.StringCharacterIterator(prenormstr); 105 int index = 0; 106 charPositions[0] = 0; 107 while (index < prenormstr.length()) { 108 int codepoint = prenormstr.codePointAt(index); 109 index += Character.charCount(codepoint); 110 numChars++; 111 charPositions[numChars] = index; 112 } 113 } else { 114 String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC); 115 text = new java.text.StringCharacterIterator(normStr); 116 charPositions = new int[normStr.length() + 1]; 117 Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0); 118 int index = 0; 119 charPositions[0] = 0; 120 while (index < normalizer.endIndex()) { 121 normalizer.next(); 122 numChars++; 123 index = normalizer.getIndex(); 124 charPositions[numChars] = index; 125 } 126 } 127 128 // From here on out, do the algorithm. Note that our indices 129 // refer to indices within the normalized string. 130 int[] bestSnlp = new int[numChars + 1]; 131 bestSnlp[0] = 0; 132 for (int i = 1; i <= numChars; i++) { 133 bestSnlp[i] = kint32max; 134 } 135 136 int[] prev = new int[numChars + 1]; 137 for (int i = 0; i <= numChars; i++) { 138 prev[i] = -1; 139 } 140 141 final int maxWordSize = 20; 142 int values[] = new int[numChars]; 143 int lengths[] = new int[numChars]; 144 // dynamic programming to find the best segmentation 145 boolean is_prev_katakana = false; 146 for (int i = 0; i < numChars; i++) { 147 text.setIndex(i); 148 if (bestSnlp[i] == kint32max) { 149 continue; 150 } 151 152 int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i); 153 int[] count_ = new int[1]; 154 fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values); 155 int count = count_[0]; 156 157 // if there are no single character matches found in the dictionary 158 // starting with this character, treat character as a 1-character word 159 // with the highest value possible (i.e. the least likely to occur). 160 // Exclude Korean characters from this treatment, as they should be 161 // left together by default. 162 if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) { 163 values[count] = maxSnlp; 164 lengths[count] = 1; 165 count++; 166 } 167 168 for (int j = 0; j < count; j++) { 169 int newSnlp = bestSnlp[i] + values[j]; 170 if (newSnlp < bestSnlp[lengths[j] + i]) { 171 bestSnlp[lengths[j] + i] = newSnlp; 172 prev[lengths[j] + i] = i; 173 } 174 } 175 176 // In Japanese, single-character Katakana words are pretty rare. 177 // So we apply the following heuristic to Katakana: any continuous 178 // run of Katakana characters is considered a candidate word with 179 // a default cost specified in the katakanaCost table according 180 // to its length. 181 text.setIndex(i); 182 boolean is_katakana = isKatakana(current32(text)); 183 if (!is_prev_katakana && is_katakana) { 184 int j = i + 1; 185 next32(text); 186 while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) { 187 next32(text); 188 ++j; 189 } 190 191 if ((j - i) < kMaxKatakanaGroupLength) { 192 int newSnlp = bestSnlp[i] + getKatakanaCost(j - i); 193 if (newSnlp < bestSnlp[j]) { 194 bestSnlp[j] = newSnlp; 195 prev[j] = i; 196 } 197 } 198 } 199 is_prev_katakana = is_katakana; 200 } 201 202 int t_boundary[] = new int[numChars + 1]; 203 int numBreaks = 0; 204 if (bestSnlp[numChars] == kint32max) { 205 t_boundary[numBreaks] = numChars; 206 numBreaks++; 207 } else { 208 for (int i = numChars; i > 0; i = prev[i]) { 209 t_boundary[numBreaks] = i; 210 numBreaks++; 211 } 212 Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0); 213 } 214 215 if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) { 216 t_boundary[numBreaks++] = 0; 217 } 218 219 int correctedNumBreaks = 0; 220 for (int i = numBreaks - 1; i >= 0; i--) { 221 int pos = charPositions[t_boundary[i]] + startPos; 222 if (!(foundBreaks.contains(pos) || pos == startPos)) { 223 foundBreaks.push(charPositions[t_boundary[i]] + startPos); 224 correctedNumBreaks++; 225 } 226 } 227 228 if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) { 229 foundBreaks.pop(); 230 correctedNumBreaks--; 231 } 232 if (!foundBreaks.isEmpty()) 233 inText.setIndex(foundBreaks.peek()); 234 return correctedNumBreaks; 235 } 236} 237