1/* GENERATED SOURCE. DO NOT MODIFY. */
2/*
3 *******************************************************************************
4 * Copyright (C) 2012-2014, International Business Machines Corporation and         *
5 * others. All Rights Reserved.                                                *
6 *******************************************************************************
7 */
8package android.icu.text;
9
10import static android.icu.impl.CharacterIteration.DONE32;
11import static android.icu.impl.CharacterIteration.current32;
12import static android.icu.impl.CharacterIteration.next32;
13
14import java.io.IOException;
15import java.text.CharacterIterator;
16
17import android.icu.impl.Assert;
18
19class CjkBreakEngine extends DictionaryBreakEngine {
20    private static final UnicodeSet fHangulWordSet = new UnicodeSet();
21    private static final UnicodeSet fHanWordSet = new UnicodeSet();
22    private static final UnicodeSet fKatakanaWordSet = new UnicodeSet();
23    private static final UnicodeSet fHiraganaWordSet = new UnicodeSet();
24    static {
25        fHangulWordSet.applyPattern("[\\uac00-\\ud7a3]");
26        fHanWordSet.applyPattern("[:Han:]");
27        fKatakanaWordSet.applyPattern("[[:Katakana:]\\uff9e\\uff9f]");
28        fHiraganaWordSet.applyPattern("[:Hiragana:]");
29
30        // freeze them all
31        fHangulWordSet.freeze();
32        fHanWordSet.freeze();
33        fKatakanaWordSet.freeze();
34        fHiraganaWordSet.freeze();
35    }
36
37    private DictionaryMatcher fDictionary = null;
38
39    public CjkBreakEngine(boolean korean) throws IOException {
40        super(BreakIterator.KIND_WORD);
41        fDictionary = DictionaryData.loadDictionaryFor("Hira");
42        if (korean) {
43            setCharacters(fHangulWordSet);
44        } else { //Chinese and Japanese
45            UnicodeSet cjSet = new UnicodeSet();
46            cjSet = new UnicodeSet();
47            cjSet.addAll(fHanWordSet);
48            cjSet.addAll(fKatakanaWordSet);
49            cjSet.addAll(fHiraganaWordSet);
50            cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
51            cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
52            setCharacters(cjSet);
53        }
54    }
55
56    public boolean equals(Object obj) {
57        if (obj instanceof CjkBreakEngine) {
58            CjkBreakEngine other = (CjkBreakEngine)obj;
59            return this.fSet.equals(other.fSet);
60        }
61        return false;
62    }
63
64    public int hashCode() {
65        return getClass().hashCode();
66    }
67
68    private static final int kMaxKatakanaLength = 8;
69    private static final int kMaxKatakanaGroupLength = 20;
70    private static final int maxSnlp = 255;
71    private static final int kint32max = Integer.MAX_VALUE;
72    private static int getKatakanaCost(int wordlength) {
73        int katakanaCost[] =  new int[] { 8192, 984, 408, 240, 204, 252, 300, 372, 480 };
74        return (wordlength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordlength];
75    }
76
77    private static boolean isKatakana(int value) {
78        return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
79                (value >= 0xFF66 && value <= 0xFF9F);
80    }
81
82    public int divideUpDictionaryRange(CharacterIterator inText, int startPos, int endPos,
83            DequeI foundBreaks) {
84        if (startPos >= endPos) {
85            return 0;
86        }
87
88        inText.setIndex(startPos);
89
90        int inputLength = endPos - startPos;
91        int[] charPositions = new int[inputLength + 1];
92        StringBuffer s = new StringBuffer("");
93        inText.setIndex(startPos);
94        while (inText.getIndex() < endPos) {
95            s.append(inText.current());
96            inText.next();
97        }
98        String prenormstr = s.toString();
99        boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
100                               Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
101        CharacterIterator text;
102        int numChars = 0;
103        if (isNormalized) {
104            text = new java.text.StringCharacterIterator(prenormstr);
105            int index = 0;
106            charPositions[0] = 0;
107            while (index < prenormstr.length()) {
108                int codepoint = prenormstr.codePointAt(index);
109                index += Character.charCount(codepoint);
110                numChars++;
111                charPositions[numChars] = index;
112            }
113        } else {
114            String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
115            text = new java.text.StringCharacterIterator(normStr);
116            charPositions = new int[normStr.length() + 1];
117            Normalizer normalizer = new Normalizer(prenormstr, Normalizer.NFKC, 0);
118            int index = 0;
119            charPositions[0] = 0;
120            while (index < normalizer.endIndex()) {
121                normalizer.next();
122                numChars++;
123                index = normalizer.getIndex();
124                charPositions[numChars] = index;
125            }
126        }
127
128        // From here on out, do the algorithm. Note that our indices
129        // refer to indices within the normalized string.
130        int[] bestSnlp = new int[numChars + 1];
131        bestSnlp[0] = 0;
132        for (int i = 1; i <= numChars; i++) {
133            bestSnlp[i] = kint32max;
134        }
135
136        int[] prev = new int[numChars + 1];
137        for (int i = 0; i <= numChars; i++) {
138            prev[i] = -1;
139        }
140
141        final int maxWordSize = 20;
142        int values[] = new int[numChars];
143        int lengths[] = new int[numChars];
144        // dynamic programming to find the best segmentation
145        boolean is_prev_katakana = false;
146        for (int i = 0; i < numChars; i++) {
147            text.setIndex(i);
148            if (bestSnlp[i] == kint32max) {
149                continue;
150            }
151
152            int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
153            int[] count_ = new int[1];
154            fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
155            int count = count_[0];
156
157            // if there are no single character matches found in the dictionary
158            // starting with this character, treat character as a 1-character word
159            // with the highest value possible (i.e. the least likely to occur).
160            // Exclude Korean characters from this treatment, as they should be
161            // left together by default.
162            if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
163                values[count] = maxSnlp;
164                lengths[count] = 1;
165                count++;
166            }
167
168            for (int j = 0; j < count; j++) {
169                int newSnlp = bestSnlp[i] + values[j];
170                if (newSnlp < bestSnlp[lengths[j] + i]) {
171                    bestSnlp[lengths[j] + i] = newSnlp;
172                    prev[lengths[j] + i] = i;
173                }
174            }
175
176            // In Japanese, single-character Katakana words are pretty rare.
177            // So we apply the following heuristic to Katakana: any continuous
178            // run of Katakana characters is considered a candidate word with
179            // a default cost specified in the katakanaCost table according
180            // to its length.
181            text.setIndex(i);
182            boolean is_katakana = isKatakana(current32(text));
183            if (!is_prev_katakana && is_katakana) {
184                int j = i + 1;
185                next32(text);
186                while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
187                    next32(text);
188                    ++j;
189                }
190
191                if ((j - i) < kMaxKatakanaGroupLength) {
192                    int newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
193                    if (newSnlp < bestSnlp[j]) {
194                        bestSnlp[j] = newSnlp;
195                        prev[j] = i;
196                    }
197                }
198            }
199            is_prev_katakana = is_katakana;
200        }
201
202        int t_boundary[] = new int[numChars + 1];
203        int numBreaks = 0;
204        if (bestSnlp[numChars] == kint32max) {
205            t_boundary[numBreaks] = numChars;
206            numBreaks++;
207        } else {
208            for (int i = numChars; i > 0; i = prev[i]) {
209                t_boundary[numBreaks] = i;
210                numBreaks++;
211            }
212            Assert.assrt(prev[t_boundary[numBreaks - 1]] == 0);
213        }
214
215        if (foundBreaks.size() == 0 || foundBreaks.peek() < startPos) {
216            t_boundary[numBreaks++] = 0;
217        }
218
219        int correctedNumBreaks = 0;
220        for (int i = numBreaks - 1; i >= 0; i--) {
221            int pos = charPositions[t_boundary[i]] + startPos;
222            if (!(foundBreaks.contains(pos) || pos == startPos)) {
223                foundBreaks.push(charPositions[t_boundary[i]] + startPos);
224                correctedNumBreaks++;
225            }
226        }
227
228        if (!foundBreaks.isEmpty() && foundBreaks.peek() == endPos) {
229            foundBreaks.pop();
230            correctedNumBreaks--;
231        }
232        if (!foundBreaks.isEmpty())
233            inText.setIndex(foundBreaks.peek());
234        return correctedNumBreaks;
235    }
236}
237