1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 *******************************************************************************
6 * Copyright (C) 2014, International Business Machines Corporation and         *
7 * others. All Rights Reserved.                                                *
8 *******************************************************************************
9 */
10package android.icu.text;
11
12import java.io.IOException;
13import java.text.CharacterIterator;
14
15import android.icu.lang.UCharacter;
16import android.icu.lang.UProperty;
17import android.icu.lang.UScript;
18
19class ThaiBreakEngine extends DictionaryBreakEngine {
20
21    // Constants for ThaiBreakIterator
22    // How many words in a row are "good enough"?
23    private static final byte THAI_LOOKAHEAD = 3;
24    // Will not combine a non-word with a preceding dictionary word longer than this
25    private static final byte THAI_ROOT_COMBINE_THRESHOLD = 3;
26    // Will not combine a non-word that shares at least this much prefix with a
27    // dictionary word with a preceding word
28    private static final byte THAI_PREFIX_COMBINE_THRESHOLD = 3;
29    // Ellision character
30    private static final char THAI_PAIYANNOI = 0x0E2F;
31    // Repeat character
32    private static final char THAI_MAIYAMOK = 0x0E46;
33    // Minimum word size
34    private static final byte THAI_MIN_WORD = 2;
35    // Minimum number of characters for two words
36    private static final byte THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
37
38    private DictionaryMatcher fDictionary;
39    private static UnicodeSet fThaiWordSet;
40    private static UnicodeSet fEndWordSet;
41    private static UnicodeSet fBeginWordSet;
42    private static UnicodeSet fSuffixSet;
43    private static UnicodeSet fMarkSet;
44
45    static {
46        // Initialize UnicodeSets
47        fThaiWordSet = new UnicodeSet();
48        fMarkSet = new UnicodeSet();
49        fBeginWordSet = new UnicodeSet();
50        fSuffixSet = new UnicodeSet();
51
52        fThaiWordSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]]");
53        fThaiWordSet.compact();
54
55        fMarkSet.applyPattern("[[:Thai:]&[:LineBreak=SA:]&[:M:]]");
56        fMarkSet.add(0x0020);
57        fEndWordSet = new UnicodeSet(fThaiWordSet);
58        fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
59        fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
60        fBeginWordSet.add(0x0E01, 0x0E2E); //KO KAI through HO NOKHUK
61        fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
62        fSuffixSet.add(THAI_PAIYANNOI);
63        fSuffixSet.add(THAI_MAIYAMOK);
64
65        // Compact for caching
66        fMarkSet.compact();
67        fEndWordSet.compact();
68        fBeginWordSet.compact();
69        fSuffixSet.compact();
70
71        // Freeze the static UnicodeSet
72        fThaiWordSet.freeze();
73        fMarkSet.freeze();
74        fEndWordSet.freeze();
75        fBeginWordSet.freeze();
76        fSuffixSet.freeze();
77    }
78
79    public ThaiBreakEngine() throws IOException {
80        super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
81        setCharacters(fThaiWordSet);
82        // Initialize dictionary
83        fDictionary = DictionaryData.loadDictionaryFor("Thai");
84    }
85
86    public boolean equals(Object obj) {
87        // Normally is a singleton, but it's possible to have duplicates
88        //   during initialization. All are equivalent.
89        return obj instanceof ThaiBreakEngine;
90    }
91
92    public int hashCode() {
93        return getClass().hashCode();
94    }
95
96    public boolean handles(int c, int breakType) {
97        if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
98            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
99            return (script == UScript.THAI);
100        }
101        return false;
102    }
103
104    public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
105            DequeI foundBreaks) {
106
107        if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
108            return 0;  // Not enough characters for word
109        }
110        int wordsFound = 0;
111        int wordLength;
112        PossibleWord words[] = new PossibleWord[THAI_LOOKAHEAD];
113        for (int i = 0; i < THAI_LOOKAHEAD; i++) {
114            words[i] = new PossibleWord();
115        }
116
117        int uc;
118        fIter.setIndex(rangeStart);
119        int current;
120        while ((current = fIter.getIndex()) < rangeEnd) {
121            wordLength = 0;
122
123            //Look for candidate words at the current position
124            int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
125
126            // If we found exactly one, use that
127            if (candidates == 1) {
128                wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
129                wordsFound += 1;
130            }
131
132            // If there was more than one, see which one can take us forward the most words
133            else if (candidates > 1) {
134                // If we're already at the end of the range, we're done
135                if (fIter.getIndex() < rangeEnd) {
136                  foundBest:
137                    do {
138                        int wordsMatched = 1;
139                        if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
140                            if (wordsMatched < 2) {
141                                // Followed by another dictionary word; mark first word as a good candidate
142                                words[wordsFound%THAI_LOOKAHEAD].markCurrent();
143                                wordsMatched = 2;
144                            }
145
146                            // If we're already at the end of the range, we're done
147                            if (fIter.getIndex() >= rangeEnd) {
148                                break foundBest;
149                            }
150
151                            // See if any of the possible second words is followed by a third word
152                            do {
153                                // If we find a third word, stop right away
154                                if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
155                                    words[wordsFound%THAI_LOOKAHEAD].markCurrent();
156                                    break foundBest;
157                                }
158                            } while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(fIter));
159                        }
160                    }
161                    while (words[wordsFound%THAI_LOOKAHEAD].backUp(fIter));
162                    // foundBest: end of loop
163                }
164                wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(fIter);
165                wordsFound += 1;
166            }
167
168            // We come here after having either found a word or not. We look ahead to the
169            // next word. If it's not a dictionary word, we will combine it with the word we
170            // just found (if there is one), but only if the preceding word does not exceed
171            // the threshold.
172            // The text iterator should now be positioned at the end of the word we found.
173            if (fIter.getIndex() < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
174                // If it is a dictionary word, do nothing. If it isn't, then if there is
175                // no preceding word, or the non-word shares less than the minimum threshold
176                // of characters with a dictionary word, then scan to resynchronize
177                if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
178                        (wordLength == 0 ||
179                                words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
180                    // Look for a plausible word boundary
181                    int remaining = rangeEnd - (current + wordLength);
182                    int pc = fIter.current();
183                    int chars = 0;
184                    for (;;) {
185                        fIter.next();
186                        uc = fIter.current();
187                        chars += 1;
188                        if (--remaining <= 0) {
189                            break;
190                        }
191                        if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
192                            // Maybe. See if it's in the dictionary.
193                            // Note: In the original Apple code, checked that the next
194                            // two characters after uc were not 0x0E4C THANTHAKHAT before
195                            // checking the dictionary. That is just a performance filter,
196                            // but it's not clear it's faster than checking the trie
197                            int candidate = words[(wordsFound + 1) %THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
198                            fIter.setIndex(current + wordLength + chars);
199                            if (candidate > 0) {
200                                break;
201                            }
202                        }
203                        pc = uc;
204                    }
205
206                    // Bump the word count if there wasn't already one
207                    if (wordLength <= 0) {
208                        wordsFound += 1;
209                    }
210
211                    // Update the length with the passed-over characters
212                    wordLength += chars;
213                } else {
214                    // Backup to where we were for next iteration
215                    fIter.setIndex(current+wordLength);
216                }
217            }
218
219            // Never stop before a combining mark.
220            int currPos;
221            while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) {
222                fIter.next();
223                wordLength += fIter.getIndex() - currPos;
224            }
225
226            // Look ahead for possible suffixes if a dictionary word does not follow.
227            // We do this in code rather than using a rule so that the heuristic
228            // resynch continues to function. For example, one of the suffix characters
229            // could be a typo in the middle of a word.
230            if (fIter.getIndex() < rangeEnd && wordLength > 0) {
231                if (words[wordsFound%THAI_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
232                        fSuffixSet.contains(uc = fIter.current())) {
233                    if (uc == THAI_PAIYANNOI) {
234                        if (!fSuffixSet.contains(fIter.previous())) {
235                            // Skip over previous end and PAIYANNOI
236                            fIter.next();
237                            fIter.next();
238                            wordLength += 1;
239                            uc = fIter.current();
240                        } else {
241                            // Restore prior position
242                            fIter.next();
243                        }
244                    }
245                    if (uc == THAI_MAIYAMOK) {
246                        if (fIter.previous() != THAI_MAIYAMOK) {
247                            // Skip over previous end and MAIYAMOK
248                            fIter.next();
249                            fIter.next();
250                            wordLength += 1;
251                        } else {
252                            // restore prior position
253                            fIter.next();
254                        }
255                    }
256                } else {
257                    fIter.setIndex(current + wordLength);
258                }
259            }
260
261            // Did we find a word on this iteration? If so, push it on the break stack
262            if (wordLength > 0) {
263                foundBreaks.push(Integer.valueOf(current + wordLength));
264            }
265        }
266
267        // Don't return a break for the end of the dictionary range if there is one there
268        if (foundBreaks.peek() >= rangeEnd) {
269            foundBreaks.pop();
270            wordsFound -= 1;
271        }
272
273        return wordsFound;
274    }
275
276}
277