1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 *******************************************************************************
6 * Copyright (C) 2014, International Business Machines Corporation and         *
7 * others. All Rights Reserved.                                                *
8 *******************************************************************************
9 */
10package android.icu.text;
11
12import java.io.IOException;
13import java.text.CharacterIterator;
14
15import android.icu.lang.UCharacter;
16import android.icu.lang.UProperty;
17import android.icu.lang.UScript;
18
19class KhmerBreakEngine extends DictionaryBreakEngine {
20
21    // Constants for KhmerBreakIterator
22    // How many words in a row are "good enough"?
23    private static final byte KHMER_LOOKAHEAD = 3;
24    // Will not combine a non-word with a preceding dictionary word longer than this
25    private static final byte KHMER_ROOT_COMBINE_THRESHOLD = 3;
26    // Will not combine a non-word that shares at least this much prefix with a
27    // dictionary word with a preceding word
28    private static final byte KHMER_PREFIX_COMBINE_THRESHOLD = 3;
29    // Minimum word size
30    private static final byte KHMER_MIN_WORD = 2;
31    // Minimum number of characters for two words
32    private static final byte KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
33
34
35    private DictionaryMatcher fDictionary;
36    private static UnicodeSet fKhmerWordSet;
37    private static UnicodeSet fEndWordSet;
38    private static UnicodeSet fBeginWordSet;
39    private static UnicodeSet fMarkSet;
40
41    static {
42        // Initialize UnicodeSets
43        fKhmerWordSet = new UnicodeSet();
44        fMarkSet = new UnicodeSet();
45        fBeginWordSet = new UnicodeSet();
46
47        fKhmerWordSet.applyPattern("[[:Khmer:]&[:LineBreak=SA:]]");
48        fKhmerWordSet.compact();
49
50        fMarkSet.applyPattern("[[:Khmer:]&[:LineBreak=SA:]&[:M:]]");
51        fMarkSet.add(0x0020);
52        fEndWordSet = new UnicodeSet(fKhmerWordSet);
53        fBeginWordSet.add(0x1780, 0x17B3);
54        fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
55
56        // Compact for caching
57        fMarkSet.compact();
58        fEndWordSet.compact();
59        fBeginWordSet.compact();
60
61        // Freeze the static UnicodeSet
62        fKhmerWordSet.freeze();
63        fMarkSet.freeze();
64        fEndWordSet.freeze();
65        fBeginWordSet.freeze();
66    }
67
68    public KhmerBreakEngine() throws IOException {
69        super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
70        setCharacters(fKhmerWordSet);
71        // Initialize dictionary
72        fDictionary = DictionaryData.loadDictionaryFor("Khmr");
73    }
74
75    public boolean equals(Object obj) {
76        // Normally is a singleton, but it's possible to have duplicates
77        //   during initialization. All are equivalent.
78        return obj instanceof KhmerBreakEngine;
79    }
80
81    public int hashCode() {
82        return getClass().hashCode();
83    }
84
85    public boolean handles(int c, int breakType) {
86        if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
87            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
88            return (script == UScript.KHMER);
89        }
90        return false;
91    }
92
93    public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
94            DequeI foundBreaks) {
95
96        if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
97            return 0;  // Not enough characters for word
98        }
99        int wordsFound = 0;
100        int wordLength;
101        int current;
102        PossibleWord words[] = new PossibleWord[KHMER_LOOKAHEAD];
103        for (int i = 0; i < KHMER_LOOKAHEAD; i++) {
104            words[i] = new PossibleWord();
105        }
106        int uc;
107
108        fIter.setIndex(rangeStart);
109
110        while ((current = fIter.getIndex()) < rangeEnd) {
111            wordLength = 0;
112
113            //Look for candidate words at the current position
114            int candidates = words[wordsFound % KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
115
116            // If we found exactly one, use that
117            if (candidates == 1) {
118                wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(fIter);
119                wordsFound += 1;
120            }
121
122            // If there was more than one, see which one can take us forward the most words
123            else if (candidates > 1) {
124                boolean foundBest = false;
125                // If we're already at the end of the range, we're done
126                if (fIter.getIndex() < rangeEnd) {
127                    do {
128                        int wordsMatched = 1;
129                        if (words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
130                            if (wordsMatched < 2) {
131                                // Followed by another dictionary word; mark first word as a good candidate
132                                words[wordsFound%KHMER_LOOKAHEAD].markCurrent();
133                                wordsMatched = 2;
134                            }
135
136                            // If we're already at the end of the range, we're done
137                            if (fIter.getIndex() >= rangeEnd) {
138                                break;
139                            }
140
141                            // See if any of the possible second words is followed by a third word
142                            do {
143                                // If we find a third word, stop right away
144                                if (words[(wordsFound+2)%KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
145                                    words[wordsFound%KHMER_LOOKAHEAD].markCurrent();
146                                    foundBest = true;
147                                    break;
148                                }
149                            } while (words[(wordsFound+1)%KHMER_LOOKAHEAD].backUp(fIter));
150                        }
151                    } while (words[wordsFound%KHMER_LOOKAHEAD].backUp(fIter) && !foundBest);
152                }
153                wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(fIter);
154                wordsFound += 1;
155            }
156
157            // We come here after having either found a word or not. We look ahead to the
158            // next word. If it's not a dictionary word, we will combine it with the word we
159            // just found (if there is one), but only if the preceding word does not exceed
160            // the threshold.
161            // The text iterator should now be positioned at the end of the word we found.
162            if (fIter.getIndex() < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
163                // If it is a dictionary word, do nothing. If it isn't, then if there is
164                // no preceding word, or the non-word shares less than the minimum threshold
165                // of characters with a dictionary word, then scan to resynchronize
166                if (words[wordsFound%KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
167                        (wordLength == 0 ||
168                                words[wordsFound%KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
169                    // Look for a plausible word boundary
170                    int remaining = rangeEnd - (current + wordLength);
171                    int pc = fIter.current();
172                    int chars = 0;
173                    for (;;) {
174                        fIter.next();
175                        uc = fIter.current();
176                        chars += 1;
177                        if (--remaining <= 0) {
178                            break;
179                        }
180                        if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
181                            // Maybe. See if it's in the dictionary.
182                            int candidate = words[(wordsFound + 1) %KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
183                            fIter.setIndex(current + wordLength + chars);
184                            if (candidate > 0) {
185                                break;
186                            }
187                        }
188                        pc = uc;
189                    }
190
191                    // Bump the word count if there wasn't already one
192                    if (wordLength <= 0) {
193                        wordsFound += 1;
194                    }
195
196                    // Update the length with the passed-over characters
197                    wordLength += chars;
198                } else {
199                    // Backup to where we were for next iteration
200                    fIter.setIndex(current+wordLength);
201                }
202            }
203
204            // Never stop before a combining mark.
205            int currPos;
206            while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) {
207                fIter.next();
208                wordLength += fIter.getIndex() - currPos;
209            }
210
211            // Look ahead for possible suffixes if a dictionary word does not follow.
212            // We do this in code rather than using a rule so that the heuristic
213            // resynch continues to function. For example, one of the suffix characters
214            // could be a typo in the middle of a word.
215            // NOT CURRENTLY APPLICABLE TO KHMER
216
217            // Did we find a word on this iteration? If so, push it on the break stack
218            if (wordLength > 0) {
219                foundBreaks.push(Integer.valueOf(current + wordLength));
220            }
221        }
222
223        // Don't return a break for the end of the dictionary range if there is one there
224        if (foundBreaks.peek() >= rangeEnd) {
225            foundBreaks.pop();
226            wordsFound -= 1;
227        }
228
229        return wordsFound;
230    }
231
232}
233