1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 2014, International Business Machines Corporation and         *
6 * others. All Rights Reserved.                                                *
7 *******************************************************************************
8 */
9package com.ibm.icu.text;
10
11import java.io.IOException;
12import java.text.CharacterIterator;
13
14import com.ibm.icu.lang.UCharacter;
15import com.ibm.icu.lang.UProperty;
16import com.ibm.icu.lang.UScript;
17
18class LaoBreakEngine extends DictionaryBreakEngine {
19
20    // Constants for LaoBreakIterator
21    // How many words in a row are "good enough"?
22    private static final byte LAO_LOOKAHEAD = 3;
23    // Will not combine a non-word with a preceding dictionary word longer than this
24    private static final byte LAO_ROOT_COMBINE_THRESHOLD = 3;
25    // Will not combine a non-word that shares at least this much prefix with a
26    // dictionary word with a preceding word
27    private static final byte LAO_PREFIX_COMBINE_THRESHOLD = 3;
28    // Minimum word size
29    private static final byte LAO_MIN_WORD = 2;
30
31    private DictionaryMatcher fDictionary;
32    private static UnicodeSet fLaoWordSet;
33    private static UnicodeSet fEndWordSet;
34    private static UnicodeSet fBeginWordSet;
35    private static UnicodeSet fMarkSet;
36
37    static {
38        // Initialize UnicodeSets
39        fLaoWordSet = new UnicodeSet();
40        fMarkSet = new UnicodeSet();
41        fBeginWordSet = new UnicodeSet();
42
43        fLaoWordSet.applyPattern("[[:Laoo:]&[:LineBreak=SA:]]");
44        fLaoWordSet.compact();
45
46        fMarkSet.applyPattern("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]");
47        fMarkSet.add(0x0020);
48        fEndWordSet = new UnicodeSet(fLaoWordSet);
49        fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
50        fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
51        fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
52        fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels
53
54        // Compact for caching
55        fMarkSet.compact();
56        fEndWordSet.compact();
57        fBeginWordSet.compact();
58
59        // Freeze the static UnicodeSet
60        fLaoWordSet.freeze();
61        fMarkSet.freeze();
62        fEndWordSet.freeze();
63        fBeginWordSet.freeze();
64    }
65
66    public LaoBreakEngine() throws IOException {
67        super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
68        setCharacters(fLaoWordSet);
69        // Initialize dictionary
70        fDictionary = DictionaryData.loadDictionaryFor("Laoo");
71    }
72
73    public boolean equals(Object obj) {
74        // Normally is a singleton, but it's possible to have duplicates
75        //   during initialization. All are equivalent.
76        return obj instanceof LaoBreakEngine;
77    }
78
79    public int hashCode() {
80        return getClass().hashCode();
81    }
82
83    public boolean handles(int c, int breakType) {
84        if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
85            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
86            return (script == UScript.LAO);
87        }
88        return false;
89    }
90
91    public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
92            DequeI foundBreaks) {
93
94
95        if ((rangeEnd - rangeStart) < LAO_MIN_WORD) {
96            return 0;  // Not enough characters for word
97        }
98        int wordsFound = 0;
99        int wordLength;
100        int current;
101        PossibleWord words[] = new PossibleWord[LAO_LOOKAHEAD];
102        for (int i = 0; i < LAO_LOOKAHEAD; i++) {
103            words[i] = new PossibleWord();
104        }
105        int uc;
106
107        fIter.setIndex(rangeStart);
108        while ((current = fIter.getIndex()) < rangeEnd) {
109            wordLength = 0;
110
111            //Look for candidate words at the current position
112            int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
113
114            // If we found exactly one, use that
115            if (candidates == 1) {
116                wordLength = words[wordsFound%LAO_LOOKAHEAD].acceptMarked(fIter);
117                wordsFound += 1;
118            }
119
120            // If there was more than one, see which one can take us forward the most words
121            else if (candidates > 1) {
122                boolean foundBest = false;
123                // If we're already at the end of the range, we're done
124                if (fIter.getIndex() < rangeEnd) {
125                    do {
126                        int wordsMatched = 1;
127                        if (words[(wordsFound+1)%LAO_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
128                            if (wordsMatched < 2) {
129                                // Followed by another dictionary word; mark first word as a good candidate
130                                words[wordsFound%LAO_LOOKAHEAD].markCurrent();
131                                wordsMatched = 2;
132                            }
133
134                            // If we're already at the end of the range, we're done
135                            if (fIter.getIndex() >= rangeEnd) {
136                                break;
137                            }
138
139                            // See if any of the possible second words is followed by a third word
140                            do {
141                                // If we find a third word, stop right away
142                                if (words[(wordsFound+2)%LAO_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
143                                    words[wordsFound%LAO_LOOKAHEAD].markCurrent();
144                                    foundBest = true;
145                                    break;
146                                }
147                            } while (words[(wordsFound+1)%LAO_LOOKAHEAD].backUp(fIter));
148                        }
149                    } while (words[wordsFound%LAO_LOOKAHEAD].backUp(fIter) && !foundBest);
150                }
151                wordLength = words[wordsFound%LAO_LOOKAHEAD].acceptMarked(fIter);
152                wordsFound += 1;
153            }
154
155            // We come here after having either found a word or not. We look ahead to the
156            // next word. If it's not a dictionary word, we will combine it with the word we
157            // just found (if there is one), but only if the preceding word does not exceed
158            // the threshold.
159            // The text iterator should now be positioned at the end of the word we found.
160            if (fIter.getIndex() < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) {
161                // If it is a dictionary word, do nothing. If it isn't, then if there is
162                // no preceding word, or the non-word shares less than the minimum threshold
163                // of characters with a dictionary word, then scan to resynchronize
164                if (words[wordsFound%LAO_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
165                        (wordLength == 0 ||
166                                words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
167                    // Look for a plausible word boundary
168                    int remaining = rangeEnd - (current + wordLength);
169                    int pc = fIter.current();
170                    int chars = 0;
171                    for (;;) {
172                        fIter.next();
173                        uc = fIter.current();
174                        chars += 1;
175                        if (--remaining <= 0) {
176                            break;
177                        }
178                        if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
179                            // Maybe. See if it's in the dictionary.
180                            int candidate = words[(wordsFound + 1) %LAO_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
181                            fIter.setIndex(current + wordLength + chars);
182                            if (candidate > 0) {
183                                break;
184                            }
185                        }
186                        pc = uc;
187                    }
188
189                    // Bump the word count if there wasn't already one
190                    if (wordLength <= 0) {
191                        wordsFound += 1;
192                    }
193
194                    // Update the length with the passed-over characters
195                    wordLength += chars;
196                } else {
197                    // Backup to where we were for next iteration
198                    fIter.setIndex(current+wordLength);
199                }
200            }
201
202            // Never stop before a combining mark.
203            int currPos;
204            while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) {
205                fIter.next();
206                wordLength += fIter.getIndex() - currPos;
207            }
208
209            // Look ahead for possible suffixes if a dictionary word does not follow.
210            // We do this in code rather than using a rule so that the heuristic
211            // resynch continues to function. For example, one of the suffix characters
212            // could be a typo in the middle of a word.
213            // NOT CURRENTLY APPLICABLE TO LAO
214
215            // Did we find a word on this iteration? If so, push it on the break stack
216            if (wordLength > 0) {
217                foundBreaks.push(Integer.valueOf(current + wordLength));
218            }
219        }
220
221        // Don't return a break for the end of the dictionary range if there is one there
222        if (foundBreaks.peek() >= rangeEnd) {
223            foundBreaks.pop();
224            wordsFound -= 1;
225        }
226
227        return wordsFound;
228    }
229
230}
231