1/*
2 *******************************************************************************
3 * Copyright (C) 2014, International Business Machines Corporation and         *
4 * others. All Rights Reserved.                                                *
5 *******************************************************************************
6 */
7package com.ibm.icu.text;
8
9import java.io.IOException;
10import java.text.CharacterIterator;
11
12import com.ibm.icu.lang.UCharacter;
13import com.ibm.icu.lang.UProperty;
14import com.ibm.icu.lang.UScript;
15
16class LaoBreakEngine extends DictionaryBreakEngine {
17
18    // Constants for LaoBreakIterator
19    // How many words in a row are "good enough"?
20    private static final byte LAO_LOOKAHEAD = 3;
21    // Will not combine a non-word with a preceding dictionary word longer than this
22    private static final byte LAO_ROOT_COMBINE_THRESHOLD = 3;
23    // Will not combine a non-word that shares at least this much prefix with a
24    // dictionary word with a preceding word
25    private static final byte LAO_PREFIX_COMBINE_THRESHOLD = 3;
26    // Minimum word size
27    private static final byte LAO_MIN_WORD = 2;
28
29    private DictionaryMatcher fDictionary;
30    private static UnicodeSet fLaoWordSet;
31    private static UnicodeSet fEndWordSet;
32    private static UnicodeSet fBeginWordSet;
33    private static UnicodeSet fMarkSet;
34
35    static {
36        // Initialize UnicodeSets
37        fLaoWordSet = new UnicodeSet();
38        fMarkSet = new UnicodeSet();
39        fBeginWordSet = new UnicodeSet();
40
41        fLaoWordSet.applyPattern("[[:Laoo:]&[:LineBreak=SA:]]");
42        fLaoWordSet.compact();
43
44        fMarkSet.applyPattern("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]");
45        fMarkSet.add(0x0020);
46        fEndWordSet = new UnicodeSet(fLaoWordSet);
47        fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
48        fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
49        fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
50        fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels
51
52        // Compact for caching
53        fMarkSet.compact();
54        fEndWordSet.compact();
55        fBeginWordSet.compact();
56
57        // Freeze the static UnicodeSet
58        fLaoWordSet.freeze();
59        fMarkSet.freeze();
60        fEndWordSet.freeze();
61        fBeginWordSet.freeze();
62    }
63
64    public LaoBreakEngine() throws IOException {
65        super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
66        setCharacters(fLaoWordSet);
67        // Initialize dictionary
68        fDictionary = DictionaryData.loadDictionaryFor("Laoo");
69    }
70
71    public boolean equals(Object obj) {
72        // Normally is a singleton, but it's possible to have duplicates
73        //   during initialization. All are equivalent.
74        return obj instanceof LaoBreakEngine;
75    }
76
77    public int hashCode() {
78        return getClass().hashCode();
79    }
80
81    public boolean handles(int c, int breakType) {
82        if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
83            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
84            return (script == UScript.LAO);
85        }
86        return false;
87    }
88
89    public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
90            DequeI foundBreaks) {
91
92
93        if ((rangeEnd - rangeStart) < LAO_MIN_WORD) {
94            return 0;  // Not enough characters for word
95        }
96        int wordsFound = 0;
97        int wordLength;
98        int current;
99        PossibleWord words[] = new PossibleWord[LAO_LOOKAHEAD];
100        for (int i = 0; i < LAO_LOOKAHEAD; i++) {
101            words[i] = new PossibleWord();
102        }
103        int uc;
104
105        fIter.setIndex(rangeStart);
106        while ((current = fIter.getIndex()) < rangeEnd) {
107            wordLength = 0;
108
109            //Look for candidate words at the current position
110            int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
111
112            // If we found exactly one, use that
113            if (candidates == 1) {
114                wordLength = words[wordsFound%LAO_LOOKAHEAD].acceptMarked(fIter);
115                wordsFound += 1;
116            }
117
118            // If there was more than one, see which one can take us forward the most words
119            else if (candidates > 1) {
120                boolean foundBest = false;
121                // If we're already at the end of the range, we're done
122                if (fIter.getIndex() < rangeEnd) {
123                    do {
124                        int wordsMatched = 1;
125                        if (words[(wordsFound+1)%LAO_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
126                            if (wordsMatched < 2) {
127                                // Followed by another dictionary word; mark first word as a good candidate
128                                words[wordsFound%LAO_LOOKAHEAD].markCurrent();
129                                wordsMatched = 2;
130                            }
131
132                            // If we're already at the end of the range, we're done
133                            if (fIter.getIndex() >= rangeEnd) {
134                                break;
135                            }
136
137                            // See if any of the possible second words is followed by a third word
138                            do {
139                                // If we find a third word, stop right away
140                                if (words[(wordsFound+2)%LAO_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
141                                    words[wordsFound%LAO_LOOKAHEAD].markCurrent();
142                                    foundBest = true;
143                                    break;
144                                }
145                            } while (words[(wordsFound+1)%LAO_LOOKAHEAD].backUp(fIter));
146                        }
147                    } while (words[wordsFound%LAO_LOOKAHEAD].backUp(fIter) && !foundBest);
148                }
149                wordLength = words[wordsFound%LAO_LOOKAHEAD].acceptMarked(fIter);
150                wordsFound += 1;
151            }
152
153            // We come here after having either found a word or not. We look ahead to the
154            // next word. If it's not a dictionary word, we will combine it with the word we
155            // just found (if there is one), but only if the preceding word does not exceed
156            // the threshold.
157            // The text iterator should now be positioned at the end of the word we found.
158            if (fIter.getIndex() < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) {
159                // If it is a dictionary word, do nothing. If it isn't, then if there is
160                // no preceding word, or the non-word shares less than the minimum threshold
161                // of characters with a dictionary word, then scan to resynchronize
162                if (words[wordsFound%LAO_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
163                        (wordLength == 0 ||
164                                words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
165                    // Look for a plausible word boundary
166                    int remaining = rangeEnd - (current + wordLength);
167                    int pc = fIter.current();
168                    int chars = 0;
169                    for (;;) {
170                        fIter.next();
171                        uc = fIter.current();
172                        chars += 1;
173                        if (--remaining <= 0) {
174                            break;
175                        }
176                        if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
177                            // Maybe. See if it's in the dictionary.
178                            int candidate = words[(wordsFound + 1) %LAO_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
179                            fIter.setIndex(current + wordLength + chars);
180                            if (candidate > 0) {
181                                break;
182                            }
183                        }
184                        pc = uc;
185                    }
186
187                    // Bump the word count if there wasn't already one
188                    if (wordLength <= 0) {
189                        wordsFound += 1;
190                    }
191
192                    // Update the length with the passed-over characters
193                    wordLength += chars;
194                } else {
195                    // Backup to where we were for next iteration
196                    fIter.setIndex(current+wordLength);
197                }
198            }
199
200            // Never stop before a combining mark.
201            int currPos;
202            while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) {
203                fIter.next();
204                wordLength += fIter.getIndex() - currPos;
205            }
206
207            // Look ahead for possible suffixes if a dictionary word does not follow.
208            // We do this in code rather than using a rule so that the heuristic
209            // resynch continues to function. For example, one of the suffix characters
210            // could be a typo in the middle of a word.
211            // NOT CURRENTLY APPLICABLE TO LAO
212
213            // Did we find a word on this iteration? If so, push it on the break stack
214            if (wordLength > 0) {
215                foundBreaks.push(Integer.valueOf(current + wordLength));
216            }
217        }
218
219        // Don't return a break for the end of the dictionary range if there is one there
220        if (foundBreaks.peek() >= rangeEnd) {
221            foundBreaks.pop();
222            wordsFound -= 1;
223        }
224
225        return wordsFound;
226    }
227
228}
229