1/*
2 *******************************************************************************
3 * Copyright (C) 2014, International Business Machines Corporation and         *
4 * others. All Rights Reserved.                                                *
5 *******************************************************************************
6 */
7package com.ibm.icu.text;
8
9import java.io.IOException;
10import java.text.CharacterIterator;
11
12import com.ibm.icu.lang.UCharacter;
13import com.ibm.icu.lang.UProperty;
14import com.ibm.icu.lang.UScript;
15
16class BurmeseBreakEngine extends DictionaryBreakEngine {
17
18    // Constants for BurmeseBreakIterator
19    // How many words in a row are "good enough"?
20    private static final byte BURMESE_LOOKAHEAD = 3;
21    // Will not combine a non-word with a preceding dictionary word longer than this
22    private static final byte BURMESE_ROOT_COMBINE_THRESHOLD = 3;
23    // Will not combine a non-word that shares at least this much prefix with a
24    // dictionary word with a preceding word
25    private static final byte BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
26    // Minimum word size
27    private static final byte BURMESE_MIN_WORD = 2;
28
29    private DictionaryMatcher fDictionary;
30    private static UnicodeSet fBurmeseWordSet;
31    private static UnicodeSet fEndWordSet;
32    private static UnicodeSet fBeginWordSet;
33    private static UnicodeSet fMarkSet;
34
35    static {
36        // Initialize UnicodeSets
37        fBurmeseWordSet = new UnicodeSet();
38        fMarkSet = new UnicodeSet();
39        fBeginWordSet = new UnicodeSet();
40
41        fBurmeseWordSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]]");
42        fBurmeseWordSet.compact();
43
44        fMarkSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]");
45        fMarkSet.add(0x0020);
46        fEndWordSet = new UnicodeSet(fBurmeseWordSet);
47        fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
48
49        // Compact for caching
50        fMarkSet.compact();
51        fEndWordSet.compact();
52        fBeginWordSet.compact();
53
54        // Freeze the static UnicodeSet
55        fBurmeseWordSet.freeze();
56        fMarkSet.freeze();
57        fEndWordSet.freeze();
58        fBeginWordSet.freeze();
59    }
60
61    public BurmeseBreakEngine() throws IOException {
62        super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
63        setCharacters(fBurmeseWordSet);
64        // Initialize dictionary
65        fDictionary = DictionaryData.loadDictionaryFor("Mymr");
66    }
67
68    public boolean equals(Object obj) {
69        // Normally is a singleton, but it's possible to have duplicates
70        //   during initialization. All are equivalent.
71        return obj instanceof BurmeseBreakEngine;
72    }
73
74    public int hashCode() {
75        return getClass().hashCode();
76    }
77
78    public boolean handles(int c, int breakType) {
79        if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
80            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
81            return (script == UScript.MYANMAR);
82        }
83        return false;
84    }
85
86    public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
87            DequeI foundBreaks) {
88
89
90        if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD) {
91            return 0;  // Not enough characters for word
92        }
93        int wordsFound = 0;
94        int wordLength;
95        int current;
96        PossibleWord words[] = new PossibleWord[BURMESE_LOOKAHEAD];
97        for (int i = 0; i < BURMESE_LOOKAHEAD; i++) {
98            words[i] = new PossibleWord();
99        }
100        int uc;
101
102        fIter.setIndex(rangeStart);
103        while ((current = fIter.getIndex()) < rangeEnd) {
104            wordLength = 0;
105
106            //Look for candidate words at the current position
107            int candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
108
109            // If we found exactly one, use that
110            if (candidates == 1) {
111                wordLength = words[wordsFound%BURMESE_LOOKAHEAD].acceptMarked(fIter);
112                wordsFound += 1;
113            }
114
115            // If there was more than one, see which one can take us forward the most words
116            else if (candidates > 1) {
117                boolean foundBest = false;
118                // If we're already at the end of the range, we're done
119                if (fIter.getIndex() < rangeEnd) {
120                    do {
121                        int wordsMatched = 1;
122                        if (words[(wordsFound+1)%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
123                            if (wordsMatched < 2) {
124                                // Followed by another dictionary word; mark first word as a good candidate
125                                words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
126                                wordsMatched = 2;
127                            }
128
129                            // If we're already at the end of the range, we're done
130                            if (fIter.getIndex() >= rangeEnd) {
131                                break;
132                            }
133
134                            // See if any of the possible second words is followed by a third word
135                            do {
136                                // If we find a third word, stop right away
137                                if (words[(wordsFound+2)%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
138                                    words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
139                                    foundBest = true;
140                                    break;
141                                }
142                            } while (words[(wordsFound+1)%BURMESE_LOOKAHEAD].backUp(fIter));
143                        }
144                    } while (words[wordsFound%BURMESE_LOOKAHEAD].backUp(fIter) && !foundBest);
145                }
146                wordLength = words[wordsFound%BURMESE_LOOKAHEAD].acceptMarked(fIter);
147                wordsFound += 1;
148            }
149
150            // We come here after having either found a word or not. We look ahead to the
151            // next word. If it's not a dictionary word, we will combine it with the word we
152            // just found (if there is one), but only if the preceding word does not exceed
153            // the threshold.
154            // The text iterator should now be positioned at the end of the word we found.
155            if (fIter.getIndex() < rangeEnd && wordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
156                // If it is a dictionary word, do nothing. If it isn't, then if there is
157                // no preceding word, or the non-word shares less than the minimum threshold
158                // of characters with a dictionary word, then scan to resynchronize
159                if (words[wordsFound%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
160                        (wordLength == 0 ||
161                                words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
162                    // Look for a plausible word boundary
163                    int remaining = rangeEnd - (current + wordLength);
164                    int pc = fIter.current();
165                    int chars = 0;
166                    for (;;) {
167                        fIter.next();
168                        uc = fIter.current();
169                        chars += 1;
170                        if (--remaining <= 0) {
171                            break;
172                        }
173                        if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
174                            // Maybe. See if it's in the dictionary.
175                            int candidate = words[(wordsFound + 1) %BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
176                            fIter.setIndex(current + wordLength + chars);
177                            if (candidate > 0) {
178                                break;
179                            }
180                        }
181                        pc = uc;
182                    }
183
184                    // Bump the word count if there wasn't already one
185                    if (wordLength <= 0) {
186                        wordsFound += 1;
187                    }
188
189                    // Update the length with the passed-over characters
190                    wordLength += chars;
191                } else {
192                    // Backup to where we were for next iteration
193                    fIter.setIndex(current+wordLength);
194                }
195            }
196
197            // Never stop before a combining mark.
198            int currPos;
199            while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) {
200                fIter.next();
201                wordLength += fIter.getIndex() - currPos;
202            }
203
204            // Look ahead for possible suffixes if a dictionary word does not follow.
205            // We do this in code rather than using a rule so that the heuristic
206            // resynch continues to function. For example, one of the suffix characters
207            // could be a typo in the middle of a word.
208            // NOT CURRENTLY APPLICABLE TO BURMESE
209
210            // Did we find a word on this iteration? If so, push it on the break stack
211            if (wordLength > 0) {
212                foundBreaks.push(Integer.valueOf(current + wordLength));
213            }
214        }
215
216        // Don't return a break for the end of the dictionary range if there is one there
217        if (foundBreaks.peek() >= rangeEnd) {
218            foundBreaks.pop();
219            wordsFound -= 1;
220        }
221
222        return wordsFound;
223    }
224
225}
226