1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 2014, International Business Machines Corporation and         *
6 * others. All Rights Reserved.                                                *
7 *******************************************************************************
8 */
9package com.ibm.icu.text;
10
11import java.io.IOException;
12import java.text.CharacterIterator;
13
14import com.ibm.icu.lang.UCharacter;
15import com.ibm.icu.lang.UProperty;
16import com.ibm.icu.lang.UScript;
17
18class BurmeseBreakEngine extends DictionaryBreakEngine {
19
20    // Constants for BurmeseBreakIterator
21    // How many words in a row are "good enough"?
22    private static final byte BURMESE_LOOKAHEAD = 3;
23    // Will not combine a non-word with a preceding dictionary word longer than this
24    private static final byte BURMESE_ROOT_COMBINE_THRESHOLD = 3;
25    // Will not combine a non-word that shares at least this much prefix with a
26    // dictionary word with a preceding word
27    private static final byte BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
28    // Minimum word size
29    private static final byte BURMESE_MIN_WORD = 2;
30
31    private DictionaryMatcher fDictionary;
32    private static UnicodeSet fBurmeseWordSet;
33    private static UnicodeSet fEndWordSet;
34    private static UnicodeSet fBeginWordSet;
35    private static UnicodeSet fMarkSet;
36
37    static {
38        // Initialize UnicodeSets
39        fBurmeseWordSet = new UnicodeSet();
40        fMarkSet = new UnicodeSet();
41        fBeginWordSet = new UnicodeSet();
42
43        fBurmeseWordSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]]");
44        fBurmeseWordSet.compact();
45
46        fMarkSet.applyPattern("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]");
47        fMarkSet.add(0x0020);
48        fEndWordSet = new UnicodeSet(fBurmeseWordSet);
49        fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
50
51        // Compact for caching
52        fMarkSet.compact();
53        fEndWordSet.compact();
54        fBeginWordSet.compact();
55
56        // Freeze the static UnicodeSet
57        fBurmeseWordSet.freeze();
58        fMarkSet.freeze();
59        fEndWordSet.freeze();
60        fBeginWordSet.freeze();
61    }
62
63    public BurmeseBreakEngine() throws IOException {
64        super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
65        setCharacters(fBurmeseWordSet);
66        // Initialize dictionary
67        fDictionary = DictionaryData.loadDictionaryFor("Mymr");
68    }
69
70    @Override
71    public boolean equals(Object obj) {
72        // Normally is a singleton, but it's possible to have duplicates
73        //   during initialization. All are equivalent.
74        return obj instanceof BurmeseBreakEngine;
75    }
76
77    @Override
78    public int hashCode() {
79        return getClass().hashCode();
80    }
81
82    @Override
83    public boolean handles(int c, int breakType) {
84        if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
85            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
86            return (script == UScript.MYANMAR);
87        }
88        return false;
89    }
90
91    @Override
92    public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
93            DequeI foundBreaks) {
94
95
96        if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD) {
97            return 0;  // Not enough characters for word
98        }
99        int wordsFound = 0;
100        int wordLength;
101        int current;
102        PossibleWord words[] = new PossibleWord[BURMESE_LOOKAHEAD];
103        for (int i = 0; i < BURMESE_LOOKAHEAD; i++) {
104            words[i] = new PossibleWord();
105        }
106        int uc;
107
108        fIter.setIndex(rangeStart);
109        while ((current = fIter.getIndex()) < rangeEnd) {
110            wordLength = 0;
111
112            //Look for candidate words at the current position
113            int candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
114
115            // If we found exactly one, use that
116            if (candidates == 1) {
117                wordLength = words[wordsFound%BURMESE_LOOKAHEAD].acceptMarked(fIter);
118                wordsFound += 1;
119            }
120
121            // If there was more than one, see which one can take us forward the most words
122            else if (candidates > 1) {
123                boolean foundBest = false;
124                // If we're already at the end of the range, we're done
125                if (fIter.getIndex() < rangeEnd) {
126                    do {
127                        int wordsMatched = 1;
128                        if (words[(wordsFound+1)%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
129                            if (wordsMatched < 2) {
130                                // Followed by another dictionary word; mark first word as a good candidate
131                                words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
132                                wordsMatched = 2;
133                            }
134
135                            // If we're already at the end of the range, we're done
136                            if (fIter.getIndex() >= rangeEnd) {
137                                break;
138                            }
139
140                            // See if any of the possible second words is followed by a third word
141                            do {
142                                // If we find a third word, stop right away
143                                if (words[(wordsFound+2)%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
144                                    words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
145                                    foundBest = true;
146                                    break;
147                                }
148                            } while (words[(wordsFound+1)%BURMESE_LOOKAHEAD].backUp(fIter));
149                        }
150                    } while (words[wordsFound%BURMESE_LOOKAHEAD].backUp(fIter) && !foundBest);
151                }
152                wordLength = words[wordsFound%BURMESE_LOOKAHEAD].acceptMarked(fIter);
153                wordsFound += 1;
154            }
155
156            // We come here after having either found a word or not. We look ahead to the
157            // next word. If it's not a dictionary word, we will combine it with the word we
158            // just found (if there is one), but only if the preceding word does not exceed
159            // the threshold.
160            // The text iterator should now be positioned at the end of the word we found.
161            if (fIter.getIndex() < rangeEnd && wordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
162                // If it is a dictionary word, do nothing. If it isn't, then if there is
163                // no preceding word, or the non-word shares less than the minimum threshold
164                // of characters with a dictionary word, then scan to resynchronize
165                if (words[wordsFound%BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
166                        (wordLength == 0 ||
167                                words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
168                    // Look for a plausible word boundary
169                    int remaining = rangeEnd - (current + wordLength);
170                    int pc = fIter.current();
171                    int chars = 0;
172                    for (;;) {
173                        fIter.next();
174                        uc = fIter.current();
175                        chars += 1;
176                        if (--remaining <= 0) {
177                            break;
178                        }
179                        if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
180                            // Maybe. See if it's in the dictionary.
181                            int candidate = words[(wordsFound + 1) %BURMESE_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
182                            fIter.setIndex(current + wordLength + chars);
183                            if (candidate > 0) {
184                                break;
185                            }
186                        }
187                        pc = uc;
188                    }
189
190                    // Bump the word count if there wasn't already one
191                    if (wordLength <= 0) {
192                        wordsFound += 1;
193                    }
194
195                    // Update the length with the passed-over characters
196                    wordLength += chars;
197                } else {
198                    // Backup to where we were for next iteration
199                    fIter.setIndex(current+wordLength);
200                }
201            }
202
203            // Never stop before a combining mark.
204            int currPos;
205            while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) {
206                fIter.next();
207                wordLength += fIter.getIndex() - currPos;
208            }
209
210            // Look ahead for possible suffixes if a dictionary word does not follow.
211            // We do this in code rather than using a rule so that the heuristic
212            // resynch continues to function. For example, one of the suffix characters
213            // could be a typo in the middle of a word.
214            // NOT CURRENTLY APPLICABLE TO BURMESE
215
216            // Did we find a word on this iteration? If so, push it on the break stack
217            if (wordLength > 0) {
218                foundBreaks.push(Integer.valueOf(current + wordLength));
219            }
220        }
221
222        // Don't return a break for the end of the dictionary range if there is one there
223        if (foundBreaks.peek() >= rangeEnd) {
224            foundBreaks.pop();
225            wordsFound -= 1;
226        }
227
228        return wordsFound;
229    }
230
231}
232