12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others.
22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License
37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/*
47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *******************************************************************************
57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Copyright (C) 2014, International Business Machines Corporation and         *
67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * others. All Rights Reserved.                                                *
77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *******************************************************************************
87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text;
107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.io.IOException;
127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.text.CharacterIterator;
137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UCharacter;
157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UProperty;
167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UScript;
177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertclass KhmerBreakEngine extends DictionaryBreakEngine {
197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // Constants for KhmerBreakIterator
217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // How many words in a row are "good enough"?
227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static final byte KHMER_LOOKAHEAD = 3;
237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // Will not combine a non-word with a preceding dictionary word longer than this
247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static final byte KHMER_ROOT_COMBINE_THRESHOLD = 3;
257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // Will not combine a non-word that shares at least this much prefix with a
267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // dictionary word with a preceding word
277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static final byte KHMER_PREFIX_COMBINE_THRESHOLD = 3;
287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // Minimum word size
297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static final byte KHMER_MIN_WORD = 2;
307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // Minimum number of characters for two words
317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static final byte KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private DictionaryMatcher fDictionary;
357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static UnicodeSet fKhmerWordSet;
367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static UnicodeSet fEndWordSet;
377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static UnicodeSet fBeginWordSet;
387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static UnicodeSet fMarkSet;
397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    static {
417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Initialize UnicodeSets
427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fKhmerWordSet = new UnicodeSet();
437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fMarkSet = new UnicodeSet();
447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fBeginWordSet = new UnicodeSet();
457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fKhmerWordSet.applyPattern("[[:Khmer:]&[:LineBreak=SA:]]");
477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fKhmerWordSet.compact();
487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fMarkSet.applyPattern("[[:Khmer:]&[:LineBreak=SA:]&[:M:]]");
507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fMarkSet.add(0x0020);
517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fEndWordSet = new UnicodeSet(fKhmerWordSet);
527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fBeginWordSet.add(0x1780, 0x17B3);
537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters
547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Compact for caching
567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fMarkSet.compact();
577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fEndWordSet.compact();
587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fBeginWordSet.compact();
597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Freeze the static UnicodeSet
617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fKhmerWordSet.freeze();
627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fMarkSet.freeze();
637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fEndWordSet.freeze();
647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fBeginWordSet.freeze();
657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public KhmerBreakEngine() throws IOException {
687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        setCharacters(fKhmerWordSet);
707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Initialize dictionary
717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fDictionary = DictionaryData.loadDictionaryFor("Khmr");
727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public boolean equals(Object obj) {
757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Normally is a singleton, but it's possible to have duplicates
767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //   during initialization. All are equivalent.
777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return obj instanceof KhmerBreakEngine;
787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public int hashCode() {
817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return getClass().hashCode();
827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public boolean handles(int c, int breakType) {
857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return (script == UScript.KHMER);
887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return false;
907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            DequeI foundBreaks) {
947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return 0;  // Not enough characters for word
977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int wordsFound = 0;
997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int wordLength;
1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int current;
1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        PossibleWord words[] = new PossibleWord[KHMER_LOOKAHEAD];
1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for (int i = 0; i < KHMER_LOOKAHEAD; i++) {
1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            words[i] = new PossibleWord();
1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int uc;
1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fIter.setIndex(rangeStart);
1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        while ((current = fIter.getIndex()) < rangeEnd) {
1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            wordLength = 0;
1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            //Look for candidate words at the current position
1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int candidates = words[wordsFound % KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // If we found exactly one, use that
1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (candidates == 1) {
1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(fIter);
1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                wordsFound += 1;
1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // If there was more than one, see which one can take us forward the most words
1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            else if (candidates > 1) {
1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                boolean foundBest = false;
1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // If we're already at the end of the range, we're done
1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (fIter.getIndex() < rangeEnd) {
1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    do {
1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        int wordsMatched = 1;
1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        if (words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            if (wordsMatched < 2) {
1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                // Followed by another dictionary word; mark first word as a good candidate
1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                words[wordsFound%KHMER_LOOKAHEAD].markCurrent();
1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                wordsMatched = 2;
1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            }
1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            // If we're already at the end of the range, we're done
1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            if (fIter.getIndex() >= rangeEnd) {
1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                break;
1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            }
1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            // See if any of the possible second words is followed by a third word
1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            do {
1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                // If we find a third word, stop right away
1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                if (words[(wordsFound+2)%KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) {
1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                    words[wordsFound%KHMER_LOOKAHEAD].markCurrent();
1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                    foundBest = true;
1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                    break;
1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                }
1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            } while (words[(wordsFound+1)%KHMER_LOOKAHEAD].backUp(fIter));
1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        }
1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    } while (words[wordsFound%KHMER_LOOKAHEAD].backUp(fIter) && !foundBest);
1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(fIter);
1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                wordsFound += 1;
1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // We come here after having either found a word or not. We look ahead to the
1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // next word. If it's not a dictionary word, we will combine it with the word we
1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // just found (if there is one), but only if the preceding word does not exceed
1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // the threshold.
1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // The text iterator should now be positioned at the end of the word we found.
1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (fIter.getIndex() < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // If it is a dictionary word, do nothing. If it isn't, then if there is
1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // no preceding word, or the non-word shares less than the minimum threshold
1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // of characters with a dictionary word, then scan to resynchronize
1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if (words[wordsFound%KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        (wordLength == 0 ||
1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                words[wordsFound%KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // Look for a plausible word boundary
1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    int remaining = rangeEnd - (current + wordLength);
1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    int pc = fIter.current();
1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    int chars = 0;
1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    for (;;) {
1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        fIter.next();
1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        uc = fIter.current();
1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        chars += 1;
1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        if (--remaining <= 0) {
1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            break;
1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        }
1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            // Maybe. See if it's in the dictionary.
1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            int candidate = words[(wordsFound + 1) %KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd);
1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            fIter.setIndex(current + wordLength + chars);
1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            if (candidate > 0) {
1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                                break;
1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            }
1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        }
1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        pc = uc;
1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // Bump the word count if there wasn't already one
1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if (wordLength <= 0) {
1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        wordsFound += 1;
1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // Update the length with the passed-over characters
1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    wordLength += chars;
1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else {
1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // Backup to where we were for next iteration
1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    fIter.setIndex(current+wordLength);
2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Never stop before a combining mark.
2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int currPos;
2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) {
2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                fIter.next();
2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                wordLength += fIter.getIndex() - currPos;
2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Look ahead for possible suffixes if a dictionary word does not follow.
2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // We do this in code rather than using a rule so that the heuristic
2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // resynch continues to function. For example, one of the suffix characters
2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // could be a typo in the middle of a word.
2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // NOT CURRENTLY APPLICABLE TO KHMER
2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Did we find a word on this iteration? If so, push it on the break stack
2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if (wordLength > 0) {
2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                foundBreaks.push(Integer.valueOf(current + wordLength));
2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Don't return a break for the end of the dictionary range if there is one there
2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if (foundBreaks.peek() >= rangeEnd) {
2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            foundBreaks.pop();
2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            wordsFound -= 1;
2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return wordsFound;
2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert}
232