12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others. 22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/* 47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Copyright (C) 2014, International Business Machines Corporation and * 67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * others. All Rights Reserved. * 77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert ******************************************************************************* 87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */ 97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.text; 107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.io.IOException; 127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport java.text.CharacterIterator; 137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UCharacter; 157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UProperty; 167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.lang.UScript; 177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertclass KhmerBreakEngine extends DictionaryBreakEngine { 197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Constants for KhmerBreakIterator 217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // How many words in a row are "good enough"? 227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final byte KHMER_LOOKAHEAD = 3; 237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Will not combine a non-word with a preceding dictionary word longer than this 247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final byte KHMER_ROOT_COMBINE_THRESHOLD = 3; 257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Will not combine a non-word that shares at least this much prefix with a 267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // dictionary word with a preceding word 277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final byte KHMER_PREFIX_COMBINE_THRESHOLD = 3; 287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Minimum word size 297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final byte KHMER_MIN_WORD = 2; 307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Minimum number of characters for two words 317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static final byte KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2; 327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private DictionaryMatcher fDictionary; 357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static UnicodeSet fKhmerWordSet; 367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static UnicodeSet fEndWordSet; 377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static UnicodeSet fBeginWordSet; 387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert private static UnicodeSet fMarkSet; 397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert static { 417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Initialize UnicodeSets 427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fKhmerWordSet = new UnicodeSet(); 437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fMarkSet = new UnicodeSet(); 447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fBeginWordSet = new UnicodeSet(); 457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fKhmerWordSet.applyPattern("[[:Khmer:]&[:LineBreak=SA:]]"); 477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fKhmerWordSet.compact(); 487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fMarkSet.applyPattern("[[:Khmer:]&[:LineBreak=SA:]&[:M:]]"); 507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fMarkSet.add(0x0020); 517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fEndWordSet = new UnicodeSet(fKhmerWordSet); 527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fBeginWordSet.add(0x1780, 0x17B3); 537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters 547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Compact for caching 567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fMarkSet.compact(); 577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fEndWordSet.compact(); 587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fBeginWordSet.compact(); 597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Freeze the static UnicodeSet 617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fKhmerWordSet.freeze(); 627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fMarkSet.freeze(); 637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fEndWordSet.freeze(); 647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fBeginWordSet.freeze(); 657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public KhmerBreakEngine() throws IOException { 687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE); 697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert setCharacters(fKhmerWordSet); 707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Initialize dictionary 717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fDictionary = DictionaryData.loadDictionaryFor("Khmr"); 727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean equals(Object obj) { 757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Normally is a singleton, but it's possible to have duplicates 767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // during initialization. All are equivalent. 777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return obj instanceof KhmerBreakEngine; 787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int hashCode() { 817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return getClass().hashCode(); 827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public boolean handles(int c, int breakType) { 857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) { 867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return (script == UScript.KHMER); 887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return false; 907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd, 937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert DequeI foundBreaks) { 947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { 967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return 0; // Not enough characters for word 977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int wordsFound = 0; 997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int wordLength; 1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int current; 1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert PossibleWord words[] = new PossibleWord[KHMER_LOOKAHEAD]; 1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (int i = 0; i < KHMER_LOOKAHEAD; i++) { 1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert words[i] = new PossibleWord(); 1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int uc; 1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fIter.setIndex(rangeStart); 1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while ((current = fIter.getIndex()) < rangeEnd) { 1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert wordLength = 0; 1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert //Look for candidate words at the current position 1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int candidates = words[wordsFound % KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd); 1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If we found exactly one, use that 1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (candidates == 1) { 1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(fIter); 1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert wordsFound += 1; 1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If there was more than one, see which one can take us forward the most words 1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert else if (candidates > 1) { 1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert boolean foundBest = false; 1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If we're already at the end of the range, we're done 1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fIter.getIndex() < rangeEnd) { 1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int wordsMatched = 1; 1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) { 1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (wordsMatched < 2) { 1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Followed by another dictionary word; mark first word as a good candidate 1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert words[wordsFound%KHMER_LOOKAHEAD].markCurrent(); 1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert wordsMatched = 2; 1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If we're already at the end of the range, we're done 1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fIter.getIndex() >= rangeEnd) { 1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // See if any of the possible second words is followed by a third word 1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert do { 1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If we find a third word, stop right away 1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (words[(wordsFound+2)%KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) > 0) { 1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert words[wordsFound%KHMER_LOOKAHEAD].markCurrent(); 1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert foundBest = true; 1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while (words[(wordsFound+1)%KHMER_LOOKAHEAD].backUp(fIter)); 1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } while (words[wordsFound%KHMER_LOOKAHEAD].backUp(fIter) && !foundBest); 1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(fIter); 1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert wordsFound += 1; 1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We come here after having either found a word or not. We look ahead to the 1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // next word. If it's not a dictionary word, we will combine it with the word we 1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // just found (if there is one), but only if the preceding word does not exceed 1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // the threshold. 1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // The text iterator should now be positioned at the end of the word we found. 1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fIter.getIndex() < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { 1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // If it is a dictionary word, do nothing. If it isn't, then if there is 1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // no preceding word, or the non-word shares less than the minimum threshold 1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // of characters with a dictionary word, then scan to resynchronize 1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (words[wordsFound%KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 && 1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert (wordLength == 0 || 1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert words[wordsFound%KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { 1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Look for a plausible word boundary 1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int remaining = rangeEnd - (current + wordLength); 1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int pc = fIter.current(); 1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int chars = 0; 1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert for (;;) { 1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fIter.next(); 1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert uc = fIter.current(); 1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert chars += 1; 1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (--remaining <= 0) { 1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Maybe. See if it's in the dictionary. 1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int candidate = words[(wordsFound + 1) %KHMER_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd); 1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fIter.setIndex(current + wordLength + chars); 1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (candidate > 0) { 1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert break; 1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert pc = uc; 1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Bump the word count if there wasn't already one 1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (wordLength <= 0) { 1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert wordsFound += 1; 1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Update the length with the passed-over characters 1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert wordLength += chars; 1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } else { 1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Backup to where we were for next iteration 1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fIter.setIndex(current+wordLength); 2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Never stop before a combining mark. 2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert int currPos; 2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert while ((currPos = fIter.getIndex()) < rangeEnd && fMarkSet.contains(fIter.current())) { 2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert fIter.next(); 2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert wordLength += fIter.getIndex() - currPos; 2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Look ahead for possible suffixes if a dictionary word does not follow. 2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // We do this in code rather than using a rule so that the heuristic 2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // resynch continues to function. For example, one of the suffix characters 2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // could be a typo in the middle of a word. 2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // NOT CURRENTLY APPLICABLE TO KHMER 2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Did we find a word on this iteration? If so, push it on the break stack 2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (wordLength > 0) { 2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert foundBreaks.push(Integer.valueOf(current + wordLength)); 2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert // Don't return a break for the end of the dictionary range if there is one there 2237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert if (foundBreaks.peek() >= rangeEnd) { 2247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert foundBreaks.pop(); 2257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert wordsFound -= 1; 2267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert return wordsFound; 2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert } 2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert 2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert} 232