1ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/** 2ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************* 3b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * Copyright (C) 2006-2008,2011, International Business Machines Corporation * 4b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho * and others. All Rights Reserved. * 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ******************************************************************************* 6ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "brkeng.h" 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "dictbe.h" 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h" 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/chariter.h" 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ubrk.h" 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "uvector.h" 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "triedict.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ****************************************************************** 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*DictionaryBreakEngine::DictionaryBreakEngine() { 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fTypes = 0; 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}*/ 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fTypes = breakTypes; 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::~DictionaryBreakEngine() { 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruUBool 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru && fSet.contains(c)); 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::findBreaks( UText *text, 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t startPos, 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t endPos, 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool reverse, 48ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t breakType, 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UStack &foundBreaks ) const { 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t result = 0; 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Find the span of characters included in the set. 53ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t start = (int32_t)utext_getNativeIndex(text); 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t current; 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rangeStart; 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rangeEnd; 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 c = utext_current32(text); 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (reverse) { 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool isDict = fSet.contains(c); 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c = utext_previous32(text); 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru isDict = fSet.contains(c); 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rangeEnd = start + 1; 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); // TODO: recast loop for postincrement 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru c = utext_current32(text); 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rangeStart = start; 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru rangeEnd = current; 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); 77ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, current); 78ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 79ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return result; 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruvoid 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSet = set; 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Compact for caching 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSet.compact(); 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/*void 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruDictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) { 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fTypes = breakTypes; 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}*/ 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru/* 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ****************************************************************** 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru */ 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Helper class for improving readability of the Thai word break 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// algorithm. The implementation is completely inline. 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// List size, limited by the maximum number of words in the dictionary 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// that form a nested sequence. 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define POSSIBLE_WORD_LIST_MAX 20 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruclass PossibleWord { 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru private: 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // list of word candidate lengths, in increasing length order 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t lengths[POSSIBLE_WORD_LIST_MAX]; 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int count; // Count of candidates 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t prefix; // The longest match with a dictionary word 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t offset; // Offset in the text of these candidates 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int mark; // The preferred candidate's offset 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int current; // The candidate we're currently looking at 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru public: 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru PossibleWord(); 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ~PossibleWord(); 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Fill the list of candidates if needed, select the longest, and return the number found 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ); 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Select the currently marked candidate, point after it in the text, and invalidate self 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t acceptMarked( UText *text ); 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Back up from the current candidate to the next shorter one; return TRUE if that exists 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and point the text after it 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UBool backUp( UText *text ); 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Return the longest prefix this candidate location shares with a dictionary word 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t longestPrefix(); 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Mark the current candidate as the one we like 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru void markCurrent(); 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru}; 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::PossibleWord() { 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offset = -1; 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::~PossibleWord() { 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) { 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // TODO: If getIndex is too slow, use offset < 0 and add discardAll() 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t start = (int32_t)utext_getNativeIndex(text); 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (start != offset) { 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru offset = start; 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Dictionary leaves text after longest prefix, not longest word. Back up. 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (count <= 0) { 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, start); 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (count > 0) { 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, start+lengths[count-1]); 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru current = count-1; 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru mark = current; 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return count; 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::acceptMarked( UText *text ) { 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, offset + lengths[mark]); 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return lengths[mark]; 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline UBool 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::backUp( UText *text ) { 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (current > 0) { 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, offset + lengths[--current]); 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return TRUE; 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return FALSE; 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline int32_t 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::longestPrefix() { 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return prefix; 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruinline void 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruPossibleWord::markCurrent() { 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru mark = current; 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// How many words in a row are "good enough"? 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_LOOKAHEAD 3 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Will not combine a non-word with a preceding dictionary word longer than this 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_ROOT_COMBINE_THRESHOLD 3 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Will not combine a non-word that shares at least this much prefix with a 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// dictionary word, with a preceding word 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_PREFIX_COMBINE_THRESHOLD 3 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Ellision character 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_PAIYANNOI 0x0E2F 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Repeat character 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_MAIYAMOK 0x0E46 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Minimum word size 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_MIN_WORD 2 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Minimum number of characters for two words 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status) 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fDictionary(adoptDictionary) 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_SUCCESS(status)) { 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru setCharacters(fThaiWordSet); 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); 22385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho fMarkSet.add(0x0020); 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEndWordSet = fThaiWordSet; 225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSuffixSet.add(THAI_PAIYANNOI); 230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSuffixSet.add(THAI_MAIYAMOK); 231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Compact for caching. 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fMarkSet.compact(); 234ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fEndWordSet.compact(); 235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fBeginWordSet.compact(); 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSuffixSet.compact(); 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiBreakEngine::~ThaiBreakEngine() { 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fDictionary; 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queruint32_t 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruThaiBreakEngine::divideUpDictionaryRange( UText *text, 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rangeStart, 246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rangeEnd, 247ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UStack &foundBreaks ) const { 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return 0; // Not enough characters for two words 250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uint32_t wordsFound = 0; 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t wordLength; 254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t current; 255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode status = U_ZERO_ERROR; 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru PossibleWord words[THAI_LOOKAHEAD]; 257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 uc; 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, rangeStart); 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordLength = 0; 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look for candidate words at the current position 265ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 266ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If we found exactly one, use that 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (candidates == 1) { 269ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); 270ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordsFound += 1; 271ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 272ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If there was more than one, see which one can take us forward the most words 274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else if (candidates > 1) { 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If we're already at the end of the range, we're done 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru goto foundBest; 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int wordsMatched = 1; 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (wordsMatched < 2) { 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Followed by another dictionary word; mark first word as a good candidate 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordsMatched = 2; 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If we're already at the end of the range, we're done 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru goto foundBest; 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 292ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 293ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // See if any of the possible second words is followed by a third word 294ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru do { 295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // If we find a third word, stop right away 296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru goto foundBest; 299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text)); 302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while (words[wordsFound%THAI_LOOKAHEAD].backUp(text)); 305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QuerufoundBest: 306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); 307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordsFound += 1; 308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // We come here after having either found a word or not. We look ahead to the 311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // next word. If it's not a dictionary word, we will combine it withe the word we 312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // just found (if there is one), but only if the preceding word does not exceed 313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // the threshold. 314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // The text iterator should now be positioned at the end of the word we found. 315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { 316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // if it is a dictionary word, do nothing. If it isn't, then if there is 317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // no preceding word, or the non-word shares less than the minimum threshold 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // of characters with a dictionary word, then scan to resynchronize 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru && (wordLength == 0 321ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { 322ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look for a plausible word boundary 323ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru //TODO: This section will need a rework for UText. 324ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t remaining = rangeEnd - (current+wordLength); 325ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UChar32 pc = utext_current32(text); 326ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t chars = 0; 327ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (;;) { 328ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 329ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uc = utext_current32(text); 330ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // TODO: Here we're counting on the fact that the SA languages are all 331ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // in the BMP. This should get fixed with the UText rework. 332ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru chars += 1; 333ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (--remaining <= 0) { 334ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 335ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 336ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 337ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Maybe. See if it's in the dictionary. 338ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // NOTE: In the original Apple code, checked that the next 339ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // two characters after uc were not 0x0E4C THANTHAKHAT before 340ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // checking the dictionary. That is just a performance filter, 341ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // but it's not clear it's faster than checking the trie. 342ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 343ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, current+wordLength+chars); 344ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (candidates > 0) { 345ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 346ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 347ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 348ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru pc = uc; 349ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 350ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 351ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Bump the word count if there wasn't already one 352ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (wordLength <= 0) { 353ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordsFound += 1; 354ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 355ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 356ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Update the length with the passed-over characters 357ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordLength += chars; 358ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 359ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 360ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Back up to where we were for next iteration 361ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, current+wordLength); 362ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 363ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 364ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 365ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Never stop before a combining mark. 366ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t currPos; 367ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 368ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 369ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 370ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 371ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 372ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Look ahead for possible suffixes if a dictionary word does not follow. 373ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // We do this in code rather than using a rule so that the heuristic 374ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // resynch continues to function. For example, one of the suffix characters 375ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // could be a typo in the middle of a word. 376ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 377ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 378ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru && fSuffixSet.contains(uc = utext_current32(text))) { 379ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (uc == THAI_PAIYANNOI) { 380ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (!fSuffixSet.contains(utext_previous32(text))) { 381ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Skip over previous end and PAIYANNOI 382ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 383ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 384ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordLength += 1; // Add PAIYANNOI to word 385ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uc = utext_current32(text); // Fetch next character 386ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 387ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 388ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Restore prior position 389ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 390ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 391ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 392ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (uc == THAI_MAIYAMOK) { 393ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (utext_previous32(text) != THAI_MAIYAMOK) { 394ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Skip over previous end and MAIYAMOK 395ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 396ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 397ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordLength += 1; // Add MAIYAMOK to word 398ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 399ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 400ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Restore prior position 401ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_next32(text); 402ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 403ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 404ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 405ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else { 406ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru utext_setNativeIndex(text, current+wordLength); 407ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 408ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 409b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 410b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Did we find a word on this iteration? If so, push it on the break stack 411b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (wordLength > 0) { 412b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho foundBreaks.push((current+wordLength), status); 413b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 414b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 415b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 416b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Don't return a break for the end of the dictionary range if there is one there. 417b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (foundBreaks.peeki() >= rangeEnd) { 418b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho (void) foundBreaks.popi(); 419b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordsFound -= 1; 420b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 421b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 422b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return wordsFound; 423b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 424b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 425b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// How many words in a row are "good enough"? 426b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_LOOKAHEAD 3 427b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 428b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Will not combine a non-word with a preceding dictionary word longer than this 429b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_ROOT_COMBINE_THRESHOLD 3 430b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 431b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Will not combine a non-word that shares at least this much prefix with a 432b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// dictionary word, with a preceding word 433b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_PREFIX_COMBINE_THRESHOLD 3 434b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 435b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Minimum word size 436b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_MIN_WORD 2 437b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 438b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// Minimum number of characters for two words 439b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho#define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2) 440b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 441b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoKhmerBreakEngine::KhmerBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status) 442b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 443b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fDictionary(adoptDictionary) 444b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho{ 445b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); 446b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (U_SUCCESS(status)) { 447b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho setCharacters(fKhmerWordSet); 448b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 449b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); 450b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fMarkSet.add(0x0020); 451b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fEndWordSet = fKhmerWordSet; 452b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fBeginWordSet.add(0x1780, 0x17B3); 453b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels 454b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word 455b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word 456b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters 457b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels 458b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 459b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 460b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 461b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 462b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fSuffixSet.add(THAI_PAIYANNOI); 463b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fSuffixSet.add(THAI_MAIYAMOK); 464b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 465b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Compact for caching. 466b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fMarkSet.compact(); 467b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fEndWordSet.compact(); 468b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho fBeginWordSet.compact(); 469b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// fSuffixSet.compact(); 470b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 471b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 472b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoKhmerBreakEngine::~KhmerBreakEngine() { 473b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho delete fDictionary; 474b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho} 475b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 476b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoint32_t 477b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehoKhmerBreakEngine::divideUpDictionaryRange( UText *text, 478b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t rangeStart, 479b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t rangeEnd, 480b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UStack &foundBreaks ) const { 481b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { 482b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho return 0; // Not enough characters for two words 483b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 484b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 485b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho uint32_t wordsFound = 0; 486b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t wordLength; 487b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t current; 488b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UErrorCode status = U_ZERO_ERROR; 489b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho PossibleWord words[KHMER_LOOKAHEAD]; 490b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar32 uc; 491b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 492b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho utext_setNativeIndex(text, rangeStart); 493b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 494b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 495b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordLength = 0; 496b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 497b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Look for candidate words at the current position 498b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 499b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 500b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If we found exactly one, use that 501b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (candidates == 1) { 502b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); 503b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordsFound += 1; 504b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 505b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 506b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If there was more than one, see which one can take us forward the most words 507b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho else if (candidates > 1) { 508b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If we're already at the end of the range, we're done 509b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 510b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho goto foundBest; 511b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 512b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho do { 513b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int wordsMatched = 1; 514b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 515b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (wordsMatched < 2) { 516b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Followed by another dictionary word; mark first word as a good candidate 517b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho words[wordsFound%KHMER_LOOKAHEAD].markCurrent(); 518b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordsMatched = 2; 519b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 520b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 521b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If we're already at the end of the range, we're done 522b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 523b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho goto foundBest; 524b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 525b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 526b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // See if any of the possible second words is followed by a third word 527b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho do { 528b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // If we find a third word, stop right away 529b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (words[(wordsFound+2)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 530b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho words[wordsFound%KHMER_LOOKAHEAD].markCurrent(); 531b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho goto foundBest; 532b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 533b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 534b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while (words[(wordsFound+1)%KHMER_LOOKAHEAD].backUp(text)); 535b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 536b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 537b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while (words[wordsFound%KHMER_LOOKAHEAD].backUp(text)); 538b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2clairehofoundBest: 539b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text); 540b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordsFound += 1; 541b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 542b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 543b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // We come here after having either found a word or not. We look ahead to the 544b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // next word. If it's not a dictionary word, we will combine it with the word we 545b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // just found (if there is one), but only if the preceding word does not exceed 546b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // the threshold. 547b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // The text iterator should now be positioned at the end of the word we found. 548b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) { 549b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // if it is a dictionary word, do nothing. If it isn't, then if there is 550b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // no preceding word, or the non-word shares less than the minimum threshold 551b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // of characters with a dictionary word, then scan to resynchronize 552b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 553b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho && (wordLength == 0 554b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho || words[wordsFound%KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { 555b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Look for a plausible word boundary 556b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho //TODO: This section will need a rework for UText. 557b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t remaining = rangeEnd - (current+wordLength); 558b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho UChar32 pc = utext_current32(text); 559b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t chars = 0; 560b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho for (;;) { 561b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho utext_next32(text); 562b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho uc = utext_current32(text); 563b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // TODO: Here we're counting on the fact that the SA languages are all 564b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // in the BMP. This should get fixed with the UText rework. 565b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho chars += 1; 566b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (--remaining <= 0) { 567b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 568b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 569b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 570b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Maybe. See if it's in the dictionary. 571b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int candidates = words[(wordsFound+1)%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 572b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho utext_setNativeIndex(text, current+wordLength+chars); 573b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (candidates > 0) { 574b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho break; 575b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 576b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 577b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho pc = uc; 578b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 579b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 580b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Bump the word count if there wasn't already one 581b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho if (wordLength <= 0) { 582b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordsFound += 1; 583b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 584b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 585b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Update the length with the passed-over characters 586b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordLength += chars; 587b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 588b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho else { 589b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Back up to where we were for next iteration 590b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho utext_setNativeIndex(text, current+wordLength); 591b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 592b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 593b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 594b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Never stop before a combining mark. 595b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho int32_t currPos; 596b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 597b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho utext_next32(text); 598b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 599b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho } 600b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 601b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // Look ahead for possible suffixes if a dictionary word does not follow. 602b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // We do this in code rather than using a rule so that the heuristic 603b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // resynch continues to function. For example, one of the suffix characters 604b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho // could be a typo in the middle of a word. 605b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 606b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 607b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// && fSuffixSet.contains(uc = utext_current32(text))) { 608b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// if (uc == KHMER_PAIYANNOI) { 609b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// if (!fSuffixSet.contains(utext_previous32(text))) { 610b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// // Skip over previous end and PAIYANNOI 611b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_next32(text); 612b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_next32(text); 613b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// wordLength += 1; // Add PAIYANNOI to word 614b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// uc = utext_current32(text); // Fetch next character 615b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 616b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// else { 617b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// // Restore prior position 618b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_next32(text); 619b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 620b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 621b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// if (uc == KHMER_MAIYAMOK) { 622b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// if (utext_previous32(text) != KHMER_MAIYAMOK) { 623b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// // Skip over previous end and MAIYAMOK 624b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_next32(text); 625b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_next32(text); 626b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// wordLength += 1; // Add MAIYAMOK to word 627b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 628b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// else { 629b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// // Restore prior position 630b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_next32(text); 631b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 632b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 633b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 634b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// else { 635b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// utext_setNativeIndex(text, current+wordLength); 636b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 637b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho// } 638b26ce3a7367e4ed2ee7ddddcdc3f3d3377a455c2claireho 639ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Did we find a word on this iteration? If so, push it on the break stack 640ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (wordLength > 0) { 641ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru foundBreaks.push((current+wordLength), status); 642ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 643ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 644ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 645ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Don't return a break for the end of the dictionary range if there is one there. 646ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (foundBreaks.peeki() >= rangeEnd) { 647ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru (void) foundBreaks.popi(); 648ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru wordsFound -= 1; 649ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 650ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 651ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return wordsFound; 652ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 653ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 654ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 655ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 656ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 657