1/** 2 ******************************************************************************* 3 * Copyright (C) 2006-2008, International Business Machines Corporation and others. * 4 * All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 8#include "unicode/utypes.h" 9 10#if !UCONFIG_NO_BREAK_ITERATION 11 12#include "brkeng.h" 13#include "dictbe.h" 14#include "unicode/uniset.h" 15#include "unicode/chariter.h" 16#include "unicode/ubrk.h" 17#include "uvector.h" 18#include "triedict.h" 19 20U_NAMESPACE_BEGIN 21 22/* 23 ****************************************************************** 24 */ 25 26/*DictionaryBreakEngine::DictionaryBreakEngine() { 27 fTypes = 0; 28}*/ 29 30DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { 31 fTypes = breakTypes; 32} 33 34DictionaryBreakEngine::~DictionaryBreakEngine() { 35} 36 37UBool 38DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { 39 return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) 40 && fSet.contains(c)); 41} 42 43int32_t 44DictionaryBreakEngine::findBreaks( UText *text, 45 int32_t startPos, 46 int32_t endPos, 47 UBool reverse, 48 int32_t breakType, 49 UStack &foundBreaks ) const { 50 int32_t result = 0; 51 52 // Find the span of characters included in the set. 53 int32_t start = (int32_t)utext_getNativeIndex(text); 54 int32_t current; 55 int32_t rangeStart; 56 int32_t rangeEnd; 57 UChar32 c = utext_current32(text); 58 if (reverse) { 59 UBool isDict = fSet.contains(c); 60 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { 61 c = utext_previous32(text); 62 isDict = fSet.contains(c); 63 } 64 rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1); 65 rangeEnd = start + 1; 66 } 67 else { 68 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { 69 utext_next32(text); // TODO: recast loop for postincrement 70 c = utext_current32(text); 71 } 72 rangeStart = start; 73 rangeEnd = current; 74 } 75 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { 76 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); 77 utext_setNativeIndex(text, current); 78 } 79 80 return result; 81} 82 83void 84DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { 85 fSet = set; 86 // Compact for caching 87 fSet.compact(); 88} 89 90/*void 91DictionaryBreakEngine::setBreakTypes( uint32_t breakTypes ) { 92 fTypes = breakTypes; 93}*/ 94 95/* 96 ****************************************************************** 97 */ 98 99 100// Helper class for improving readability of the Thai word break 101// algorithm. The implementation is completely inline. 102 103// List size, limited by the maximum number of words in the dictionary 104// that form a nested sequence. 105#define POSSIBLE_WORD_LIST_MAX 20 106 107class PossibleWord { 108 private: 109 // list of word candidate lengths, in increasing length order 110 int32_t lengths[POSSIBLE_WORD_LIST_MAX]; 111 int count; // Count of candidates 112 int32_t prefix; // The longest match with a dictionary word 113 int32_t offset; // Offset in the text of these candidates 114 int mark; // The preferred candidate's offset 115 int current; // The candidate we're currently looking at 116 117 public: 118 PossibleWord(); 119 ~PossibleWord(); 120 121 // Fill the list of candidates if needed, select the longest, and return the number found 122 int candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ); 123 124 // Select the currently marked candidate, point after it in the text, and invalidate self 125 int32_t acceptMarked( UText *text ); 126 127 // Back up from the current candidate to the next shorter one; return TRUE if that exists 128 // and point the text after it 129 UBool backUp( UText *text ); 130 131 // Return the longest prefix this candidate location shares with a dictionary word 132 int32_t longestPrefix(); 133 134 // Mark the current candidate as the one we like 135 void markCurrent(); 136}; 137 138inline 139PossibleWord::PossibleWord() { 140 offset = -1; 141} 142 143inline 144PossibleWord::~PossibleWord() { 145} 146 147inline int 148PossibleWord::candidates( UText *text, const TrieWordDictionary *dict, int32_t rangeEnd ) { 149 // TODO: If getIndex is too slow, use offset < 0 and add discardAll() 150 int32_t start = (int32_t)utext_getNativeIndex(text); 151 if (start != offset) { 152 offset = start; 153 prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0])); 154 // Dictionary leaves text after longest prefix, not longest word. Back up. 155 if (count <= 0) { 156 utext_setNativeIndex(text, start); 157 } 158 } 159 if (count > 0) { 160 utext_setNativeIndex(text, start+lengths[count-1]); 161 } 162 current = count-1; 163 mark = current; 164 return count; 165} 166 167inline int32_t 168PossibleWord::acceptMarked( UText *text ) { 169 utext_setNativeIndex(text, offset + lengths[mark]); 170 return lengths[mark]; 171} 172 173inline UBool 174PossibleWord::backUp( UText *text ) { 175 if (current > 0) { 176 utext_setNativeIndex(text, offset + lengths[--current]); 177 return TRUE; 178 } 179 return FALSE; 180} 181 182inline int32_t 183PossibleWord::longestPrefix() { 184 return prefix; 185} 186 187inline void 188PossibleWord::markCurrent() { 189 mark = current; 190} 191 192// How many words in a row are "good enough"? 193#define THAI_LOOKAHEAD 3 194 195// Will not combine a non-word with a preceding dictionary word longer than this 196#define THAI_ROOT_COMBINE_THRESHOLD 3 197 198// Will not combine a non-word that shares at least this much prefix with a 199// dictionary word, with a preceding word 200#define THAI_PREFIX_COMBINE_THRESHOLD 3 201 202// Ellision character 203#define THAI_PAIYANNOI 0x0E2F 204 205// Repeat character 206#define THAI_MAIYAMOK 0x0E46 207 208// Minimum word size 209#define THAI_MIN_WORD 2 210 211// Minimum number of characters for two words 212#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2) 213 214ThaiBreakEngine::ThaiBreakEngine(const TrieWordDictionary *adoptDictionary, UErrorCode &status) 215 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 216 fDictionary(adoptDictionary) 217{ 218 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); 219 if (U_SUCCESS(status)) { 220 setCharacters(fThaiWordSet); 221 } 222 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); 223 fMarkSet.add(0x0020); 224 fEndWordSet = fThaiWordSet; 225 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 226 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 227 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 228 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 229 fSuffixSet.add(THAI_PAIYANNOI); 230 fSuffixSet.add(THAI_MAIYAMOK); 231 232 // Compact for caching. 233 fMarkSet.compact(); 234 fEndWordSet.compact(); 235 fBeginWordSet.compact(); 236 fSuffixSet.compact(); 237} 238 239ThaiBreakEngine::~ThaiBreakEngine() { 240 delete fDictionary; 241} 242 243int32_t 244ThaiBreakEngine::divideUpDictionaryRange( UText *text, 245 int32_t rangeStart, 246 int32_t rangeEnd, 247 UStack &foundBreaks ) const { 248 if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) { 249 return 0; // Not enough characters for two words 250 } 251 252 uint32_t wordsFound = 0; 253 int32_t wordLength; 254 int32_t current; 255 UErrorCode status = U_ZERO_ERROR; 256 PossibleWord words[THAI_LOOKAHEAD]; 257 UChar32 uc; 258 259 utext_setNativeIndex(text, rangeStart); 260 261 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 262 wordLength = 0; 263 264 // Look for candidate words at the current position 265 int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 266 267 // If we found exactly one, use that 268 if (candidates == 1) { 269 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); 270 wordsFound += 1; 271 } 272 273 // If there was more than one, see which one can take us forward the most words 274 else if (candidates > 1) { 275 // If we're already at the end of the range, we're done 276 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 277 goto foundBest; 278 } 279 do { 280 int wordsMatched = 1; 281 if (words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 282 if (wordsMatched < 2) { 283 // Followed by another dictionary word; mark first word as a good candidate 284 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 285 wordsMatched = 2; 286 } 287 288 // If we're already at the end of the range, we're done 289 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 290 goto foundBest; 291 } 292 293 // See if any of the possible second words is followed by a third word 294 do { 295 // If we find a third word, stop right away 296 if (words[(wordsFound+2)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 297 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 298 goto foundBest; 299 } 300 } 301 while (words[(wordsFound+1)%THAI_LOOKAHEAD].backUp(text)); 302 } 303 } 304 while (words[wordsFound%THAI_LOOKAHEAD].backUp(text)); 305foundBest: 306 wordLength = words[wordsFound%THAI_LOOKAHEAD].acceptMarked(text); 307 wordsFound += 1; 308 } 309 310 // We come here after having either found a word or not. We look ahead to the 311 // next word. If it's not a dictionary word, we will combine it withe the word we 312 // just found (if there is one), but only if the preceding word does not exceed 313 // the threshold. 314 // The text iterator should now be positioned at the end of the word we found. 315 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) { 316 // if it is a dictionary word, do nothing. If it isn't, then if there is 317 // no preceding word, or the non-word shares less than the minimum threshold 318 // of characters with a dictionary word, then scan to resynchronize 319 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 320 && (wordLength == 0 321 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { 322 // Look for a plausible word boundary 323 //TODO: This section will need a rework for UText. 324 int32_t remaining = rangeEnd - (current+wordLength); 325 UChar32 pc = utext_current32(text); 326 int32_t chars = 0; 327 for (;;) { 328 utext_next32(text); 329 uc = utext_current32(text); 330 // TODO: Here we're counting on the fact that the SA languages are all 331 // in the BMP. This should get fixed with the UText rework. 332 chars += 1; 333 if (--remaining <= 0) { 334 break; 335 } 336 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 337 // Maybe. See if it's in the dictionary. 338 // NOTE: In the original Apple code, checked that the next 339 // two characters after uc were not 0x0E4C THANTHAKHAT before 340 // checking the dictionary. That is just a performance filter, 341 // but it's not clear it's faster than checking the trie. 342 int candidates = words[(wordsFound+1)%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 343 utext_setNativeIndex(text, current+wordLength+chars); 344 if (candidates > 0) { 345 break; 346 } 347 } 348 pc = uc; 349 } 350 351 // Bump the word count if there wasn't already one 352 if (wordLength <= 0) { 353 wordsFound += 1; 354 } 355 356 // Update the length with the passed-over characters 357 wordLength += chars; 358 } 359 else { 360 // Back up to where we were for next iteration 361 utext_setNativeIndex(text, current+wordLength); 362 } 363 } 364 365 // Never stop before a combining mark. 366 int32_t currPos; 367 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 368 utext_next32(text); 369 wordLength += (int32_t)utext_getNativeIndex(text) - currPos; 370 } 371 372 // Look ahead for possible suffixes if a dictionary word does not follow. 373 // We do this in code rather than using a rule so that the heuristic 374 // resynch continues to function. For example, one of the suffix characters 375 // could be a typo in the middle of a word. 376 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 377 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 378 && fSuffixSet.contains(uc = utext_current32(text))) { 379 if (uc == THAI_PAIYANNOI) { 380 if (!fSuffixSet.contains(utext_previous32(text))) { 381 // Skip over previous end and PAIYANNOI 382 utext_next32(text); 383 utext_next32(text); 384 wordLength += 1; // Add PAIYANNOI to word 385 uc = utext_current32(text); // Fetch next character 386 } 387 else { 388 // Restore prior position 389 utext_next32(text); 390 } 391 } 392 if (uc == THAI_MAIYAMOK) { 393 if (utext_previous32(text) != THAI_MAIYAMOK) { 394 // Skip over previous end and MAIYAMOK 395 utext_next32(text); 396 utext_next32(text); 397 wordLength += 1; // Add MAIYAMOK to word 398 } 399 else { 400 // Restore prior position 401 utext_next32(text); 402 } 403 } 404 } 405 else { 406 utext_setNativeIndex(text, current+wordLength); 407 } 408 } 409 410 // Did we find a word on this iteration? If so, push it on the break stack 411 if (wordLength > 0) { 412 foundBreaks.push((current+wordLength), status); 413 } 414 } 415 416 // Don't return a break for the end of the dictionary range if there is one there. 417 if (foundBreaks.peeki() >= rangeEnd) { 418 (void) foundBreaks.popi(); 419 wordsFound -= 1; 420 } 421 422 return wordsFound; 423} 424 425U_NAMESPACE_END 426 427#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 428