1/** 2 ******************************************************************************* 3 * Copyright (C) 2006-2014, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8#include "unicode/utypes.h" 9 10#if !UCONFIG_NO_BREAK_ITERATION 11 12#include "brkeng.h" 13#include "dictbe.h" 14#include "unicode/uniset.h" 15#include "unicode/chariter.h" 16#include "unicode/ubrk.h" 17#include "uvectr32.h" 18#include "uvector.h" 19#include "uassert.h" 20#include "unicode/normlzr.h" 21#include "cmemory.h" 22#include "dictionarydata.h" 23 24U_NAMESPACE_BEGIN 25 26/* 27 ****************************************************************** 28 */ 29 30DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) { 31 fTypes = breakTypes; 32} 33 34DictionaryBreakEngine::~DictionaryBreakEngine() { 35} 36 37UBool 38DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { 39 return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) 40 && fSet.contains(c)); 41} 42 43int32_t 44DictionaryBreakEngine::findBreaks( UText *text, 45 int32_t startPos, 46 int32_t endPos, 47 UBool reverse, 48 int32_t breakType, 49 UStack &foundBreaks ) const { 50 int32_t result = 0; 51 52 // Find the span of characters included in the set. 53 // The span to break begins at the current position in the text, and 54 // extends towards the start or end of the text, depending on 'reverse'. 55 56 int32_t start = (int32_t)utext_getNativeIndex(text); 57 int32_t current; 58 int32_t rangeStart; 59 int32_t rangeEnd; 60 UChar32 c = utext_current32(text); 61 if (reverse) { 62 UBool isDict = fSet.contains(c); 63 while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) { 64 c = utext_previous32(text); 65 isDict = fSet.contains(c); 66 } 67 if (current < startPos) { 68 rangeStart = startPos; 69 } else { 70 rangeStart = current; 71 if (!isDict) { 72 utext_next32(text); 73 rangeStart = utext_getNativeIndex(text); 74 } 75 } 76 // rangeEnd = start + 1; 77 utext_setNativeIndex(text, start); 78 utext_next32(text); 79 rangeEnd = utext_getNativeIndex(text); 80 } 81 else { 82 while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { 83 utext_next32(text); // TODO: recast loop for postincrement 84 c = utext_current32(text); 85 } 86 rangeStart = start; 87 rangeEnd = current; 88 } 89 if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) { 90 result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks); 91 utext_setNativeIndex(text, current); 92 } 93 94 return result; 95} 96 97void 98DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { 99 fSet = set; 100 // Compact for caching 101 fSet.compact(); 102} 103 104/* 105 ****************************************************************** 106 * PossibleWord 107 */ 108 109// Helper class for improving readability of the Thai/Lao/Khmer word break 110// algorithm. The implementation is completely inline. 111 112// List size, limited by the maximum number of words in the dictionary 113// that form a nested sequence. 114static const int32_t POSSIBLE_WORD_LIST_MAX = 20; 115 116class PossibleWord { 117private: 118 // list of word candidate lengths, in increasing length order 119 // TODO: bytes would be sufficient for word lengths. 120 int32_t count; // Count of candidates 121 int32_t prefix; // The longest match with a dictionary word 122 int32_t offset; // Offset in the text of these candidates 123 int32_t mark; // The preferred candidate's offset 124 int32_t current; // The candidate we're currently looking at 125 int32_t cuLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code units. 126 int32_t cpLengths[POSSIBLE_WORD_LIST_MAX]; // Word Lengths, in code points. 127 128public: 129 PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}; 130 ~PossibleWord() {}; 131 132 // Fill the list of candidates if needed, select the longest, and return the number found 133 int32_t candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ); 134 135 // Select the currently marked candidate, point after it in the text, and invalidate self 136 int32_t acceptMarked( UText *text ); 137 138 // Back up from the current candidate to the next shorter one; return TRUE if that exists 139 // and point the text after it 140 UBool backUp( UText *text ); 141 142 // Return the longest prefix this candidate location shares with a dictionary word 143 // Return value is in code points. 144 int32_t longestPrefix() { return prefix; }; 145 146 // Mark the current candidate as the one we like 147 void markCurrent() { mark = current; }; 148 149 // Get length in code points of the marked word. 150 int32_t markedCPLength() { return cpLengths[mark]; }; 151}; 152 153 154int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { 155 // TODO: If getIndex is too slow, use offset < 0 and add discardAll() 156 int32_t start = (int32_t)utext_getNativeIndex(text); 157 if (start != offset) { 158 offset = start; 159 count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix); 160 // Dictionary leaves text after longest prefix, not longest word. Back up. 161 if (count <= 0) { 162 utext_setNativeIndex(text, start); 163 } 164 } 165 if (count > 0) { 166 utext_setNativeIndex(text, start+cuLengths[count-1]); 167 } 168 current = count-1; 169 mark = current; 170 return count; 171} 172 173int32_t 174PossibleWord::acceptMarked( UText *text ) { 175 utext_setNativeIndex(text, offset + cuLengths[mark]); 176 return cuLengths[mark]; 177} 178 179 180UBool 181PossibleWord::backUp( UText *text ) { 182 if (current > 0) { 183 utext_setNativeIndex(text, offset + cuLengths[--current]); 184 return TRUE; 185 } 186 return FALSE; 187} 188 189/* 190 ****************************************************************** 191 * ThaiBreakEngine 192 */ 193 194// How many words in a row are "good enough"? 195static const int32_t THAI_LOOKAHEAD = 3; 196 197// Will not combine a non-word with a preceding dictionary word longer than this 198static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3; 199 200// Will not combine a non-word that shares at least this much prefix with a 201// dictionary word, with a preceding word 202static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3; 203 204// Ellision character 205static const int32_t THAI_PAIYANNOI = 0x0E2F; 206 207// Repeat character 208static const int32_t THAI_MAIYAMOK = 0x0E46; 209 210// Minimum word size 211static const int32_t THAI_MIN_WORD = 2; 212 213// Minimum number of characters for two words 214static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2; 215 216ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 217 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 218 fDictionary(adoptDictionary) 219{ 220 fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); 221 if (U_SUCCESS(status)) { 222 setCharacters(fThaiWordSet); 223 } 224 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); 225 fMarkSet.add(0x0020); 226 fEndWordSet = fThaiWordSet; 227 fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 228 fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 229 fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 230 fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 231 fSuffixSet.add(THAI_PAIYANNOI); 232 fSuffixSet.add(THAI_MAIYAMOK); 233 234 // Compact for caching. 235 fMarkSet.compact(); 236 fEndWordSet.compact(); 237 fBeginWordSet.compact(); 238 fSuffixSet.compact(); 239} 240 241ThaiBreakEngine::~ThaiBreakEngine() { 242 delete fDictionary; 243} 244 245int32_t 246ThaiBreakEngine::divideUpDictionaryRange( UText *text, 247 int32_t rangeStart, 248 int32_t rangeEnd, 249 UStack &foundBreaks ) const { 250 utext_setNativeIndex(text, rangeStart); 251 utext_moveIndex32(text, THAI_MIN_WORD_SPAN); 252 if (utext_getNativeIndex(text) >= rangeEnd) { 253 return 0; // Not enough characters for two words 254 } 255 utext_setNativeIndex(text, rangeStart); 256 257 258 uint32_t wordsFound = 0; 259 int32_t cpWordLength = 0; // Word Length in Code Points. 260 int32_t cuWordLength = 0; // Word length in code units (UText native indexing) 261 int32_t current; 262 UErrorCode status = U_ZERO_ERROR; 263 PossibleWord words[THAI_LOOKAHEAD]; 264 265 utext_setNativeIndex(text, rangeStart); 266 267 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 268 cpWordLength = 0; 269 cuWordLength = 0; 270 271 // Look for candidate words at the current position 272 int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 273 274 // If we found exactly one, use that 275 if (candidates == 1) { 276 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 277 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength(); 278 wordsFound += 1; 279 } 280 // If there was more than one, see which one can take us forward the most words 281 else if (candidates > 1) { 282 // If we're already at the end of the range, we're done 283 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 284 goto foundBest; 285 } 286 do { 287 int32_t wordsMatched = 1; 288 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 289 if (wordsMatched < 2) { 290 // Followed by another dictionary word; mark first word as a good candidate 291 words[wordsFound%THAI_LOOKAHEAD].markCurrent(); 292 wordsMatched = 2; 293 } 294 295 // If we're already at the end of the range, we're done 296 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 297 goto foundBest; 298 } 299 300 // See if any of the possible second words is followed by a third word 301 do { 302 // If we find a third word, stop right away 303 if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 304 words[wordsFound % THAI_LOOKAHEAD].markCurrent(); 305 goto foundBest; 306 } 307 } 308 while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text)); 309 } 310 } 311 while (words[wordsFound % THAI_LOOKAHEAD].backUp(text)); 312foundBest: 313 // Set UText position to after the accepted word. 314 cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text); 315 cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength(); 316 wordsFound += 1; 317 } 318 319 // We come here after having either found a word or not. We look ahead to the 320 // next word. If it's not a dictionary word, we will combine it with the word we 321 // just found (if there is one), but only if the preceding word does not exceed 322 // the threshold. 323 // The text iterator should now be positioned at the end of the word we found. 324 325 UChar32 uc = 0; 326 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) { 327 // if it is a dictionary word, do nothing. If it isn't, then if there is 328 // no preceding word, or the non-word shares less than the minimum threshold 329 // of characters with a dictionary word, then scan to resynchronize 330 if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 331 && (cuWordLength == 0 332 || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { 333 // Look for a plausible word boundary 334 int32_t remaining = rangeEnd - (current+cuWordLength); 335 UChar32 pc; 336 int32_t chars = 0; 337 for (;;) { 338 int32_t pcIndex = utext_getNativeIndex(text); 339 pc = utext_next32(text); 340 int32_t pcSize = utext_getNativeIndex(text) - pcIndex; 341 chars += pcSize; 342 remaining -= pcSize; 343 if (remaining <= 0) { 344 break; 345 } 346 uc = utext_current32(text); 347 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 348 // Maybe. See if it's in the dictionary. 349 // NOTE: In the original Apple code, checked that the next 350 // two characters after uc were not 0x0E4C THANTHAKHAT before 351 // checking the dictionary. That is just a performance filter, 352 // but it's not clear it's faster than checking the trie. 353 int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 354 utext_setNativeIndex(text, current + cuWordLength + chars); 355 if (candidates > 0) { 356 break; 357 } 358 } 359 } 360 361 // Bump the word count if there wasn't already one 362 if (cuWordLength <= 0) { 363 wordsFound += 1; 364 } 365 366 // Update the length with the passed-over characters 367 cuWordLength += chars; 368 } 369 else { 370 // Back up to where we were for next iteration 371 utext_setNativeIndex(text, current+cuWordLength); 372 } 373 } 374 375 // Never stop before a combining mark. 376 int32_t currPos; 377 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 378 utext_next32(text); 379 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; 380 } 381 382 // Look ahead for possible suffixes if a dictionary word does not follow. 383 // We do this in code rather than using a rule so that the heuristic 384 // resynch continues to function. For example, one of the suffix characters 385 // could be a typo in the middle of a word. 386 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) { 387 if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 388 && fSuffixSet.contains(uc = utext_current32(text))) { 389 if (uc == THAI_PAIYANNOI) { 390 if (!fSuffixSet.contains(utext_previous32(text))) { 391 // Skip over previous end and PAIYANNOI 392 utext_next32(text); 393 int32_t paiyannoiIndex = utext_getNativeIndex(text); 394 utext_next32(text); 395 cuWordLength += utext_getNativeIndex(text) - paiyannoiIndex; // Add PAIYANNOI to word 396 uc = utext_current32(text); // Fetch next character 397 } 398 else { 399 // Restore prior position 400 utext_next32(text); 401 } 402 } 403 if (uc == THAI_MAIYAMOK) { 404 if (utext_previous32(text) != THAI_MAIYAMOK) { 405 // Skip over previous end and MAIYAMOK 406 utext_next32(text); 407 int32_t maiyamokIndex = utext_getNativeIndex(text); 408 utext_next32(text); 409 cuWordLength += utext_getNativeIndex(text) - maiyamokIndex; // Add MAIYAMOK to word 410 } 411 else { 412 // Restore prior position 413 utext_next32(text); 414 } 415 } 416 } 417 else { 418 utext_setNativeIndex(text, current+cuWordLength); 419 } 420 } 421 422 // Did we find a word on this iteration? If so, push it on the break stack 423 if (cuWordLength > 0) { 424 foundBreaks.push((current+cuWordLength), status); 425 } 426 } 427 428 // Don't return a break for the end of the dictionary range if there is one there. 429 if (foundBreaks.peeki() >= rangeEnd) { 430 (void) foundBreaks.popi(); 431 wordsFound -= 1; 432 } 433 434 return wordsFound; 435} 436 437/* 438 ****************************************************************** 439 * LaoBreakEngine 440 */ 441 442// How many words in a row are "good enough"? 443static const int32_t LAO_LOOKAHEAD = 3; 444 445// Will not combine a non-word with a preceding dictionary word longer than this 446static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3; 447 448// Will not combine a non-word that shares at least this much prefix with a 449// dictionary word, with a preceding word 450static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3; 451 452// Minimum word size 453static const int32_t LAO_MIN_WORD = 2; 454 455// Minimum number of characters for two words 456static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2; 457 458LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 459 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 460 fDictionary(adoptDictionary) 461{ 462 fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); 463 if (U_SUCCESS(status)) { 464 setCharacters(fLaoWordSet); 465 } 466 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); 467 fMarkSet.add(0x0020); 468 fEndWordSet = fLaoWordSet; 469 fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels 470 fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters) 471 fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent) 472 fBeginWordSet.add(0x0EC0, 0x0EC4); // prefix vowels 473 474 // Compact for caching. 475 fMarkSet.compact(); 476 fEndWordSet.compact(); 477 fBeginWordSet.compact(); 478} 479 480LaoBreakEngine::~LaoBreakEngine() { 481 delete fDictionary; 482} 483 484int32_t 485LaoBreakEngine::divideUpDictionaryRange( UText *text, 486 int32_t rangeStart, 487 int32_t rangeEnd, 488 UStack &foundBreaks ) const { 489 if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { 490 return 0; // Not enough characters for two words 491 } 492 493 uint32_t wordsFound = 0; 494 int32_t cpWordLength = 0; 495 int32_t cuWordLength = 0; 496 int32_t current; 497 UErrorCode status = U_ZERO_ERROR; 498 PossibleWord words[LAO_LOOKAHEAD]; 499 500 utext_setNativeIndex(text, rangeStart); 501 502 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 503 cuWordLength = 0; 504 cpWordLength = 0; 505 506 // Look for candidate words at the current position 507 int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 508 509 // If we found exactly one, use that 510 if (candidates == 1) { 511 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 512 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength(); 513 wordsFound += 1; 514 } 515 // If there was more than one, see which one can take us forward the most words 516 else if (candidates > 1) { 517 // If we're already at the end of the range, we're done 518 if (utext_getNativeIndex(text) >= rangeEnd) { 519 goto foundBest; 520 } 521 do { 522 int32_t wordsMatched = 1; 523 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 524 if (wordsMatched < 2) { 525 // Followed by another dictionary word; mark first word as a good candidate 526 words[wordsFound%LAO_LOOKAHEAD].markCurrent(); 527 wordsMatched = 2; 528 } 529 530 // If we're already at the end of the range, we're done 531 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 532 goto foundBest; 533 } 534 535 // See if any of the possible second words is followed by a third word 536 do { 537 // If we find a third word, stop right away 538 if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 539 words[wordsFound % LAO_LOOKAHEAD].markCurrent(); 540 goto foundBest; 541 } 542 } 543 while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text)); 544 } 545 } 546 while (words[wordsFound % LAO_LOOKAHEAD].backUp(text)); 547foundBest: 548 cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text); 549 cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength(); 550 wordsFound += 1; 551 } 552 553 // We come here after having either found a word or not. We look ahead to the 554 // next word. If it's not a dictionary word, we will combine it withe the word we 555 // just found (if there is one), but only if the preceding word does not exceed 556 // the threshold. 557 // The text iterator should now be positioned at the end of the word we found. 558 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) { 559 // if it is a dictionary word, do nothing. If it isn't, then if there is 560 // no preceding word, or the non-word shares less than the minimum threshold 561 // of characters with a dictionary word, then scan to resynchronize 562 if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 563 && (cuWordLength == 0 564 || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) { 565 // Look for a plausible word boundary 566 int32_t remaining = rangeEnd - (current + cuWordLength); 567 UChar32 pc; 568 UChar32 uc; 569 int32_t chars = 0; 570 for (;;) { 571 int32_t pcIndex = utext_getNativeIndex(text); 572 pc = utext_next32(text); 573 int32_t pcSize = utext_getNativeIndex(text) - pcIndex; 574 chars += pcSize; 575 remaining -= pcSize; 576 if (remaining <= 0) { 577 break; 578 } 579 uc = utext_current32(text); 580 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 581 // Maybe. See if it's in the dictionary. 582 // TODO: this looks iffy; compare with old code. 583 int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 584 utext_setNativeIndex(text, current + cuWordLength + chars); 585 if (candidates > 0) { 586 break; 587 } 588 } 589 } 590 591 // Bump the word count if there wasn't already one 592 if (cuWordLength <= 0) { 593 wordsFound += 1; 594 } 595 596 // Update the length with the passed-over characters 597 cuWordLength += chars; 598 } 599 else { 600 // Back up to where we were for next iteration 601 utext_setNativeIndex(text, current + cuWordLength); 602 } 603 } 604 605 // Never stop before a combining mark. 606 int32_t currPos; 607 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 608 utext_next32(text); 609 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; 610 } 611 612 // Look ahead for possible suffixes if a dictionary word does not follow. 613 // We do this in code rather than using a rule so that the heuristic 614 // resynch continues to function. For example, one of the suffix characters 615 // could be a typo in the middle of a word. 616 // NOT CURRENTLY APPLICABLE TO LAO 617 618 // Did we find a word on this iteration? If so, push it on the break stack 619 if (cuWordLength > 0) { 620 foundBreaks.push((current+cuWordLength), status); 621 } 622 } 623 624 // Don't return a break for the end of the dictionary range if there is one there. 625 if (foundBreaks.peeki() >= rangeEnd) { 626 (void) foundBreaks.popi(); 627 wordsFound -= 1; 628 } 629 630 return wordsFound; 631} 632 633/* 634 ****************************************************************** 635 * BurmeseBreakEngine 636 */ 637 638// How many words in a row are "good enough"? 639static const int32_t BURMESE_LOOKAHEAD = 3; 640 641// Will not combine a non-word with a preceding dictionary word longer than this 642static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3; 643 644// Will not combine a non-word that shares at least this much prefix with a 645// dictionary word, with a preceding word 646static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3; 647 648// Minimum word size 649static const int32_t BURMESE_MIN_WORD = 2; 650 651// Minimum number of characters for two words 652static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2; 653 654BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 655 : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)), 656 fDictionary(adoptDictionary) 657{ 658 fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status); 659 if (U_SUCCESS(status)) { 660 setCharacters(fBurmeseWordSet); 661 } 662 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); 663 fMarkSet.add(0x0020); 664 fEndWordSet = fBurmeseWordSet; 665 fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels 666 667 // Compact for caching. 668 fMarkSet.compact(); 669 fEndWordSet.compact(); 670 fBeginWordSet.compact(); 671} 672 673BurmeseBreakEngine::~BurmeseBreakEngine() { 674 delete fDictionary; 675} 676 677int32_t 678BurmeseBreakEngine::divideUpDictionaryRange( UText *text, 679 int32_t rangeStart, 680 int32_t rangeEnd, 681 UStack &foundBreaks ) const { 682 if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) { 683 return 0; // Not enough characters for two words 684 } 685 686 uint32_t wordsFound = 0; 687 int32_t cpWordLength = 0; 688 int32_t cuWordLength = 0; 689 int32_t current; 690 UErrorCode status = U_ZERO_ERROR; 691 PossibleWord words[BURMESE_LOOKAHEAD]; 692 693 utext_setNativeIndex(text, rangeStart); 694 695 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 696 cuWordLength = 0; 697 cpWordLength = 0; 698 699 // Look for candidate words at the current position 700 int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 701 702 // If we found exactly one, use that 703 if (candidates == 1) { 704 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text); 705 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength(); 706 wordsFound += 1; 707 } 708 // If there was more than one, see which one can take us forward the most words 709 else if (candidates > 1) { 710 // If we're already at the end of the range, we're done 711 if (utext_getNativeIndex(text) >= rangeEnd) { 712 goto foundBest; 713 } 714 do { 715 int32_t wordsMatched = 1; 716 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 717 if (wordsMatched < 2) { 718 // Followed by another dictionary word; mark first word as a good candidate 719 words[wordsFound%BURMESE_LOOKAHEAD].markCurrent(); 720 wordsMatched = 2; 721 } 722 723 // If we're already at the end of the range, we're done 724 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 725 goto foundBest; 726 } 727 728 // See if any of the possible second words is followed by a third word 729 do { 730 // If we find a third word, stop right away 731 if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 732 words[wordsFound % BURMESE_LOOKAHEAD].markCurrent(); 733 goto foundBest; 734 } 735 } 736 while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text)); 737 } 738 } 739 while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text)); 740foundBest: 741 cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text); 742 cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength(); 743 wordsFound += 1; 744 } 745 746 // We come here after having either found a word or not. We look ahead to the 747 // next word. If it's not a dictionary word, we will combine it withe the word we 748 // just found (if there is one), but only if the preceding word does not exceed 749 // the threshold. 750 // The text iterator should now be positioned at the end of the word we found. 751 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) { 752 // if it is a dictionary word, do nothing. If it isn't, then if there is 753 // no preceding word, or the non-word shares less than the minimum threshold 754 // of characters with a dictionary word, then scan to resynchronize 755 if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 756 && (cuWordLength == 0 757 || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) { 758 // Look for a plausible word boundary 759 int32_t remaining = rangeEnd - (current + cuWordLength); 760 UChar32 pc; 761 UChar32 uc; 762 int32_t chars = 0; 763 for (;;) { 764 int32_t pcIndex = utext_getNativeIndex(text); 765 pc = utext_next32(text); 766 int32_t pcSize = utext_getNativeIndex(text) - pcIndex; 767 chars += pcSize; 768 remaining -= pcSize; 769 if (remaining <= 0) { 770 break; 771 } 772 uc = utext_current32(text); 773 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 774 // Maybe. See if it's in the dictionary. 775 // TODO: this looks iffy; compare with old code. 776 int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 777 utext_setNativeIndex(text, current + cuWordLength + chars); 778 if (candidates > 0) { 779 break; 780 } 781 } 782 } 783 784 // Bump the word count if there wasn't already one 785 if (cuWordLength <= 0) { 786 wordsFound += 1; 787 } 788 789 // Update the length with the passed-over characters 790 cuWordLength += chars; 791 } 792 else { 793 // Back up to where we were for next iteration 794 utext_setNativeIndex(text, current + cuWordLength); 795 } 796 } 797 798 // Never stop before a combining mark. 799 int32_t currPos; 800 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 801 utext_next32(text); 802 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; 803 } 804 805 // Look ahead for possible suffixes if a dictionary word does not follow. 806 // We do this in code rather than using a rule so that the heuristic 807 // resynch continues to function. For example, one of the suffix characters 808 // could be a typo in the middle of a word. 809 // NOT CURRENTLY APPLICABLE TO BURMESE 810 811 // Did we find a word on this iteration? If so, push it on the break stack 812 if (cuWordLength > 0) { 813 foundBreaks.push((current+cuWordLength), status); 814 } 815 } 816 817 // Don't return a break for the end of the dictionary range if there is one there. 818 if (foundBreaks.peeki() >= rangeEnd) { 819 (void) foundBreaks.popi(); 820 wordsFound -= 1; 821 } 822 823 return wordsFound; 824} 825 826/* 827 ****************************************************************** 828 * KhmerBreakEngine 829 */ 830 831// How many words in a row are "good enough"? 832static const int32_t KHMER_LOOKAHEAD = 3; 833 834// Will not combine a non-word with a preceding dictionary word longer than this 835static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3; 836 837// Will not combine a non-word that shares at least this much prefix with a 838// dictionary word, with a preceding word 839static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3; 840 841// Minimum word size 842static const int32_t KHMER_MIN_WORD = 2; 843 844// Minimum number of characters for two words 845static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2; 846 847KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status) 848 : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)), 849 fDictionary(adoptDictionary) 850{ 851 fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); 852 if (U_SUCCESS(status)) { 853 setCharacters(fKhmerWordSet); 854 } 855 fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); 856 fMarkSet.add(0x0020); 857 fEndWordSet = fKhmerWordSet; 858 fBeginWordSet.add(0x1780, 0x17B3); 859 //fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels 860 //fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word 861 //fEndWordSet.remove(0x17B2); // Khmer independent vowel that can't end a word 862 fEndWordSet.remove(0x17D2); // KHMER SIGN COENG that combines some following characters 863 //fEndWordSet.remove(0x17B6, 0x17C5); // Remove dependent vowels 864// fEndWordSet.remove(0x0E31); // MAI HAN-AKAT 865// fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 866// fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK 867// fBeginWordSet.add(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI 868// fSuffixSet.add(THAI_PAIYANNOI); 869// fSuffixSet.add(THAI_MAIYAMOK); 870 871 // Compact for caching. 872 fMarkSet.compact(); 873 fEndWordSet.compact(); 874 fBeginWordSet.compact(); 875// fSuffixSet.compact(); 876} 877 878KhmerBreakEngine::~KhmerBreakEngine() { 879 delete fDictionary; 880} 881 882int32_t 883KhmerBreakEngine::divideUpDictionaryRange( UText *text, 884 int32_t rangeStart, 885 int32_t rangeEnd, 886 UStack &foundBreaks ) const { 887 if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { 888 return 0; // Not enough characters for two words 889 } 890 891 uint32_t wordsFound = 0; 892 int32_t cpWordLength = 0; 893 int32_t cuWordLength = 0; 894 int32_t current; 895 UErrorCode status = U_ZERO_ERROR; 896 PossibleWord words[KHMER_LOOKAHEAD]; 897 898 utext_setNativeIndex(text, rangeStart); 899 900 while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { 901 cuWordLength = 0; 902 cpWordLength = 0; 903 904 // Look for candidate words at the current position 905 int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 906 907 // If we found exactly one, use that 908 if (candidates == 1) { 909 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); 910 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); 911 wordsFound += 1; 912 } 913 914 // If there was more than one, see which one can take us forward the most words 915 else if (candidates > 1) { 916 // If we're already at the end of the range, we're done 917 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 918 goto foundBest; 919 } 920 do { 921 int32_t wordsMatched = 1; 922 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { 923 if (wordsMatched < 2) { 924 // Followed by another dictionary word; mark first word as a good candidate 925 words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 926 wordsMatched = 2; 927 } 928 929 // If we're already at the end of the range, we're done 930 if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { 931 goto foundBest; 932 } 933 934 // See if any of the possible second words is followed by a third word 935 do { 936 // If we find a third word, stop right away 937 if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { 938 words[wordsFound % KHMER_LOOKAHEAD].markCurrent(); 939 goto foundBest; 940 } 941 } 942 while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text)); 943 } 944 } 945 while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text)); 946foundBest: 947 cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text); 948 cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength(); 949 wordsFound += 1; 950 } 951 952 // We come here after having either found a word or not. We look ahead to the 953 // next word. If it's not a dictionary word, we will combine it with the word we 954 // just found (if there is one), but only if the preceding word does not exceed 955 // the threshold. 956 // The text iterator should now be positioned at the end of the word we found. 957 if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) { 958 // if it is a dictionary word, do nothing. If it isn't, then if there is 959 // no preceding word, or the non-word shares less than the minimum threshold 960 // of characters with a dictionary word, then scan to resynchronize 961 if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 962 && (cuWordLength == 0 963 || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { 964 // Look for a plausible word boundary 965 int32_t remaining = rangeEnd - (current+cuWordLength); 966 UChar32 pc; 967 UChar32 uc; 968 int32_t chars = 0; 969 for (;;) { 970 int32_t pcIndex = utext_getNativeIndex(text); 971 pc = utext_next32(text); 972 int32_t pcSize = utext_getNativeIndex(text) - pcIndex; 973 chars += pcSize; 974 remaining -= pcSize; 975 if (remaining <= 0) { 976 break; 977 } 978 uc = utext_current32(text); 979 if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { 980 // Maybe. See if it's in the dictionary. 981 int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd); 982 utext_setNativeIndex(text, current+cuWordLength+chars); 983 if (candidates > 0) { 984 break; 985 } 986 } 987 } 988 989 // Bump the word count if there wasn't already one 990 if (cuWordLength <= 0) { 991 wordsFound += 1; 992 } 993 994 // Update the length with the passed-over characters 995 cuWordLength += chars; 996 } 997 else { 998 // Back up to where we were for next iteration 999 utext_setNativeIndex(text, current+cuWordLength); 1000 } 1001 } 1002 1003 // Never stop before a combining mark. 1004 int32_t currPos; 1005 while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { 1006 utext_next32(text); 1007 cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos; 1008 } 1009 1010 // Look ahead for possible suffixes if a dictionary word does not follow. 1011 // We do this in code rather than using a rule so that the heuristic 1012 // resynch continues to function. For example, one of the suffix characters 1013 // could be a typo in the middle of a word. 1014// if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { 1015// if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0 1016// && fSuffixSet.contains(uc = utext_current32(text))) { 1017// if (uc == KHMER_PAIYANNOI) { 1018// if (!fSuffixSet.contains(utext_previous32(text))) { 1019// // Skip over previous end and PAIYANNOI 1020// utext_next32(text); 1021// utext_next32(text); 1022// wordLength += 1; // Add PAIYANNOI to word 1023// uc = utext_current32(text); // Fetch next character 1024// } 1025// else { 1026// // Restore prior position 1027// utext_next32(text); 1028// } 1029// } 1030// if (uc == KHMER_MAIYAMOK) { 1031// if (utext_previous32(text) != KHMER_MAIYAMOK) { 1032// // Skip over previous end and MAIYAMOK 1033// utext_next32(text); 1034// utext_next32(text); 1035// wordLength += 1; // Add MAIYAMOK to word 1036// } 1037// else { 1038// // Restore prior position 1039// utext_next32(text); 1040// } 1041// } 1042// } 1043// else { 1044// utext_setNativeIndex(text, current+wordLength); 1045// } 1046// } 1047 1048 // Did we find a word on this iteration? If so, push it on the break stack 1049 if (cuWordLength > 0) { 1050 foundBreaks.push((current+cuWordLength), status); 1051 } 1052 } 1053 1054 // Don't return a break for the end of the dictionary range if there is one there. 1055 if (foundBreaks.peeki() >= rangeEnd) { 1056 (void) foundBreaks.popi(); 1057 wordsFound -= 1; 1058 } 1059 1060 return wordsFound; 1061} 1062 1063#if !UCONFIG_NO_NORMALIZATION 1064/* 1065 ****************************************************************** 1066 * CjkBreakEngine 1067 */ 1068static const uint32_t kuint32max = 0xFFFFFFFF; 1069CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status) 1070: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) { 1071 // Korean dictionary only includes Hangul syllables 1072 fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); 1073 fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); 1074 fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); 1075 fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); 1076 nfkcNorm2 = Normalizer2::getNFKCInstance(status); 1077 1078 if (U_SUCCESS(status)) { 1079 // handle Korean and Japanese/Chinese using different dictionaries 1080 if (type == kKorean) { 1081 setCharacters(fHangulWordSet); 1082 } else { //Chinese and Japanese 1083 UnicodeSet cjSet; 1084 cjSet.addAll(fHanWordSet); 1085 cjSet.addAll(fKatakanaWordSet); 1086 cjSet.addAll(fHiraganaWordSet); 1087 cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK 1088 cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK 1089 setCharacters(cjSet); 1090 } 1091 } 1092} 1093 1094CjkBreakEngine::~CjkBreakEngine(){ 1095 delete fDictionary; 1096} 1097 1098// The katakanaCost values below are based on the length frequencies of all 1099// katakana phrases in the dictionary 1100static const int32_t kMaxKatakanaLength = 8; 1101static const int32_t kMaxKatakanaGroupLength = 20; 1102static const uint32_t maxSnlp = 255; 1103 1104static inline uint32_t getKatakanaCost(int32_t wordLength){ 1105 //TODO: fill array with actual values from dictionary! 1106 static const uint32_t katakanaCost[kMaxKatakanaLength + 1] 1107 = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; 1108 return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength]; 1109} 1110 1111static inline bool isKatakana(uint16_t value) { 1112 return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) || 1113 (value >= 0xFF66u && value <= 0xFF9fu); 1114} 1115 1116 1117// Function for accessing internal utext flags. 1118// Replicates an internal UText function. 1119 1120static inline int32_t utext_i32_flag(int32_t bitIndex) { 1121 return (int32_t)1 << bitIndex; 1122} 1123 1124 1125/* 1126 * @param text A UText representing the text 1127 * @param rangeStart The start of the range of dictionary characters 1128 * @param rangeEnd The end of the range of dictionary characters 1129 * @param foundBreaks Output of C array of int32_t break positions, or 0 1130 * @return The number of breaks found 1131 */ 1132int32_t 1133CjkBreakEngine::divideUpDictionaryRange( UText *inText, 1134 int32_t rangeStart, 1135 int32_t rangeEnd, 1136 UStack &foundBreaks ) const { 1137 if (rangeStart >= rangeEnd) { 1138 return 0; 1139 } 1140 1141 // UnicodeString version of input UText, NFKC normalized in necessary. 1142 UnicodeString *inString; 1143 1144 // inputMap[inStringIndex] = corresponding native index from UText inText. 1145 // If NULL then mapping is 1:1 1146 UVector32 *inputMap = NULL; 1147 1148 UErrorCode status = U_ZERO_ERROR; 1149 1150 1151 // if UText has the input string as one contiguous UTF-16 chunk 1152 if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) && 1153 inText->chunkNativeStart <= rangeStart && 1154 inText->chunkNativeLimit >= rangeEnd && 1155 inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) { 1156 1157 // Input UTtxt is in one contiguous UTF-16 chunk. 1158 // Use Read-only aliasing UnicodeString constructor on it. 1159 inString = new UnicodeString(FALSE, 1160 inText->chunkContents + rangeStart - inText->chunkNativeStart, 1161 rangeEnd - rangeStart); 1162 } else { 1163 // Copy the text from the original inText (UText) to inString (UnicodeString). 1164 // Create a map from UnicodeString indices -> UText offsets. 1165 utext_setNativeIndex(inText, rangeStart); 1166 int32_t limit = rangeEnd; 1167 U_ASSERT(limit <= utext_nativeLength(inText)); 1168 if (limit > utext_nativeLength(inText)) { 1169 limit = utext_nativeLength(inText); 1170 } 1171 inString = new UnicodeString; 1172 inputMap = new UVector32(status); 1173 while (utext_getNativeIndex(inText) < limit) { 1174 int32_t nativePosition = utext_getNativeIndex(inText); 1175 UChar32 c = utext_next32(inText); 1176 U_ASSERT(c != U_SENTINEL); 1177 inString->append(c); 1178 while (inputMap->size() < inString->length()) { 1179 inputMap->addElement(nativePosition, status); 1180 } 1181 } 1182 inputMap->addElement(limit, status); 1183 } 1184 1185 1186 if (!nfkcNorm2->isNormalized(*inString, status)) { 1187 UnicodeString *normalizedInput = new UnicodeString(); 1188 // normalizedMap[normalizedInput position] == original UText position. 1189 UVector32 *normalizedMap = new UVector32(status); 1190 if (U_FAILURE(status)) { 1191 return 0; 1192 } 1193 1194 UnicodeString fragment; 1195 UnicodeString normalizedFragment; 1196 for (int32_t srcI = 0; srcI < inString->length();) { // Once per normalization chunk 1197 fragment.remove(); 1198 int32_t fragmentStartI = srcI; 1199 UChar32 c = inString->char32At(srcI); 1200 for (;;) { 1201 fragment.append(c); 1202 srcI = inString->moveIndex32(srcI, 1); 1203 if (srcI == inString->length()) { 1204 break; 1205 } 1206 c = inString->char32At(srcI); 1207 if (nfkcNorm2->hasBoundaryBefore(c)) { 1208 break; 1209 } 1210 } 1211 nfkcNorm2->normalize(fragment, normalizedFragment, status); 1212 normalizedInput->append(normalizedFragment); 1213 1214 // Map every position in the normalized chunk to the start of the chunk 1215 // in the original input. 1216 int32_t fragmentOriginalStart = inputMap? inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart; 1217 while (normalizedMap->size() < normalizedInput->length()) { 1218 normalizedMap->addElement(fragmentOriginalStart, status); 1219 if (U_FAILURE(status)) { 1220 break; 1221 } 1222 } 1223 } 1224 U_ASSERT(normalizedMap->size() == normalizedInput->length()); 1225 int32_t nativeEnd = inputMap? inputMap->elementAti(inString->length()) : inString->length()+rangeStart; 1226 normalizedMap->addElement(nativeEnd, status); 1227 1228 delete inputMap; 1229 inputMap = normalizedMap; 1230 delete inString; 1231 inString = normalizedInput; 1232 } 1233 1234 int32_t numCodePts = inString->countChar32(); 1235 if (numCodePts != inString->length()) { 1236 // There are supplementary characters in the input. 1237 // The dictionary will produce boundary positions in terms of code point indexes, 1238 // not in terms of code unit string indexes. 1239 // Use the inputMap mechanism to take care of this in addition to indexing differences 1240 // from normalization and/or UTF-8 input. 1241 UBool hadExistingMap = (inputMap != NULL); 1242 if (!hadExistingMap) { 1243 inputMap = new UVector32(status); 1244 } 1245 int32_t cpIdx = 0; 1246 for (int32_t cuIdx = 0; ; cuIdx = inString->moveIndex32(cuIdx, 1)) { 1247 U_ASSERT(cuIdx >= cpIdx); 1248 if (hadExistingMap) { 1249 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx); 1250 } else { 1251 inputMap->addElement(cuIdx+rangeStart, status); 1252 } 1253 cpIdx++; 1254 if (cuIdx == inString->length()) { 1255 break; 1256 } 1257 } 1258 } 1259 1260 // bestSnlp[i] is the snlp of the best segmentation of the first i 1261 // code points in the range to be matched. 1262 UVector32 bestSnlp(numCodePts + 1, status); 1263 bestSnlp.addElement(0, status); 1264 for(int32_t i = 1; i <= numCodePts; i++) { 1265 bestSnlp.addElement(kuint32max, status); 1266 } 1267 1268 1269 // prev[i] is the index of the last CJK code point in the previous word in 1270 // the best segmentation of the first i characters. 1271 UVector32 prev(numCodePts + 1, status); 1272 for(int32_t i = 0; i <= numCodePts; i++){ 1273 prev.addElement(-1, status); 1274 } 1275 1276 const int32_t maxWordSize = 20; 1277 UVector32 values(numCodePts, status); 1278 values.setSize(numCodePts); 1279 UVector32 lengths(numCodePts, status); 1280 lengths.setSize(numCodePts); 1281 1282 UText fu = UTEXT_INITIALIZER; 1283 utext_openUnicodeString(&fu, inString, &status); 1284 1285 // Dynamic programming to find the best segmentation. 1286 1287 // In outer loop, i is the code point index, 1288 // ix is the corresponding string (code unit) index. 1289 // They differ when the string contains supplementary characters. 1290 int32_t ix = 0; 1291 for (int32_t i = 0; i < numCodePts; ++i, ix = inString->moveIndex32(ix, 1)) { 1292 if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) { 1293 continue; 1294 } 1295 1296 int32_t count; 1297 utext_setNativeIndex(&fu, ix); 1298 count = fDictionary->matches(&fu, maxWordSize, numCodePts, 1299 NULL, lengths.getBuffer(), values.getBuffer(), NULL); 1300 // Note: lengths is filled with code point lengths 1301 // The NULL parameter is the ignored code unit lengths. 1302 1303 // if there are no single character matches found in the dictionary 1304 // starting with this charcter, treat character as a 1-character word 1305 // with the highest value possible, i.e. the least likely to occur. 1306 // Exclude Korean characters from this treatment, as they should be left 1307 // together by default. 1308 if ((count == 0 || lengths.elementAti(0) != 1) && 1309 !fHangulWordSet.contains(inString->char32At(ix))) { 1310 values.setElementAt(maxSnlp, count); // 255 1311 lengths.setElementAt(1, count++); 1312 } 1313 1314 for (int32_t j = 0; j < count; j++) { 1315 uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j); 1316 int32_t ln_j_i = lengths.elementAti(j) + i; 1317 if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) { 1318 bestSnlp.setElementAt(newSnlp, ln_j_i); 1319 prev.setElementAt(i, ln_j_i); 1320 } 1321 } 1322 1323 // In Japanese, 1324 // Katakana word in single character is pretty rare. So we apply 1325 // the following heuristic to Katakana: any continuous run of Katakana 1326 // characters is considered a candidate word with a default cost 1327 // specified in the katakanaCost table according to its length. 1328 1329 bool is_prev_katakana = false; 1330 bool is_katakana = isKatakana(inString->char32At(ix)); 1331 int32_t katakanaRunLength = 1; 1332 if (!is_prev_katakana && is_katakana) { 1333 int32_t j = inString->moveIndex32(ix, 1); 1334 // Find the end of the continuous run of Katakana characters 1335 while (j < inString->length() && katakanaRunLength < kMaxKatakanaGroupLength && 1336 isKatakana(inString->char32At(j))) { 1337 j = inString->moveIndex32(j, 1); 1338 katakanaRunLength++; 1339 } 1340 if (katakanaRunLength < kMaxKatakanaGroupLength) { 1341 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength); 1342 if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) { 1343 bestSnlp.setElementAt(newSnlp, j); 1344 prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i; 1345 } 1346 } 1347 } 1348 is_prev_katakana = is_katakana; 1349 } 1350 utext_close(&fu); 1351 1352 // Start pushing the optimal offset index into t_boundary (t for tentative). 1353 // prev[numCodePts] is guaranteed to be meaningful. 1354 // We'll first push in the reverse order, i.e., 1355 // t_boundary[0] = numCodePts, and afterwards do a swap. 1356 UVector32 t_boundary(numCodePts+1, status); 1357 1358 int32_t numBreaks = 0; 1359 // No segmentation found, set boundary to end of range 1360 if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { 1361 t_boundary.addElement(numCodePts, status); 1362 numBreaks++; 1363 } else { 1364 for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) { 1365 t_boundary.addElement(i, status); 1366 numBreaks++; 1367 } 1368 U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0); 1369 } 1370 1371 // Add a break for the start of the dictionary range if there is not one 1372 // there already. 1373 if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { 1374 t_boundary.addElement(0, status); 1375 numBreaks++; 1376 } 1377 1378 // Now that we're done, convert positions in t_boundary[] (indices in 1379 // the normalized input string) back to indices in the original input UText 1380 // while reversing t_boundary and pushing values to foundBreaks. 1381 for (int32_t i = numBreaks-1; i >= 0; i--) { 1382 int32_t cpPos = t_boundary.elementAti(i); 1383 int32_t utextPos = inputMap ? inputMap->elementAti(cpPos) : cpPos + rangeStart; 1384 // Boundaries are added to foundBreaks output in ascending order. 1385 U_ASSERT(foundBreaks.size() == 0 ||foundBreaks.peeki() < utextPos); 1386 foundBreaks.push(utextPos, status); 1387 } 1388 1389 delete inString; 1390 delete inputMap; 1391 return numBreaks; 1392} 1393#endif 1394 1395U_NAMESPACE_END 1396 1397#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1398 1399