1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4*************************************************************************** 5* Copyright (C) 1999-2016 International Business Machines Corporation 6* and others. All rights reserved. 7*************************************************************************** 8*/ 9// 10// file: rbbi.cpp Contains the implementation of the rule based break iterator 11// runtime engine and the API implementation for 12// class RuleBasedBreakIterator 13// 14 15#include "utypeinfo.h" // for 'typeid' to work 16 17#include "unicode/utypes.h" 18 19#if !UCONFIG_NO_BREAK_ITERATION 20 21#include "unicode/rbbi.h" 22#include "unicode/schriter.h" 23#include "unicode/uchriter.h" 24#include "unicode/uclean.h" 25#include "unicode/udata.h" 26 27#include "brkeng.h" 28#include "ucln_cmn.h" 29#include "cmemory.h" 30#include "cstring.h" 31#include "rbbidata.h" 32#include "rbbi_cache.h" 33#include "rbbirb.h" 34#include "uassert.h" 35#include "umutex.h" 36#include "uvectr32.h" 37 38// if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. 39#if U_LOCAL_SERVICE_HOOK 40#include "localsvc.h" 41#endif 42 43#ifdef RBBI_DEBUG 44static UBool gTrace = FALSE; 45#endif 46 47U_NAMESPACE_BEGIN 48 49// The state number of the starting state 50constexpr int32_t START_STATE = 1; 51 52// The state-transition value indicating "stop" 53constexpr int32_t STOP_STATE = 0; 54 55 56UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) 57 58 59//======================================================================= 60// constructors 61//======================================================================= 62 63/** 64 * Constructs a RuleBasedBreakIterator that uses the already-created 65 * tables object that is passed in as a parameter. 66 */ 67RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) { 68 init(status); 69 fData = new RBBIDataWrapper(data, status); // status checked in constructor 70 if (U_FAILURE(status)) {return;} 71 if(fData == 0) { 72 status = U_MEMORY_ALLOCATION_ERROR; 73 return; 74 } 75} 76 77// 78// Construct from precompiled binary rules (tables). This constructor is public API, 79// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). 80// 81RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, 82 uint32_t ruleLength, 83 UErrorCode &status) { 84 init(status); 85 if (U_FAILURE(status)) { 86 return; 87 } 88 if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { 89 status = U_ILLEGAL_ARGUMENT_ERROR; 90 return; 91 } 92 const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; 93 if (data->fLength > ruleLength) { 94 status = U_ILLEGAL_ARGUMENT_ERROR; 95 return; 96 } 97 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); 98 if (U_FAILURE(status)) {return;} 99 if(fData == 0) { 100 status = U_MEMORY_ALLOCATION_ERROR; 101 return; 102 } 103} 104 105 106//------------------------------------------------------------------------------- 107// 108// Constructor from a UDataMemory handle to precompiled break rules 109// stored in an ICU data file. 110// 111//------------------------------------------------------------------------------- 112RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) 113{ 114 init(status); 115 fData = new RBBIDataWrapper(udm, status); // status checked in constructor 116 if (U_FAILURE(status)) {return;} 117 if(fData == 0) { 118 status = U_MEMORY_ALLOCATION_ERROR; 119 return; 120 } 121} 122 123 124 125//------------------------------------------------------------------------------- 126// 127// Constructor from a set of rules supplied as a string. 128// 129//------------------------------------------------------------------------------- 130RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, 131 UParseError &parseError, 132 UErrorCode &status) 133{ 134 init(status); 135 if (U_FAILURE(status)) {return;} 136 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) 137 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); 138 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that 139 // creates and returns a complete RBBI. From here, in a constructor, we 140 // can't just return the object created by the builder factory, hence 141 // the assignment of the factory created object to "this". 142 if (U_SUCCESS(status)) { 143 *this = *bi; 144 delete bi; 145 } 146} 147 148 149//------------------------------------------------------------------------------- 150// 151// Default Constructor. Create an empty shell that can be set up later. 152// Used when creating a RuleBasedBreakIterator from a set 153// of rules. 154//------------------------------------------------------------------------------- 155RuleBasedBreakIterator::RuleBasedBreakIterator() { 156 UErrorCode status = U_ZERO_ERROR; 157 init(status); 158} 159 160 161//------------------------------------------------------------------------------- 162// 163// Copy constructor. Will produce a break iterator with the same behavior, 164// and which iterates over the same text, as the one passed in. 165// 166//------------------------------------------------------------------------------- 167RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) 168: BreakIterator(other) 169{ 170 UErrorCode status = U_ZERO_ERROR; 171 this->init(status); 172 *this = other; 173} 174 175 176/** 177 * Destructor 178 */ 179RuleBasedBreakIterator::~RuleBasedBreakIterator() { 180 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 181 // fCharIter was adopted from the outside. 182 delete fCharIter; 183 } 184 fCharIter = NULL; 185 delete fSCharIter; 186 fSCharIter = NULL; 187 delete fDCharIter; 188 fDCharIter = NULL; 189 190 utext_close(fText); 191 192 if (fData != NULL) { 193 fData->removeReference(); 194 fData = NULL; 195 } 196 delete fBreakCache; 197 fBreakCache = NULL; 198 199 delete fDictionaryCache; 200 fDictionaryCache = NULL; 201 202 delete fLanguageBreakEngines; 203 fLanguageBreakEngines = NULL; 204 205 delete fUnhandledBreakEngine; 206 fUnhandledBreakEngine = NULL; 207} 208 209/** 210 * Assignment operator. Sets this iterator to have the same behavior, 211 * and iterate over the same text, as the one passed in. 212 */ 213RuleBasedBreakIterator& 214RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { 215 if (this == &that) { 216 return *this; 217 } 218 BreakIterator::operator=(that); 219 220 fBreakType = that.fBreakType; 221 if (fLanguageBreakEngines != NULL) { 222 delete fLanguageBreakEngines; 223 fLanguageBreakEngines = NULL; // Just rebuild for now 224 } 225 // TODO: clone fLanguageBreakEngines from "that" 226 UErrorCode status = U_ZERO_ERROR; 227 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); 228 229 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 230 delete fCharIter; 231 } 232 fCharIter = NULL; 233 234 if (that.fCharIter != NULL ) { 235 // This is a little bit tricky - it will intially appear that 236 // this->fCharIter is adopted, even if that->fCharIter was 237 // not adopted. That's ok. 238 fCharIter = that.fCharIter->clone(); 239 } 240 241 if (fData != NULL) { 242 fData->removeReference(); 243 fData = NULL; 244 } 245 if (that.fData != NULL) { 246 fData = that.fData->addReference(); 247 } 248 249 fPosition = that.fPosition; 250 fRuleStatusIndex = that.fRuleStatusIndex; 251 fDone = that.fDone; 252 253 // TODO: both the dictionary and the main cache need to be copied. 254 // Current position could be within a dictionary range. Trying to continue 255 // the iteration without the caches present would go to the rules, with 256 // the assumption that the current position is on a rule boundary. 257 fBreakCache->reset(fPosition, fRuleStatusIndex); 258 fDictionaryCache->reset(); 259 260 return *this; 261} 262 263 264 265//----------------------------------------------------------------------------- 266// 267// init() Shared initialization routine. Used by all the constructors. 268// Initializes all fields, leaving the object in a consistent state. 269// 270//----------------------------------------------------------------------------- 271void RuleBasedBreakIterator::init(UErrorCode &status) { 272 fText = NULL; 273 fCharIter = NULL; 274 fSCharIter = NULL; 275 fDCharIter = NULL; 276 fData = NULL; 277 fPosition = 0; 278 fRuleStatusIndex = 0; 279 fDone = false; 280 fDictionaryCharCount = 0; 281 fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable 282 // dictionary behavior for Break Iterators that are 283 // built from rules. Even better would be the ability to 284 // declare the type in the rules. 285 286 fLanguageBreakEngines = NULL; 287 fUnhandledBreakEngine = NULL; 288 fBreakCache = NULL; 289 fDictionaryCache = NULL; 290 291 if (U_FAILURE(status)) { 292 return; 293 } 294 295 fText = utext_openUChars(NULL, NULL, 0, &status); 296 fDictionaryCache = new DictionaryCache(this, status); 297 fBreakCache = new BreakCache(this, status); 298 if (U_SUCCESS(status) && (fText == NULL || fDictionaryCache == NULL || fBreakCache == NULL)) { 299 status = U_MEMORY_ALLOCATION_ERROR; 300 } 301 302#ifdef RBBI_DEBUG 303 static UBool debugInitDone = FALSE; 304 if (debugInitDone == FALSE) { 305 char *debugEnv = getenv("U_RBBIDEBUG"); 306 if (debugEnv && uprv_strstr(debugEnv, "trace")) { 307 gTrace = TRUE; 308 } 309 debugInitDone = TRUE; 310 } 311#endif 312} 313 314 315 316//----------------------------------------------------------------------------- 317// 318// clone - Returns a newly-constructed RuleBasedBreakIterator with the same 319// behavior, and iterating over the same text, as this one. 320// Virtual function: does the right thing with subclasses. 321// 322//----------------------------------------------------------------------------- 323BreakIterator* 324RuleBasedBreakIterator::clone(void) const { 325 return new RuleBasedBreakIterator(*this); 326} 327 328/** 329 * Equality operator. Returns TRUE if both BreakIterators are of the 330 * same class, have the same behavior, and iterate over the same text. 331 */ 332UBool 333RuleBasedBreakIterator::operator==(const BreakIterator& that) const { 334 if (typeid(*this) != typeid(that)) { 335 return FALSE; 336 } 337 if (this == &that) { 338 return TRUE; 339 } 340 341 // The base class BreakIterator carries no state that participates in equality, 342 // and does not implement an equality function that would otherwise be 343 // checked at this point. 344 345 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; 346 347 if (!utext_equals(fText, that2.fText)) { 348 // The two break iterators are operating on different text, 349 // or have a different iteration position. 350 // Note that fText's position is always the same as the break iterator's position. 351 return FALSE; 352 }; 353 354 if (!(fPosition == that2.fPosition && 355 fRuleStatusIndex == that2.fRuleStatusIndex && 356 fDone == that2.fDone)) { 357 return FALSE; 358 } 359 360 if (that2.fData == fData || 361 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { 362 // The two break iterators are using the same rules. 363 return TRUE; 364 } 365 return FALSE; 366} 367 368/** 369 * Compute a hash code for this BreakIterator 370 * @return A hash code 371 */ 372int32_t 373RuleBasedBreakIterator::hashCode(void) const { 374 int32_t hash = 0; 375 if (fData != NULL) { 376 hash = fData->hashCode(); 377 } 378 return hash; 379} 380 381 382void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { 383 if (U_FAILURE(status)) { 384 return; 385 } 386 fBreakCache->reset(); 387 fDictionaryCache->reset(); 388 fText = utext_clone(fText, ut, FALSE, TRUE, &status); 389 390 // Set up a dummy CharacterIterator to be returned if anyone 391 // calls getText(). With input from UText, there is no reasonable 392 // way to return a characterIterator over the actual input text. 393 // Return one over an empty string instead - this is the closest 394 // we can come to signaling a failure. 395 // (GetText() is obsolete, this failure is sort of OK) 396 if (fDCharIter == NULL) { 397 static const UChar c = 0; 398 fDCharIter = new UCharCharacterIterator(&c, 0); 399 if (fDCharIter == NULL) { 400 status = U_MEMORY_ALLOCATION_ERROR; 401 return; 402 } 403 } 404 405 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 406 // existing fCharIter was adopted from the outside. Delete it now. 407 delete fCharIter; 408 } 409 fCharIter = fDCharIter; 410 411 this->first(); 412} 413 414 415UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { 416 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); 417 return result; 418} 419 420 421//======================================================================= 422// BreakIterator overrides 423//======================================================================= 424 425/** 426 * Return a CharacterIterator over the text being analyzed. 427 */ 428CharacterIterator& 429RuleBasedBreakIterator::getText() const { 430 return *fCharIter; 431} 432 433/** 434 * Set the iterator to analyze a new piece of text. This function resets 435 * the current iteration position to the beginning of the text. 436 * @param newText An iterator over the text to analyze. 437 */ 438void 439RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { 440 // If we are holding a CharacterIterator adopted from a 441 // previous call to this function, delete it now. 442 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 443 delete fCharIter; 444 } 445 446 fCharIter = newText; 447 UErrorCode status = U_ZERO_ERROR; 448 fBreakCache->reset(); 449 fDictionaryCache->reset(); 450 if (newText==NULL || newText->startIndex() != 0) { 451 // startIndex !=0 wants to be an error, but there's no way to report it. 452 // Make the iterator text be an empty string. 453 fText = utext_openUChars(fText, NULL, 0, &status); 454 } else { 455 fText = utext_openCharacterIterator(fText, newText, &status); 456 } 457 this->first(); 458} 459 460/** 461 * Set the iterator to analyze a new piece of text. This function resets 462 * the current iteration position to the beginning of the text. 463 * @param newText An iterator over the text to analyze. 464 */ 465void 466RuleBasedBreakIterator::setText(const UnicodeString& newText) { 467 UErrorCode status = U_ZERO_ERROR; 468 fBreakCache->reset(); 469 fDictionaryCache->reset(); 470 fText = utext_openConstUnicodeString(fText, &newText, &status); 471 472 // Set up a character iterator on the string. 473 // Needed in case someone calls getText(). 474 // Can not, unfortunately, do this lazily on the (probably never) 475 // call to getText(), because getText is const. 476 if (fSCharIter == NULL) { 477 fSCharIter = new StringCharacterIterator(newText); 478 } else { 479 fSCharIter->setText(newText); 480 } 481 482 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 483 // old fCharIter was adopted from the outside. Delete it. 484 delete fCharIter; 485 } 486 fCharIter = fSCharIter; 487 488 this->first(); 489} 490 491 492/** 493 * Provide a new UText for the input text. Must reference text with contents identical 494 * to the original. 495 * Intended for use with text data originating in Java (garbage collected) environments 496 * where the data may be moved in memory at arbitrary times. 497 */ 498RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { 499 if (U_FAILURE(status)) { 500 return *this; 501 } 502 if (input == NULL) { 503 status = U_ILLEGAL_ARGUMENT_ERROR; 504 return *this; 505 } 506 int64_t pos = utext_getNativeIndex(fText); 507 // Shallow read-only clone of the new UText into the existing input UText 508 fText = utext_clone(fText, input, FALSE, TRUE, &status); 509 if (U_FAILURE(status)) { 510 return *this; 511 } 512 utext_setNativeIndex(fText, pos); 513 if (utext_getNativeIndex(fText) != pos) { 514 // Sanity check. The new input utext is supposed to have the exact same 515 // contents as the old. If we can't set to the same position, it doesn't. 516 // The contents underlying the old utext might be invalid at this point, 517 // so it's not safe to check directly. 518 status = U_ILLEGAL_ARGUMENT_ERROR; 519 } 520 return *this; 521} 522 523 524/** 525 * Sets the current iteration position to the beginning of the text, position zero. 526 * @return The new iterator position, which is zero. 527 */ 528int32_t RuleBasedBreakIterator::first(void) { 529 UErrorCode status = U_ZERO_ERROR; 530 if (!fBreakCache->seek(0)) { 531 fBreakCache->populateNear(0, status); 532 } 533 fBreakCache->current(); 534 U_ASSERT(fPosition == 0); 535 return 0; 536} 537 538/** 539 * Sets the current iteration position to the end of the text. 540 * @return The text's past-the-end offset. 541 */ 542int32_t RuleBasedBreakIterator::last(void) { 543 int32_t endPos = (int32_t)utext_nativeLength(fText); 544 UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. 545 (void)endShouldBeBoundary; 546 U_ASSERT(endShouldBeBoundary); 547 U_ASSERT(fPosition == endPos); 548 return endPos; 549} 550 551/** 552 * Advances the iterator either forward or backward the specified number of steps. 553 * Negative values move backward, and positive values move forward. This is 554 * equivalent to repeatedly calling next() or previous(). 555 * @param n The number of steps to move. The sign indicates the direction 556 * (negative is backwards, and positive is forwards). 557 * @return The character offset of the boundary position n boundaries away from 558 * the current one. 559 */ 560int32_t RuleBasedBreakIterator::next(int32_t n) { 561 int32_t result = 0; 562 if (n > 0) { 563 for (; n > 0 && result != UBRK_DONE; --n) { 564 result = next(); 565 } 566 } else if (n < 0) { 567 for (; n < 0 && result != UBRK_DONE; ++n) { 568 result = previous(); 569 } 570 } else { 571 result = current(); 572 } 573 return result; 574} 575 576/** 577 * Advances the iterator to the next boundary position. 578 * @return The position of the first boundary after this one. 579 */ 580int32_t RuleBasedBreakIterator::next(void) { 581 fBreakCache->next(); 582 return fDone ? UBRK_DONE : fPosition; 583} 584 585/** 586 * Move the iterator backwards, to the boundary preceding the current one. 587 * 588 * Starts from the current position within fText. 589 * Starting position need not be on a boundary. 590 * 591 * @return The position of the boundary position immediately preceding the starting position. 592 */ 593int32_t RuleBasedBreakIterator::previous(void) { 594 UErrorCode status = U_ZERO_ERROR; 595 fBreakCache->previous(status); 596 return fDone ? UBRK_DONE : fPosition; 597} 598 599/** 600 * Sets the iterator to refer to the first boundary position following 601 * the specified position. 602 * @param startPos The position from which to begin searching for a break position. 603 * @return The position of the first break after the current position. 604 */ 605int32_t RuleBasedBreakIterator::following(int32_t startPos) { 606 // if the supplied position is before the beginning, return the 607 // text's starting offset 608 if (startPos < 0) { 609 return first(); 610 } 611 612 // Move requested offset to a code point start. It might be on a trail surrogate, 613 // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text. 614 utext_setNativeIndex(fText, startPos); 615 startPos = (int32_t)utext_getNativeIndex(fText); 616 617 UErrorCode status = U_ZERO_ERROR; 618 fBreakCache->following(startPos, status); 619 return fDone ? UBRK_DONE : fPosition; 620} 621 622/** 623 * Sets the iterator to refer to the last boundary position before the 624 * specified position. 625 * @param offset The position to begin searching for a break from. 626 * @return The position of the last boundary before the starting position. 627 */ 628int32_t RuleBasedBreakIterator::preceding(int32_t offset) { 629 if (fText == NULL || offset > utext_nativeLength(fText)) { 630 return last(); 631 } 632 633 // Move requested offset to a code point start. It might be on a trail surrogate, 634 // or on a trail byte if the input is UTF-8. 635 636 utext_setNativeIndex(fText, offset); 637 int32_t adjustedOffset = utext_getNativeIndex(fText); 638 639 UErrorCode status = U_ZERO_ERROR; 640 fBreakCache->preceding(adjustedOffset, status); 641 return fDone ? UBRK_DONE : fPosition; 642} 643 644/** 645 * Returns true if the specfied position is a boundary position. As a side 646 * effect, leaves the iterator pointing to the first boundary position at 647 * or after "offset". 648 * 649 * @param offset the offset to check. 650 * @return True if "offset" is a boundary position. 651 */ 652UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { 653 // out-of-range indexes are never boundary positions 654 if (offset < 0) { 655 first(); // For side effects on current position, tag values. 656 return FALSE; 657 } 658 659 // Adjust offset to be on a code point boundary and not beyond the end of the text. 660 // Note that isBoundary() is always be false for offsets that are not on code point boundaries. 661 // But we still need the side effect of leaving iteration at the following boundary. 662 663 utext_setNativeIndex(fText, offset); 664 int32_t adjustedOffset = utext_getNativeIndex(fText); 665 666 bool result = false; 667 UErrorCode status = U_ZERO_ERROR; 668 if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) { 669 result = (fBreakCache->current() == offset); 670 } 671 672 if (result && adjustedOffset < offset && utext_char32At(fText, offset) == U_SENTINEL) { 673 // Original offset is beyond the end of the text. Return FALSE, it's not a boundary, 674 // but the iteration position remains set to the end of the text, which is a boundary. 675 return FALSE; 676 } 677 if (!result) { 678 // Not on a boundary. isBoundary() must leave iterator on the following boundary. 679 // Cache->seek(), above, left us on the preceding boundary, so advance one. 680 next(); 681 } 682 return result; 683} 684 685 686/** 687 * Returns the current iteration position. 688 * @return The current iteration position. 689 */ 690int32_t RuleBasedBreakIterator::current(void) const { 691 return fPosition; 692} 693 694 695//======================================================================= 696// implementation 697//======================================================================= 698 699// 700// RBBIRunMode - the state machine runs an extra iteration at the beginning and end 701// of user text. A variable with this enum type keeps track of where we 702// are. The state machine only fetches user input while in the RUN mode. 703// 704enum RBBIRunMode { 705 RBBI_START, // state machine processing is before first char of input 706 RBBI_RUN, // state machine processing is in the user text 707 RBBI_END // state machine processing is after end of user text. 708}; 709 710 711// Map from look-ahead break states (corresponds to rules) to boundary positions. 712// Allows multiple lookahead break rules to be in flight at the same time. 713// 714// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers 715// in the state table be sequential, then we can just index an array. And the 716// table could also tell us in advance how big that array needs to be. 717// 718// Before ICU 57 there was just a single simple variable for a look-ahead match that 719// was in progress. Two rules at once did not work. 720 721static const int32_t kMaxLookaheads = 8; 722struct LookAheadResults { 723 int32_t fUsedSlotLimit; 724 int32_t fPositions[8]; 725 int16_t fKeys[8]; 726 727 LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}; 728 729 int32_t getPosition(int16_t key) { 730 for (int32_t i=0; i<fUsedSlotLimit; ++i) { 731 if (fKeys[i] == key) { 732 return fPositions[i]; 733 } 734 } 735 U_ASSERT(FALSE); 736 return -1; 737 } 738 739 void setPosition(int16_t key, int32_t position) { 740 int32_t i; 741 for (i=0; i<fUsedSlotLimit; ++i) { 742 if (fKeys[i] == key) { 743 fPositions[i] = position; 744 return; 745 } 746 } 747 if (i >= kMaxLookaheads) { 748 U_ASSERT(FALSE); 749 i = kMaxLookaheads - 1; 750 } 751 fKeys[i] = key; 752 fPositions[i] = position; 753 U_ASSERT(fUsedSlotLimit == i); 754 fUsedSlotLimit = i + 1; 755 } 756}; 757 758 759//----------------------------------------------------------------------------------- 760// 761// handleNext() 762// Run the state machine to find a boundary 763// 764//----------------------------------------------------------------------------------- 765int32_t RuleBasedBreakIterator::handleNext() { 766 int32_t state; 767 uint16_t category = 0; 768 RBBIRunMode mode; 769 770 RBBIStateTableRow *row; 771 UChar32 c; 772 LookAheadResults lookAheadMatches; 773 int32_t result = 0; 774 int32_t initialPosition = 0; 775 const RBBIStateTable *statetable = fData->fForwardTable; 776 const char *tableData = statetable->fTableData; 777 uint32_t tableRowLen = statetable->fRowLen; 778 #ifdef RBBI_DEBUG 779 if (gTrace) { 780 RBBIDebugPuts("Handle Next pos char state category"); 781 } 782 #endif 783 784 // handleNext alway sets the break tag value. 785 // Set the default for it. 786 fRuleStatusIndex = 0; 787 788 fDictionaryCharCount = 0; 789 790 // if we're already at the end of the text, return DONE. 791 initialPosition = fPosition; 792 UTEXT_SETNATIVEINDEX(fText, initialPosition); 793 result = initialPosition; 794 c = UTEXT_NEXT32(fText); 795 if (c==U_SENTINEL) { 796 fDone = TRUE; 797 return UBRK_DONE; 798 } 799 800 // Set the initial state for the state machine 801 state = START_STATE; 802 row = (RBBIStateTableRow *) 803 //(statetable->fTableData + (statetable->fRowLen * state)); 804 (tableData + tableRowLen * state); 805 806 807 mode = RBBI_RUN; 808 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 809 category = 2; 810 mode = RBBI_START; 811 } 812 813 814 // loop until we reach the end of the text or transition to state 0 815 // 816 for (;;) { 817 if (c == U_SENTINEL) { 818 // Reached end of input string. 819 if (mode == RBBI_END) { 820 // We have already run the loop one last time with the 821 // character set to the psueudo {eof} value. Now it is time 822 // to unconditionally bail out. 823 break; 824 } 825 // Run the loop one last time with the fake end-of-input character category. 826 mode = RBBI_END; 827 category = 1; 828 } 829 830 // 831 // Get the char category. An incoming category of 1 or 2 means that 832 // we are preset for doing the beginning or end of input, and 833 // that we shouldn't get a category from an actual text input character. 834 // 835 if (mode == RBBI_RUN) { 836 // look up the current character's character category, which tells us 837 // which column in the state table to look at. 838 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 839 // not the size of the character going in, which is a UChar32. 840 // 841 category = UTRIE2_GET16(fData->fTrie, c); 842 843 // Check the dictionary bit in the character's category. 844 // Counter is only used by dictionary based iteration. 845 // Chars that need to be handled by a dictionary have a flag bit set 846 // in their category values. 847 // 848 if ((category & 0x4000) != 0) { 849 fDictionaryCharCount++; 850 // And off the dictionary flag bit. 851 category &= ~0x4000; 852 } 853 } 854 855 #ifdef RBBI_DEBUG 856 if (gTrace) { 857 RBBIDebugPrintf(" %4ld ", utext_getNativeIndex(fText)); 858 if (0x20<=c && c<0x7f) { 859 RBBIDebugPrintf("\"%c\" ", c); 860 } else { 861 RBBIDebugPrintf("%5x ", c); 862 } 863 RBBIDebugPrintf("%3d %3d\n", state, category); 864 } 865 #endif 866 867 // State Transition - move machine to its next state 868 // 869 870 // Note: fNextState is defined as uint16_t[2], but we are casting 871 // a generated RBBI table to RBBIStateTableRow and some tables 872 // actually have more than 2 categories. 873 U_ASSERT(category<fData->fHeader->fCatCount); 874 state = row->fNextState[category]; /*Not accessing beyond memory*/ 875 row = (RBBIStateTableRow *) 876 // (statetable->fTableData + (statetable->fRowLen * state)); 877 (tableData + tableRowLen * state); 878 879 880 if (row->fAccepting == -1) { 881 // Match found, common case. 882 if (mode != RBBI_START) { 883 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 884 } 885 fRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. 886 } 887 888 int16_t completedRule = row->fAccepting; 889 if (completedRule > 0) { 890 // Lookahead match is completed. 891 int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); 892 if (lookaheadResult >= 0) { 893 fRuleStatusIndex = row->fTagIdx; 894 fPosition = lookaheadResult; 895 return lookaheadResult; 896 } 897 } 898 int16_t rule = row->fLookAhead; 899 if (rule != 0) { 900 // At the position of a '/' in a look-ahead match. Record it. 901 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 902 lookAheadMatches.setPosition(rule, pos); 903 } 904 905 if (state == STOP_STATE) { 906 // This is the normal exit from the lookup state machine. 907 // We have advanced through the string until it is certain that no 908 // longer match is possible, no matter what characters follow. 909 break; 910 } 911 912 // Advance to the next character. 913 // If this is a beginning-of-input loop iteration, don't advance 914 // the input position. The next iteration will be processing the 915 // first real input character. 916 if (mode == RBBI_RUN) { 917 c = UTEXT_NEXT32(fText); 918 } else { 919 if (mode == RBBI_START) { 920 mode = RBBI_RUN; 921 } 922 } 923 } 924 925 // The state machine is done. Check whether it found a match... 926 927 // If the iterator failed to advance in the match engine, force it ahead by one. 928 // (This really indicates a defect in the break rules. They should always match 929 // at least one character.) 930 if (result == initialPosition) { 931 utext_setNativeIndex(fText, initialPosition); 932 utext_next32(fText); 933 result = (int32_t)utext_getNativeIndex(fText); 934 fRuleStatusIndex = 0; 935 } 936 937 // Leave the iterator at our result position. 938 fPosition = result; 939 #ifdef RBBI_DEBUG 940 if (gTrace) { 941 RBBIDebugPrintf("result = %d\n\n", result); 942 } 943 #endif 944 return result; 945} 946 947 948 949//----------------------------------------------------------------------------------- 950// 951// handlePrevious() 952// 953// Iterate backwards using the safe reverse rules. 954// The logic of this function is very similar to handleNext(), above. 955// 956//----------------------------------------------------------------------------------- 957int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) { 958 int32_t state; 959 uint16_t category = 0; 960 RBBIRunMode mode; 961 RBBIStateTableRow *row; 962 UChar32 c; 963 LookAheadResults lookAheadMatches; 964 int32_t result = 0; 965 int32_t initialPosition = 0; 966 967 const RBBIStateTable *stateTable = fData->fSafeRevTable; 968 UTEXT_SETNATIVEINDEX(fText, fromPosition); 969 #ifdef RBBI_DEBUG 970 if (gTrace) { 971 RBBIDebugPuts("Handle Previous pos char state category"); 972 } 973 #endif 974 975 // if we're already at the start of the text, return DONE. 976 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { 977 return BreakIterator::DONE; 978 } 979 980 // Set up the starting char. 981 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 982 result = initialPosition; 983 c = UTEXT_PREVIOUS32(fText); 984 985 // Set the initial state for the state machine 986 state = START_STATE; 987 row = (RBBIStateTableRow *) 988 (stateTable->fTableData + (stateTable->fRowLen * state)); 989 category = 3; 990 mode = RBBI_RUN; 991 if (stateTable->fFlags & RBBI_BOF_REQUIRED) { 992 category = 2; 993 mode = RBBI_START; 994 } 995 996 997 // loop until we reach the start of the text or transition to state 0 998 // 999 for (;;) { 1000 if (c == U_SENTINEL) { 1001 // Reached end of input string. 1002 if (mode == RBBI_END) { 1003 // We have already run the loop one last time with the 1004 // character set to the psueudo {eof} value. Now it is time 1005 // to unconditionally bail out. 1006 break; 1007 } 1008 // Run the loop one last time with the fake end-of-input character category. 1009 mode = RBBI_END; 1010 category = 1; 1011 } 1012 1013 // 1014 // Get the char category. An incoming category of 1 or 2 means that 1015 // we are preset for doing the beginning or end of input, and 1016 // that we shouldn't get a category from an actual text input character. 1017 // 1018 if (mode == RBBI_RUN) { 1019 // look up the current character's character category, which tells us 1020 // which column in the state table to look at. 1021 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1022 // not the size of the character going in, which is a UChar32. 1023 // 1024 // And off the dictionary flag bit. For reverse iteration it is not used. 1025 category = UTRIE2_GET16(fData->fTrie, c); 1026 category &= ~0x4000; 1027 } 1028 1029 #ifdef RBBI_DEBUG 1030 if (gTrace) { 1031 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); 1032 if (0x20<=c && c<0x7f) { 1033 RBBIDebugPrintf("\"%c\" ", c); 1034 } else { 1035 RBBIDebugPrintf("%5x ", c); 1036 } 1037 RBBIDebugPrintf("%3d %3d\n", state, category); 1038 } 1039 #endif 1040 1041 // State Transition - move machine to its next state 1042 // 1043 1044 // Note: fNextState is defined as uint16_t[2], but we are casting 1045 // a generated RBBI table to RBBIStateTableRow and some tables 1046 // actually have more than 2 categories. 1047 U_ASSERT(category<fData->fHeader->fCatCount); 1048 state = row->fNextState[category]; /*Not accessing beyond memory*/ 1049 row = (RBBIStateTableRow *) 1050 (stateTable->fTableData + (stateTable->fRowLen * state)); 1051 1052 if (row->fAccepting == -1) { 1053 // Match found, common case. 1054 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1055 } 1056 1057 int16_t completedRule = row->fAccepting; 1058 if (completedRule > 0) { 1059 // Lookahead match is completed. 1060 int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); 1061 if (lookaheadResult >= 0) { 1062 UTEXT_SETNATIVEINDEX(fText, lookaheadResult); 1063 return lookaheadResult; 1064 } 1065 } 1066 int16_t rule = row->fLookAhead; 1067 if (rule != 0) { 1068 // At the position of a '/' in a look-ahead match. Record it. 1069 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1070 lookAheadMatches.setPosition(rule, pos); 1071 } 1072 1073 if (state == STOP_STATE) { 1074 // This is the normal exit from the lookup state machine. 1075 // We have advanced through the string until it is certain that no 1076 // longer match is possible, no matter what characters follow. 1077 break; 1078 } 1079 1080 // Move (backwards) to the next character to process. 1081 // If this is a beginning-of-input loop iteration, don't advance 1082 // the input position. The next iteration will be processing the 1083 // first real input character. 1084 if (mode == RBBI_RUN) { 1085 c = UTEXT_PREVIOUS32(fText); 1086 } else { 1087 if (mode == RBBI_START) { 1088 mode = RBBI_RUN; 1089 } 1090 } 1091 } 1092 1093 // The state machine is done. Check whether it found a match... 1094 1095 // If the iterator failed to advance in the match engine, force it ahead by one. 1096 // (This really indicates a defect in the break rules. They should always match 1097 // at least one character.) 1098 if (result == initialPosition) { 1099 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1100 UTEXT_PREVIOUS32(fText); 1101 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1102 } 1103 1104 #ifdef RBBI_DEBUG 1105 if (gTrace) { 1106 RBBIDebugPrintf("result = %d\n\n", result); 1107 } 1108 #endif 1109 return result; 1110} 1111 1112 1113//------------------------------------------------------------------------------- 1114// 1115// getRuleStatus() Return the break rule tag associated with the current 1116// iterator position. If the iterator arrived at its current 1117// position by iterating forwards, the value will have been 1118// cached by the handleNext() function. 1119// 1120//------------------------------------------------------------------------------- 1121 1122int32_t RuleBasedBreakIterator::getRuleStatus() const { 1123 1124 // fLastRuleStatusIndex indexes to the start of the appropriate status record 1125 // (the number of status values.) 1126 // This function returns the last (largest) of the array of status values. 1127 int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex]; 1128 int32_t tagVal = fData->fRuleStatusTable[idx]; 1129 1130 return tagVal; 1131} 1132 1133 1134int32_t RuleBasedBreakIterator::getRuleStatusVec( 1135 int32_t *fillInVec, int32_t capacity, UErrorCode &status) { 1136 if (U_FAILURE(status)) { 1137 return 0; 1138 } 1139 1140 int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex]; 1141 int32_t numValsToCopy = numVals; 1142 if (numVals > capacity) { 1143 status = U_BUFFER_OVERFLOW_ERROR; 1144 numValsToCopy = capacity; 1145 } 1146 int i; 1147 for (i=0; i<numValsToCopy; i++) { 1148 fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1]; 1149 } 1150 return numVals; 1151} 1152 1153 1154 1155//------------------------------------------------------------------------------- 1156// 1157// getBinaryRules Access to the compiled form of the rules, 1158// for use by build system tools that save the data 1159// for standard iterator types. 1160// 1161//------------------------------------------------------------------------------- 1162const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { 1163 const uint8_t *retPtr = NULL; 1164 length = 0; 1165 1166 if (fData != NULL) { 1167 retPtr = (const uint8_t *)fData->fHeader; 1168 length = fData->fHeader->fLength; 1169 } 1170 return retPtr; 1171} 1172 1173 1174BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/, 1175 int32_t &bufferSize, 1176 UErrorCode &status) 1177{ 1178 if (U_FAILURE(status)){ 1179 return NULL; 1180 } 1181 1182 if (bufferSize == 0) { 1183 bufferSize = 1; // preflighting for deprecated functionality 1184 return NULL; 1185 } 1186 1187 BreakIterator *clonedBI = clone(); 1188 if (clonedBI == NULL) { 1189 status = U_MEMORY_ALLOCATION_ERROR; 1190 } else { 1191 status = U_SAFECLONE_ALLOCATED_WARNING; 1192 } 1193 return (RuleBasedBreakIterator *)clonedBI; 1194} 1195 1196U_NAMESPACE_END 1197 1198 1199static icu::UStack *gLanguageBreakFactories = nullptr; 1200static const icu::UnicodeString *gEmptyString = nullptr; 1201static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; 1202static icu::UInitOnce gRBBIInitOnce = U_INITONCE_INITIALIZER; 1203 1204/** 1205 * Release all static memory held by breakiterator. 1206 */ 1207U_CDECL_BEGIN 1208static UBool U_CALLCONV rbbi_cleanup(void) { 1209 delete gLanguageBreakFactories; 1210 gLanguageBreakFactories = nullptr; 1211 delete gEmptyString; 1212 gEmptyString = nullptr; 1213 gLanguageBreakFactoriesInitOnce.reset(); 1214 gRBBIInitOnce.reset(); 1215 return TRUE; 1216} 1217U_CDECL_END 1218 1219U_CDECL_BEGIN 1220static void U_CALLCONV _deleteFactory(void *obj) { 1221 delete (icu::LanguageBreakFactory *) obj; 1222} 1223U_CDECL_END 1224U_NAMESPACE_BEGIN 1225 1226static void U_CALLCONV rbbiInit() { 1227 gEmptyString = new UnicodeString(); 1228 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup); 1229} 1230 1231static void U_CALLCONV initLanguageFactories() { 1232 UErrorCode status = U_ZERO_ERROR; 1233 U_ASSERT(gLanguageBreakFactories == NULL); 1234 gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status); 1235 if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) { 1236 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); 1237 gLanguageBreakFactories->push(builtIn, status); 1238#ifdef U_LOCAL_SERVICE_HOOK 1239 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); 1240 if (extra != NULL) { 1241 gLanguageBreakFactories->push(extra, status); 1242 } 1243#endif 1244 } 1245 ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup); 1246} 1247 1248 1249static const LanguageBreakEngine* 1250getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) 1251{ 1252 umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); 1253 if (gLanguageBreakFactories == NULL) { 1254 return NULL; 1255 } 1256 1257 int32_t i = gLanguageBreakFactories->size(); 1258 const LanguageBreakEngine *lbe = NULL; 1259 while (--i >= 0) { 1260 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); 1261 lbe = factory->getEngineFor(c, breakType); 1262 if (lbe != NULL) { 1263 break; 1264 } 1265 } 1266 return lbe; 1267} 1268 1269 1270//------------------------------------------------------------------------------- 1271// 1272// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the 1273// the character c. 1274// 1275//------------------------------------------------------------------------------- 1276const LanguageBreakEngine * 1277RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { 1278 const LanguageBreakEngine *lbe = NULL; 1279 UErrorCode status = U_ZERO_ERROR; 1280 1281 if (fLanguageBreakEngines == NULL) { 1282 fLanguageBreakEngines = new UStack(status); 1283 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { 1284 delete fLanguageBreakEngines; 1285 fLanguageBreakEngines = 0; 1286 return NULL; 1287 } 1288 } 1289 1290 int32_t i = fLanguageBreakEngines->size(); 1291 while (--i >= 0) { 1292 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); 1293 if (lbe->handles(c, fBreakType)) { 1294 return lbe; 1295 } 1296 } 1297 1298 // No existing dictionary took the character. See if a factory wants to 1299 // give us a new LanguageBreakEngine for this character. 1300 lbe = getLanguageBreakEngineFromFactory(c, fBreakType); 1301 1302 // If we got one, use it and push it on our stack. 1303 if (lbe != NULL) { 1304 fLanguageBreakEngines->push((void *)lbe, status); 1305 // Even if we can't remember it, we can keep looking it up, so 1306 // return it even if the push fails. 1307 return lbe; 1308 } 1309 1310 // No engine is forthcoming for this character. Add it to the 1311 // reject set. Create the reject break engine if needed. 1312 if (fUnhandledBreakEngine == NULL) { 1313 fUnhandledBreakEngine = new UnhandledEngine(status); 1314 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { 1315 status = U_MEMORY_ALLOCATION_ERROR; 1316 } 1317 // Put it last so that scripts for which we have an engine get tried 1318 // first. 1319 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); 1320 // If we can't insert it, or creation failed, get rid of it 1321 if (U_FAILURE(status)) { 1322 delete fUnhandledBreakEngine; 1323 fUnhandledBreakEngine = 0; 1324 return NULL; 1325 } 1326 } 1327 1328 // Tell the reject engine about the character; at its discretion, it may 1329 // add more than just the one character. 1330 fUnhandledBreakEngine->handleCharacter(c, fBreakType); 1331 1332 return fUnhandledBreakEngine; 1333} 1334 1335 1336 1337/*int32_t RuleBasedBreakIterator::getBreakType() const { 1338 return fBreakType; 1339}*/ 1340 1341void RuleBasedBreakIterator::setBreakType(int32_t type) { 1342 fBreakType = type; 1343} 1344 1345void RuleBasedBreakIterator::dumpCache() { 1346 fBreakCache->dumpCache(); 1347} 1348 1349/** 1350 * Returns the description used to create this iterator 1351 */ 1352 1353const UnicodeString& 1354RuleBasedBreakIterator::getRules() const { 1355 if (fData != NULL) { 1356 return fData->getRuleSourceString(); 1357 } else { 1358 umtx_initOnce(gRBBIInitOnce, &rbbiInit); 1359 return *gEmptyString; 1360 } 1361} 1362 1363U_NAMESPACE_END 1364 1365#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1366