1/* 2*************************************************************************** 3* Copyright (C) 1999-2008 International Business Machines Corporation * 4* and others. All rights reserved. * 5*************************************************************************** 6*/ 7// 8// file: rbbi.c Contains the implementation of the rule based break iterator 9// runtime engine and the API implementation for 10// class RuleBasedBreakIterator 11// 12 13#include "unicode/utypes.h" 14 15#if !UCONFIG_NO_BREAK_ITERATION 16 17#include "unicode/rbbi.h" 18#include "unicode/schriter.h" 19#include "unicode/uchriter.h" 20#include "unicode/udata.h" 21#include "unicode/uclean.h" 22#include "rbbidata.h" 23#include "rbbirb.h" 24#include "cmemory.h" 25#include "cstring.h" 26#include "umutex.h" 27#include "ucln_cmn.h" 28#include "brkeng.h" 29 30#include "uassert.h" 31#include "uvector.h" 32 33// if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. 34#if U_LOCAL_SERVICE_HOOK 35#include "localsvc.h" 36#endif 37 38#ifdef RBBI_DEBUG 39static UBool fTrace = FALSE; 40#endif 41 42U_NAMESPACE_BEGIN 43 44// The state number of the starting state 45#define START_STATE 1 46 47// The state-transition value indicating "stop" 48#define STOP_STATE 0 49 50 51UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) 52 53 54//======================================================================= 55// constructors 56//======================================================================= 57 58/** 59 * Constructs a RuleBasedBreakIterator that uses the already-created 60 * tables object that is passed in as a parameter. 61 */ 62RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) 63{ 64 init(); 65 fData = new RBBIDataWrapper(data, status); // status checked in constructor 66 if (U_FAILURE(status)) {return;} 67 if(fData == 0) { 68 status = U_MEMORY_ALLOCATION_ERROR; 69 return; 70 } 71} 72 73/** 74 * Same as above but does not adopt memory 75 */ 76RuleBasedBreakIterator::RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt, UErrorCode &status) 77{ 78 init(); 79 fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); // status checked in constructor 80 if (U_FAILURE(status)) {return;} 81 if(fData == 0) { 82 status = U_MEMORY_ALLOCATION_ERROR; 83 return; 84 } 85} 86 87//------------------------------------------------------------------------------- 88// 89// Constructor from a UDataMemory handle to precompiled break rules 90// stored in an ICU data file. 91// 92//------------------------------------------------------------------------------- 93RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) 94{ 95 init(); 96 fData = new RBBIDataWrapper(udm, status); // status checked in constructor 97 if (U_FAILURE(status)) {return;} 98 if(fData == 0) { 99 status = U_MEMORY_ALLOCATION_ERROR; 100 return; 101 } 102} 103 104 105 106//------------------------------------------------------------------------------- 107// 108// Constructor from a set of rules supplied as a string. 109// 110//------------------------------------------------------------------------------- 111RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, 112 UParseError &parseError, 113 UErrorCode &status) 114{ 115 init(); 116 if (U_FAILURE(status)) {return;} 117 RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) 118 RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); 119 // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that 120 // creates and returns a complete RBBI. From here, in a constructor, we 121 // can't just return the object created by the builder factory, hence 122 // the assignment of the factory created object to "this". 123 if (U_SUCCESS(status)) { 124 *this = *bi; 125 delete bi; 126 } 127} 128 129 130//------------------------------------------------------------------------------- 131// 132// Default Constructor. Create an empty shell that can be set up later. 133// Used when creating a RuleBasedBreakIterator from a set 134// of rules. 135//------------------------------------------------------------------------------- 136RuleBasedBreakIterator::RuleBasedBreakIterator() { 137 init(); 138} 139 140 141//------------------------------------------------------------------------------- 142// 143// Copy constructor. Will produce a break iterator with the same behavior, 144// and which iterates over the same text, as the one passed in. 145// 146//------------------------------------------------------------------------------- 147RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) 148: BreakIterator(other) 149{ 150 this->init(); 151 *this = other; 152} 153 154 155/** 156 * Destructor 157 */ 158RuleBasedBreakIterator::~RuleBasedBreakIterator() { 159 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 160 // fCharIter was adopted from the outside. 161 delete fCharIter; 162 } 163 fCharIter = NULL; 164 delete fSCharIter; 165 fCharIter = NULL; 166 delete fDCharIter; 167 fDCharIter = NULL; 168 169 utext_close(fText); 170 171 if (fData != NULL) { 172 fData->removeReference(); 173 fData = NULL; 174 } 175 if (fCachedBreakPositions) { 176 uprv_free(fCachedBreakPositions); 177 fCachedBreakPositions = NULL; 178 } 179 if (fLanguageBreakEngines) { 180 delete fLanguageBreakEngines; 181 fLanguageBreakEngines = NULL; 182 } 183 if (fUnhandledBreakEngine) { 184 delete fUnhandledBreakEngine; 185 fUnhandledBreakEngine = NULL; 186 } 187} 188 189/** 190 * Assignment operator. Sets this iterator to have the same behavior, 191 * and iterate over the same text, as the one passed in. 192 */ 193RuleBasedBreakIterator& 194RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { 195 if (this == &that) { 196 return *this; 197 } 198 reset(); // Delete break cache information 199 fBreakType = that.fBreakType; 200 if (fLanguageBreakEngines != NULL) { 201 delete fLanguageBreakEngines; 202 fLanguageBreakEngines = NULL; // Just rebuild for now 203 } 204 // TODO: clone fLanguageBreakEngines from "that" 205 UErrorCode status = U_ZERO_ERROR; 206 fText = utext_clone(fText, that.fText, FALSE, TRUE, &status); 207 208 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 209 delete fCharIter; 210 } 211 fCharIter = NULL; 212 213 if (that.fCharIter != NULL ) { 214 // This is a little bit tricky - it will intially appear that 215 // this->fCharIter is adopted, even if that->fCharIter was 216 // not adopted. That's ok. 217 fCharIter = that.fCharIter->clone(); 218 } 219 220 if (fData != NULL) { 221 fData->removeReference(); 222 fData = NULL; 223 } 224 if (that.fData != NULL) { 225 fData = that.fData->addReference(); 226 } 227 228 return *this; 229} 230 231 232 233//----------------------------------------------------------------------------- 234// 235// init() Shared initialization routine. Used by all the constructors. 236// Initializes all fields, leaving the object in a consistent state. 237// 238//----------------------------------------------------------------------------- 239void RuleBasedBreakIterator::init() { 240 UErrorCode status = U_ZERO_ERROR; 241 fBufferClone = FALSE; 242 fText = utext_openUChars(NULL, NULL, 0, &status); 243 fCharIter = NULL; 244 fSCharIter = NULL; 245 fDCharIter = NULL; 246 fData = NULL; 247 fLastRuleStatusIndex = 0; 248 fLastStatusIndexValid = TRUE; 249 fDictionaryCharCount = 0; 250 fBreakType = -1; 251 252 fCachedBreakPositions = NULL; 253 fLanguageBreakEngines = NULL; 254 fUnhandledBreakEngine = NULL; 255 fNumCachedBreakPositions = 0; 256 fPositionInCache = 0; 257 258#ifdef RBBI_DEBUG 259 static UBool debugInitDone = FALSE; 260 if (debugInitDone == FALSE) { 261 char *debugEnv = getenv("U_RBBIDEBUG"); 262 if (debugEnv && uprv_strstr(debugEnv, "trace")) { 263 fTrace = TRUE; 264 } 265 debugInitDone = TRUE; 266 } 267#endif 268} 269 270 271 272//----------------------------------------------------------------------------- 273// 274// clone - Returns a newly-constructed RuleBasedBreakIterator with the same 275// behavior, and iterating over the same text, as this one. 276// Virtual function: does the right thing with subclasses. 277// 278//----------------------------------------------------------------------------- 279BreakIterator* 280RuleBasedBreakIterator::clone(void) const { 281 return new RuleBasedBreakIterator(*this); 282} 283 284/** 285 * Equality operator. Returns TRUE if both BreakIterators are of the 286 * same class, have the same behavior, and iterate over the same text. 287 */ 288UBool 289RuleBasedBreakIterator::operator==(const BreakIterator& that) const { 290 if (that.getDynamicClassID() != getDynamicClassID()) { 291 return FALSE; 292 } 293 294 const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; 295 296 if (!utext_equals(fText, that2.fText)) { 297 // The two break iterators are operating on different text, 298 // or have a different interation position. 299 return FALSE; 300 }; 301 302 // TODO: need a check for when in a dictionary region at different offsets. 303 304 if (that2.fData == fData || 305 (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { 306 // The two break iterators are using the same rules. 307 return TRUE; 308 } 309 return FALSE; 310} 311 312/** 313 * Compute a hash code for this BreakIterator 314 * @return A hash code 315 */ 316int32_t 317RuleBasedBreakIterator::hashCode(void) const { 318 int32_t hash = 0; 319 if (fData != NULL) { 320 hash = fData->hashCode(); 321 } 322 return hash; 323} 324 325 326void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { 327 if (U_FAILURE(status)) { 328 return; 329 } 330 reset(); 331 fText = utext_clone(fText, ut, FALSE, TRUE, &status); 332 333 // Set up a dummy CharacterIterator to be returned if anyone 334 // calls getText(). With input from UText, there is no reasonable 335 // way to return a characterIterator over the actual input text. 336 // Return one over an empty string instead - this is the closest 337 // we can come to signaling a failure. 338 // (GetText() is obsolete, this failure is sort of OK) 339 if (fDCharIter == NULL) { 340 static const UChar c = 0; 341 fDCharIter = new UCharCharacterIterator(&c, 0); 342 if (fDCharIter == NULL) { 343 status = U_MEMORY_ALLOCATION_ERROR; 344 return; 345 } 346 } 347 348 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 349 // existing fCharIter was adopted from the outside. Delete it now. 350 delete fCharIter; 351 } 352 fCharIter = fDCharIter; 353 354 this->first(); 355} 356 357 358UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { 359 UText *result = utext_clone(fillIn, fText, FALSE, TRUE, &status); 360 return result; 361} 362 363 364 365/** 366 * Returns the description used to create this iterator 367 */ 368const UnicodeString& 369RuleBasedBreakIterator::getRules() const { 370 if (fData != NULL) { 371 return fData->getRuleSourceString(); 372 } else { 373 static const UnicodeString *s; 374 if (s == NULL) { 375 // TODO: something more elegant here. 376 // perhaps API should return the string by value. 377 // Note: thread unsafe init & leak are semi-ok, better than 378 // what was before. Sould be cleaned up, though. 379 s = new UnicodeString; 380 } 381 return *s; 382 } 383} 384 385//======================================================================= 386// BreakIterator overrides 387//======================================================================= 388 389/** 390 * Return a CharacterIterator over the text being analyzed. 391 */ 392CharacterIterator& 393RuleBasedBreakIterator::getText() const { 394 return *fCharIter; 395} 396 397/** 398 * Set the iterator to analyze a new piece of text. This function resets 399 * the current iteration position to the beginning of the text. 400 * @param newText An iterator over the text to analyze. 401 */ 402void 403RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { 404 // If we are holding a CharacterIterator adopted from a 405 // previous call to this function, delete it now. 406 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 407 delete fCharIter; 408 } 409 410 fCharIter = newText; 411 UErrorCode status = U_ZERO_ERROR; 412 reset(); 413 if (newText==NULL || newText->startIndex() != 0) { 414 // startIndex !=0 wants to be an error, but there's no way to report it. 415 // Make the iterator text be an empty string. 416 fText = utext_openUChars(fText, NULL, 0, &status); 417 } else { 418 fText = utext_openCharacterIterator(fText, newText, &status); 419 } 420 this->first(); 421} 422 423/** 424 * Set the iterator to analyze a new piece of text. This function resets 425 * the current iteration position to the beginning of the text. 426 * @param newText An iterator over the text to analyze. 427 */ 428void 429RuleBasedBreakIterator::setText(const UnicodeString& newText) { 430 UErrorCode status = U_ZERO_ERROR; 431 reset(); 432 fText = utext_openConstUnicodeString(fText, &newText, &status); 433 434 // Set up a character iterator on the string. 435 // Needed in case someone calls getText(). 436 // Can not, unfortunately, do this lazily on the (probably never) 437 // call to getText(), because getText is const. 438 if (fSCharIter == NULL) { 439 fSCharIter = new StringCharacterIterator(newText); 440 } else { 441 fSCharIter->setText(newText); 442 } 443 444 if (fCharIter!=fSCharIter && fCharIter!=fDCharIter) { 445 // old fCharIter was adopted from the outside. Delete it. 446 delete fCharIter; 447 } 448 fCharIter = fSCharIter; 449 450 this->first(); 451} 452 453 454 455/** 456 * Sets the current iteration position to the beginning of the text. 457 * @return The offset of the beginning of the text. 458 */ 459int32_t RuleBasedBreakIterator::first(void) { 460 reset(); 461 fLastRuleStatusIndex = 0; 462 fLastStatusIndexValid = TRUE; 463 //if (fText == NULL) 464 // return BreakIterator::DONE; 465 466 utext_setNativeIndex(fText, 0); 467 return 0; 468} 469 470/** 471 * Sets the current iteration position to the end of the text. 472 * @return The text's past-the-end offset. 473 */ 474int32_t RuleBasedBreakIterator::last(void) { 475 reset(); 476 if (fText == NULL) { 477 fLastRuleStatusIndex = 0; 478 fLastStatusIndexValid = TRUE; 479 return BreakIterator::DONE; 480 } 481 482 fLastStatusIndexValid = FALSE; 483 int32_t pos = (int32_t)utext_nativeLength(fText); 484 utext_setNativeIndex(fText, pos); 485 return pos; 486} 487 488/** 489 * Advances the iterator either forward or backward the specified number of steps. 490 * Negative values move backward, and positive values move forward. This is 491 * equivalent to repeatedly calling next() or previous(). 492 * @param n The number of steps to move. The sign indicates the direction 493 * (negative is backwards, and positive is forwards). 494 * @return The character offset of the boundary position n boundaries away from 495 * the current one. 496 */ 497int32_t RuleBasedBreakIterator::next(int32_t n) { 498 int32_t result = current(); 499 while (n > 0) { 500 result = next(); 501 --n; 502 } 503 while (n < 0) { 504 result = previous(); 505 ++n; 506 } 507 return result; 508} 509 510/** 511 * Advances the iterator to the next boundary position. 512 * @return The position of the first boundary after this one. 513 */ 514int32_t RuleBasedBreakIterator::next(void) { 515 // if we have cached break positions and we're still in the range 516 // covered by them, just move one step forward in the cache 517 if (fCachedBreakPositions != NULL) { 518 if (fPositionInCache < fNumCachedBreakPositions - 1) { 519 ++fPositionInCache; 520 int32_t pos = fCachedBreakPositions[fPositionInCache]; 521 utext_setNativeIndex(fText, pos); 522 return pos; 523 } 524 else { 525 reset(); 526 } 527 } 528 529 int32_t startPos = current(); 530 int32_t result = handleNext(fData->fForwardTable); 531 if (fDictionaryCharCount > 0) { 532 result = checkDictionary(startPos, result, FALSE); 533 } 534 return result; 535} 536 537/** 538 * Advances the iterator backwards, to the last boundary preceding this one. 539 * @return The position of the last boundary position preceding this one. 540 */ 541int32_t RuleBasedBreakIterator::previous(void) { 542 int32_t result; 543 int32_t startPos; 544 545 // if we have cached break positions and we're still in the range 546 // covered by them, just move one step backward in the cache 547 if (fCachedBreakPositions != NULL) { 548 if (fPositionInCache > 0) { 549 --fPositionInCache; 550 // If we're at the beginning of the cache, need to reevaluate the 551 // rule status 552 if (fPositionInCache <= 0) { 553 fLastStatusIndexValid = FALSE; 554 } 555 int32_t pos = fCachedBreakPositions[fPositionInCache]; 556 utext_setNativeIndex(fText, pos); 557 return pos; 558 } 559 else { 560 reset(); 561 } 562 } 563 564 // if we're already sitting at the beginning of the text, return DONE 565 if (fText == NULL || (startPos = current()) == 0) { 566 fLastRuleStatusIndex = 0; 567 fLastStatusIndexValid = TRUE; 568 return BreakIterator::DONE; 569 } 570 571 if (fData->fSafeRevTable != NULL || fData->fSafeFwdTable != NULL) { 572 result = handlePrevious(fData->fReverseTable); 573 if (fDictionaryCharCount > 0) { 574 result = checkDictionary(result, startPos, TRUE); 575 } 576 return result; 577 } 578 579 // old rule syntax 580 // set things up. handlePrevious() will back us up to some valid 581 // break position before the current position (we back our internal 582 // iterator up one step to prevent handlePrevious() from returning 583 // the current position), but not necessarily the last one before 584 585 // where we started 586 587 int32_t start = current(); 588 589 UTEXT_PREVIOUS32(fText); 590 int32_t lastResult = handlePrevious(fData->fReverseTable); 591 if (lastResult == UBRK_DONE) { 592 lastResult = 0; 593 utext_setNativeIndex(fText, 0); 594 } 595 result = lastResult; 596 int32_t lastTag = 0; 597 UBool breakTagValid = FALSE; 598 599 // iterate forward from the known break position until we pass our 600 // starting point. The last break position before the starting 601 // point is our return value 602 603 for (;;) { 604 result = next(); 605 if (result == BreakIterator::DONE || result >= start) { 606 break; 607 } 608 lastResult = result; 609 lastTag = fLastRuleStatusIndex; 610 breakTagValid = TRUE; 611 } 612 613 // fLastBreakTag wants to have the value for section of text preceding 614 // the result position that we are to return (in lastResult.) If 615 // the backwards rules overshot and the above loop had to do two or more 616 // next()s to move up to the desired return position, we will have a valid 617 // tag value. But, if handlePrevious() took us to exactly the correct result positon, 618 // we wont have a tag value for that position, which is only set by handleNext(). 619 620 // set the current iteration position to be the last break position 621 // before where we started, and then return that value 622 utext_setNativeIndex(fText, lastResult); 623 fLastRuleStatusIndex = lastTag; // for use by getRuleStatus() 624 fLastStatusIndexValid = breakTagValid; 625 626 // No need to check the dictionary; it will have been handled by 627 // next() 628 629 return lastResult; 630} 631 632/** 633 * Sets the iterator to refer to the first boundary position following 634 * the specified position. 635 * @offset The position from which to begin searching for a break position. 636 * @return The position of the first break after the current position. 637 */ 638int32_t RuleBasedBreakIterator::following(int32_t offset) { 639 // if we have cached break positions and offset is in the range 640 // covered by them, use them 641 // TODO: could use binary search 642 // TODO: what if offset is outside range, but break is not? 643 if (fCachedBreakPositions != NULL) { 644 if (offset >= fCachedBreakPositions[0] 645 && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 646 fPositionInCache = 0; 647 // We are guaranteed not to leave the array due to range test above 648 while (offset >= fCachedBreakPositions[fPositionInCache]) { 649 ++fPositionInCache; 650 } 651 int32_t pos = fCachedBreakPositions[fPositionInCache]; 652 utext_setNativeIndex(fText, pos); 653 return pos; 654 } 655 else { 656 reset(); 657 } 658 } 659 660 // if the offset passed in is already past the end of the text, 661 // just return DONE; if it's before the beginning, return the 662 // text's starting offset 663 fLastRuleStatusIndex = 0; 664 fLastStatusIndexValid = TRUE; 665 if (fText == NULL || offset >= utext_nativeLength(fText)) { 666 last(); 667 return next(); 668 } 669 else if (offset < 0) { 670 return first(); 671 } 672 673 // otherwise, set our internal iteration position (temporarily) 674 // to the position passed in. If this is the _beginning_ position, 675 // then we can just use next() to get our return value 676 677 int32_t result = 0; 678 679 if (fData->fSafeRevTable != NULL) { 680 // new rule syntax 681 utext_setNativeIndex(fText, offset); 682 // move forward one codepoint to prepare for moving back to a 683 // safe point. 684 // this handles offset being between a supplementary character 685 UTEXT_NEXT32(fText); 686 // handlePrevious will move most of the time to < 1 boundary away 687 handlePrevious(fData->fSafeRevTable); 688 int32_t result = next(); 689 while (result <= offset) { 690 result = next(); 691 } 692 return result; 693 } 694 if (fData->fSafeFwdTable != NULL) { 695 // backup plan if forward safe table is not available 696 utext_setNativeIndex(fText, offset); 697 UTEXT_PREVIOUS32(fText); 698 // handle next will give result >= offset 699 handleNext(fData->fSafeFwdTable); 700 // previous will give result 0 or 1 boundary away from offset, 701 // most of the time 702 // we have to 703 int32_t oldresult = previous(); 704 while (oldresult > offset) { 705 int32_t result = previous(); 706 if (result <= offset) { 707 return oldresult; 708 } 709 oldresult = result; 710 } 711 int32_t result = next(); 712 if (result <= offset) { 713 return next(); 714 } 715 return result; 716 } 717 // otherwise, we have to sync up first. Use handlePrevious() to back 718 // up to a known break position before the specified position (if 719 // we can determine that the specified position is a break position, 720 // we don't back up at all). This may or may not be the last break 721 // position at or before our starting position. Advance forward 722 // from here until we've passed the starting position. The position 723 // we stop on will be the first break position after the specified one. 724 // old rule syntax 725 726 utext_setNativeIndex(fText, offset); 727 if (offset==0 || 728 offset==1 && utext_getNativeIndex(fText)==0) { 729 return next(); 730 } 731 result = previous(); 732 733 while (result != BreakIterator::DONE && result <= offset) { 734 result = next(); 735 } 736 737 return result; 738} 739 740/** 741 * Sets the iterator to refer to the last boundary position before the 742 * specified position. 743 * @offset The position to begin searching for a break from. 744 * @return The position of the last boundary before the starting position. 745 */ 746int32_t RuleBasedBreakIterator::preceding(int32_t offset) { 747 // if we have cached break positions and offset is in the range 748 // covered by them, use them 749 if (fCachedBreakPositions != NULL) { 750 // TODO: binary search? 751 // TODO: What if offset is outside range, but break is not? 752 if (offset > fCachedBreakPositions[0] 753 && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { 754 fPositionInCache = 0; 755 while (fPositionInCache < fNumCachedBreakPositions 756 && offset > fCachedBreakPositions[fPositionInCache]) 757 ++fPositionInCache; 758 --fPositionInCache; 759 // If we're at the beginning of the cache, need to reevaluate the 760 // rule status 761 if (fPositionInCache <= 0) { 762 fLastStatusIndexValid = FALSE; 763 } 764 utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); 765 return fCachedBreakPositions[fPositionInCache]; 766 } 767 else { 768 reset(); 769 } 770 } 771 772 // if the offset passed in is already past the end of the text, 773 // just return DONE; if it's before the beginning, return the 774 // text's starting offset 775 if (fText == NULL || offset > utext_nativeLength(fText)) { 776 // return BreakIterator::DONE; 777 return last(); 778 } 779 else if (offset < 0) { 780 return first(); 781 } 782 783 // if we start by updating the current iteration position to the 784 // position specified by the caller, we can just use previous() 785 // to carry out this operation 786 787 if (fData->fSafeFwdTable != NULL) { 788 // new rule syntax 789 utext_setNativeIndex(fText, offset); 790 int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 791 if (newOffset != offset) { 792 // Will come here if specified offset was not a code point boundary AND 793 // the underlying implmentation is using UText, which snaps any non-code-point-boundary 794 // indices to the containing code point. 795 // For breakitereator::preceding only, these non-code-point indices need to be moved 796 // up to refer to the following codepoint. 797 UTEXT_NEXT32(fText); 798 offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); 799 } 800 801 // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, 802 // rather than adjusting the position unconditionally? 803 // (Change would interact with safe rules.) 804 // TODO: change RBBI behavior for off-boundary indices to match that of UText? 805 // affects only preceding(), seems cleaner, but is slightly different. 806 UTEXT_PREVIOUS32(fText); 807 handleNext(fData->fSafeFwdTable); 808 int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 809 while (result >= offset) { 810 result = previous(); 811 } 812 return result; 813 } 814 if (fData->fSafeRevTable != NULL) { 815 // backup plan if forward safe table is not available 816 // TODO: check whether this path can be discarded 817 // It's probably OK to say that rules must supply both safe tables 818 // if they use safe tables at all. We have certainly never described 819 // to anyone how to work with just one safe table. 820 utext_setNativeIndex(fText, offset); 821 UTEXT_NEXT32(fText); 822 823 // handle previous will give result <= offset 824 handlePrevious(fData->fSafeRevTable); 825 826 // next will give result 0 or 1 boundary away from offset, 827 // most of the time 828 // we have to 829 int32_t oldresult = next(); 830 while (oldresult < offset) { 831 int32_t result = next(); 832 if (result >= offset) { 833 return oldresult; 834 } 835 oldresult = result; 836 } 837 int32_t result = previous(); 838 if (result >= offset) { 839 return previous(); 840 } 841 return result; 842 } 843 844 // old rule syntax 845 utext_setNativeIndex(fText, offset); 846 return previous(); 847} 848 849/** 850 * Returns true if the specfied position is a boundary position. As a side 851 * effect, leaves the iterator pointing to the first boundary position at 852 * or after "offset". 853 * @param offset the offset to check. 854 * @return True if "offset" is a boundary position. 855 */ 856UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { 857 // the beginning index of the iterator is always a boundary position by definition 858 if (offset == 0) { 859 first(); // For side effects on current position, tag values. 860 return TRUE; 861 } 862 863 if (offset == (int32_t)utext_nativeLength(fText)) { 864 last(); // For side effects on current position, tag values. 865 return TRUE; 866 } 867 868 // out-of-range indexes are never boundary positions 869 if (offset < 0) { 870 first(); // For side effects on current position, tag values. 871 return FALSE; 872 } 873 874 if (offset > utext_nativeLength(fText)) { 875 last(); // For side effects on current position, tag values. 876 return FALSE; 877 } 878 879 // otherwise, we can use following() on the position before the specified 880 // one and return true if the position we get back is the one the user 881 // specified 882 utext_previous32From(fText, offset); 883 int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); 884 UBool result = following(backOne) == offset; 885 return result; 886} 887 888/** 889 * Returns the current iteration position. 890 * @return The current iteration position. 891 */ 892int32_t RuleBasedBreakIterator::current(void) const { 893 int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(fText); 894 return pos; 895} 896 897//======================================================================= 898// implementation 899//======================================================================= 900 901// 902// RBBIRunMode - the state machine runs an extra iteration at the beginning and end 903// of user text. A variable with this enum type keeps track of where we 904// are. The state machine only fetches user input while in the RUN mode. 905// 906enum RBBIRunMode { 907 RBBI_START, // state machine processing is before first char of input 908 RBBI_RUN, // state machine processing is in the user text 909 RBBI_END // state machine processing is after end of user text. 910}; 911 912 913//----------------------------------------------------------------------------------- 914// 915// handleNext(stateTable) 916// This method is the actual implementation of the rbbi next() method. 917// This method initializes the state machine to state 1 918// and advances through the text character by character until we reach the end 919// of the text or the state machine transitions to state 0. We update our return 920// value every time the state machine passes through an accepting state. 921// 922//----------------------------------------------------------------------------------- 923int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { 924 int32_t state; 925 int16_t category = 0; 926 RBBIRunMode mode; 927 928 RBBIStateTableRow *row; 929 UChar32 c; 930 int32_t lookaheadStatus = 0; 931 int32_t lookaheadTagIdx = 0; 932 int32_t result = 0; 933 int32_t initialPosition = 0; 934 int32_t lookaheadResult = 0; 935 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 936 const char *tableData = statetable->fTableData; 937 uint32_t tableRowLen = statetable->fRowLen; 938 939 #ifdef RBBI_DEBUG 940 if (fTrace) { 941 RBBIDebugPuts("Handle Next pos char state category"); 942 } 943 #endif 944 945 // No matter what, handleNext alway correctly sets the break tag value. 946 fLastStatusIndexValid = TRUE; 947 fLastRuleStatusIndex = 0; 948 949 // if we're already at the end of the text, return DONE. 950 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 951 result = initialPosition; 952 c = UTEXT_NEXT32(fText); 953 if (fData == NULL || c==U_SENTINEL) { 954 return BreakIterator::DONE; 955 } 956 957 // Set the initial state for the state machine 958 state = START_STATE; 959 row = (RBBIStateTableRow *) 960 //(statetable->fTableData + (statetable->fRowLen * state)); 961 (tableData + tableRowLen * state); 962 963 964 mode = RBBI_RUN; 965 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 966 category = 2; 967 mode = RBBI_START; 968 } 969 970 971 // loop until we reach the end of the text or transition to state 0 972 // 973 for (;;) { 974 if (c == U_SENTINEL) { 975 // Reached end of input string. 976 if (mode == RBBI_END) { 977 // We have already run the loop one last time with the 978 // character set to the psueudo {eof} value. Now it is time 979 // to unconditionally bail out. 980 if (lookaheadResult > result) { 981 // We ran off the end of the string with a pending look-ahead match. 982 // Treat this as if the look-ahead condition had been met, and return 983 // the match at the / position from the look-ahead rule. 984 result = lookaheadResult; 985 fLastRuleStatusIndex = lookaheadTagIdx; 986 lookaheadStatus = 0; 987 } 988 break; 989 } 990 // Run the loop one last time with the fake end-of-input character category. 991 mode = RBBI_END; 992 category = 1; 993 } 994 995 // 996 // Get the char category. An incoming category of 1 or 2 means that 997 // we are preset for doing the beginning or end of input, and 998 // that we shouldn't get a category from an actual text input character. 999 // 1000 if (mode == RBBI_RUN) { 1001 // look up the current character's character category, which tells us 1002 // which column in the state table to look at. 1003 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1004 // not the size of the character going in, which is a UChar32. 1005 // 1006 UTRIE_GET16(&fData->fTrie, c, category); 1007 1008 // Check the dictionary bit in the character's category. 1009 // Counter is only used by dictionary based iterators (subclasses). 1010 // Chars that need to be handled by a dictionary have a flag bit set 1011 // in their category values. 1012 // 1013 if ((category & 0x4000) != 0) { 1014 fDictionaryCharCount++; 1015 // And off the dictionary flag bit. 1016 category &= ~0x4000; 1017 } 1018 } 1019 1020 #ifdef RBBI_DEBUG 1021 if (fTrace) { 1022 RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText)); 1023 if (0x20<=c && c<0x7f) { 1024 RBBIDebugPrintf("\"%c\" ", c); 1025 } else { 1026 RBBIDebugPrintf("%5x ", c); 1027 } 1028 RBBIDebugPrintf("%3d %3d\n", state, category); 1029 } 1030 #endif 1031 1032 // State Transition - move machine to its next state 1033 // 1034 state = row->fNextState[category]; 1035 row = (RBBIStateTableRow *) 1036 // (statetable->fTableData + (statetable->fRowLen * state)); 1037 (tableData + tableRowLen * state); 1038 1039 1040 if (row->fAccepting == -1) { 1041 // Match found, common case. 1042 if (mode != RBBI_START) { 1043 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1044 } 1045 fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. 1046 } 1047 1048 if (row->fLookAhead != 0) { 1049 if (lookaheadStatus != 0 1050 && row->fAccepting == lookaheadStatus) { 1051 // Lookahead match is completed. 1052 result = lookaheadResult; 1053 fLastRuleStatusIndex = lookaheadTagIdx; 1054 lookaheadStatus = 0; 1055 // TODO: make a standalone hard break in a rule work. 1056 if (lookAheadHardBreak) { 1057 UTEXT_SETNATIVEINDEX(fText, result); 1058 return result; 1059 } 1060 // Look-ahead completed, but other rules may match further. Continue on 1061 // TODO: junk this feature? I don't think it's used anywhwere. 1062 goto continueOn; 1063 } 1064 1065 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1066 lookaheadResult = r; 1067 lookaheadStatus = row->fLookAhead; 1068 lookaheadTagIdx = row->fTagIdx; 1069 goto continueOn; 1070 } 1071 1072 1073 if (row->fAccepting != 0) { 1074 // Because this is an accepting state, any in-progress look-ahead match 1075 // is no longer relavant. Clear out the pending lookahead status. 1076 lookaheadStatus = 0; // clear out any pending look-ahead match. 1077 } 1078 1079continueOn: 1080 if (state == STOP_STATE) { 1081 // This is the normal exit from the lookup state machine. 1082 // We have advanced through the string until it is certain that no 1083 // longer match is possible, no matter what characters follow. 1084 break; 1085 } 1086 1087 // Advance to the next character. 1088 // If this is a beginning-of-input loop iteration, don't advance 1089 // the input position. The next iteration will be processing the 1090 // first real input character. 1091 if (mode == RBBI_RUN) { 1092 c = UTEXT_NEXT32(fText); 1093 } else { 1094 if (mode == RBBI_START) { 1095 mode = RBBI_RUN; 1096 } 1097 } 1098 1099 1100 } 1101 1102 // The state machine is done. Check whether it found a match... 1103 1104 // If the iterator failed to advance in the match engine, force it ahead by one. 1105 // (This really indicates a defect in the break rules. They should always match 1106 // at least one character.) 1107 if (result == initialPosition) { 1108 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1109 UTEXT_NEXT32(fText); 1110 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1111 } 1112 1113 // Leave the iterator at our result position. 1114 UTEXT_SETNATIVEINDEX(fText, result); 1115 #ifdef RBBI_DEBUG 1116 if (fTrace) { 1117 RBBIDebugPrintf("result = %d\n\n", result); 1118 } 1119 #endif 1120 return result; 1121} 1122 1123 1124 1125//----------------------------------------------------------------------------------- 1126// 1127// handlePrevious() 1128// 1129// Iterate backwards, according to the logic of the reverse rules. 1130// This version handles the exact style backwards rules. 1131// 1132// The logic of this function is very similar to handleNext(), above. 1133// 1134//----------------------------------------------------------------------------------- 1135int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { 1136 int32_t state; 1137 int16_t category = 0; 1138 RBBIRunMode mode; 1139 RBBIStateTableRow *row; 1140 UChar32 c; 1141 int32_t lookaheadStatus = 0; 1142 int32_t result = 0; 1143 int32_t initialPosition = 0; 1144 int32_t lookaheadResult = 0; 1145 UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; 1146 1147 #ifdef RBBI_DEBUG 1148 if (fTrace) { 1149 RBBIDebugPuts("Handle Previous pos char state category"); 1150 } 1151 #endif 1152 1153 // handlePrevious() never gets the rule status. 1154 // Flag the status as invalid; if the user ever asks for status, we will need 1155 // to back up, then re-find the break position using handleNext(), which does 1156 // get the status value. 1157 fLastStatusIndexValid = FALSE; 1158 fLastRuleStatusIndex = 0; 1159 1160 // if we're already at the start of the text, return DONE. 1161 if (fText == NULL || fData == NULL || UTEXT_GETNATIVEINDEX(fText)==0) { 1162 return BreakIterator::DONE; 1163 } 1164 1165 // Set up the starting char. 1166 initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1167 result = initialPosition; 1168 c = UTEXT_PREVIOUS32(fText); 1169 1170 // Set the initial state for the state machine 1171 state = START_STATE; 1172 row = (RBBIStateTableRow *) 1173 (statetable->fTableData + (statetable->fRowLen * state)); 1174 category = 3; 1175 mode = RBBI_RUN; 1176 if (statetable->fFlags & RBBI_BOF_REQUIRED) { 1177 category = 2; 1178 mode = RBBI_START; 1179 } 1180 1181 1182 // loop until we reach the start of the text or transition to state 0 1183 // 1184 for (;;) { 1185 if (c == U_SENTINEL) { 1186 // Reached end of input string. 1187 if (mode == RBBI_END || 1188 *(int32_t *)fData->fHeader->fFormatVersion == 1 ) { 1189 // We have already run the loop one last time with the 1190 // character set to the psueudo {eof} value. Now it is time 1191 // to unconditionally bail out. 1192 // (Or we have an old format binary rule file that does not support {eof}.) 1193 if (lookaheadResult < result) { 1194 // We ran off the end of the string with a pending look-ahead match. 1195 // Treat this as if the look-ahead condition had been met, and return 1196 // the match at the / position from the look-ahead rule. 1197 result = lookaheadResult; 1198 lookaheadStatus = 0; 1199 } else if (result == initialPosition) { 1200 // Ran off start, no match found. 1201 // move one index one (towards the start, since we are doing a previous()) 1202 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1203 UTEXT_PREVIOUS32(fText); // TODO: shouldn't be necessary. We're already at beginning. Check. 1204 } 1205 break; 1206 } 1207 // Run the loop one last time with the fake end-of-input character category. 1208 mode = RBBI_END; 1209 category = 1; 1210 } 1211 1212 // 1213 // Get the char category. An incoming category of 1 or 2 means that 1214 // we are preset for doing the beginning or end of input, and 1215 // that we shouldn't get a category from an actual text input character. 1216 // 1217 if (mode == RBBI_RUN) { 1218 // look up the current character's character category, which tells us 1219 // which column in the state table to look at. 1220 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 1221 // not the size of the character going in, which is a UChar32. 1222 // 1223 UTRIE_GET16(&fData->fTrie, c, category); 1224 1225 // Check the dictionary bit in the character's category. 1226 // Counter is only used by dictionary based iterators (subclasses). 1227 // Chars that need to be handled by a dictionary have a flag bit set 1228 // in their category values. 1229 // 1230 if ((category & 0x4000) != 0) { 1231 fDictionaryCharCount++; 1232 // And off the dictionary flag bit. 1233 category &= ~0x4000; 1234 } 1235 } 1236 1237 #ifdef RBBI_DEBUG 1238 if (fTrace) { 1239 RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(fText)); 1240 if (0x20<=c && c<0x7f) { 1241 RBBIDebugPrintf("\"%c\" ", c); 1242 } else { 1243 RBBIDebugPrintf("%5x ", c); 1244 } 1245 RBBIDebugPrintf("%3d %3d\n", state, category); 1246 } 1247 #endif 1248 1249 // State Transition - move machine to its next state 1250 // 1251 state = row->fNextState[category]; 1252 row = (RBBIStateTableRow *) 1253 (statetable->fTableData + (statetable->fRowLen * state)); 1254 1255 if (row->fAccepting == -1) { 1256 // Match found, common case. 1257 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1258 } 1259 1260 if (row->fLookAhead != 0) { 1261 if (lookaheadStatus != 0 1262 && row->fAccepting == lookaheadStatus) { 1263 // Lookahead match is completed. 1264 result = lookaheadResult; 1265 lookaheadStatus = 0; 1266 // TODO: make a standalone hard break in a rule work. 1267 if (lookAheadHardBreak) { 1268 UTEXT_SETNATIVEINDEX(fText, result); 1269 return result; 1270 } 1271 // Look-ahead completed, but other rules may match further. Continue on 1272 // TODO: junk this feature? I don't think it's used anywhwere. 1273 goto continueOn; 1274 } 1275 1276 int32_t r = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1277 lookaheadResult = r; 1278 lookaheadStatus = row->fLookAhead; 1279 goto continueOn; 1280 } 1281 1282 1283 if (row->fAccepting != 0) { 1284 // Because this is an accepting state, any in-progress look-ahead match 1285 // is no longer relavant. Clear out the pending lookahead status. 1286 lookaheadStatus = 0; 1287 } 1288 1289continueOn: 1290 if (state == STOP_STATE) { 1291 // This is the normal exit from the lookup state machine. 1292 // We have advanced through the string until it is certain that no 1293 // longer match is possible, no matter what characters follow. 1294 break; 1295 } 1296 1297 // Move (backwards) to the next character to process. 1298 // If this is a beginning-of-input loop iteration, don't advance 1299 // the input position. The next iteration will be processing the 1300 // first real input character. 1301 if (mode == RBBI_RUN) { 1302 c = UTEXT_PREVIOUS32(fText); 1303 } else { 1304 if (mode == RBBI_START) { 1305 mode = RBBI_RUN; 1306 } 1307 } 1308 } 1309 1310 // The state machine is done. Check whether it found a match... 1311 1312 // If the iterator failed to advance in the match engine, force it ahead by one. 1313 // (This really indicates a defect in the break rules. They should always match 1314 // at least one character.) 1315 if (result == initialPosition) { 1316 UTEXT_SETNATIVEINDEX(fText, initialPosition); 1317 UTEXT_PREVIOUS32(fText); 1318 result = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1319 } 1320 1321 // Leave the iterator at our result position. 1322 UTEXT_SETNATIVEINDEX(fText, result); 1323 #ifdef RBBI_DEBUG 1324 if (fTrace) { 1325 RBBIDebugPrintf("result = %d\n\n", result); 1326 } 1327 #endif 1328 return result; 1329} 1330 1331 1332void 1333RuleBasedBreakIterator::reset() 1334{ 1335 if (fCachedBreakPositions) { 1336 uprv_free(fCachedBreakPositions); 1337 } 1338 fCachedBreakPositions = NULL; 1339 fNumCachedBreakPositions = 0; 1340 fDictionaryCharCount = 0; 1341 fPositionInCache = 0; 1342} 1343 1344 1345 1346//------------------------------------------------------------------------------- 1347// 1348// getRuleStatus() Return the break rule tag associated with the current 1349// iterator position. If the iterator arrived at its current 1350// position by iterating forwards, the value will have been 1351// cached by the handleNext() function. 1352// 1353// If no cached status value is available, the status is 1354// found by doing a previous() followed by a next(), which 1355// leaves the iterator where it started, and computes the 1356// status while doing the next(). 1357// 1358//------------------------------------------------------------------------------- 1359void RuleBasedBreakIterator::makeRuleStatusValid() { 1360 if (fLastStatusIndexValid == FALSE) { 1361 // No cached status is available. 1362 if (fText == NULL || current() == 0) { 1363 // At start of text, or there is no text. Status is always zero. 1364 fLastRuleStatusIndex = 0; 1365 fLastStatusIndexValid = TRUE; 1366 } else { 1367 // Not at start of text. Find status the tedious way. 1368 int32_t pa = current(); 1369 previous(); 1370 if (fNumCachedBreakPositions > 0) { 1371 reset(); // Blow off the dictionary cache 1372 } 1373 int32_t pb = next(); 1374 if (pa != pb) { 1375 // note: the if (pa != pb) test is here only to eliminate warnings for 1376 // unused local variables on gcc. Logically, it isn't needed. 1377 U_ASSERT(pa == pb); 1378 } 1379 } 1380 } 1381 U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx); 1382} 1383 1384 1385int32_t RuleBasedBreakIterator::getRuleStatus() const { 1386 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1387 nonConstThis->makeRuleStatusValid(); 1388 1389 // fLastRuleStatusIndex indexes to the start of the appropriate status record 1390 // (the number of status values.) 1391 // This function returns the last (largest) of the array of status values. 1392 int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex]; 1393 int32_t tagVal = fData->fRuleStatusTable[idx]; 1394 1395 return tagVal; 1396} 1397 1398 1399 1400 1401int32_t RuleBasedBreakIterator::getRuleStatusVec( 1402 int32_t *fillInVec, int32_t capacity, UErrorCode &status) 1403{ 1404 if (U_FAILURE(status)) { 1405 return 0; 1406 } 1407 1408 RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; 1409 nonConstThis->makeRuleStatusValid(); 1410 int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex]; 1411 int32_t numValsToCopy = numVals; 1412 if (numVals > capacity) { 1413 status = U_BUFFER_OVERFLOW_ERROR; 1414 numValsToCopy = capacity; 1415 } 1416 int i; 1417 for (i=0; i<numValsToCopy; i++) { 1418 fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1]; 1419 } 1420 return numVals; 1421} 1422 1423 1424 1425//------------------------------------------------------------------------------- 1426// 1427// getBinaryRules Access to the compiled form of the rules, 1428// for use by build system tools that save the data 1429// for standard iterator types. 1430// 1431//------------------------------------------------------------------------------- 1432const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { 1433 const uint8_t *retPtr = NULL; 1434 length = 0; 1435 1436 if (fData != NULL) { 1437 retPtr = (const uint8_t *)fData->fHeader; 1438 length = fData->fHeader->fLength; 1439 } 1440 return retPtr; 1441} 1442 1443 1444 1445 1446//------------------------------------------------------------------------------- 1447// 1448// BufferClone TODO: In my (Andy) opinion, this function should be deprecated. 1449// Saving one heap allocation isn't worth the trouble. 1450// Cloning shouldn't be done in tight loops, and 1451// making the clone copy involves other heap operations anyway. 1452// And the application code for correctly dealing with buffer 1453// size problems and the eventual object destruction is ugly. 1454// 1455//------------------------------------------------------------------------------- 1456BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, 1457 int32_t &bufferSize, 1458 UErrorCode &status) 1459{ 1460 if (U_FAILURE(status)){ 1461 return NULL; 1462 } 1463 1464 // 1465 // If user buffer size is zero this is a preflight operation to 1466 // obtain the needed buffer size, allowing for worst case misalignment. 1467 // 1468 if (bufferSize == 0) { 1469 bufferSize = sizeof(RuleBasedBreakIterator) + U_ALIGNMENT_OFFSET_UP(0); 1470 return NULL; 1471 } 1472 1473 1474 // 1475 // Check the alignment and size of the user supplied buffer. 1476 // Allocate heap memory if the user supplied memory is insufficient. 1477 // 1478 char *buf = (char *)stackBuffer; 1479 uint32_t s = bufferSize; 1480 1481 if (stackBuffer == NULL) { 1482 s = 0; // Ignore size, force allocation if user didn't give us a buffer. 1483 } 1484 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { 1485 uint32_t offsetUp = (uint32_t)U_ALIGNMENT_OFFSET_UP(buf); 1486 s -= offsetUp; 1487 buf += offsetUp; 1488 } 1489 if (s < sizeof(RuleBasedBreakIterator)) { 1490 // Not enough room in the caller-supplied buffer. 1491 // Do a plain-vanilla heap based clone and return that, along with 1492 // a warning that the clone was allocated. 1493 RuleBasedBreakIterator *clonedBI = new RuleBasedBreakIterator(*this); 1494 if (clonedBI == 0) { 1495 status = U_MEMORY_ALLOCATION_ERROR; 1496 } else { 1497 status = U_SAFECLONE_ALLOCATED_WARNING; 1498 } 1499 return clonedBI; 1500 } 1501 1502 // 1503 // Clone the source BI into the caller-supplied buffer. 1504 // TODO: using an overloaded operator new to directly initialize the 1505 // copy in the user's buffer would be better, but it doesn't seem 1506 // to get along with namespaces. Investigate why. 1507 // 1508 // The memcpy is only safe with an empty (default constructed) 1509 // break iterator. Use on others can screw up reference counts 1510 // to data. memcpy-ing objects is not really a good idea... 1511 // 1512 RuleBasedBreakIterator localIter; // Empty break iterator, source for memcpy 1513 RuleBasedBreakIterator *clone = (RuleBasedBreakIterator *)buf; 1514 uprv_memcpy(clone, &localIter, sizeof(RuleBasedBreakIterator)); // init C++ gorp, BreakIterator base class part 1515 clone->init(); // Init RuleBasedBreakIterator part, (user default constructor) 1516 *clone = *this; // clone = the real BI we want. 1517 clone->fBufferClone = TRUE; // Flag to prevent deleting storage on close (From C code) 1518 1519 return clone; 1520} 1521 1522 1523//------------------------------------------------------------------------------- 1524// 1525// isDictionaryChar Return true if the category lookup for this char 1526// indicates that it is in the set of dictionary lookup 1527// chars. 1528// 1529// This function is intended for use by dictionary based 1530// break iterators. 1531// 1532//------------------------------------------------------------------------------- 1533/*UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) { 1534 if (fData == NULL) { 1535 return FALSE; 1536 } 1537 uint16_t category; 1538 UTRIE_GET16(&fData->fTrie, c, category); 1539 return (category & 0x4000) != 0; 1540}*/ 1541 1542 1543//------------------------------------------------------------------------------- 1544// 1545// checkDictionary This function handles all processing of characters in 1546// the "dictionary" set. It will determine the appropriate 1547// course of action, and possibly set up a cache in the 1548// process. 1549// 1550//------------------------------------------------------------------------------- 1551int32_t RuleBasedBreakIterator::checkDictionary(int32_t startPos, 1552 int32_t endPos, 1553 UBool reverse) { 1554 // Reset the old break cache first. 1555 uint32_t dictionaryCount = fDictionaryCharCount; 1556 reset(); 1557 1558 if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { 1559 return (reverse ? startPos : endPos); 1560 } 1561 1562 // Starting from the starting point, scan towards the proposed result, 1563 // looking for the first dictionary character (which may be the one 1564 // we're on, if we're starting in the middle of a range). 1565 utext_setNativeIndex(fText, reverse ? endPos : startPos); 1566 if (reverse) { 1567 UTEXT_PREVIOUS32(fText); 1568 } 1569 1570 int32_t rangeStart = startPos; 1571 int32_t rangeEnd = endPos; 1572 1573 uint16_t category; 1574 int32_t current; 1575 UErrorCode status = U_ZERO_ERROR; 1576 UStack breaks(status); 1577 int32_t foundBreakCount = 0; 1578 UChar32 c = utext_current32(fText); 1579 1580 UTRIE_GET16(&fData->fTrie, c, category); 1581 1582 // Is the character we're starting on a dictionary character? If so, we 1583 // need to back up to include the entire run; otherwise the results of 1584 // the break algorithm will differ depending on where we start. Since 1585 // the result is cached and there is typically a non-dictionary break 1586 // within a small number of words, there should be little performance impact. 1587 if (category & 0x4000) { 1588 if (reverse) { 1589 do { 1590 utext_next32(fText); // TODO: recast to work directly with postincrement. 1591 c = utext_current32(fText); 1592 UTRIE_GET16(&fData->fTrie, c, category); 1593 } while (c != U_SENTINEL && (category & 0x4000)); 1594 // Back up to the last dictionary character 1595 rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); 1596 if (c == U_SENTINEL) { 1597 // c = fText->last32(); 1598 // TODO: why was this if needed? 1599 c = UTEXT_PREVIOUS32(fText); 1600 } 1601 else { 1602 c = UTEXT_PREVIOUS32(fText); 1603 } 1604 } 1605 else { 1606 do { 1607 c = UTEXT_PREVIOUS32(fText); 1608 UTRIE_GET16(&fData->fTrie, c, category); 1609 } 1610 while (c != U_SENTINEL && (category & 0x4000)); 1611 // Back up to the last dictionary character 1612 if (c == U_SENTINEL) { 1613 // c = fText->first32(); 1614 c = utext_current32(fText); 1615 } 1616 else { 1617 utext_next32(fText); 1618 c = utext_current32(fText); 1619 } 1620 rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; 1621 } 1622 UTRIE_GET16(&fData->fTrie, c, category); 1623 } 1624 1625 // Loop through the text, looking for ranges of dictionary characters. 1626 // For each span, find the appropriate break engine, and ask it to find 1627 // any breaks within the span. 1628 // Note: we always do this in the forward direction, so that the break 1629 // cache is built in the right order. 1630 if (reverse) { 1631 utext_setNativeIndex(fText, rangeStart); 1632 c = utext_current32(fText); 1633 UTRIE_GET16(&fData->fTrie, c, category); 1634 } 1635 while(U_SUCCESS(status)) { 1636 while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { 1637 utext_next32(fText); // TODO: tweak for post-increment operation 1638 c = utext_current32(fText); 1639 UTRIE_GET16(&fData->fTrie, c, category); 1640 } 1641 if (current >= rangeEnd) { 1642 break; 1643 } 1644 1645 // We now have a dictionary character. Get the appropriate language object 1646 // to deal with it. 1647 const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); 1648 1649 // Ask the language object if there are any breaks. It will leave the text 1650 // pointer on the other side of its range, ready to search for the next one. 1651 if (lbe != NULL) { 1652 foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); 1653 } 1654 1655 // Reload the loop variables for the next go-round 1656 c = utext_current32(fText); 1657 UTRIE_GET16(&fData->fTrie, c, category); 1658 } 1659 1660 // If we found breaks, build a new break cache. The first and last entries must 1661 // be the original starting and ending position. 1662 if (foundBreakCount > 0) { 1663 int32_t totalBreaks = foundBreakCount; 1664 if (startPos < breaks.elementAti(0)) { 1665 totalBreaks += 1; 1666 } 1667 if (endPos > breaks.peeki()) { 1668 totalBreaks += 1; 1669 } 1670 fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); 1671 if (fCachedBreakPositions != NULL) { 1672 int32_t out = 0; 1673 fNumCachedBreakPositions = totalBreaks; 1674 if (startPos < breaks.elementAti(0)) { 1675 fCachedBreakPositions[out++] = startPos; 1676 } 1677 for (int32_t i = 0; i < foundBreakCount; ++i) { 1678 fCachedBreakPositions[out++] = breaks.elementAti(i); 1679 } 1680 if (endPos > fCachedBreakPositions[out-1]) { 1681 fCachedBreakPositions[out] = endPos; 1682 } 1683 // If there are breaks, then by definition, we are replacing the original 1684 // proposed break by one of the breaks we found. Use following() and 1685 // preceding() to do the work. They should never recurse in this case. 1686 if (reverse) { 1687 return preceding(endPos - 1); 1688 } 1689 else { 1690 return following(startPos); 1691 } 1692 } 1693 // If the allocation failed, just fall through to the "no breaks found" case. 1694 } 1695 1696 // If we get here, there were no language-based breaks. Set the text pointer 1697 // to the original proposed break. 1698 utext_setNativeIndex(fText, reverse ? startPos : endPos); 1699 return (reverse ? startPos : endPos); 1700} 1701 1702U_NAMESPACE_END 1703 1704// defined in ucln_cmn.h 1705 1706static U_NAMESPACE_QUALIFIER UStack *gLanguageBreakFactories = NULL; 1707 1708/** 1709 * Release all static memory held by breakiterator. 1710 */ 1711U_CDECL_BEGIN 1712static UBool U_CALLCONV breakiterator_cleanup_dict(void) { 1713 if (gLanguageBreakFactories) { 1714 delete gLanguageBreakFactories; 1715 gLanguageBreakFactories = NULL; 1716 } 1717 return TRUE; 1718} 1719U_CDECL_END 1720 1721U_CDECL_BEGIN 1722static void U_CALLCONV _deleteFactory(void *obj) { 1723 delete (U_NAMESPACE_QUALIFIER LanguageBreakFactory *) obj; 1724} 1725U_CDECL_END 1726U_NAMESPACE_BEGIN 1727 1728static const LanguageBreakEngine* 1729getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType) 1730{ 1731 UBool needsInit; 1732 UErrorCode status = U_ZERO_ERROR; 1733 UMTX_CHECK(NULL, (UBool)(gLanguageBreakFactories == NULL), needsInit); 1734 1735 if (needsInit) { 1736 UStack *factories = new UStack(_deleteFactory, NULL, status); 1737 if (factories != NULL && U_SUCCESS(status)) { 1738 ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); 1739 factories->push(builtIn, status); 1740#ifdef U_LOCAL_SERVICE_HOOK 1741 LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); 1742 if (extra != NULL) { 1743 factories->push(extra, status); 1744 } 1745#endif 1746 } 1747 umtx_lock(NULL); 1748 if (gLanguageBreakFactories == NULL) { 1749 gLanguageBreakFactories = factories; 1750 factories = NULL; 1751 ucln_common_registerCleanup(UCLN_COMMON_BREAKITERATOR_DICT, breakiterator_cleanup_dict); 1752 } 1753 umtx_unlock(NULL); 1754 delete factories; 1755 } 1756 1757 if (gLanguageBreakFactories == NULL) { 1758 return NULL; 1759 } 1760 1761 int32_t i = gLanguageBreakFactories->size(); 1762 const LanguageBreakEngine *lbe = NULL; 1763 while (--i >= 0) { 1764 LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); 1765 lbe = factory->getEngineFor(c, breakType); 1766 if (lbe != NULL) { 1767 break; 1768 } 1769 } 1770 return lbe; 1771} 1772 1773 1774//------------------------------------------------------------------------------- 1775// 1776// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the 1777// the characer c. 1778// 1779//------------------------------------------------------------------------------- 1780const LanguageBreakEngine * 1781RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { 1782 const LanguageBreakEngine *lbe = NULL; 1783 UErrorCode status = U_ZERO_ERROR; 1784 1785 if (fLanguageBreakEngines == NULL) { 1786 fLanguageBreakEngines = new UStack(status); 1787 if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { 1788 delete fLanguageBreakEngines; 1789 fLanguageBreakEngines = 0; 1790 return NULL; 1791 } 1792 } 1793 1794 int32_t i = fLanguageBreakEngines->size(); 1795 while (--i >= 0) { 1796 lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); 1797 if (lbe->handles(c, fBreakType)) { 1798 return lbe; 1799 } 1800 } 1801 1802 // No existing dictionary took the character. See if a factory wants to 1803 // give us a new LanguageBreakEngine for this character. 1804 lbe = getLanguageBreakEngineFromFactory(c, fBreakType); 1805 1806 // If we got one, use it and push it on our stack. 1807 if (lbe != NULL) { 1808 fLanguageBreakEngines->push((void *)lbe, status); 1809 // Even if we can't remember it, we can keep looking it up, so 1810 // return it even if the push fails. 1811 return lbe; 1812 } 1813 1814 // No engine is forthcoming for this character. Add it to the 1815 // reject set. Create the reject break engine if needed. 1816 if (fUnhandledBreakEngine == NULL) { 1817 fUnhandledBreakEngine = new UnhandledEngine(status); 1818 if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { 1819 status = U_MEMORY_ALLOCATION_ERROR; 1820 } 1821 // Put it last so that scripts for which we have an engine get tried 1822 // first. 1823 fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); 1824 // If we can't insert it, or creation failed, get rid of it 1825 if (U_FAILURE(status)) { 1826 delete fUnhandledBreakEngine; 1827 fUnhandledBreakEngine = 0; 1828 return NULL; 1829 } 1830 } 1831 1832 // Tell the reject engine about the character; at its discretion, it may 1833 // add more than just the one character. 1834 fUnhandledBreakEngine->handleCharacter(c, fBreakType); 1835 1836 return fUnhandledBreakEngine; 1837} 1838 1839 1840 1841/*int32_t RuleBasedBreakIterator::getBreakType() const { 1842 return fBreakType; 1843}*/ 1844 1845void RuleBasedBreakIterator::setBreakType(int32_t type) { 1846 fBreakType = type; 1847 reset(); 1848} 1849 1850U_NAMESPACE_END 1851 1852#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 1853