1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4 ******************************************************************************* 5 * Copyright (C) 2005-2016 International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10package com.ibm.icu.text; 11 12import static com.ibm.icu.impl.CharacterIteration.DONE32; 13import static com.ibm.icu.impl.CharacterIteration.next32; 14import static com.ibm.icu.impl.CharacterIteration.nextTrail32; 15import static com.ibm.icu.impl.CharacterIteration.previous32; 16 17import java.io.ByteArrayOutputStream; 18import java.io.IOException; 19import java.io.InputStream; 20import java.io.OutputStream; 21import java.nio.ByteBuffer; 22import java.text.CharacterIterator; 23import java.util.ArrayList; 24import java.util.List; 25 26import com.ibm.icu.impl.CharacterIteration; 27import com.ibm.icu.impl.ICUBinary; 28import com.ibm.icu.impl.ICUDebug; 29import com.ibm.icu.impl.Trie2; 30import com.ibm.icu.lang.UCharacter; 31import com.ibm.icu.lang.UProperty; 32import com.ibm.icu.lang.UScript; 33 34/** 35 * Rule Based Break Iterator 36 * This is a port of the C++ class RuleBasedBreakIterator from ICU4C. 37 * 38 * @stable ICU 2.0 39 */ 40public class RuleBasedBreakIterator extends BreakIterator { 41 //======================================================================= 42 // Constructors & Factories 43 //======================================================================= 44 45 /** 46 * private constructor 47 */ 48 private RuleBasedBreakIterator() { 49 fDictionaryCharCount = 0; 50 synchronized(gAllBreakEngines) { 51 fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines); 52 } 53 } 54 55 /** 56 * Create a break iterator from a precompiled set of break rules. 57 * 58 * Creating a break iterator from the binary rules is much faster than 59 * creating one from source rules. 60 * 61 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 62 * Binary break iterator rules are not guaranteed to be compatible between 63 * different versions of ICU. 64 * 65 * @param is an input stream supplying the compiled binary rules. 66 * @throws IOException if there is an error while reading the rules from the InputStream. 67 * @see #compileRules(String, OutputStream) 68 * @stable ICU 4.8 69 */ 70 public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { 71 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 72 This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); 73 return This; 74 } 75 76 /** 77 * Create a break iterator from a precompiled set of break rules. 78 * 79 * Creating a break iterator from the binary rules is much faster than 80 * creating one from source rules. 81 * 82 * The binary rules are generated by the RuleBasedBreakIterator.compileRules() function. 83 * Binary break iterator rules are not guaranteed to be compatible between 84 * different versions of ICU. 85 * 86 * @param bytes a buffer supplying the compiled binary rules. 87 * @throws IOException if there is an error while reading the rules from the buffer. 88 * @see #compileRules(String, OutputStream) 89 * @internal 90 * @deprecated This API is ICU internal only. 91 */ 92 @Deprecated 93 public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { 94 RuleBasedBreakIterator This = new RuleBasedBreakIterator(); 95 This.fRData = RBBIDataWrapper.get(bytes); 96 return This; 97 } 98 99 /** 100 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. 101 * @param rules The break rules to be used. 102 * @stable ICU 2.2 103 */ 104 public RuleBasedBreakIterator(String rules) { 105 this(); 106 try { 107 ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); 108 compileRules(rules, ruleOS); 109 fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); 110 } catch (IOException e) { 111 ///CLOVER:OFF 112 // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, 113 // causing bogus compiled rules to be produced, but with no compile error raised. 114 RuntimeException rte = new RuntimeException("RuleBasedBreakIterator rule compilation internal error: " 115 + e.getMessage()); 116 throw rte; 117 ///CLOVER:ON 118 } 119 } 120 121 //======================================================================= 122 // Boilerplate 123 //======================================================================= 124 125 /** 126 * Clones this iterator. 127 * @return A newly-constructed RuleBasedBreakIterator with the same 128 * behavior as this one. 129 * @stable ICU 2.0 130 */ 131 @Override 132 public Object clone() { 133 RuleBasedBreakIterator result; 134 result = (RuleBasedBreakIterator)super.clone(); 135 if (fText != null) { 136 result.fText = (CharacterIterator)(fText.clone()); 137 } 138 synchronized (gAllBreakEngines) { 139 result.fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines); 140 } 141 result.fLookAheadMatches = new LookAheadResults(); 142 result.fBreakCache = result.new BreakCache(fBreakCache); 143 result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); 144 return result; 145 } 146 147 148 /** 149 * Returns true if both BreakIterators are of the same class, have the same 150 * rules, and iterate over the same text. 151 * @stable ICU 2.0 152 */ 153 @Override 154 public boolean equals(Object that) { 155 if (that == null) { 156 return false; 157 } 158 if (this == that) { 159 return true; 160 } 161 try { 162 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that; 163 if (fRData != other.fRData && (fRData == null || other.fRData == null)) { 164 return false; 165 } 166 if (fRData != null && other.fRData != null && 167 (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) { 168 return false; 169 } 170 if (fText == null && other.fText == null) { 171 return true; 172 } 173 if (fText == null || other.fText == null || !fText.equals(other.fText)) { 174 return false; 175 } 176 return fPosition == other.fPosition; 177 } 178 catch(ClassCastException e) { 179 return false; 180 } 181 } 182 183 /** 184 * Returns the description (rules) used to create this iterator. 185 * (In ICU4C, the same function is RuleBasedBreakIterator::getRules()) 186 * @stable ICU 2.0 187 */ 188 @Override 189 public String toString() { 190 String retStr = ""; 191 if (fRData != null) { 192 retStr = fRData.fRuleSource; 193 } 194 return retStr; 195 } 196 197 /** 198 * Compute a hashcode for this BreakIterator 199 * @return A hash code 200 * @stable ICU 2.0 201 */ 202 @Override 203 public int hashCode() 204 { 205 return fRData.fRuleSource.hashCode(); 206 } 207 208 209 private static final int START_STATE = 1; // The state number of the starting state 210 private static final int STOP_STATE = 0; // The state-transition value indicating "stop" 211 212 // RBBIRunMode - the state machine runs an extra iteration at the beginning and end 213 // of user text. A variable with this enum type keeps track of where we 214 // are. The state machine only fetches user text input while in RUN mode. 215 private static final int RBBI_START = 0; 216 private static final int RBBI_RUN = 1; 217 private static final int RBBI_END = 2; 218 219 /* 220 * The character iterator through which this BreakIterator accesses the text. 221 */ 222 private CharacterIterator fText = new java.text.StringCharacterIterator(""); 223 224 /** 225 * The rule data for this BreakIterator instance. Package private. 226 */ 227 RBBIDataWrapper fRData; 228 229 /** 230 * The iteration state - current position, rule status for the current position, 231 * and whether the iterator ran off the end, yielding UBRK_DONE. 232 * Current position is pinned to be 0 < position <= text.length. 233 * Current position is always set to a boundary. 234 * 235 * The current position of the iterator. Pinned, 0 < fPosition <= text.length. 236 * Never has the value UBRK_DONE (-1). 237 */ 238 private int fPosition; 239 240 /** 241 * Index of the Rule {tag} values for the most recent match. 242 */ 243 private int fRuleStatusIndex; 244 245 /** 246 * True when iteration has run off the end, and iterator functions should return UBRK_DONE. 247 */ 248 private boolean fDone; 249 250 /** 251 * Cache of previously determined boundary positions. 252 */ 253 private BreakCache fBreakCache = new BreakCache(); 254 255 256 /** 257 * Counter for the number of characters encountered with the "dictionary" 258 * flag set. Normal RBBI iterators don't use it, although the code 259 * for updating it is live. Dictionary Based break iterators (a subclass 260 * of us) access this field directly. 261 * @internal 262 */ 263 private int fDictionaryCharCount; 264 265 private DictionaryCache fDictionaryCache = new DictionaryCache(); 266 267 /* 268 * ICU debug argument name for RBBI 269 */ 270 private static final String RBBI_DEBUG_ARG = "rbbi"; 271 272 /** 273 * Debugging flag. Trace operation of state machine when true. 274 */ 275 private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG) 276 && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0; 277 278 /** 279 * What kind of break iterator this is. 280 * Defaulting BreakType to word gives reasonable dictionary behavior for 281 * Break Iterators that are built from rules. 282 */ 283 private int fBreakType = KIND_WORD; 284 285 /** 286 * The "default" break engine - just skips over ranges of dictionary words, 287 * producing no breaks. Should only be used if characters need to be handled 288 * by a dictionary but we have no dictionary implementation for them. 289 * 290 * Only one instance; shared by all break iterators. 291 */ 292 private static final UnhandledBreakEngine gUnhandledBreakEngine; 293 294 /** 295 * List of all known break engines, common for all break iterators. 296 * Lazily updated as break engines are needed, because instantiation of 297 * break engines is expensive. 298 * 299 * Because gAllBreakEngines can be referenced concurrently from different 300 * BreakIterator instances, all access is synchronized. 301 */ 302 private static final List<LanguageBreakEngine> gAllBreakEngines; 303 304 static { 305 gUnhandledBreakEngine = new UnhandledBreakEngine(); 306 gAllBreakEngines = new ArrayList<LanguageBreakEngine>(); 307 gAllBreakEngines.add(gUnhandledBreakEngine); 308 } 309 310 /** 311 * List of all known break engines. Similar to gAllBreakEngines, but local to a 312 * break iterator, allowing it to be used without synchronization. 313 */ 314 private List<LanguageBreakEngine> fBreakEngines; 315 316 /** 317 * Dump the contents of the state table and character classes for this break iterator. 318 * For debugging only. 319 * @internal 320 * @deprecated This API is ICU internal only. 321 */ 322 @Deprecated 323 public void dump(java.io.PrintStream out) { 324 if (out == null) { 325 out = System.out; 326 } 327 this.fRData.dump(out); 328 } 329 330 /** 331 * Compile a set of source break rules into the binary state tables used 332 * by the break iterator engine. Creating a break iterator from precompiled 333 * rules is much faster than creating one from source rules. 334 * 335 * Binary break rules are not guaranteed to be compatible between different 336 * versions of ICU. 337 * 338 * 339 * @param rules The source form of the break rules 340 * @param ruleBinary An output stream to receive the compiled rules. 341 * @throws IOException If there is an error writing the output. 342 * @see #getInstanceFromCompiledRules(InputStream) 343 * @stable ICU 4.8 344 */ 345 public static void compileRules(String rules, OutputStream ruleBinary) throws IOException { 346 RBBIRuleBuilder.compileRules(rules, ruleBinary); 347 } 348 349 //======================================================================= 350 // BreakIterator overrides 351 //======================================================================= 352 353 /** 354 * Sets the current iteration position to the beginning of the text. 355 * (i.e., the CharacterIterator's starting offset). 356 * @return The offset of the beginning of the text. 357 * @stable ICU 2.0 358 */ 359 @Override 360 public int first() { 361 if (fText == null) { 362 return BreakIterator.DONE; 363 } 364 fText.first(); 365 int start = fText.getIndex(); 366 if (!fBreakCache.seek(start)) { 367 fBreakCache.populateNear(start); 368 } 369 fBreakCache.current(); 370 assert(fPosition == start); 371 return fPosition; 372 } 373 374 /** 375 * Sets the current iteration position to the end of the text. 376 * (i.e., the CharacterIterator's ending offset). 377 * @return The text's past-the-end offset. 378 * @stable ICU 2.0 379 */ 380 @Override 381 public int last() { 382 if (fText == null) { 383 return BreakIterator.DONE; 384 } 385 int endPos = fText.getEndIndex(); 386 boolean endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. 387 assert(endShouldBeBoundary); 388 if (fPosition != endPos) { 389 assert(fPosition == endPos); 390 } 391 return endPos; 392 } 393 394 /** 395 * Advances the iterator either forward or backward the specified number of steps. 396 * Negative values move backward, and positive values move forward. This is 397 * equivalent to repeatedly calling next() or previous(). 398 * @param n The number of steps to move. The sign indicates the direction 399 * (negative is backwards, and positive is forwards). 400 * @return The character offset of the boundary position n boundaries away from 401 * the current one. 402 * @stable ICU 2.0 403 */ 404 @Override 405 public int next(int n) { 406 int result = 0; 407 if (n > 0) { 408 for (; n > 0 && result != DONE; --n) { 409 result = next(); 410 } 411 } else if (n < 0) { 412 for (; n < 0 && result != DONE; ++n) { 413 result = previous(); 414 } 415 } else { 416 result = current(); 417 } 418 return result; 419 } 420 421 /** 422 * Advances the iterator to the next boundary position. 423 * @return The position of the first boundary after this one. 424 * @stable ICU 2.0 425 */ 426 @Override 427 public int next() { 428 fBreakCache.next(); 429 return fDone ? DONE : fPosition; 430 } 431 432 /** 433 * Moves the iterator backwards, to the boundary preceding the current one. 434 * @return The position of the boundary position immediately preceding the starting position. 435 * @stable ICU 2.0 436 */ 437 @Override 438 public int previous() { 439 fBreakCache.previous(); 440 return fDone ? DONE : fPosition; 441 } 442 443 /** 444 * Sets the iterator to refer to the first boundary position following 445 * the specified position. 446 * @param startPos The position from which to begin searching for a break position. 447 * @return The position of the first break after the current position. 448 * @stable ICU 2.0 449 */ 450 @Override 451 public int following(int startPos) { 452 // if the supplied position is before the beginning, return the 453 // text's starting offset 454 if (startPos < fText.getBeginIndex()) { 455 return first(); 456 } 457 458 // Move requested offset to a code point start. It might be on a trail surrogate. 459 // Or it may be beyond the end of the text. 460 startPos = CISetIndex32(fText, startPos); 461 fBreakCache.following(startPos); 462 return fDone ? DONE : fPosition; 463 } 464 465 466 /** 467 * Sets the iterator to refer to the last boundary position before the 468 * specified position. 469 * @param offset The position to begin searching for a break from. 470 * @return The position of the last boundary before the starting position. 471 * @stable ICU 2.0 472 */ 473 @Override 474 public int preceding(int offset) { 475 if (fText == null || offset > fText.getEndIndex()) { 476 return last(); 477 } else if (offset < fText.getBeginIndex()) { 478 return first(); 479 } 480 481 // Move requested offset to a code point start. It might be on a trail surrogate. 482 // int adjustedOffset = CISetIndex32(fText, offset); // TODO: restore to match ICU4C behavior. 483 int adjustedOffset = offset; 484 fBreakCache.preceding(adjustedOffset); 485 return fDone ? DONE : fPosition; 486 487 } 488 489 490 /** 491 * Throw IllegalArgumentException unless begin <= offset < end. 492 * @stable ICU 2.0 493 */ 494 protected static final void checkOffset(int offset, CharacterIterator text) { 495 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) { 496 throw new IllegalArgumentException("offset out of bounds"); 497 } 498 } 499 500 501 /** 502 * Returns true if the specified position is a boundary position. As a side 503 * effect, leaves the iterator pointing to the first boundary position at 504 * or after "offset". 505 * @param offset the offset to check. 506 * @return True if "offset" is a boundary position. 507 * @stable ICU 2.0 508 */ 509 @Override 510 public boolean isBoundary(int offset) { 511 // TODO: behavior difference with ICU4C, which considers out-of-range offsets 512 // to not be boundaries, and to not be errors. 513 checkOffset(offset, fText); 514 515 // Adjust offset to be on a code point boundary and not beyond the end of the text. 516 // Note that isBoundary() is always be false for offsets that are not on code point boundaries. 517 // But we still need the side effect of leaving iteration at the following boundary. 518 int adjustedOffset = CISetIndex32(fText, offset); 519 520 boolean result = false; 521 if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) { 522 result = (fBreakCache.current() == offset); 523 } 524 525 if (!result) { 526 // Not on a boundary. isBoundary() must leave iterator on the following boundary. 527 // fBreakCache.seek(), above, left us on the preceding boundary, so advance one. 528 next(); 529 } 530 return result; 531 532 } 533 534 /** 535 * Returns the current iteration position. Note that UBRK_DONE is never 536 * returned from this function; if iteration has run to the end of a 537 * string, current() will return the length of the string while 538 * next() will return BreakIterator.DONE). 539 * @return The current iteration position. 540 * @stable ICU 2.0 541 */ 542 @Override 543 public int current() { 544 return (fText != null) ? fPosition : BreakIterator.DONE; 545 } 546 547 548 /** 549 * Return the status tag from the break rule that determined the most recently 550 * returned break position. The values appear in the rule source 551 * within brackets, {123}, for example. For rules that do not specify a 552 * status, a default value of 0 is returned. If more than one rule applies, 553 * the numerically largest of the possible status values is returned. 554 * <p> 555 * Of the standard types of ICU break iterators, only the word and line break 556 * iterator provides status values. The values are defined in 557 * class RuleBasedBreakIterator, and allow distinguishing between words 558 * that contain alphabetic letters, "words" that appear to be numbers, 559 * punctuation and spaces, words containing ideographic characters, and 560 * more. Call <code>getRuleStatus</code> after obtaining a boundary 561 * position from <code>next()</code>, <code>previous()</code>, or 562 * any other break iterator functions that returns a boundary position. 563 * <p> 564 * @return the status from the break rule that determined the most recently 565 * returned break position. 566 * 567 * @stable ICU 60 568 */ 569 570 @Override 571 public int getRuleStatus() { 572 // Status records have this form: 573 // Count N <-- fLastRuleStatusIndex points here. 574 // Status val 0 575 // Status val 1 576 // ... 577 // Status val N-1 <-- the value we need to return 578 // The status values are sorted in ascending order. 579 // This function returns the last (largest) of the array of status values. 580 int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex]; 581 int tagVal = fRData.fStatusTable[idx]; 582 return tagVal; 583 } 584 585 /** 586 * Get the status (tag) values from the break rule(s) that determined the most 587 * recently returned break position. The values appear in the rule source 588 * within brackets, {123}, for example. The default status value for rules 589 * that do not explicitly provide one is zero. 590 * <p> 591 * The status values used by the standard ICU break rules are defined 592 * as public constants in class RuleBasedBreakIterator. 593 * <p> 594 * If the size of the output array is insufficient to hold the data, 595 * the output will be truncated to the available length. No exception 596 * will be thrown. 597 * 598 * @param fillInArray an array to be filled in with the status values. 599 * @return The number of rule status values from rules that determined 600 * the most recent boundary returned by the break iterator. 601 * In the event that the array is too small, the return value 602 * is the total number of status values that were available, 603 * not the reduced number that were actually returned. 604 * @stable ICU 60 605 */ 606 @Override 607 public int getRuleStatusVec(int[] fillInArray) { 608 int numStatusVals = fRData.fStatusTable[fRuleStatusIndex]; 609 if (fillInArray != null) { 610 int numToCopy = Math.min(numStatusVals, fillInArray.length); 611 for (int i=0; i<numToCopy; i++) { 612 fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1]; 613 } 614 } 615 return numStatusVals; 616 } 617 618 /** 619 * Return a CharacterIterator over the text being analyzed. This version 620 * of this method returns the actual CharacterIterator we're using internally. 621 * Changing the state of this iterator can have undefined consequences. If 622 * you need to change it, clone it first. 623 * @return An iterator over the text being analyzed. 624 * @stable ICU 2.0 625 */ 626 @Override 627 public CharacterIterator getText() { 628 return fText; 629 } 630 631 /** 632 * Set the iterator to analyze a new piece of text. This function resets 633 * the current iteration position to the beginning of the text. 634 * @param newText An iterator over the text to analyze. 635 * @stable ICU 2.0 636 */ 637 @Override 638 public void setText(CharacterIterator newText) { 639 if (newText != null) { 640 fBreakCache.reset(newText.getBeginIndex(), 0); 641 } else { 642 fBreakCache.reset(); 643 } 644 fDictionaryCache.reset(); 645 fText = newText; 646 this.first(); 647 } 648 649 /** 650 * package private 651 */ 652 void setBreakType(int type) { 653 fBreakType = type; 654 } 655 656 /** 657 * package private 658 */ 659 int getBreakType() { 660 return fBreakType; 661 } 662 663 /** 664 * Control debug, trace and dump options. 665 * @internal 666 */ 667 static final String fDebugEnv = ICUDebug.enabled(RBBI_DEBUG_ARG) ? 668 ICUDebug.value(RBBI_DEBUG_ARG) : null; 669 670 671 private LanguageBreakEngine getLanguageBreakEngine(int c) { 672 673 // We have a dictionary character. 674 // Does an already instantiated break engine handle it? 675 for (LanguageBreakEngine candidate : fBreakEngines) { 676 if (candidate.handles(c, fBreakType)) { 677 return candidate; 678 } 679 } 680 681 synchronized (gAllBreakEngines) { 682 // This break iterator's list of break engines didn't handle the character. 683 // Check the global list, another break iterator may have instantiated the 684 // desired engine. 685 for (LanguageBreakEngine candidate : gAllBreakEngines) { 686 if (candidate.handles(c, fBreakType)) { 687 fBreakEngines.add(candidate); 688 return candidate; 689 } 690 } 691 692 // The global list doesn't have an existing engine, build one. 693 int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT); 694 if (script == UScript.KATAKANA || script == UScript.HIRAGANA) { 695 // Katakana, Hiragana and Han are handled by the same dictionary engine. 696 // Fold them together for mapping from script -> engine. 697 script = UScript.HAN; 698 } 699 700 LanguageBreakEngine eng; 701 try { 702 switch (script) { 703 case UScript.THAI: 704 eng = new ThaiBreakEngine(); 705 break; 706 case UScript.LAO: 707 eng = new LaoBreakEngine(); 708 break; 709 case UScript.MYANMAR: 710 eng = new BurmeseBreakEngine(); 711 break; 712 case UScript.KHMER: 713 eng = new KhmerBreakEngine(); 714 break; 715 case UScript.HAN: 716 if (getBreakType() == KIND_WORD) { 717 eng = new CjkBreakEngine(false); 718 } 719 else { 720 gUnhandledBreakEngine.handleChar(c, getBreakType()); 721 eng = gUnhandledBreakEngine; 722 } 723 break; 724 case UScript.HANGUL: 725 if (getBreakType() == KIND_WORD) { 726 eng = new CjkBreakEngine(true); 727 } else { 728 gUnhandledBreakEngine.handleChar(c, getBreakType()); 729 eng = gUnhandledBreakEngine; 730 } 731 break; 732 default: 733 gUnhandledBreakEngine.handleChar(c, getBreakType()); 734 eng = gUnhandledBreakEngine; 735 break; 736 } 737 } catch (IOException e) { 738 eng = null; 739 } 740 741 if (eng != null && eng != gUnhandledBreakEngine) { 742 gAllBreakEngines.add(eng); 743 fBreakEngines.add(eng); 744 } 745 return eng; 746 } // end synchronized(gAllBreakEngines) 747 } 748 749 private static final int kMaxLookaheads = 8; 750 private static class LookAheadResults { 751 int fUsedSlotLimit; 752 int[] fPositions; 753 int[] fKeys; 754 755 LookAheadResults() { 756 fUsedSlotLimit= 0; 757 fPositions = new int[kMaxLookaheads]; 758 fKeys = new int[kMaxLookaheads]; 759 } 760 761 int getPosition(int key) { 762 for (int i=0; i<fUsedSlotLimit; ++i) { 763 if (fKeys[i] == key) { 764 return fPositions[i]; 765 } 766 } 767 assert(false); 768 return -1; 769 } 770 771 void setPosition(int key, int position) { 772 int i; 773 for (i=0; i<fUsedSlotLimit; ++i) { 774 if (fKeys[i] == key) { 775 fPositions[i] = position; 776 return; 777 } 778 } 779 if (i >= kMaxLookaheads) { 780 assert(false); 781 i = kMaxLookaheads - 1; 782 } 783 fKeys[i] = key; 784 fPositions[i] = position; 785 assert(fUsedSlotLimit == i); 786 fUsedSlotLimit = i + 1; 787 } 788 789 void reset() { 790 fUsedSlotLimit = 0; 791 } 792 }; 793 private LookAheadResults fLookAheadMatches = new LookAheadResults(); 794 795 796 /** 797 * The State Machine Engine for moving forward is here. 798 * This function is the heart of the RBBI run time engine. 799 * 800 * Input 801 * fPosition, the position in the text to begin from. 802 * Output 803 * fPosition: the boundary following the starting position. 804 * fDictionaryCharCount the number of dictionary characters encountered. 805 * If > 0, the segment will be further subdivided 806 * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. 807 * 808 * @return the new iterator position 809 * 810 * A note on supplementary characters and the position of underlying 811 * Java CharacterIterator: Normally, a character iterator is positioned at 812 * the char most recently returned by next(). Within this function, when 813 * a supplementary char is being processed, the char iterator is left 814 * sitting on the trail surrogate, in the middle of the code point. 815 * This is different from everywhere else, where an iterator always 816 * points at the lead surrogate of a supplementary. 817 */ 818 private int handleNext() { 819 if (TRACE) { 820 System.out.println("Handle Next pos char state category"); 821 } 822 823 // handleNext always sets the break tag value. 824 // Set the default for it. 825 fRuleStatusIndex = 0; 826 fDictionaryCharCount = 0; 827 828 // caches for quicker access 829 CharacterIterator text = fText; 830 Trie2 trie = fRData.fTrie; 831 832 short[] stateTable = fRData.fFTable; 833 int initialPosition = fPosition; 834 text.setIndex(initialPosition); 835 int result = initialPosition; 836 837 // Set up the starting char 838 int c = text.current(); 839 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 840 c = nextTrail32(text, c); 841 if (c == DONE32) { 842 fDone = true; 843 return BreakIterator.DONE; 844 } 845 } 846 847 // Set the initial state for the state machine 848 int state = START_STATE; 849 int row = fRData.getRowIndex(state); 850 short category = 3; 851 int flagsState = fRData.getStateTableFlags(stateTable); 852 int mode = RBBI_RUN; 853 if ((flagsState & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 854 category = 2; 855 mode = RBBI_START; 856 if (TRACE) { 857 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 858 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 859 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 860 } 861 } 862 fLookAheadMatches.reset(); 863 864 // loop until we reach the end of the text or transition to state 0 865 while (state != STOP_STATE) { 866 if (c == DONE32) { 867 // Reached end of input string. 868 if (mode == RBBI_END) { 869 // We have already run the loop one last time with the 870 // character set to the pseudo {eof} value. Now it is time 871 // to unconditionally bail out. 872 break; 873 } 874 // Run the loop one last time with the fake end-of-input character category 875 mode = RBBI_END; 876 category = 1; 877 } 878 else if (mode == RBBI_RUN) { 879 // Get the char category. An incoming category of 1 or 2 mens that 880 // we are preset for doing the beginning or end of input, and 881 // that we shouldn't get a category from an actual text input character. 882 // 883 884 // look up the current character's character category, which tells us 885 // which column in the state table to look at. 886 // 887 category = (short) trie.get(c); 888 889 // Check the dictionary bit in the character's category. 890 // Counter is only used by dictionary based iterators (subclasses). 891 // Chars that need to be handled by a dictionary have a flag bit set 892 // in their category values. 893 // 894 if ((category & 0x4000) != 0) { 895 fDictionaryCharCount++; 896 // And off the dictionary flag bit. 897 category &= ~0x4000; 898 } 899 900 if (TRACE) { 901 System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); 902 System.out.print(RBBIDataWrapper.intToHexString(c, 10)); 903 System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); 904 } 905 906 // Advance to the next character. 907 // If this is a beginning-of-input loop iteration, don't advance. 908 // The next iteration will be processing the first real input character. 909 c = text.next(); 910 if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) { 911 c = nextTrail32(text, c); 912 } 913 } 914 else { 915 mode = RBBI_RUN; 916 } 917 918 // look up a state transition in the state table 919 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 920 row = fRData.getRowIndex(state); 921 922 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { 923 // Match found, common case 924 result = text.getIndex(); 925 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 926 // The iterator has been left in the middle of a surrogate pair. 927 // We want the start of it. 928 result--; 929 } 930 931 // Remember the break status (tag) values. 932 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; 933 } 934 935 int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; 936 if (completedRule > 0) { 937 // Lookahead match is completed 938 int lookaheadResult = fLookAheadMatches.getPosition(completedRule); 939 if (lookaheadResult >= 0) { 940 fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGIDX]; 941 fPosition = lookaheadResult; 942 return lookaheadResult; 943 } 944 } 945 946 int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 947 if (rule != 0) { 948 // At the position of a '/' in a look-ahead match. Record it. 949 int pos = text.getIndex(); 950 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { 951 // The iterator has been left in the middle of a surrogate pair. 952 // We want the beginning of it. 953 pos--; 954 } 955 fLookAheadMatches.setPosition(rule, pos); 956 } 957 958 959 } // End of state machine main loop 960 961 // The state machine is done. Check whether it found a match... 962 963 // If the iterator failed to advance in the match engine force it ahead by one. 964 // This indicates a defect in the break rules, which should always match 965 // at least one character. 966 967 if (result == initialPosition) { 968 if (TRACE) { 969 System.out.println("Iterator did not move. Advancing by 1."); 970 } 971 text.setIndex(initialPosition); 972 next32(text); 973 result = text.getIndex(); 974 fRuleStatusIndex = 0; 975 } 976 977 // Leave the iterator at our result position. 978 // (we may have advanced beyond the last accepting position chasing after 979 // longer matches that never completed.) 980 fPosition = result; 981 982 if (TRACE) { 983 System.out.println("result = " + result); 984 } 985 return result; 986 } 987 988 /** 989 * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules. 990 * This locates a "Safe Position" from which the forward break rules 991 * will operate correctly. A Safe Position is not necessarily a boundary itself. 992 * 993 * The logic of this function is very similar to handleNext(), above. 994 * 995 * @param fromPosition the position in the input text to begin the iteration. 996 * @internal 997 */ 998 private int handlePrevious(int fromPosition) { 999 if (fText == null) { 1000 return 0; 1001 } 1002 1003 int state; 1004 int category = 0; 1005 int mode; 1006 int row; 1007 int c; 1008 int result = 0; 1009 int initialPosition = fromPosition; 1010 fLookAheadMatches.reset(); 1011 short[] stateTable = fRData.fSRTable; 1012 CISetIndex32(fText, fromPosition); 1013 if (fromPosition == fText.getBeginIndex()) { 1014 return BreakIterator.DONE; 1015 } 1016 1017 // set up the starting char 1018 result = initialPosition; 1019 c = previous32(fText); 1020 1021 // Set up the initial state for the state machine 1022 state = START_STATE; 1023 row = fRData.getRowIndex(state); 1024 category = 3; // TODO: obsolete? from the old start/run mode scheme? 1025 mode = RBBI_RUN; 1026 if ((fRData.getStateTableFlags(stateTable) & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) { 1027 category = 2; 1028 mode = RBBI_START; 1029 } 1030 1031 if (TRACE) { 1032 System.out.println("Handle Prev pos char state category "); 1033 } 1034 1035 // loop until we reach the beginning of the text or transition to state 0 1036 // 1037 mainLoop: for (;;) { 1038 if (c == DONE32) { 1039 // Reached end of input string. 1040 if (mode == RBBI_END) { 1041 // We have already done the {eof} iteration. Now is the time 1042 // to unconditionally bail out. 1043 break mainLoop; 1044 } 1045 mode = RBBI_END; 1046 category = 1; 1047 } 1048 1049 if (mode == RBBI_RUN) { 1050 // look up the current character's category, which tells us 1051 // which column in the state table to look at. 1052 // 1053 // And off the dictionary flag bit. For reverse iteration it is not used. 1054 category = (short) fRData.fTrie.get(c); 1055 category &= ~0x4000; 1056 } 1057 1058 if (TRACE) { 1059 System.out.print(" " + fText.getIndex() + " "); 1060 if (0x20 <= c && c < 0x7f) { 1061 System.out.print(" " + c + " "); 1062 } else { 1063 System.out.print(" " + Integer.toHexString(c) + " "); 1064 } 1065 System.out.println(" " + state + " " + category + " "); 1066 } 1067 1068 // State Transition - move machine to its next state 1069 // 1070 state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category]; 1071 row = fRData.getRowIndex(state); 1072 1073 if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) { 1074 // Match found, common case, could have lookahead so we move 1075 // on to check it 1076 result = fText.getIndex(); 1077 } 1078 1079 1080 int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; 1081 if (completedRule > 0) { 1082 // Lookahead match is completed. 1083 int lookaheadResult = fLookAheadMatches.getPosition(completedRule); 1084 if (lookaheadResult >= 0) { 1085 result = lookaheadResult; 1086 break mainLoop; 1087 } 1088 } 1089 int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; 1090 if (rule != 0) { 1091 // At the position of a '/' in a look-ahead match. Record it. 1092 int pos = fText.getIndex(); 1093 fLookAheadMatches.setPosition(rule, pos); 1094 } 1095 1096 if (state == STOP_STATE) { 1097 // Normal loop exit is here 1098 break mainLoop; 1099 } 1100 1101 // then move iterator position backwards one character 1102 // 1103 if (mode == RBBI_RUN) { 1104 c = previous32(fText); 1105 } else { 1106 if (mode == RBBI_START) { 1107 mode = RBBI_RUN; 1108 } 1109 } 1110 1111 1112 } // End of the main loop. 1113 1114 // The state machine is done. Check whether it found a match... 1115 // 1116 // If the iterator failed to move in the match engine, force it back by one code point. 1117 // (This really indicates a defect in the break rules. They should always match 1118 // at least one character.) 1119 if (result == initialPosition) { 1120 CISetIndex32(fText, initialPosition); 1121 previous32(fText); 1122 result = fText.getIndex(); 1123 } 1124 1125 if (TRACE) { 1126 System.out.println("Result = " + result); 1127 } 1128 1129 return result; 1130 } 1131 1132 /** 1133 * Set the index of a CharacterIterator. 1134 * Pin the index to the valid range range of BeginIndex <= index <= EndIndex. 1135 * If the index points to a trail surrogate of a supplementary character, adjust it 1136 * to the start (lead surrogate) index. 1137 * 1138 * @param ci A CharacterIterator to set 1139 * @param index the index to set 1140 * @return the resulting index, possibly pinned or adjusted. 1141 */ 1142 private static int CISetIndex32(CharacterIterator ci, int index) { 1143 if (index <= ci.getBeginIndex()) { 1144 ci.first(); 1145 } else if (index >= ci.getEndIndex()) { 1146 ci.setIndex(ci.getEndIndex()); 1147 } else if (Character.isLowSurrogate(ci.setIndex(index))) { 1148 if (!Character.isHighSurrogate(ci.previous())) { 1149 ci.next(); 1150 } 1151 } 1152 return ci.getIndex(); 1153 } 1154 1155 /* DictionaryCache stores the boundaries obtained from a run of dictionary characters. 1156 * Dictionary boundaries are moved first to this cache, then from here 1157 * to the main BreakCache, where they may inter-leave with non-dictionary 1158 * boundaries. The public BreakIterator API always fetches directly 1159 * from the main BreakCache, not from here. 1160 * 1161 * In common situations, the number of boundaries in a single dictionary run 1162 * should be quite small, it will be terminated by punctuation, spaces, 1163 * or any other non-dictionary characters. The main BreakCache may end 1164 * up with boundaries from multiple dictionary based runs. 1165 * 1166 * The boundaries are stored in a simple ArrayList (vector), with the 1167 * assumption that they will be accessed sequentially. 1168 */ 1169 class DictionaryCache { 1170 1171 void reset() { 1172 fPositionInCache = -1; 1173 fStart = 0; 1174 fLimit = 0; 1175 fFirstRuleStatusIndex = 0; 1176 fOtherRuleStatusIndex = 0; 1177 fBreaks.removeAllElements(); 1178 }; 1179 1180 boolean following(int fromPos) { 1181 if (fromPos >= fLimit || fromPos < fStart) { 1182 fPositionInCache = -1; 1183 return false; 1184 } 1185 1186 // Sequential iteration, move from previous boundary to the following 1187 1188 int r = 0; 1189 if (fPositionInCache >= 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1190 ++fPositionInCache; 1191 if (fPositionInCache >= fBreaks.size()) { 1192 fPositionInCache = -1; 1193 return false; 1194 } 1195 r = fBreaks.elementAt(fPositionInCache); 1196 assert(r > fromPos); 1197 fBoundary = r; 1198 fStatusIndex = fOtherRuleStatusIndex; 1199 return true; 1200 } 1201 1202 // Random indexing. Linear search for the boundary following the given position. 1203 1204 for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) { 1205 r= fBreaks.elementAt(fPositionInCache); 1206 if (r > fromPos) { 1207 fBoundary = r; 1208 fStatusIndex = fOtherRuleStatusIndex; 1209 return true; 1210 } 1211 } 1212 1213 // Internal error. fStart <= fromPos < fLimit, but no cached boundary. 1214 assert(false); 1215 fPositionInCache = -1; 1216 return false; 1217 }; 1218 1219 boolean preceding(int fromPos) { 1220 if (fromPos <= fStart || fromPos > fLimit) { 1221 fPositionInCache = -1; 1222 return false; 1223 } 1224 1225 if (fromPos == fLimit) { 1226 fPositionInCache = fBreaks.size() - 1; 1227 if (fPositionInCache >= 0) { 1228 assert(fBreaks.elementAt(fPositionInCache) == fromPos); 1229 } 1230 } 1231 1232 int r; 1233 if (fPositionInCache > 0 && fPositionInCache < fBreaks.size() && fBreaks.elementAt(fPositionInCache) == fromPos) { 1234 --fPositionInCache; 1235 r = fBreaks.elementAt(fPositionInCache); 1236 assert(r < fromPos); 1237 fBoundary = r; 1238 fStatusIndex = ( r== fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1239 return true; 1240 } 1241 1242 if (fPositionInCache == 0) { 1243 fPositionInCache = -1; 1244 return false; 1245 } 1246 1247 for (fPositionInCache = fBreaks.size()-1; fPositionInCache >= 0; --fPositionInCache) { 1248 r = fBreaks.elementAt(fPositionInCache); 1249 if (r < fromPos) { 1250 fBoundary = r; 1251 fStatusIndex = ( r == fStart) ? fFirstRuleStatusIndex : fOtherRuleStatusIndex; 1252 return true; 1253 } 1254 } 1255 assert(false); 1256 fPositionInCache = -1; 1257 return false; 1258 }; 1259 1260 /** 1261 * Populate the cache with the dictionary based boundaries within a region of text. 1262 * @param startPos The start position of a range of text 1263 * @param endPos The end position of a range of text 1264 * @param firstRuleStatus The rule status index that applies to the break at startPos 1265 * @param otherRuleStatus The rule status index that applies to boundaries other than startPos 1266 * @internal 1267 */ 1268 void populateDictionary(int startPos, int endPos, 1269 int firstRuleStatus, int otherRuleStatus) { 1270 if ((endPos - startPos) <= 1) { 1271 return; 1272 } 1273 1274 reset(); 1275 fFirstRuleStatusIndex = firstRuleStatus; 1276 fOtherRuleStatusIndex = otherRuleStatus; 1277 1278 int rangeStart = startPos; 1279 int rangeEnd = endPos; 1280 1281 int category; 1282 int current; 1283 int foundBreakCount = 0; 1284 1285 // Loop through the text, looking for ranges of dictionary characters. 1286 // For each span, find the appropriate break engine, and ask it to find 1287 // any breaks within the span. 1288 1289 fText.setIndex(rangeStart); 1290 int c = CharacterIteration.current32(fText); 1291 category = (short)fRData.fTrie.get(c); 1292 1293 while(true) { 1294 while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) { 1295 c = CharacterIteration.next32(fText); // pre-increment 1296 category = (short)fRData.fTrie.get(c); 1297 } 1298 if (current >= rangeEnd) { 1299 break; 1300 } 1301 1302 // We now have a dictionary character. Get the appropriate language object 1303 // to deal with it. 1304 LanguageBreakEngine lbe = getLanguageBreakEngine(c); 1305 1306 // Ask the language object if there are any breaks. It will add them to the cache and 1307 // leave the text pointer on the other side of its range, ready to search for the next one. 1308 if (lbe != null) { 1309 foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreakType, fBreaks); 1310 } 1311 1312 // Reload the loop variables for the next go-round 1313 c = CharacterIteration.current32(fText); 1314 category = (short)fRData.fTrie.get(c); 1315 } 1316 1317 // If we found breaks, ensure that the first and last entries are 1318 // the original starting and ending position. And initialize the 1319 // cache iteration position to the first entry. 1320 1321 // System.out.printf("foundBreakCount = %d%n", foundBreakCount); 1322 if (foundBreakCount > 0) { 1323 assert(foundBreakCount == fBreaks.size()); 1324 if (startPos < fBreaks.elementAt(0)) { 1325 // The dictionary did not place a boundary at the start of the segment of text. 1326 // Add one now. This should not commonly happen, but it would be easy for interactions 1327 // of the rules for dictionary segments and the break engine implementations to 1328 // inadvertently cause it. Cover it here, just in case. 1329 fBreaks.offer(startPos); 1330 } 1331 if (endPos > fBreaks.peek()) { 1332 fBreaks.push(endPos); 1333 } 1334 fPositionInCache = 0; 1335 // Note: Dictionary matching may extend beyond the original limit. 1336 fStart = fBreaks.elementAt(0); 1337 fLimit = fBreaks.peek(); 1338 } else { 1339 // there were no language-based breaks, even though the segment contained 1340 // dictionary characters. Subsequent attempts to fetch boundaries from the dictionary cache 1341 // for this range will fail, and the calling code will fall back to the rule based boundaries. 1342 } 1343 1344 }; 1345 1346 1347 DictionaryCache() { 1348 fPositionInCache = -1; 1349 fBreaks = new DictionaryBreakEngine.DequeI(); 1350 } 1351 1352 /** 1353 * copy constructor. Used by RuleBasedBreakIterator.clone(). 1354 * 1355 * @param src the source object to be copied. 1356 */ 1357 DictionaryCache(DictionaryCache src) { 1358 try { 1359 fBreaks = (DictionaryBreakEngine.DequeI)src.fBreaks.clone(); 1360 } 1361 catch (CloneNotSupportedException e) { 1362 throw new RuntimeException(e); 1363 } 1364 fPositionInCache = src.fPositionInCache; 1365 fStart = src.fStart; 1366 fLimit = src.fLimit; 1367 fFirstRuleStatusIndex = src.fFirstRuleStatusIndex; 1368 fOtherRuleStatusIndex = src.fOtherRuleStatusIndex; 1369 fBoundary = src.fBoundary; 1370 fStatusIndex = src.fStatusIndex; 1371 } 1372 1373 // A data structure containing the boundaries themselves. Essentially a vector of raw ints. 1374 DictionaryBreakEngine.DequeI fBreaks; 1375 int fPositionInCache; // Index in fBreaks of last boundary returned by following() 1376 // // or preceding(). Optimizes sequential access. 1377 int fStart; // Text position of first boundary in cache. 1378 int fLimit; // Last boundary in cache. Which is the limit of the 1379 // // text segment being handled by the dictionary. 1380 int fFirstRuleStatusIndex; // Rule status info for first boundary. 1381 int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. 1382 int fBoundary; // Current boundary. Set by preceding(), following(). 1383 int fStatusIndex; // Current rule status index. Set by preceding, following(). 1384 }; 1385 1386 1387 1388 1389/* 1390 * class BreakCache 1391 * 1392 * Cache of break boundary positions and rule status values. 1393 * Break iterator API functions, next(), previous(), etc., will use cached results 1394 * when possible, and otherwise cache new results as they are obtained. 1395 * 1396 * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. 1397 * 1398 * The cache is implemented as a single circular buffer. 1399 */ 1400 1401/* 1402 * size of the circular cache buffer. 1403 */ 1404 1405class BreakCache { 1406 1407 BreakCache() { 1408 reset(); 1409 }; 1410 1411 void reset(int pos, int ruleStatus) { 1412 fStartBufIdx = 0; 1413 fEndBufIdx = 0; 1414 fTextIdx = pos; 1415 fBufIdx = 0; 1416 fBoundaries[0] = pos; 1417 fStatuses[0] = (short)ruleStatus; 1418 } 1419 1420 void reset() {reset(0, 0); }; 1421 1422 void next() { 1423 if (fBufIdx == fEndBufIdx) { 1424 fDone = !populateFollowing(); 1425 fPosition = fTextIdx; 1426 fRuleStatusIndex = fStatuses[fBufIdx]; 1427 } else { 1428 fBufIdx = modChunkSize(fBufIdx + 1); 1429 fTextIdx = fPosition = fBoundaries[fBufIdx]; 1430 fRuleStatusIndex = fStatuses[fBufIdx]; 1431 } 1432 }; 1433 1434 void previous() { 1435 int initialBufIdx = fBufIdx; 1436 if (fBufIdx == fStartBufIdx) { 1437 // At start of cache. Prepend to it. 1438 populatePreceding(); 1439 } else { 1440 // Cache already holds the next boundary 1441 fBufIdx = modChunkSize(fBufIdx - 1); 1442 fTextIdx = fBoundaries[fBufIdx]; 1443 } 1444 fDone = (fBufIdx == initialBufIdx); 1445 fPosition = fTextIdx; 1446 fRuleStatusIndex = fStatuses[fBufIdx]; 1447 return; 1448 }; 1449 1450 // Move the iteration state to the position following the startPosition. 1451 // Input position must be pinned to the input length. 1452 void following(int startPos) { 1453 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1454 // startPos is in the cache. Do a next() from that position. 1455 // TODO: an awkward set of interactions with bi->fDone 1456 // seek() does not clear it; it can't because of interactions with populateNear(). 1457 // next() does not clear it in the fast-path case, where everything matters. Maybe it should. 1458 // So clear it here, for the case where seek() succeeded on an iterator that had previously run off the end. 1459 fDone = false; 1460 next(); 1461 } 1462 1463 }; 1464 1465 void preceding(int startPos) { 1466 if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) { 1467 if (startPos == fTextIdx) { 1468 previous(); 1469 } else { 1470 // seek() leaves the BreakCache positioned at the preceding boundary 1471 // if the requested position is between two bounaries. 1472 // current() pushes the BreakCache position out to the BreakIterator itself. 1473 assert(startPos > fTextIdx); 1474 current(); 1475 } 1476 } 1477 return; 1478 }; 1479 1480 /* 1481 * Update the state of the public BreakIterator (fBI) to reflect the 1482 * current state of the break iterator cache (this). 1483 */ 1484 int current() { 1485 fPosition = fTextIdx; 1486 fRuleStatusIndex = fStatuses[fBufIdx]; 1487 fDone = false; 1488 return fTextIdx; 1489 }; 1490 1491 /** 1492 * Add boundaries to the cache near the specified position. 1493 * The given position need not be a boundary itself. 1494 * The input position must be within the range of the text, and 1495 * on a code point boundary. 1496 * If the requested position is a break boundary, leave the iteration 1497 * position on it. 1498 * If the requested position is not a boundary, leave the iteration 1499 * position on the preceding boundary and include both the the 1500 * preceding and following boundaries in the cache. 1501 * Additional boundaries, either preceding or following, may be added 1502 * to the cache as a side effect. 1503 * 1504 * Return false if the operation failed. 1505 */ 1506 boolean populateNear(int position) { 1507 assert(position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]); 1508 1509 // Find a boundary somewhere in the vicinity of the requested position. 1510 // Depending on the safe rules and the text data, it could be either before, at, or after 1511 // the requested position. 1512 1513 1514 // If the requested position is not near already cached positions, clear the existing cache, 1515 // find a near-by boundary and begin new cache contents there. 1516 1517 if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) { 1518 int aBoundary = fText.getBeginIndex(); 1519 int ruleStatusIndex = 0; 1520 // TODO: check for position == length of text. Although may still need to back up to get rule status. 1521 if (position > aBoundary + 20) { 1522 int backupPos = handlePrevious(position); 1523 fPosition = backupPos; 1524 aBoundary = handleNext(); // Ignore dictionary, just finding a rule based boundary. 1525 ruleStatusIndex = fRuleStatusIndex; 1526 } 1527 reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. 1528 } 1529 1530 // Fill in boundaries between existing cache content and the new requested position. 1531 1532 if (fBoundaries[fEndBufIdx] < position) { 1533 // The last position in the cache precedes the requested position. 1534 // Add following position(s) to the cache. 1535 while (fBoundaries[fEndBufIdx] < position) { 1536 if (!populateFollowing()) { 1537 assert false; 1538 return false; 1539 } 1540 } 1541 fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer. 1542 fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra boundaries. 1543 while (fTextIdx > position) { // Move backwards to a position at or preceding the requested pos. 1544 previous(); 1545 } 1546 return true; 1547 } 1548 1549 if (fBoundaries[fStartBufIdx] > position) { 1550 // The first position in the cache is beyond the requested position. 1551 // back up more until we get a boundary <= the requested position. 1552 while (fBoundaries[fStartBufIdx] > position) { 1553 populatePreceding(); 1554 } 1555 fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer. 1556 fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra boundaries. 1557 while (fTextIdx < position) { // Move forwards to a position at or following the requested pos. 1558 next(); 1559 } 1560 if (fTextIdx > position) { 1561 // If position is not itself a boundary, the next() loop above will overshoot. 1562 // Back up one, leaving cache position at the boundary preceding the requested position. 1563 previous(); 1564 } 1565 return true; 1566 } 1567 1568 assert fTextIdx == position; 1569 return true; 1570 1571 }; 1572 1573 /** 1574 * Add boundary(s) to the cache following the current last boundary. 1575 * Return false if at the end of the text, and no more boundaries can be added. 1576 * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. 1577 */ 1578 boolean populateFollowing() { 1579 int fromPosition = fBoundaries[fEndBufIdx]; 1580 int fromRuleStatusIdx = fStatuses[fEndBufIdx]; 1581 int pos = 0; 1582 int ruleStatusIdx = 0; 1583 1584 if (fDictionaryCache.following(fromPosition)) { 1585 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1586 return true; 1587 } 1588 1589 fPosition = fromPosition; 1590 pos = handleNext(); 1591 if (pos == BreakIterator.DONE) { 1592 return false; 1593 } 1594 1595 ruleStatusIdx = fRuleStatusIndex; 1596 if (fDictionaryCharCount > 0) { 1597 // The text segment obtained from the rules includes dictionary characters. 1598 // Subdivide it, with subdivided results going into the dictionary cache. 1599 fDictionaryCache.populateDictionary(fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx); 1600 if (fDictionaryCache.following(fromPosition)) { 1601 addFollowing(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1602 return true; 1603 // TODO: may want to move a sizable chunk of the dictionary cache to the break cache at this point. 1604 // But be careful with interactions with populateNear(). 1605 } 1606 } 1607 1608 // Rule based segment did not include dictionary characters. 1609 // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them, 1610 // meaning that we didn't take the return, above. 1611 // Add its end point to the cache. 1612 addFollowing(pos, ruleStatusIdx, UpdateCachePosition); 1613 1614 // Add several non-dictionary boundaries at this point, to optimize straight forward iteration. 1615 // (subsequent calls to BreakIterator::next() will take the fast path, getting cached results. 1616 // 1617 for (int count=0; count<6; ++count) { 1618 pos = handleNext(); 1619 if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) { 1620 break; 1621 } 1622 addFollowing(pos, fRuleStatusIndex, RetainCachePosition); 1623 } 1624 return true; 1625 }; 1626 1627 /** 1628 * Add one or more boundaries to the cache preceding the first currently cached boundary. 1629 * Leave the iteration position on the first added boundary. 1630 * Return false if no boundaries could be added (if at the start of the text.) 1631 */ 1632 boolean populatePreceding() { 1633 int textBegin = fText.getBeginIndex(); 1634 int fromPosition = fBoundaries[fStartBufIdx]; 1635 if (fromPosition == textBegin) { 1636 return false; 1637 } 1638 1639 int position = textBegin; 1640 int positionStatusIdx = 0; 1641 1642 if (fDictionaryCache.preceding(fromPosition)) { 1643 addPreceding(fDictionaryCache.fBoundary, fDictionaryCache.fStatusIndex, UpdateCachePosition); 1644 return true; 1645 } 1646 1647 int backupPosition = fromPosition; 1648 1649 // Find a boundary somewhere preceding the first already-cached boundary 1650 do { 1651 backupPosition = backupPosition - 30; 1652 if (backupPosition <= textBegin) { 1653 backupPosition = textBegin; 1654 } else { 1655 backupPosition = handlePrevious(backupPosition); 1656 } 1657 if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) { 1658 position = textBegin; 1659 positionStatusIdx = 0; 1660 } else { 1661 fPosition = backupPosition; // TODO: pass starting position in a clearer way. 1662 position = handleNext(); 1663 positionStatusIdx = fRuleStatusIndex; 1664 1665 } 1666 } while (position >= fromPosition); 1667 1668 // Find boundaries between the one we just located and the first already-cached boundary 1669 // Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.. 1670 1671 fSideBuffer.removeAllElements(); 1672 fSideBuffer.push(position); 1673 fSideBuffer.push(positionStatusIdx); 1674 1675 do { 1676 int prevPosition = fPosition = position; 1677 int prevStatusIdx = positionStatusIdx; 1678 position = handleNext(); 1679 positionStatusIdx = fRuleStatusIndex; 1680 if (position == BreakIterator.DONE) { 1681 break; 1682 } 1683 1684 boolean segmentHandledByDictionary = false; 1685 if (fDictionaryCharCount != 0) { 1686 // Segment from the rules includes dictionary characters. 1687 // Subdivide it, with subdivided results going into the dictionary cache. 1688 int dictSegEndPosition = position; 1689 fDictionaryCache.populateDictionary(prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx); 1690 while (fDictionaryCache.following(prevPosition)) { 1691 position = fDictionaryCache.fBoundary; 1692 positionStatusIdx = fDictionaryCache.fStatusIndex; 1693 segmentHandledByDictionary = true; 1694 assert(position > prevPosition); 1695 if (position >= fromPosition) { 1696 break; 1697 } 1698 assert(position <= dictSegEndPosition); 1699 fSideBuffer.push(position); 1700 fSideBuffer.push(positionStatusIdx); 1701 prevPosition = position; 1702 } 1703 assert(position==dictSegEndPosition || position>=fromPosition); 1704 } 1705 1706 if (!segmentHandledByDictionary && position < fromPosition) { 1707 fSideBuffer.push(position); 1708 fSideBuffer.push(positionStatusIdx); 1709 } 1710 } while (position < fromPosition); 1711 1712 // Move boundaries from the side buffer to the main circular buffer. 1713 boolean success = false; 1714 if (!fSideBuffer.isEmpty()) { 1715 positionStatusIdx = fSideBuffer.pop(); 1716 position = fSideBuffer.pop(); 1717 addPreceding(position, positionStatusIdx, UpdateCachePosition); 1718 success = true; 1719 } 1720 1721 while (!fSideBuffer.isEmpty()) { 1722 positionStatusIdx = fSideBuffer.pop(); 1723 position = fSideBuffer.pop(); 1724 if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) { 1725 // No space in circular buffer to hold a new preceding result while 1726 // also retaining the current cache (iteration) position. 1727 // Bailing out is safe; the cache will refill again if needed. 1728 break; 1729 } 1730 } 1731 return success; 1732 }; 1733 1734 1735 static final boolean RetainCachePosition = false; 1736 static final boolean UpdateCachePosition = true; 1737 1738 /* 1739 * Add the boundary following the current position. 1740 * The current position can be left as it was, or changed to the newly added boundary, 1741 * as specified by the update parameter. 1742 */ 1743 void addFollowing(int position, int ruleStatusIdx, boolean update) { 1744 assert(position > fBoundaries[fEndBufIdx]); 1745 assert(ruleStatusIdx <= Short.MAX_VALUE); 1746 int nextIdx = modChunkSize(fEndBufIdx + 1); 1747 if (nextIdx == fStartBufIdx) { 1748 fStartBufIdx = modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1. 1749 } 1750 fBoundaries[nextIdx] = position; 1751 fStatuses[nextIdx] = (short)ruleStatusIdx; 1752 fEndBufIdx = nextIdx; 1753 if (update == UpdateCachePosition) { 1754 // Set current position to the newly added boundary. 1755 fBufIdx = nextIdx; 1756 fTextIdx = position; 1757 } else { 1758 // Retaining the original cache position. 1759 // Check if the added boundary wraps around the buffer, and would over-write the original position. 1760 // It's the responsibility of callers of this function to not add too many. 1761 assert(nextIdx != fBufIdx); 1762 } 1763 1764 }; 1765 1766 1767 /* 1768 * Add the boundary preceding the current position. 1769 * The current position can be left as it was, or changed to the newly added boundary, 1770 * as specified by the update parameter. 1771 */ 1772 boolean addPreceding(int position, int ruleStatusIdx, boolean update) { 1773 assert(position < fBoundaries[fStartBufIdx]); 1774 assert(ruleStatusIdx <= Short.MAX_VALUE); 1775 int nextIdx = modChunkSize(fStartBufIdx - 1); 1776 if (nextIdx == fEndBufIdx) { 1777 if (fBufIdx == fEndBufIdx && update == RetainCachePosition) { 1778 // Failure. The insertion of the new boundary would claim the buffer position that is the 1779 // current iteration position. And we also want to retain the current iteration position. 1780 // (The buffer is already completely full of entries that precede the iteration position.) 1781 return false; 1782 } 1783 fEndBufIdx = modChunkSize(fEndBufIdx - 1); 1784 } 1785 fBoundaries[nextIdx] = position; 1786 fStatuses[nextIdx] = (short)ruleStatusIdx; 1787 fStartBufIdx = nextIdx; 1788 if (update == UpdateCachePosition) { 1789 fBufIdx = nextIdx; 1790 fTextIdx = position; 1791 } 1792 return true; 1793 }; 1794 1795 /** 1796 * Set the cache position to the specified position, or, if the position 1797 * falls between to cached boundaries, to the preceding boundary. 1798 * Fails if the requested position is outside of the range of boundaries currently held by the cache. 1799 * The startPosition must be on a code point boundary. 1800 * 1801 * Return true if successful, false if the specified position is after 1802 * the last cached boundary or before the first. 1803 */ 1804 boolean seek(int pos) { 1805 if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) { 1806 return false; 1807 } 1808 if (pos == fBoundaries[fStartBufIdx]) { 1809 // Common case: seek(0), from BreakIterator::first() 1810 fBufIdx = fStartBufIdx; 1811 fTextIdx = fBoundaries[fBufIdx]; 1812 return true; 1813 } 1814 if (pos == fBoundaries[fEndBufIdx]) { 1815 fBufIdx = fEndBufIdx; 1816 fTextIdx = fBoundaries[fBufIdx]; 1817 return true; 1818 } 1819 1820 int min = fStartBufIdx; 1821 int max = fEndBufIdx; 1822 while (min != max) { 1823 int probe = (min + max + (min>max ? CACHE_SIZE : 0)) / 2; 1824 probe = modChunkSize(probe); 1825 if (fBoundaries[probe] > pos) { 1826 max = probe; 1827 } else { 1828 min = modChunkSize(probe + 1); 1829 } 1830 } 1831 assert(fBoundaries[max] > pos); 1832 fBufIdx = modChunkSize(max - 1); 1833 fTextIdx = fBoundaries[fBufIdx]; 1834 assert(fTextIdx <= pos); 1835 return true; 1836 1837 }; 1838 1839 1840 /** 1841 * copy constructor, used from RuleBasedBreakIterator.clone(). 1842 * 1843 * @param src 1844 */ 1845 BreakCache(BreakCache src) { 1846 fStartBufIdx = src.fStartBufIdx; 1847 fEndBufIdx = src.fEndBufIdx; 1848 fTextIdx = src.fTextIdx; 1849 fBufIdx = src.fBufIdx; 1850 fBoundaries = src.fBoundaries.clone(); 1851 fStatuses = src.fStatuses.clone(); 1852 fSideBuffer = new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents. 1853 } 1854 1855 void dumpCache() { 1856 System.out.printf("fTextIdx:%d fBufIdx:%d%n", fTextIdx, fBufIdx); 1857 for (int i=fStartBufIdx; ; i=modChunkSize(i+1)) { 1858 System.out.printf("%d %d%n", i, fBoundaries[i]); 1859 if (i == fEndBufIdx) { 1860 break; 1861 } 1862 } 1863 }; 1864 1865 private final int modChunkSize(int index) { return index & (CACHE_SIZE - 1); }; 1866 1867 static final int CACHE_SIZE = 128; 1868 // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); 1869 1870 int fStartBufIdx; 1871 int fEndBufIdx; // inclusive 1872 1873 int fTextIdx; 1874 int fBufIdx; 1875 1876 int[] fBoundaries = new int[CACHE_SIZE]; 1877 short[] fStatuses = new short[CACHE_SIZE]; 1878 1879 DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI(); 1880}; 1881 1882 1883 1884 1885} 1886 1887