1/* 2******************************************************************************* 3* Copyright (C) 2011-2014, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* created on: 2011jan06 7* created by: Markus W. Scherer 8* ported from ICU4C ucharstrie.h/.cpp 9*/ 10 11package com.ibm.icu.util; 12 13import java.io.IOException; 14import java.util.ArrayList; 15import java.util.NoSuchElementException; 16 17import com.ibm.icu.text.UTF16; 18import com.ibm.icu.util.BytesTrie.Result; 19 20/** 21 * Light-weight, non-const reader class for a CharsTrie. 22 * Traverses a char-serialized data structure with minimal state, 23 * for mapping strings (16-bit-unit sequences) to non-negative integer values. 24 * 25 * <p>This class is not intended for public subclassing. 26 * 27 * @stable ICU 4.8 28 * @author Markus W. Scherer 29 */ 30public final class CharsTrie implements Cloneable, Iterable<CharsTrie.Entry> { 31 /** 32 * Constructs a CharsTrie reader instance. 33 * 34 * <p>The CharSequence must contain a copy of a char sequence from the CharsTrieBuilder, 35 * with the offset indicating the first char of that sequence. 36 * The CharsTrie object will not read more chars than 37 * the CharsTrieBuilder generated in the corresponding build() call. 38 * 39 * <p>The CharSequence is not copied/cloned and must not be modified while 40 * the CharsTrie object is in use. 41 * 42 * @param trieChars CharSequence that contains the serialized trie. 43 * @param offset Root offset of the trie in the CharSequence. 44 * @stable ICU 4.8 45 */ 46 public CharsTrie(CharSequence trieChars, int offset) { 47 chars_=trieChars; 48 pos_=root_=offset; 49 remainingMatchLength_=-1; 50 } 51 52 /** 53 * Clones this trie reader object and its state, 54 * but not the char array which will be shared. 55 * @return A shallow clone of this trie. 56 * @stable ICU 4.8 57 */ 58 @Override 59 public Object clone() throws CloneNotSupportedException { 60 return super.clone(); // A shallow copy is just what we need. 61 } 62 63 /** 64 * Resets this trie to its initial state. 65 * @return this 66 * @stable ICU 4.8 67 */ 68 public CharsTrie reset() { 69 pos_=root_; 70 remainingMatchLength_=-1; 71 return this; 72 } 73 74 /** 75 * CharsTrie state object, for saving a trie's current state 76 * and resetting the trie back to this state later. 77 * @stable ICU 4.8 78 */ 79 public static final class State { 80 /** 81 * Constructs an empty State. 82 * @stable ICU 4.8 83 */ 84 public State() {} 85 private CharSequence chars; 86 private int root; 87 private int pos; 88 private int remainingMatchLength; 89 } 90 91 /** 92 * Saves the state of this trie. 93 * @param state The State object to hold the trie's state. 94 * @return this 95 * @see #resetToState 96 * @stable ICU 4.8 97 */ 98 public CharsTrie saveState(State state) /*const*/ { 99 state.chars=chars_; 100 state.root=root_; 101 state.pos=pos_; 102 state.remainingMatchLength=remainingMatchLength_; 103 return this; 104 } 105 106 /** 107 * Resets this trie to the saved state. 108 * @param state The State object which holds a saved trie state. 109 * @return this 110 * @throws IllegalArgumentException if the state object contains no state, 111 * or the state of a different trie 112 * @see #saveState 113 * @see #reset 114 * @stable ICU 4.8 115 */ 116 public CharsTrie resetToState(State state) { 117 if(chars_==state.chars && chars_!=null && root_==state.root) { 118 pos_=state.pos; 119 remainingMatchLength_=state.remainingMatchLength; 120 } else { 121 throw new IllegalArgumentException("incompatible trie state"); 122 } 123 return this; 124 } 125 126 /** 127 * Determines whether the string so far matches, whether it has a value, 128 * and whether another input char can continue a matching string. 129 * @return The match/value Result. 130 * @stable ICU 4.8 131 */ 132 public Result current() /*const*/ { 133 int pos=pos_; 134 if(pos<0) { 135 return Result.NO_MATCH; 136 } else { 137 int node; 138 return (remainingMatchLength_<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 139 valueResults_[node>>15] : Result.NO_VALUE; 140 } 141 } 142 143 /** 144 * Traverses the trie from the initial state for this input char. 145 * Equivalent to reset().next(inUnit). 146 * @param inUnit Input char value. Values below 0 and above 0xffff will never match. 147 * @return The match/value Result. 148 * @stable ICU 4.8 149 */ 150 public Result first(int inUnit) { 151 remainingMatchLength_=-1; 152 return nextImpl(root_, inUnit); 153 } 154 155 /** 156 * Traverses the trie from the initial state for the 157 * one or two UTF-16 code units for this input code point. 158 * Equivalent to reset().nextForCodePoint(cp). 159 * @param cp A Unicode code point 0..0x10ffff. 160 * @return The match/value Result. 161 * @stable ICU 4.8 162 */ 163 public Result firstForCodePoint(int cp) { 164 return cp<=0xffff ? 165 first(cp) : 166 (first(UTF16.getLeadSurrogate(cp)).hasNext() ? 167 next(UTF16.getTrailSurrogate(cp)) : 168 Result.NO_MATCH); 169 } 170 171 /** 172 * Traverses the trie from the current state for this input char. 173 * @param inUnit Input char value. Values below 0 and above 0xffff will never match. 174 * @return The match/value Result. 175 * @stable ICU 4.8 176 */ 177 public Result next(int inUnit) { 178 int pos=pos_; 179 if(pos<0) { 180 return Result.NO_MATCH; 181 } 182 int length=remainingMatchLength_; // Actual remaining match length minus 1. 183 if(length>=0) { 184 // Remaining part of a linear-match node. 185 if(inUnit==chars_.charAt(pos++)) { 186 remainingMatchLength_=--length; 187 pos_=pos; 188 int node; 189 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 190 valueResults_[node>>15] : Result.NO_VALUE; 191 } else { 192 stop(); 193 return Result.NO_MATCH; 194 } 195 } 196 return nextImpl(pos, inUnit); 197 } 198 199 /** 200 * Traverses the trie from the current state for the 201 * one or two UTF-16 code units for this input code point. 202 * @param cp A Unicode code point 0..0x10ffff. 203 * @return The match/value Result. 204 * @stable ICU 4.8 205 */ 206 public Result nextForCodePoint(int cp) { 207 return cp<=0xffff ? 208 next(cp) : 209 (next(UTF16.getLeadSurrogate(cp)).hasNext() ? 210 next(UTF16.getTrailSurrogate(cp)) : 211 Result.NO_MATCH); 212 } 213 214 /** 215 * Traverses the trie from the current state for this string. 216 * Equivalent to 217 * <pre> 218 * Result result=current(); 219 * for(each c in s) 220 * if(!result.hasNext()) return Result.NO_MATCH; 221 * result=next(c); 222 * return result; 223 * </pre> 224 * @param s Contains a string. 225 * @param sIndex The start index of the string in s. 226 * @param sLimit The (exclusive) end index of the string in s. 227 * @return The match/value Result. 228 * @stable ICU 4.8 229 */ 230 public Result next(CharSequence s, int sIndex, int sLimit) { 231 if(sIndex>=sLimit) { 232 // Empty input. 233 return current(); 234 } 235 int pos=pos_; 236 if(pos<0) { 237 return Result.NO_MATCH; 238 } 239 int length=remainingMatchLength_; // Actual remaining match length minus 1. 240 for(;;) { 241 // Fetch the next input unit, if there is one. 242 // Continue a linear-match node. 243 char inUnit; 244 for(;;) { 245 if(sIndex==sLimit) { 246 remainingMatchLength_=length; 247 pos_=pos; 248 int node; 249 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 250 valueResults_[node>>15] : Result.NO_VALUE; 251 } 252 inUnit=s.charAt(sIndex++); 253 if(length<0) { 254 remainingMatchLength_=length; 255 break; 256 } 257 if(inUnit!=chars_.charAt(pos)) { 258 stop(); 259 return Result.NO_MATCH; 260 } 261 ++pos; 262 --length; 263 } 264 int node=chars_.charAt(pos++); 265 for(;;) { 266 if(node<kMinLinearMatch) { 267 Result result=branchNext(pos, node, inUnit); 268 if(result==Result.NO_MATCH) { 269 return Result.NO_MATCH; 270 } 271 // Fetch the next input unit, if there is one. 272 if(sIndex==sLimit) { 273 return result; 274 } 275 if(result==Result.FINAL_VALUE) { 276 // No further matching units. 277 stop(); 278 return Result.NO_MATCH; 279 } 280 inUnit=s.charAt(sIndex++); 281 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 282 node=chars_.charAt(pos++); 283 } else if(node<kMinValueLead) { 284 // Match length+1 units. 285 length=node-kMinLinearMatch; // Actual match length minus 1. 286 if(inUnit!=chars_.charAt(pos)) { 287 stop(); 288 return Result.NO_MATCH; 289 } 290 ++pos; 291 --length; 292 break; 293 } else if((node&kValueIsFinal)!=0) { 294 // No further matching units. 295 stop(); 296 return Result.NO_MATCH; 297 } else { 298 // Skip intermediate value. 299 pos=skipNodeValue(pos, node); 300 node&=kNodeTypeMask; 301 } 302 } 303 } 304 } 305 306 /** 307 * Returns a matching string's value if called immediately after 308 * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE. 309 * getValue() can be called multiple times. 310 * 311 * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE! 312 * @return The value for the string so far. 313 * @stable ICU 4.8 314 */ 315 public int getValue() /*const*/ { 316 int pos=pos_; 317 int leadUnit=chars_.charAt(pos++); 318 assert(leadUnit>=kMinValueLead); 319 return (leadUnit&kValueIsFinal)!=0 ? 320 readValue(chars_, pos, leadUnit&0x7fff) : readNodeValue(chars_, pos, leadUnit); 321 } 322 323 /** 324 * Determines whether all strings reachable from the current state 325 * map to the same value, and if so, returns that value. 326 * @return The unique value in bits 32..1 with bit 0 set, 327 * if all strings reachable from the current state 328 * map to the same value; otherwise returns 0. 329 * @stable ICU 4.8 330 */ 331 public long getUniqueValue() /*const*/ { 332 int pos=pos_; 333 if(pos<0) { 334 return 0; 335 } 336 // Skip the rest of a pending linear-match node. 337 long uniqueValue=findUniqueValue(chars_, pos+remainingMatchLength_+1, 0); 338 // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32. 339 return (uniqueValue<<31)>>31; 340 } 341 342 /** 343 * Finds each char which continues the string from the current state. 344 * That is, each char c for which it would be next(c)!=Result.NO_MATCH now. 345 * @param out Each next char is appended to this object. 346 * (Only uses the out.append(c) method.) 347 * @return The number of chars which continue the string from here. 348 * @stable ICU 4.8 349 */ 350 public int getNextChars(Appendable out) /*const*/ { 351 int pos=pos_; 352 if(pos<0) { 353 return 0; 354 } 355 if(remainingMatchLength_>=0) { 356 append(out, chars_.charAt(pos)); // Next unit of a pending linear-match node. 357 return 1; 358 } 359 int node=chars_.charAt(pos++); 360 if(node>=kMinValueLead) { 361 if((node&kValueIsFinal)!=0) { 362 return 0; 363 } else { 364 pos=skipNodeValue(pos, node); 365 node&=kNodeTypeMask; 366 } 367 } 368 if(node<kMinLinearMatch) { 369 if(node==0) { 370 node=chars_.charAt(pos++); 371 } 372 getNextBranchChars(chars_, pos, ++node, out); 373 return node; 374 } else { 375 // First unit of the linear-match node. 376 append(out, chars_.charAt(pos)); 377 return 1; 378 } 379 } 380 381 /** 382 * Iterates from the current state of this trie. 383 * @return A new CharsTrie.Iterator. 384 * @stable ICU 4.8 385 */ 386 public Iterator iterator() { 387 return new Iterator(chars_, pos_, remainingMatchLength_, 0); 388 } 389 390 /** 391 * Iterates from the current state of this trie. 392 * @param maxStringLength If 0, the iterator returns full strings. 393 * Otherwise, the iterator returns strings with this maximum length. 394 * @return A new CharsTrie.Iterator. 395 * @stable ICU 4.8 396 */ 397 public Iterator iterator(int maxStringLength) { 398 return new Iterator(chars_, pos_, remainingMatchLength_, maxStringLength); 399 } 400 401 /** 402 * Iterates from the root of a char-serialized BytesTrie. 403 * @param trieChars CharSequence that contains the serialized trie. 404 * @param offset Root offset of the trie in the CharSequence. 405 * @param maxStringLength If 0, the iterator returns full strings. 406 * Otherwise, the iterator returns strings with this maximum length. 407 * @return A new CharsTrie.Iterator. 408 * @stable ICU 4.8 409 */ 410 public static Iterator iterator(CharSequence trieChars, int offset, int maxStringLength) { 411 return new Iterator(trieChars, offset, -1, maxStringLength); 412 } 413 414 /** 415 * Return value type for the Iterator. 416 * @stable ICU 4.8 417 */ 418 public static final class Entry { 419 /** 420 * The string. 421 * @stable ICU 4.8 422 */ 423 public CharSequence chars; 424 /** 425 * The value associated with the string. 426 * @stable ICU 4.8 427 */ 428 public int value; 429 430 private Entry() { 431 } 432 } 433 434 /** 435 * Iterator for all of the (string, value) pairs in a CharsTrie. 436 * @stable ICU 4.8 437 */ 438 public static final class Iterator implements java.util.Iterator<Entry> { 439 private Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength) { 440 chars_=trieChars; 441 pos_=initialPos_=offset; 442 remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength; 443 maxLength_=maxStringLength; 444 int length=remainingMatchLength_; // Actual remaining match length minus 1. 445 if(length>=0) { 446 // Pending linear-match node, append remaining bytes to str_. 447 ++length; 448 if(maxLength_>0 && length>maxLength_) { 449 length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. 450 } 451 str_.append(chars_, pos_, pos_+length); 452 pos_+=length; 453 remainingMatchLength_-=length; 454 } 455 } 456 457 /** 458 * Resets this iterator to its initial state. 459 * @return this 460 * @stable ICU 4.8 461 */ 462 public Iterator reset() { 463 pos_=initialPos_; 464 remainingMatchLength_=initialRemainingMatchLength_; 465 skipValue_=false; 466 int length=remainingMatchLength_+1; // Remaining match length. 467 if(maxLength_>0 && length>maxLength_) { 468 length=maxLength_; 469 } 470 str_.setLength(length); 471 pos_+=length; 472 remainingMatchLength_-=length; 473 stack_.clear(); 474 return this; 475 } 476 477 /** 478 * @return true if there are more elements. 479 * @stable ICU 4.8 480 */ 481 public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); } 482 483 /** 484 * Finds the next (string, value) pair if there is one. 485 * 486 * If the string is truncated to the maximum length and does not 487 * have a real value, then the value is set to -1. 488 * In this case, this "not a real value" is indistinguishable from 489 * a real value of -1. 490 * @return An Entry with the string and value of the next element. 491 * @throws NoSuchElementException - iteration has no more elements. 492 * @stable ICU 4.8 493 */ 494 public Entry next() { 495 int pos=pos_; 496 if(pos<0) { 497 if(stack_.isEmpty()) { 498 throw new NoSuchElementException(); 499 } 500 // Pop the state off the stack and continue with the next outbound edge of 501 // the branch node. 502 long top=stack_.remove(stack_.size()-1); 503 int length=(int)top; 504 pos=(int)(top>>32); 505 str_.setLength(length&0xffff); 506 length>>>=16; 507 if(length>1) { 508 pos=branchNext(pos, length); 509 if(pos<0) { 510 return entry_; // Reached a final value. 511 } 512 } else { 513 str_.append(chars_.charAt(pos++)); 514 } 515 } 516 if(remainingMatchLength_>=0) { 517 // We only get here if we started in a pending linear-match node 518 // with more than maxLength remaining units. 519 return truncateAndStop(); 520 } 521 for(;;) { 522 int node=chars_.charAt(pos++); 523 if(node>=kMinValueLead) { 524 if(skipValue_) { 525 pos=skipNodeValue(pos, node); 526 node&=kNodeTypeMask; 527 skipValue_=false; 528 } else { 529 // Deliver value for the string so far. 530 boolean isFinal=(node&kValueIsFinal)!=0; 531 if(isFinal) { 532 entry_.value=readValue(chars_, pos, node&0x7fff); 533 } else { 534 entry_.value=readNodeValue(chars_, pos, node); 535 } 536 if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) { 537 pos_=-1; 538 } else { 539 // We cannot skip the value right here because it shares its 540 // lead unit with a match node which we have to evaluate 541 // next time. 542 // Instead, keep pos_ on the node lead unit itself. 543 pos_=pos-1; 544 skipValue_=true; 545 } 546 entry_.chars=str_; 547 return entry_; 548 } 549 } 550 if(maxLength_>0 && str_.length()==maxLength_) { 551 return truncateAndStop(); 552 } 553 if(node<kMinLinearMatch) { 554 if(node==0) { 555 node=chars_.charAt(pos++); 556 } 557 pos=branchNext(pos, node+1); 558 if(pos<0) { 559 return entry_; // Reached a final value. 560 } 561 } else { 562 // Linear-match node, append length units to str_. 563 int length=node-kMinLinearMatch+1; 564 if(maxLength_>0 && str_.length()+length>maxLength_) { 565 str_.append(chars_, pos, pos+maxLength_-str_.length()); 566 return truncateAndStop(); 567 } 568 str_.append(chars_, pos, pos+length); 569 pos+=length; 570 } 571 } 572 } 573 574 /** 575 * Iterator.remove() is not supported. 576 * @throws UnsupportedOperationException (always) 577 * @stable ICU 4.8 578 */ 579 public void remove() { 580 throw new UnsupportedOperationException(); 581 } 582 583 private Entry truncateAndStop() { 584 pos_=-1; 585 // We reset entry_.chars every time we return entry_ 586 // just because the caller might have modified the Entry. 587 entry_.chars=str_; 588 entry_.value=-1; // no real value for str 589 return entry_; 590 } 591 592 private int branchNext(int pos, int length) { 593 while(length>kMaxBranchLinearSubNodeLength) { 594 ++pos; // ignore the comparison unit 595 // Push state for the greater-or-equal edge. 596 stack_.add(((long)skipDelta(chars_, pos)<<32)|((length-(length>>1))<<16)|str_.length()); 597 // Follow the less-than edge. 598 length>>=1; 599 pos=jumpByDelta(chars_, pos); 600 } 601 // List of key-value pairs where values are either final values or jump deltas. 602 // Read the first (key, value) pair. 603 char trieUnit=chars_.charAt(pos++); 604 int node=chars_.charAt(pos++); 605 boolean isFinal=(node&kValueIsFinal)!=0; 606 int value=readValue(chars_, pos, node&=0x7fff); 607 pos=skipValue(pos, node); 608 stack_.add(((long)pos<<32)|((length-1)<<16)|str_.length()); 609 str_.append(trieUnit); 610 if(isFinal) { 611 pos_=-1; 612 entry_.chars=str_; 613 entry_.value=value; 614 return -1; 615 } else { 616 return pos+value; 617 } 618 } 619 620 private CharSequence chars_; 621 private int pos_; 622 private int initialPos_; 623 private int remainingMatchLength_; 624 private int initialRemainingMatchLength_; 625 private boolean skipValue_; // Skip intermediate value which was already delivered. 626 627 private StringBuilder str_=new StringBuilder(); 628 private int maxLength_; 629 private Entry entry_=new Entry(); 630 631 // The stack stores longs for backtracking to another 632 // outbound edge of a branch node. 633 // Each long has the offset in chars_ in bits 62..32, 634 // the str_.length() from before the node in bits 15..0, 635 // and the remaining branch length in bits 31..16. 636 // (We could store the remaining branch length minus 1 in bits 30..16 and not use bit 31, 637 // but the code looks more confusing that way.) 638 private ArrayList<Long> stack_=new ArrayList<Long>(); 639 } 640 641 private void stop() { 642 pos_=-1; 643 } 644 645 // Reads a compact 32-bit integer. 646 // pos is already after the leadUnit, and the lead unit has bit 15 reset. 647 private static int readValue(CharSequence chars, int pos, int leadUnit) { 648 int value; 649 if(leadUnit<kMinTwoUnitValueLead) { 650 value=leadUnit; 651 } else if(leadUnit<kThreeUnitValueLead) { 652 value=((leadUnit-kMinTwoUnitValueLead)<<16)|chars.charAt(pos); 653 } else { 654 value=(chars.charAt(pos)<<16)|chars.charAt(pos+1); 655 } 656 return value; 657 } 658 private static int skipValue(int pos, int leadUnit) { 659 if(leadUnit>=kMinTwoUnitValueLead) { 660 if(leadUnit<kThreeUnitValueLead) { 661 ++pos; 662 } else { 663 pos+=2; 664 } 665 } 666 return pos; 667 } 668 private static int skipValue(CharSequence chars, int pos) { 669 int leadUnit=chars.charAt(pos++); 670 return skipValue(pos, leadUnit&0x7fff); 671 } 672 673 private static int readNodeValue(CharSequence chars, int pos, int leadUnit) { 674 assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); 675 int value; 676 if(leadUnit<kMinTwoUnitNodeValueLead) { 677 value=(leadUnit>>6)-1; 678 } else if(leadUnit<kThreeUnitNodeValueLead) { 679 value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|chars.charAt(pos); 680 } else { 681 value=(chars.charAt(pos)<<16)|chars.charAt(pos+1); 682 } 683 return value; 684 } 685 private static int skipNodeValue(int pos, int leadUnit) { 686 assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); 687 if(leadUnit>=kMinTwoUnitNodeValueLead) { 688 if(leadUnit<kThreeUnitNodeValueLead) { 689 ++pos; 690 } else { 691 pos+=2; 692 } 693 } 694 return pos; 695 } 696 697 private static int jumpByDelta(CharSequence chars, int pos) { 698 int delta=chars.charAt(pos++); 699 if(delta>=kMinTwoUnitDeltaLead) { 700 if(delta==kThreeUnitDeltaLead) { 701 delta=(chars.charAt(pos)<<16)|chars.charAt(pos+1); 702 pos+=2; 703 } else { 704 delta=((delta-kMinTwoUnitDeltaLead)<<16)|chars.charAt(pos++); 705 } 706 } 707 return pos+delta; 708 } 709 710 private static int skipDelta(CharSequence chars, int pos) { 711 int delta=chars.charAt(pos++); 712 if(delta>=kMinTwoUnitDeltaLead) { 713 if(delta==kThreeUnitDeltaLead) { 714 pos+=2; 715 } else { 716 ++pos; 717 } 718 } 719 return pos; 720 } 721 722 private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE }; 723 724 // Handles a branch node for both next(unit) and next(string). 725 private Result branchNext(int pos, int length, int inUnit) { 726 // Branch according to the current unit. 727 if(length==0) { 728 length=chars_.charAt(pos++); 729 } 730 ++length; 731 // The length of the branch is the number of units to select from. 732 // The data structure encodes a binary search. 733 while(length>kMaxBranchLinearSubNodeLength) { 734 if(inUnit<chars_.charAt(pos++)) { 735 length>>=1; 736 pos=jumpByDelta(chars_, pos); 737 } else { 738 length=length-(length>>1); 739 pos=skipDelta(chars_, pos); 740 } 741 } 742 // Drop down to linear search for the last few units. 743 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 744 // and divides length by 2. 745 do { 746 if(inUnit==chars_.charAt(pos++)) { 747 Result result; 748 int node=chars_.charAt(pos); 749 if((node&kValueIsFinal)!=0) { 750 // Leave the final value for getValue() to read. 751 result=Result.FINAL_VALUE; 752 } else { 753 // Use the non-final value as the jump delta. 754 ++pos; 755 // int delta=readValue(pos, node); 756 int delta; 757 if(node<kMinTwoUnitValueLead) { 758 delta=node; 759 } else if(node<kThreeUnitValueLead) { 760 delta=((node-kMinTwoUnitValueLead)<<16)|chars_.charAt(pos++); 761 } else { 762 delta=(chars_.charAt(pos)<<16)|chars_.charAt(pos+1); 763 pos+=2; 764 } 765 // end readValue() 766 pos+=delta; 767 node=chars_.charAt(pos); 768 result= node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE; 769 } 770 pos_=pos; 771 return result; 772 } 773 --length; 774 pos=skipValue(chars_, pos); 775 } while(length>1); 776 if(inUnit==chars_.charAt(pos++)) { 777 pos_=pos; 778 int node=chars_.charAt(pos); 779 return node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE; 780 } else { 781 stop(); 782 return Result.NO_MATCH; 783 } 784 } 785 786 // Requires remainingLength_<0. 787 private Result nextImpl(int pos, int inUnit) { 788 int node=chars_.charAt(pos++); 789 for(;;) { 790 if(node<kMinLinearMatch) { 791 return branchNext(pos, node, inUnit); 792 } else if(node<kMinValueLead) { 793 // Match the first of length+1 units. 794 int length=node-kMinLinearMatch; // Actual match length minus 1. 795 if(inUnit==chars_.charAt(pos++)) { 796 remainingMatchLength_=--length; 797 pos_=pos; 798 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 799 valueResults_[node>>15] : Result.NO_VALUE; 800 } else { 801 // No match. 802 break; 803 } 804 } else if((node&kValueIsFinal)!=0) { 805 // No further matching units. 806 break; 807 } else { 808 // Skip intermediate value. 809 pos=skipNodeValue(pos, node); 810 node&=kNodeTypeMask; 811 } 812 } 813 stop(); 814 return Result.NO_MATCH; 815 } 816 817 // Helper functions for getUniqueValue(). 818 // Recursively finds a unique value (or whether there is not a unique one) 819 // from a branch. 820 // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). 821 // On return, if not 0, then bits 63..33 contain the updated non-negative pos. 822 private static long findUniqueValueFromBranch(CharSequence chars, int pos, int length, 823 long uniqueValue) { 824 while(length>kMaxBranchLinearSubNodeLength) { 825 ++pos; // ignore the comparison unit 826 uniqueValue=findUniqueValueFromBranch(chars, jumpByDelta(chars, pos), length>>1, uniqueValue); 827 if(uniqueValue==0) { 828 return 0; 829 } 830 length=length-(length>>1); 831 pos=skipDelta(chars, pos); 832 } 833 do { 834 ++pos; // ignore a comparison unit 835 // handle its value 836 int node=chars.charAt(pos++); 837 boolean isFinal=(node&kValueIsFinal)!=0; 838 node&=0x7fff; 839 int value=readValue(chars, pos, node); 840 pos=skipValue(pos, node); 841 if(isFinal) { 842 if(uniqueValue!=0) { 843 if(value!=(int)(uniqueValue>>1)) { 844 return 0; 845 } 846 } else { 847 uniqueValue=((long)value<<1)|1; 848 } 849 } else { 850 uniqueValue=findUniqueValue(chars, pos+value, uniqueValue); 851 if(uniqueValue==0) { 852 return 0; 853 } 854 } 855 } while(--length>1); 856 // ignore the last comparison byte 857 return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL); 858 } 859 // Recursively finds a unique value (or whether there is not a unique one) 860 // starting from a position on a node lead unit. 861 // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set. 862 // Otherwise, uniqueValue is 0. Bits 63..33 are ignored. 863 private static long findUniqueValue(CharSequence chars, int pos, long uniqueValue) { 864 int node=chars.charAt(pos++); 865 for(;;) { 866 if(node<kMinLinearMatch) { 867 if(node==0) { 868 node=chars.charAt(pos++); 869 } 870 uniqueValue=findUniqueValueFromBranch(chars, pos, node+1, uniqueValue); 871 if(uniqueValue==0) { 872 return 0; 873 } 874 pos=(int)(uniqueValue>>>33); 875 node=chars.charAt(pos++); 876 } else if(node<kMinValueLead) { 877 // linear-match node 878 pos+=node-kMinLinearMatch+1; // Ignore the match units. 879 node=chars.charAt(pos++); 880 } else { 881 boolean isFinal=(node&kValueIsFinal)!=0; 882 int value; 883 if(isFinal) { 884 value=readValue(chars, pos, node&0x7fff); 885 } else { 886 value=readNodeValue(chars, pos, node); 887 } 888 if(uniqueValue!=0) { 889 if(value!=(int)(uniqueValue>>1)) { 890 return 0; 891 } 892 } else { 893 uniqueValue=((long)value<<1)|1; 894 } 895 if(isFinal) { 896 return uniqueValue; 897 } 898 pos=skipNodeValue(pos, node); 899 node&=kNodeTypeMask; 900 } 901 } 902 } 903 904 // Helper functions for getNextChars(). 905 // getNextChars() when pos is on a branch node. 906 private static void getNextBranchChars(CharSequence chars, int pos, int length, Appendable out) { 907 while(length>kMaxBranchLinearSubNodeLength) { 908 ++pos; // ignore the comparison unit 909 getNextBranchChars(chars, jumpByDelta(chars, pos), length>>1, out); 910 length=length-(length>>1); 911 pos=skipDelta(chars, pos); 912 } 913 do { 914 append(out, chars.charAt(pos++)); 915 pos=skipValue(chars, pos); 916 } while(--length>1); 917 append(out, chars.charAt(pos)); 918 } 919 private static void append(Appendable out, int c) { 920 try { 921 out.append((char)c); 922 } catch(IOException e) { 923 throw new ICUUncheckedIOException(e); 924 } 925 } 926 927 // CharsTrie data structure 928 // 929 // The trie consists of a series of char-serialized nodes for incremental 930 // Unicode string/char sequence matching. (char=16-bit unsigned integer) 931 // The root node is at the beginning of the trie data. 932 // 933 // Types of nodes are distinguished by their node lead unit ranges. 934 // After each node, except a final-value node, another node follows to 935 // encode match values or continue matching further units. 936 // 937 // Node types: 938 // - Final-value node: Stores a 32-bit integer in a compact, variable-length format. 939 // The value is for the string/char sequence so far. 940 // - Match node, optionally with an intermediate value in a different compact format. 941 // The value, if present, is for the string/char sequence so far. 942 // 943 // Aside from the value, which uses the node lead unit's high bits: 944 // 945 // - Linear-match node: Matches a number of units. 946 // - Branch node: Branches to other nodes according to the current input unit. 947 // The node unit is the length of the branch (number of units to select from) 948 // minus 1. It is followed by a sub-node: 949 // - If the length is at most kMaxBranchLinearSubNodeLength, then 950 // there are length-1 (key, value) pairs and then one more comparison unit. 951 // If one of the key units matches, then the value is either a final value for 952 // the string so far, or a "jump" delta to the next node. 953 // If the last unit matches, then matching continues with the next node. 954 // (Values have the same encoding as final-value nodes.) 955 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 956 // there is one unit and one "jump" delta. 957 // If the input unit is less than the sub-node unit, then "jump" by delta to 958 // the next sub-node which will have a length of length/2. 959 // (The delta has its own compact encoding.) 960 // Otherwise, skip the "jump" delta to the next sub-node 961 // which will have a length of length-length/2. 962 963 // Match-node lead unit values, after masking off intermediate-value bits: 964 965 // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise 966 // the length is one more than the next unit. 967 968 // For a branch sub-node with at most this many entries, we drop down 969 // to a linear search. 970 /*package*/ static final int kMaxBranchLinearSubNodeLength=5; 971 972 // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node. 973 /*package*/ static final int kMinLinearMatch=0x30; 974 /*package*/ static final int kMaxLinearMatchLength=0x10; 975 976 // Match-node lead unit bits 14..6 for the optional intermediate value. 977 // If these bits are 0, then there is no intermediate value. 978 // Otherwise, see the *NodeValue* constants below. 979 /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040 980 /*package*/ static final int kNodeTypeMask=kMinValueLead-1; // 0x003f 981 982 // A final-value node has bit 15 set. 983 /*package*/ static final int kValueIsFinal=0x8000; 984 985 // Compact value: After testing and masking off bit 15, use the following thresholds. 986 /*package*/ static final int kMaxOneUnitValue=0x3fff; 987 988 /*package*/ static final int kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000 989 /*package*/ static final int kThreeUnitValueLead=0x7fff; 990 991 /*package*/ static final int kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff 992 993 // Compact intermediate-value integer, lead unit shared with a branch or linear-match node. 994 /*package*/ static final int kMaxOneUnitNodeValue=0xff; 995 /*package*/ static final int kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040 996 /*package*/ static final int kThreeUnitNodeValueLead=0x7fc0; 997 998 /*package*/ static final int kMaxTwoUnitNodeValue= 999 ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff 1000 1001 // Compact delta integers. 1002 /*package*/ static final int kMaxOneUnitDelta=0xfbff; 1003 /*package*/ static final int kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00 1004 /*package*/ static final int kThreeUnitDeltaLead=0xffff; 1005 1006 /*package*/ static final int kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff 1007 1008 // Fixed value referencing the CharsTrie words. 1009 private CharSequence chars_; 1010 private int root_; 1011 1012 // Iterator variables. 1013 1014 // Pointer to next trie unit to read. NULL if no more matches. 1015 private int pos_; 1016 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 1017 private int remainingMatchLength_; 1018} 1019