1/* GENERATED SOURCE. DO NOT MODIFY. */ 2// © 2016 and later: Unicode, Inc. and others. 3// License & terms of use: http://www.unicode.org/copyright.html#License 4/* 5******************************************************************************* 6* Copyright (C) 2011-2014, International Business Machines 7* Corporation and others. All Rights Reserved. 8******************************************************************************* 9* created on: 2011jan06 10* created by: Markus W. Scherer 11* ported from ICU4C ucharstrie.h/.cpp 12*/ 13 14package android.icu.util; 15 16import java.io.IOException; 17import java.util.ArrayList; 18import java.util.NoSuchElementException; 19 20import android.icu.text.UTF16; 21import android.icu.util.BytesTrie.Result; 22 23/** 24 * Light-weight, non-const reader class for a CharsTrie. 25 * Traverses a char-serialized data structure with minimal state, 26 * for mapping strings (16-bit-unit sequences) to non-negative integer values. 27 * 28 * <p>This class is not intended for public subclassing. 29 * 30 * @author Markus W. Scherer 31 * @hide Only a subset of ICU is exposed in Android 32 */ 33public final class CharsTrie implements Cloneable, Iterable<CharsTrie.Entry> { 34 /** 35 * Constructs a CharsTrie reader instance. 36 * 37 * <p>The CharSequence must contain a copy of a char sequence from the CharsTrieBuilder, 38 * with the offset indicating the first char of that sequence. 39 * The CharsTrie object will not read more chars than 40 * the CharsTrieBuilder generated in the corresponding build() call. 41 * 42 * <p>The CharSequence is not copied/cloned and must not be modified while 43 * the CharsTrie object is in use. 44 * 45 * @param trieChars CharSequence that contains the serialized trie. 46 * @param offset Root offset of the trie in the CharSequence. 47 */ 48 public CharsTrie(CharSequence trieChars, int offset) { 49 chars_=trieChars; 50 pos_=root_=offset; 51 remainingMatchLength_=-1; 52 } 53 54 /** 55 * Clones this trie reader object and its state, 56 * but not the char array which will be shared. 57 * @return A shallow clone of this trie. 58 */ 59 @Override 60 public Object clone() throws CloneNotSupportedException { 61 return super.clone(); // A shallow copy is just what we need. 62 } 63 64 /** 65 * Resets this trie to its initial state. 66 * @return this 67 */ 68 public CharsTrie reset() { 69 pos_=root_; 70 remainingMatchLength_=-1; 71 return this; 72 } 73 74 /** 75 * CharsTrie state object, for saving a trie's current state 76 * and resetting the trie back to this state later. 77 */ 78 public static final class State { 79 /** 80 * Constructs an empty State. 81 */ 82 public State() {} 83 private CharSequence chars; 84 private int root; 85 private int pos; 86 private int remainingMatchLength; 87 } 88 89 /** 90 * Saves the state of this trie. 91 * @param state The State object to hold the trie's state. 92 * @return this 93 * @see #resetToState 94 */ 95 public CharsTrie saveState(State state) /*const*/ { 96 state.chars=chars_; 97 state.root=root_; 98 state.pos=pos_; 99 state.remainingMatchLength=remainingMatchLength_; 100 return this; 101 } 102 103 /** 104 * Resets this trie to the saved state. 105 * @param state The State object which holds a saved trie state. 106 * @return this 107 * @throws IllegalArgumentException if the state object contains no state, 108 * or the state of a different trie 109 * @see #saveState 110 * @see #reset 111 */ 112 public CharsTrie resetToState(State state) { 113 if(chars_==state.chars && chars_!=null && root_==state.root) { 114 pos_=state.pos; 115 remainingMatchLength_=state.remainingMatchLength; 116 } else { 117 throw new IllegalArgumentException("incompatible trie state"); 118 } 119 return this; 120 } 121 122 /** 123 * Determines whether the string so far matches, whether it has a value, 124 * and whether another input char can continue a matching string. 125 * @return The match/value Result. 126 */ 127 public Result current() /*const*/ { 128 int pos=pos_; 129 if(pos<0) { 130 return Result.NO_MATCH; 131 } else { 132 int node; 133 return (remainingMatchLength_<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 134 valueResults_[node>>15] : Result.NO_VALUE; 135 } 136 } 137 138 /** 139 * Traverses the trie from the initial state for this input char. 140 * Equivalent to reset().next(inUnit). 141 * @param inUnit Input char value. Values below 0 and above 0xffff will never match. 142 * @return The match/value Result. 143 */ 144 public Result first(int inUnit) { 145 remainingMatchLength_=-1; 146 return nextImpl(root_, inUnit); 147 } 148 149 /** 150 * Traverses the trie from the initial state for the 151 * one or two UTF-16 code units for this input code point. 152 * Equivalent to reset().nextForCodePoint(cp). 153 * @param cp A Unicode code point 0..0x10ffff. 154 * @return The match/value Result. 155 */ 156 public Result firstForCodePoint(int cp) { 157 return cp<=0xffff ? 158 first(cp) : 159 (first(UTF16.getLeadSurrogate(cp)).hasNext() ? 160 next(UTF16.getTrailSurrogate(cp)) : 161 Result.NO_MATCH); 162 } 163 164 /** 165 * Traverses the trie from the current state for this input char. 166 * @param inUnit Input char value. Values below 0 and above 0xffff will never match. 167 * @return The match/value Result. 168 */ 169 public Result next(int inUnit) { 170 int pos=pos_; 171 if(pos<0) { 172 return Result.NO_MATCH; 173 } 174 int length=remainingMatchLength_; // Actual remaining match length minus 1. 175 if(length>=0) { 176 // Remaining part of a linear-match node. 177 if(inUnit==chars_.charAt(pos++)) { 178 remainingMatchLength_=--length; 179 pos_=pos; 180 int node; 181 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 182 valueResults_[node>>15] : Result.NO_VALUE; 183 } else { 184 stop(); 185 return Result.NO_MATCH; 186 } 187 } 188 return nextImpl(pos, inUnit); 189 } 190 191 /** 192 * Traverses the trie from the current state for the 193 * one or two UTF-16 code units for this input code point. 194 * @param cp A Unicode code point 0..0x10ffff. 195 * @return The match/value Result. 196 */ 197 public Result nextForCodePoint(int cp) { 198 return cp<=0xffff ? 199 next(cp) : 200 (next(UTF16.getLeadSurrogate(cp)).hasNext() ? 201 next(UTF16.getTrailSurrogate(cp)) : 202 Result.NO_MATCH); 203 } 204 205 /** 206 * Traverses the trie from the current state for this string. 207 * Equivalent to 208 * <pre> 209 * Result result=current(); 210 * for(each c in s) 211 * if(!result.hasNext()) return Result.NO_MATCH; 212 * result=next(c); 213 * return result; 214 * </pre> 215 * @param s Contains a string. 216 * @param sIndex The start index of the string in s. 217 * @param sLimit The (exclusive) end index of the string in s. 218 * @return The match/value Result. 219 */ 220 public Result next(CharSequence s, int sIndex, int sLimit) { 221 if(sIndex>=sLimit) { 222 // Empty input. 223 return current(); 224 } 225 int pos=pos_; 226 if(pos<0) { 227 return Result.NO_MATCH; 228 } 229 int length=remainingMatchLength_; // Actual remaining match length minus 1. 230 for(;;) { 231 // Fetch the next input unit, if there is one. 232 // Continue a linear-match node. 233 char inUnit; 234 for(;;) { 235 if(sIndex==sLimit) { 236 remainingMatchLength_=length; 237 pos_=pos; 238 int node; 239 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 240 valueResults_[node>>15] : Result.NO_VALUE; 241 } 242 inUnit=s.charAt(sIndex++); 243 if(length<0) { 244 remainingMatchLength_=length; 245 break; 246 } 247 if(inUnit!=chars_.charAt(pos)) { 248 stop(); 249 return Result.NO_MATCH; 250 } 251 ++pos; 252 --length; 253 } 254 int node=chars_.charAt(pos++); 255 for(;;) { 256 if(node<kMinLinearMatch) { 257 Result result=branchNext(pos, node, inUnit); 258 if(result==Result.NO_MATCH) { 259 return Result.NO_MATCH; 260 } 261 // Fetch the next input unit, if there is one. 262 if(sIndex==sLimit) { 263 return result; 264 } 265 if(result==Result.FINAL_VALUE) { 266 // No further matching units. 267 stop(); 268 return Result.NO_MATCH; 269 } 270 inUnit=s.charAt(sIndex++); 271 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 272 node=chars_.charAt(pos++); 273 } else if(node<kMinValueLead) { 274 // Match length+1 units. 275 length=node-kMinLinearMatch; // Actual match length minus 1. 276 if(inUnit!=chars_.charAt(pos)) { 277 stop(); 278 return Result.NO_MATCH; 279 } 280 ++pos; 281 --length; 282 break; 283 } else if((node&kValueIsFinal)!=0) { 284 // No further matching units. 285 stop(); 286 return Result.NO_MATCH; 287 } else { 288 // Skip intermediate value. 289 pos=skipNodeValue(pos, node); 290 node&=kNodeTypeMask; 291 } 292 } 293 } 294 } 295 296 /** 297 * Returns a matching string's value if called immediately after 298 * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE. 299 * getValue() can be called multiple times. 300 * 301 * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE! 302 * @return The value for the string so far. 303 */ 304 public int getValue() /*const*/ { 305 int pos=pos_; 306 int leadUnit=chars_.charAt(pos++); 307 assert(leadUnit>=kMinValueLead); 308 return (leadUnit&kValueIsFinal)!=0 ? 309 readValue(chars_, pos, leadUnit&0x7fff) : readNodeValue(chars_, pos, leadUnit); 310 } 311 312 /** 313 * Determines whether all strings reachable from the current state 314 * map to the same value, and if so, returns that value. 315 * @return The unique value in bits 32..1 with bit 0 set, 316 * if all strings reachable from the current state 317 * map to the same value; otherwise returns 0. 318 */ 319 public long getUniqueValue() /*const*/ { 320 int pos=pos_; 321 if(pos<0) { 322 return 0; 323 } 324 // Skip the rest of a pending linear-match node. 325 long uniqueValue=findUniqueValue(chars_, pos+remainingMatchLength_+1, 0); 326 // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32. 327 return (uniqueValue<<31)>>31; 328 } 329 330 /** 331 * Finds each char which continues the string from the current state. 332 * That is, each char c for which it would be next(c)!=Result.NO_MATCH now. 333 * @param out Each next char is appended to this object. 334 * (Only uses the out.append(c) method.) 335 * @return The number of chars which continue the string from here. 336 */ 337 public int getNextChars(Appendable out) /*const*/ { 338 int pos=pos_; 339 if(pos<0) { 340 return 0; 341 } 342 if(remainingMatchLength_>=0) { 343 append(out, chars_.charAt(pos)); // Next unit of a pending linear-match node. 344 return 1; 345 } 346 int node=chars_.charAt(pos++); 347 if(node>=kMinValueLead) { 348 if((node&kValueIsFinal)!=0) { 349 return 0; 350 } else { 351 pos=skipNodeValue(pos, node); 352 node&=kNodeTypeMask; 353 } 354 } 355 if(node<kMinLinearMatch) { 356 if(node==0) { 357 node=chars_.charAt(pos++); 358 } 359 getNextBranchChars(chars_, pos, ++node, out); 360 return node; 361 } else { 362 // First unit of the linear-match node. 363 append(out, chars_.charAt(pos)); 364 return 1; 365 } 366 } 367 368 /** 369 * Iterates from the current state of this trie. 370 * @return A new CharsTrie.Iterator. 371 */ 372 @Override 373 public Iterator iterator() { 374 return new Iterator(chars_, pos_, remainingMatchLength_, 0); 375 } 376 377 /** 378 * Iterates from the current state of this trie. 379 * @param maxStringLength If 0, the iterator returns full strings. 380 * Otherwise, the iterator returns strings with this maximum length. 381 * @return A new CharsTrie.Iterator. 382 */ 383 public Iterator iterator(int maxStringLength) { 384 return new Iterator(chars_, pos_, remainingMatchLength_, maxStringLength); 385 } 386 387 /** 388 * Iterates from the root of a char-serialized BytesTrie. 389 * @param trieChars CharSequence that contains the serialized trie. 390 * @param offset Root offset of the trie in the CharSequence. 391 * @param maxStringLength If 0, the iterator returns full strings. 392 * Otherwise, the iterator returns strings with this maximum length. 393 * @return A new CharsTrie.Iterator. 394 */ 395 public static Iterator iterator(CharSequence trieChars, int offset, int maxStringLength) { 396 return new Iterator(trieChars, offset, -1, maxStringLength); 397 } 398 399 /** 400 * Return value type for the Iterator. 401 */ 402 public static final class Entry { 403 /** 404 * The string. 405 */ 406 public CharSequence chars; 407 /** 408 * The value associated with the string. 409 */ 410 public int value; 411 412 private Entry() { 413 } 414 } 415 416 /** 417 * Iterator for all of the (string, value) pairs in a CharsTrie. 418 */ 419 public static final class Iterator implements java.util.Iterator<Entry> { 420 private Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength) { 421 chars_=trieChars; 422 pos_=initialPos_=offset; 423 remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength; 424 maxLength_=maxStringLength; 425 int length=remainingMatchLength_; // Actual remaining match length minus 1. 426 if(length>=0) { 427 // Pending linear-match node, append remaining bytes to str_. 428 ++length; 429 if(maxLength_>0 && length>maxLength_) { 430 length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. 431 } 432 str_.append(chars_, pos_, pos_+length); 433 pos_+=length; 434 remainingMatchLength_-=length; 435 } 436 } 437 438 /** 439 * Resets this iterator to its initial state. 440 * @return this 441 */ 442 public Iterator reset() { 443 pos_=initialPos_; 444 remainingMatchLength_=initialRemainingMatchLength_; 445 skipValue_=false; 446 int length=remainingMatchLength_+1; // Remaining match length. 447 if(maxLength_>0 && length>maxLength_) { 448 length=maxLength_; 449 } 450 str_.setLength(length); 451 pos_+=length; 452 remainingMatchLength_-=length; 453 stack_.clear(); 454 return this; 455 } 456 457 /** 458 * @return true if there are more elements. 459 */ 460 @Override 461 public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); } 462 463 /** 464 * Finds the next (string, value) pair if there is one. 465 * 466 * If the string is truncated to the maximum length and does not 467 * have a real value, then the value is set to -1. 468 * In this case, this "not a real value" is indistinguishable from 469 * a real value of -1. 470 * @return An Entry with the string and value of the next element. 471 * @throws NoSuchElementException - iteration has no more elements. 472 */ 473 @Override 474 public Entry next() { 475 int pos=pos_; 476 if(pos<0) { 477 if(stack_.isEmpty()) { 478 throw new NoSuchElementException(); 479 } 480 // Pop the state off the stack and continue with the next outbound edge of 481 // the branch node. 482 long top=stack_.remove(stack_.size()-1); 483 int length=(int)top; 484 pos=(int)(top>>32); 485 str_.setLength(length&0xffff); 486 length>>>=16; 487 if(length>1) { 488 pos=branchNext(pos, length); 489 if(pos<0) { 490 return entry_; // Reached a final value. 491 } 492 } else { 493 str_.append(chars_.charAt(pos++)); 494 } 495 } 496 if(remainingMatchLength_>=0) { 497 // We only get here if we started in a pending linear-match node 498 // with more than maxLength remaining units. 499 return truncateAndStop(); 500 } 501 for(;;) { 502 int node=chars_.charAt(pos++); 503 if(node>=kMinValueLead) { 504 if(skipValue_) { 505 pos=skipNodeValue(pos, node); 506 node&=kNodeTypeMask; 507 skipValue_=false; 508 } else { 509 // Deliver value for the string so far. 510 boolean isFinal=(node&kValueIsFinal)!=0; 511 if(isFinal) { 512 entry_.value=readValue(chars_, pos, node&0x7fff); 513 } else { 514 entry_.value=readNodeValue(chars_, pos, node); 515 } 516 if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) { 517 pos_=-1; 518 } else { 519 // We cannot skip the value right here because it shares its 520 // lead unit with a match node which we have to evaluate 521 // next time. 522 // Instead, keep pos_ on the node lead unit itself. 523 pos_=pos-1; 524 skipValue_=true; 525 } 526 entry_.chars=str_; 527 return entry_; 528 } 529 } 530 if(maxLength_>0 && str_.length()==maxLength_) { 531 return truncateAndStop(); 532 } 533 if(node<kMinLinearMatch) { 534 if(node==0) { 535 node=chars_.charAt(pos++); 536 } 537 pos=branchNext(pos, node+1); 538 if(pos<0) { 539 return entry_; // Reached a final value. 540 } 541 } else { 542 // Linear-match node, append length units to str_. 543 int length=node-kMinLinearMatch+1; 544 if(maxLength_>0 && str_.length()+length>maxLength_) { 545 str_.append(chars_, pos, pos+maxLength_-str_.length()); 546 return truncateAndStop(); 547 } 548 str_.append(chars_, pos, pos+length); 549 pos+=length; 550 } 551 } 552 } 553 554 /** 555 * Iterator.remove() is not supported. 556 * @throws UnsupportedOperationException (always) 557 */ 558 @Override 559 public void remove() { 560 throw new UnsupportedOperationException(); 561 } 562 563 private Entry truncateAndStop() { 564 pos_=-1; 565 // We reset entry_.chars every time we return entry_ 566 // just because the caller might have modified the Entry. 567 entry_.chars=str_; 568 entry_.value=-1; // no real value for str 569 return entry_; 570 } 571 572 private int branchNext(int pos, int length) { 573 while(length>kMaxBranchLinearSubNodeLength) { 574 ++pos; // ignore the comparison unit 575 // Push state for the greater-or-equal edge. 576 stack_.add(((long)skipDelta(chars_, pos)<<32)|((length-(length>>1))<<16)|str_.length()); 577 // Follow the less-than edge. 578 length>>=1; 579 pos=jumpByDelta(chars_, pos); 580 } 581 // List of key-value pairs where values are either final values or jump deltas. 582 // Read the first (key, value) pair. 583 char trieUnit=chars_.charAt(pos++); 584 int node=chars_.charAt(pos++); 585 boolean isFinal=(node&kValueIsFinal)!=0; 586 int value=readValue(chars_, pos, node&=0x7fff); 587 pos=skipValue(pos, node); 588 stack_.add(((long)pos<<32)|((length-1)<<16)|str_.length()); 589 str_.append(trieUnit); 590 if(isFinal) { 591 pos_=-1; 592 entry_.chars=str_; 593 entry_.value=value; 594 return -1; 595 } else { 596 return pos+value; 597 } 598 } 599 600 private CharSequence chars_; 601 private int pos_; 602 private int initialPos_; 603 private int remainingMatchLength_; 604 private int initialRemainingMatchLength_; 605 private boolean skipValue_; // Skip intermediate value which was already delivered. 606 607 private StringBuilder str_=new StringBuilder(); 608 private int maxLength_; 609 private Entry entry_=new Entry(); 610 611 // The stack stores longs for backtracking to another 612 // outbound edge of a branch node. 613 // Each long has the offset in chars_ in bits 62..32, 614 // the str_.length() from before the node in bits 15..0, 615 // and the remaining branch length in bits 31..16. 616 // (We could store the remaining branch length minus 1 in bits 30..16 and not use bit 31, 617 // but the code looks more confusing that way.) 618 private ArrayList<Long> stack_=new ArrayList<Long>(); 619 } 620 621 private void stop() { 622 pos_=-1; 623 } 624 625 // Reads a compact 32-bit integer. 626 // pos is already after the leadUnit, and the lead unit has bit 15 reset. 627 private static int readValue(CharSequence chars, int pos, int leadUnit) { 628 int value; 629 if(leadUnit<kMinTwoUnitValueLead) { 630 value=leadUnit; 631 } else if(leadUnit<kThreeUnitValueLead) { 632 value=((leadUnit-kMinTwoUnitValueLead)<<16)|chars.charAt(pos); 633 } else { 634 value=(chars.charAt(pos)<<16)|chars.charAt(pos+1); 635 } 636 return value; 637 } 638 private static int skipValue(int pos, int leadUnit) { 639 if(leadUnit>=kMinTwoUnitValueLead) { 640 if(leadUnit<kThreeUnitValueLead) { 641 ++pos; 642 } else { 643 pos+=2; 644 } 645 } 646 return pos; 647 } 648 private static int skipValue(CharSequence chars, int pos) { 649 int leadUnit=chars.charAt(pos++); 650 return skipValue(pos, leadUnit&0x7fff); 651 } 652 653 private static int readNodeValue(CharSequence chars, int pos, int leadUnit) { 654 assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); 655 int value; 656 if(leadUnit<kMinTwoUnitNodeValueLead) { 657 value=(leadUnit>>6)-1; 658 } else if(leadUnit<kThreeUnitNodeValueLead) { 659 value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|chars.charAt(pos); 660 } else { 661 value=(chars.charAt(pos)<<16)|chars.charAt(pos+1); 662 } 663 return value; 664 } 665 private static int skipNodeValue(int pos, int leadUnit) { 666 assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); 667 if(leadUnit>=kMinTwoUnitNodeValueLead) { 668 if(leadUnit<kThreeUnitNodeValueLead) { 669 ++pos; 670 } else { 671 pos+=2; 672 } 673 } 674 return pos; 675 } 676 677 private static int jumpByDelta(CharSequence chars, int pos) { 678 int delta=chars.charAt(pos++); 679 if(delta>=kMinTwoUnitDeltaLead) { 680 if(delta==kThreeUnitDeltaLead) { 681 delta=(chars.charAt(pos)<<16)|chars.charAt(pos+1); 682 pos+=2; 683 } else { 684 delta=((delta-kMinTwoUnitDeltaLead)<<16)|chars.charAt(pos++); 685 } 686 } 687 return pos+delta; 688 } 689 690 private static int skipDelta(CharSequence chars, int pos) { 691 int delta=chars.charAt(pos++); 692 if(delta>=kMinTwoUnitDeltaLead) { 693 if(delta==kThreeUnitDeltaLead) { 694 pos+=2; 695 } else { 696 ++pos; 697 } 698 } 699 return pos; 700 } 701 702 private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE }; 703 704 // Handles a branch node for both next(unit) and next(string). 705 private Result branchNext(int pos, int length, int inUnit) { 706 // Branch according to the current unit. 707 if(length==0) { 708 length=chars_.charAt(pos++); 709 } 710 ++length; 711 // The length of the branch is the number of units to select from. 712 // The data structure encodes a binary search. 713 while(length>kMaxBranchLinearSubNodeLength) { 714 if(inUnit<chars_.charAt(pos++)) { 715 length>>=1; 716 pos=jumpByDelta(chars_, pos); 717 } else { 718 length=length-(length>>1); 719 pos=skipDelta(chars_, pos); 720 } 721 } 722 // Drop down to linear search for the last few units. 723 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 724 // and divides length by 2. 725 do { 726 if(inUnit==chars_.charAt(pos++)) { 727 Result result; 728 int node=chars_.charAt(pos); 729 if((node&kValueIsFinal)!=0) { 730 // Leave the final value for getValue() to read. 731 result=Result.FINAL_VALUE; 732 } else { 733 // Use the non-final value as the jump delta. 734 ++pos; 735 // int delta=readValue(pos, node); 736 int delta; 737 if(node<kMinTwoUnitValueLead) { 738 delta=node; 739 } else if(node<kThreeUnitValueLead) { 740 delta=((node-kMinTwoUnitValueLead)<<16)|chars_.charAt(pos++); 741 } else { 742 delta=(chars_.charAt(pos)<<16)|chars_.charAt(pos+1); 743 pos+=2; 744 } 745 // end readValue() 746 pos+=delta; 747 node=chars_.charAt(pos); 748 result= node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE; 749 } 750 pos_=pos; 751 return result; 752 } 753 --length; 754 pos=skipValue(chars_, pos); 755 } while(length>1); 756 if(inUnit==chars_.charAt(pos++)) { 757 pos_=pos; 758 int node=chars_.charAt(pos); 759 return node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE; 760 } else { 761 stop(); 762 return Result.NO_MATCH; 763 } 764 } 765 766 // Requires remainingLength_<0. 767 private Result nextImpl(int pos, int inUnit) { 768 int node=chars_.charAt(pos++); 769 for(;;) { 770 if(node<kMinLinearMatch) { 771 return branchNext(pos, node, inUnit); 772 } else if(node<kMinValueLead) { 773 // Match the first of length+1 units. 774 int length=node-kMinLinearMatch; // Actual match length minus 1. 775 if(inUnit==chars_.charAt(pos++)) { 776 remainingMatchLength_=--length; 777 pos_=pos; 778 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 779 valueResults_[node>>15] : Result.NO_VALUE; 780 } else { 781 // No match. 782 break; 783 } 784 } else if((node&kValueIsFinal)!=0) { 785 // No further matching units. 786 break; 787 } else { 788 // Skip intermediate value. 789 pos=skipNodeValue(pos, node); 790 node&=kNodeTypeMask; 791 } 792 } 793 stop(); 794 return Result.NO_MATCH; 795 } 796 797 // Helper functions for getUniqueValue(). 798 // Recursively finds a unique value (or whether there is not a unique one) 799 // from a branch. 800 // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). 801 // On return, if not 0, then bits 63..33 contain the updated non-negative pos. 802 private static long findUniqueValueFromBranch(CharSequence chars, int pos, int length, 803 long uniqueValue) { 804 while(length>kMaxBranchLinearSubNodeLength) { 805 ++pos; // ignore the comparison unit 806 uniqueValue=findUniqueValueFromBranch(chars, jumpByDelta(chars, pos), length>>1, uniqueValue); 807 if(uniqueValue==0) { 808 return 0; 809 } 810 length=length-(length>>1); 811 pos=skipDelta(chars, pos); 812 } 813 do { 814 ++pos; // ignore a comparison unit 815 // handle its value 816 int node=chars.charAt(pos++); 817 boolean isFinal=(node&kValueIsFinal)!=0; 818 node&=0x7fff; 819 int value=readValue(chars, pos, node); 820 pos=skipValue(pos, node); 821 if(isFinal) { 822 if(uniqueValue!=0) { 823 if(value!=(int)(uniqueValue>>1)) { 824 return 0; 825 } 826 } else { 827 uniqueValue=((long)value<<1)|1; 828 } 829 } else { 830 uniqueValue=findUniqueValue(chars, pos+value, uniqueValue); 831 if(uniqueValue==0) { 832 return 0; 833 } 834 } 835 } while(--length>1); 836 // ignore the last comparison byte 837 return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL); 838 } 839 // Recursively finds a unique value (or whether there is not a unique one) 840 // starting from a position on a node lead unit. 841 // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set. 842 // Otherwise, uniqueValue is 0. Bits 63..33 are ignored. 843 private static long findUniqueValue(CharSequence chars, int pos, long uniqueValue) { 844 int node=chars.charAt(pos++); 845 for(;;) { 846 if(node<kMinLinearMatch) { 847 if(node==0) { 848 node=chars.charAt(pos++); 849 } 850 uniqueValue=findUniqueValueFromBranch(chars, pos, node+1, uniqueValue); 851 if(uniqueValue==0) { 852 return 0; 853 } 854 pos=(int)(uniqueValue>>>33); 855 node=chars.charAt(pos++); 856 } else if(node<kMinValueLead) { 857 // linear-match node 858 pos+=node-kMinLinearMatch+1; // Ignore the match units. 859 node=chars.charAt(pos++); 860 } else { 861 boolean isFinal=(node&kValueIsFinal)!=0; 862 int value; 863 if(isFinal) { 864 value=readValue(chars, pos, node&0x7fff); 865 } else { 866 value=readNodeValue(chars, pos, node); 867 } 868 if(uniqueValue!=0) { 869 if(value!=(int)(uniqueValue>>1)) { 870 return 0; 871 } 872 } else { 873 uniqueValue=((long)value<<1)|1; 874 } 875 if(isFinal) { 876 return uniqueValue; 877 } 878 pos=skipNodeValue(pos, node); 879 node&=kNodeTypeMask; 880 } 881 } 882 } 883 884 // Helper functions for getNextChars(). 885 // getNextChars() when pos is on a branch node. 886 private static void getNextBranchChars(CharSequence chars, int pos, int length, Appendable out) { 887 while(length>kMaxBranchLinearSubNodeLength) { 888 ++pos; // ignore the comparison unit 889 getNextBranchChars(chars, jumpByDelta(chars, pos), length>>1, out); 890 length=length-(length>>1); 891 pos=skipDelta(chars, pos); 892 } 893 do { 894 append(out, chars.charAt(pos++)); 895 pos=skipValue(chars, pos); 896 } while(--length>1); 897 append(out, chars.charAt(pos)); 898 } 899 private static void append(Appendable out, int c) { 900 try { 901 out.append((char)c); 902 } catch(IOException e) { 903 throw new ICUUncheckedIOException(e); 904 } 905 } 906 907 // CharsTrie data structure 908 // 909 // The trie consists of a series of char-serialized nodes for incremental 910 // Unicode string/char sequence matching. (char=16-bit unsigned integer) 911 // The root node is at the beginning of the trie data. 912 // 913 // Types of nodes are distinguished by their node lead unit ranges. 914 // After each node, except a final-value node, another node follows to 915 // encode match values or continue matching further units. 916 // 917 // Node types: 918 // - Final-value node: Stores a 32-bit integer in a compact, variable-length format. 919 // The value is for the string/char sequence so far. 920 // - Match node, optionally with an intermediate value in a different compact format. 921 // The value, if present, is for the string/char sequence so far. 922 // 923 // Aside from the value, which uses the node lead unit's high bits: 924 // 925 // - Linear-match node: Matches a number of units. 926 // - Branch node: Branches to other nodes according to the current input unit. 927 // The node unit is the length of the branch (number of units to select from) 928 // minus 1. It is followed by a sub-node: 929 // - If the length is at most kMaxBranchLinearSubNodeLength, then 930 // there are length-1 (key, value) pairs and then one more comparison unit. 931 // If one of the key units matches, then the value is either a final value for 932 // the string so far, or a "jump" delta to the next node. 933 // If the last unit matches, then matching continues with the next node. 934 // (Values have the same encoding as final-value nodes.) 935 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 936 // there is one unit and one "jump" delta. 937 // If the input unit is less than the sub-node unit, then "jump" by delta to 938 // the next sub-node which will have a length of length/2. 939 // (The delta has its own compact encoding.) 940 // Otherwise, skip the "jump" delta to the next sub-node 941 // which will have a length of length-length/2. 942 943 // Match-node lead unit values, after masking off intermediate-value bits: 944 945 // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise 946 // the length is one more than the next unit. 947 948 // For a branch sub-node with at most this many entries, we drop down 949 // to a linear search. 950 /*package*/ static final int kMaxBranchLinearSubNodeLength=5; 951 952 // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node. 953 /*package*/ static final int kMinLinearMatch=0x30; 954 /*package*/ static final int kMaxLinearMatchLength=0x10; 955 956 // Match-node lead unit bits 14..6 for the optional intermediate value. 957 // If these bits are 0, then there is no intermediate value. 958 // Otherwise, see the *NodeValue* constants below. 959 /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040 960 /*package*/ static final int kNodeTypeMask=kMinValueLead-1; // 0x003f 961 962 // A final-value node has bit 15 set. 963 /*package*/ static final int kValueIsFinal=0x8000; 964 965 // Compact value: After testing and masking off bit 15, use the following thresholds. 966 /*package*/ static final int kMaxOneUnitValue=0x3fff; 967 968 /*package*/ static final int kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000 969 /*package*/ static final int kThreeUnitValueLead=0x7fff; 970 971 /*package*/ static final int kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff 972 973 // Compact intermediate-value integer, lead unit shared with a branch or linear-match node. 974 /*package*/ static final int kMaxOneUnitNodeValue=0xff; 975 /*package*/ static final int kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040 976 /*package*/ static final int kThreeUnitNodeValueLead=0x7fc0; 977 978 /*package*/ static final int kMaxTwoUnitNodeValue= 979 ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff 980 981 // Compact delta integers. 982 /*package*/ static final int kMaxOneUnitDelta=0xfbff; 983 /*package*/ static final int kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00 984 /*package*/ static final int kThreeUnitDeltaLead=0xffff; 985 986 /*package*/ static final int kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff 987 988 // Fixed value referencing the CharsTrie words. 989 private CharSequence chars_; 990 private int root_; 991 992 // Iterator variables. 993 994 // Pointer to next trie unit to read. NULL if no more matches. 995 private int pos_; 996 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 997 private int remainingMatchLength_; 998} 999