BytesTrie.java revision 836e6b40a94ec3fb7545a76cb072960442b7eee9
1/* GENERATED SOURCE. DO NOT MODIFY. */ 2/* 3******************************************************************************* 4* Copyright (C) 2010-2014, International Business Machines 5* Corporation and others. All Rights Reserved. 6******************************************************************************* 7* created on: 2010nov23 8* created by: Markus W. Scherer 9* ported from ICU4C bytestrie.h/.cpp 10*/ 11package android.icu.util; 12 13import java.io.IOException; 14import java.nio.ByteBuffer; 15import java.util.ArrayList; 16import java.util.NoSuchElementException; 17 18/** 19 * Light-weight, non-const reader class for a BytesTrie. 20 * Traverses a byte-serialized data structure with minimal state, 21 * for mapping byte sequences to non-negative integer values. 22 * 23 * <p>This class is not intended for public subclassing. 24 * 25 * @stable ICU 4.8 26 * @author Markus W. Scherer 27 * @hide Only a subset of ICU is exposed in Android 28 * @hide All android.icu classes are currently hidden 29 */ 30public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> { 31 /** 32 * Constructs a BytesTrie reader instance. 33 * 34 * <p>The array must contain a copy of a byte sequence from the BytesTrieBuilder, 35 * with the offset indicating the first byte of that sequence. 36 * The BytesTrie object will not read more bytes than 37 * the BytesTrieBuilder generated in the corresponding build() call. 38 * 39 * <p>The array is not copied/cloned and must not be modified while 40 * the BytesTrie object is in use. 41 * 42 * @param trieBytes Bytes array that contains the serialized trie. 43 * @param offset Root offset of the trie in the array. 44 * @stable ICU 4.8 45 */ 46 public BytesTrie(byte[] trieBytes, int offset) { 47 bytes_=trieBytes; 48 pos_=root_=offset; 49 remainingMatchLength_=-1; 50 } 51 52 /** 53 * Clones this trie reader object and its state, 54 * but not the byte array which will be shared. 55 * @return A shallow clone of this trie. 56 * @stable ICU 4.8 57 */ 58 @Override 59 public Object clone() throws CloneNotSupportedException { 60 return super.clone(); // A shallow copy is just what we need. 61 } 62 63 /** 64 * Resets this trie to its initial state. 65 * @return this 66 * @stable ICU 4.8 67 */ 68 public BytesTrie reset() { 69 pos_=root_; 70 remainingMatchLength_=-1; 71 return this; 72 } 73 74 /** 75 * BytesTrie state object, for saving a trie's current state 76 * and resetting the trie back to this state later. 77 * @stable ICU 4.8 78 */ 79 public static final class State { 80 /** 81 * Constructs an empty State. 82 * @stable ICU 4.8 83 */ 84 public State() {} 85 private byte[] bytes; 86 private int root; 87 private int pos; 88 private int remainingMatchLength; 89 } 90 91 /** 92 * Saves the state of this trie. 93 * @param state The State object to hold the trie's state. 94 * @return this 95 * @see #resetToState 96 * @stable ICU 4.8 97 */ 98 public BytesTrie saveState(State state) /*const*/ { 99 state.bytes=bytes_; 100 state.root=root_; 101 state.pos=pos_; 102 state.remainingMatchLength=remainingMatchLength_; 103 return this; 104 } 105 106 /** 107 * Resets this trie to the saved state. 108 * @param state The State object which holds a saved trie state. 109 * @return this 110 * @throws IllegalArgumentException if the state object contains no state, 111 * or the state of a different trie 112 * @see #saveState 113 * @see #reset 114 * @stable ICU 4.8 115 */ 116 public BytesTrie resetToState(State state) { 117 if(bytes_==state.bytes && bytes_!=null && root_==state.root) { 118 pos_=state.pos; 119 remainingMatchLength_=state.remainingMatchLength; 120 } else { 121 throw new IllegalArgumentException("incompatible trie state"); 122 } 123 return this; 124 } 125 126 /** 127 * Return values for BytesTrie.next(), CharsTrie.next() and similar methods. 128 * @stable ICU 4.8 129 */ 130 public enum Result { 131 /** 132 * The input unit(s) did not continue a matching string. 133 * Once current()/next() return NO_MATCH, 134 * all further calls to current()/next() will also return NO_MATCH, 135 * until the trie is reset to its original state or to a saved state. 136 * @stable ICU 4.8 137 */ 138 NO_MATCH, 139 /** 140 * The input unit(s) continued a matching string 141 * but there is no value for the string so far. 142 * (It is a prefix of a longer string.) 143 * @stable ICU 4.8 144 */ 145 NO_VALUE, 146 /** 147 * The input unit(s) continued a matching string 148 * and there is a value for the string so far. 149 * This value will be returned by getValue(). 150 * No further input byte/unit can continue a matching string. 151 * @stable ICU 4.8 152 */ 153 FINAL_VALUE, 154 /** 155 * The input unit(s) continued a matching string 156 * and there is a value for the string so far. 157 * This value will be returned by getValue(). 158 * Another input byte/unit can continue a matching string. 159 * @stable ICU 4.8 160 */ 161 INTERMEDIATE_VALUE; 162 163 // Note: The following methods assume the particular order 164 // of enum constants, treating the ordinal() values like bit sets. 165 // Do not reorder the enum constants! 166 167 /** 168 * Same as (result!=NO_MATCH). 169 * @return true if the input bytes/units so far are part of a matching string/byte sequence. 170 * @stable ICU 4.8 171 */ 172 public boolean matches() { return this!=NO_MATCH; } 173 174 /** 175 * Equivalent to (result==INTERMEDIATE_VALUE || result==FINAL_VALUE). 176 * @return true if there is a value for the input bytes/units so far. 177 * @see #getValue 178 * @stable ICU 4.8 179 */ 180 public boolean hasValue() { return ordinal()>=2; } 181 182 /** 183 * Equivalent to (result==NO_VALUE || result==INTERMEDIATE_VALUE). 184 * @return true if another input byte/unit can continue a matching string. 185 * @stable ICU 4.8 186 */ 187 public boolean hasNext() { return (ordinal()&1)!=0; } 188 } 189 190 /** 191 * Determines whether the byte sequence so far matches, whether it has a value, 192 * and whether another input byte can continue a matching byte sequence. 193 * @return The match/value Result. 194 * @stable ICU 4.8 195 */ 196 public Result current() /*const*/ { 197 int pos=pos_; 198 if(pos<0) { 199 return Result.NO_MATCH; 200 } else { 201 int node; 202 return (remainingMatchLength_<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 203 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 204 } 205 } 206 207 /** 208 * Traverses the trie from the initial state for this input byte. 209 * Equivalent to reset().next(inByte). 210 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 211 * Values below -0x100 and above 0xff will never match. 212 * @return The match/value Result. 213 * @stable ICU 4.8 214 */ 215 public Result first(int inByte) { 216 remainingMatchLength_=-1; 217 if(inByte<0) { 218 inByte+=0x100; 219 } 220 return nextImpl(root_, inByte); 221 } 222 223 /** 224 * Traverses the trie from the current state for this input byte. 225 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 226 * Values below -0x100 and above 0xff will never match. 227 * @return The match/value Result. 228 * @stable ICU 4.8 229 */ 230 public Result next(int inByte) { 231 int pos=pos_; 232 if(pos<0) { 233 return Result.NO_MATCH; 234 } 235 if(inByte<0) { 236 inByte+=0x100; 237 } 238 int length=remainingMatchLength_; // Actual remaining match length minus 1. 239 if(length>=0) { 240 // Remaining part of a linear-match node. 241 if(inByte==(bytes_[pos++]&0xff)) { 242 remainingMatchLength_=--length; 243 pos_=pos; 244 int node; 245 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 246 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 247 } else { 248 stop(); 249 return Result.NO_MATCH; 250 } 251 } 252 return nextImpl(pos, inByte); 253 } 254 255 /** 256 * Traverses the trie from the current state for this byte sequence. 257 * Equivalent to 258 * <pre> 259 * Result result=current(); 260 * for(each c in s) 261 * if(!result.hasNext()) return Result.NO_MATCH; 262 * result=next(c); 263 * return result; 264 * </pre> 265 * @param s Contains a string or byte sequence. 266 * @param sIndex The start index of the byte sequence in s. 267 * @param sLimit The (exclusive) end index of the byte sequence in s. 268 * @return The match/value Result. 269 * @stable ICU 4.8 270 */ 271 public Result next(byte[] s, int sIndex, int sLimit) { 272 if(sIndex>=sLimit) { 273 // Empty input. 274 return current(); 275 } 276 int pos=pos_; 277 if(pos<0) { 278 return Result.NO_MATCH; 279 } 280 int length=remainingMatchLength_; // Actual remaining match length minus 1. 281 for(;;) { 282 // Fetch the next input byte, if there is one. 283 // Continue a linear-match node. 284 byte inByte; 285 for(;;) { 286 if(sIndex==sLimit) { 287 remainingMatchLength_=length; 288 pos_=pos; 289 int node; 290 return (length<0 && (node=(bytes_[pos]&0xff))>=kMinValueLead) ? 291 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 292 } 293 inByte=s[sIndex++]; 294 if(length<0) { 295 remainingMatchLength_=length; 296 break; 297 } 298 if(inByte!=bytes_[pos]) { 299 stop(); 300 return Result.NO_MATCH; 301 } 302 ++pos; 303 --length; 304 } 305 for(;;) { 306 int node=bytes_[pos++]&0xff; 307 if(node<kMinLinearMatch) { 308 Result result=branchNext(pos, node, inByte&0xff); 309 if(result==Result.NO_MATCH) { 310 return Result.NO_MATCH; 311 } 312 // Fetch the next input byte, if there is one. 313 if(sIndex==sLimit) { 314 return result; 315 } 316 if(result==Result.FINAL_VALUE) { 317 // No further matching bytes. 318 stop(); 319 return Result.NO_MATCH; 320 } 321 inByte=s[sIndex++]; 322 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 323 } else if(node<kMinValueLead) { 324 // Match length+1 bytes. 325 length=node-kMinLinearMatch; // Actual match length minus 1. 326 if(inByte!=bytes_[pos]) { 327 stop(); 328 return Result.NO_MATCH; 329 } 330 ++pos; 331 --length; 332 break; 333 } else if((node&kValueIsFinal)!=0) { 334 // No further matching bytes. 335 stop(); 336 return Result.NO_MATCH; 337 } else { 338 // Skip intermediate value. 339 pos=skipValue(pos, node); 340 // The next node must not also be a value node. 341 assert((bytes_[pos]&0xff)<kMinValueLead); 342 } 343 } 344 } 345 } 346 347 /** 348 * Returns a matching byte sequence's value if called immediately after 349 * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE. 350 * getValue() can be called multiple times. 351 * 352 * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE! 353 * @return The value for the byte sequence so far. 354 * @stable ICU 4.8 355 */ 356 public int getValue() /*const*/ { 357 int pos=pos_; 358 int leadByte=bytes_[pos++]&0xff; 359 assert(leadByte>=kMinValueLead); 360 return readValue(bytes_, pos, leadByte>>1); 361 } 362 363 /** 364 * Determines whether all byte sequences reachable from the current state 365 * map to the same value, and if so, returns that value. 366 * @return The unique value in bits 32..1 with bit 0 set, 367 * if all byte sequences reachable from the current state 368 * map to the same value; otherwise returns 0. 369 * @stable ICU 4.8 370 */ 371 public long getUniqueValue() /*const*/ { 372 int pos=pos_; 373 if(pos<0) { 374 return 0; 375 } 376 // Skip the rest of a pending linear-match node. 377 long uniqueValue=findUniqueValue(bytes_, pos+remainingMatchLength_+1, 0); 378 // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32. 379 return (uniqueValue<<31)>>31; 380 } 381 382 /** 383 * Finds each byte which continues the byte sequence from the current state. 384 * That is, each byte b for which it would be next(b)!=Result.NO_MATCH now. 385 * @param out Each next byte is 0-extended to a char and appended to this object. 386 * (Only uses the out.append(c) method.) 387 * @return The number of bytes which continue the byte sequence from here. 388 * @stable ICU 4.8 389 */ 390 public int getNextBytes(Appendable out) /*const*/ { 391 int pos=pos_; 392 if(pos<0) { 393 return 0; 394 } 395 if(remainingMatchLength_>=0) { 396 append(out, bytes_[pos]&0xff); // Next byte of a pending linear-match node. 397 return 1; 398 } 399 int node=bytes_[pos++]&0xff; 400 if(node>=kMinValueLead) { 401 if((node&kValueIsFinal)!=0) { 402 return 0; 403 } else { 404 pos=skipValue(pos, node); 405 node=bytes_[pos++]&0xff; 406 assert(node<kMinValueLead); 407 } 408 } 409 if(node<kMinLinearMatch) { 410 if(node==0) { 411 node=bytes_[pos++]&0xff; 412 } 413 getNextBranchBytes(bytes_, pos, ++node, out); 414 return node; 415 } else { 416 // First byte of the linear-match node. 417 append(out, bytes_[pos]&0xff); 418 return 1; 419 } 420 } 421 422 /** 423 * Iterates from the current state of this trie. 424 * @return A new BytesTrie.Iterator. 425 * @stable ICU 4.8 426 */ 427 public Iterator iterator() { 428 return new Iterator(bytes_, pos_, remainingMatchLength_, 0); 429 } 430 431 /** 432 * Iterates from the current state of this trie. 433 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 434 * Otherwise, the iterator returns strings with this maximum length. 435 * @return A new BytesTrie.Iterator. 436 * @stable ICU 4.8 437 */ 438 public Iterator iterator(int maxStringLength) { 439 return new Iterator(bytes_, pos_, remainingMatchLength_, maxStringLength); 440 } 441 442 /** 443 * Iterates from the root of a byte-serialized BytesTrie. 444 * @param trieBytes Bytes array that contains the serialized trie. 445 * @param offset Root offset of the trie in the array. 446 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 447 * Otherwise, the iterator returns strings with this maximum length. 448 * @return A new BytesTrie.Iterator. 449 * @stable ICU 4.8 450 */ 451 public static Iterator iterator(byte[] trieBytes, int offset, int maxStringLength) { 452 return new Iterator(trieBytes, offset, -1, maxStringLength); 453 } 454 455 /** 456 * Return value type for the Iterator. 457 * @stable ICU 4.8 458 */ 459 public static final class Entry { 460 private Entry(int capacity) { 461 bytes=new byte[capacity]; 462 } 463 464 /** 465 * @return The length of the byte sequence. 466 * @stable ICU 4.8 467 */ 468 public int bytesLength() { return length; } 469 /** 470 * Returns a byte of the byte sequence. 471 * @param index An index into the byte sequence. 472 * @return The index-th byte sequence byte. 473 * @stable ICU 4.8 474 */ 475 public byte byteAt(int index) { return bytes[index]; } 476 /** 477 * Copies the byte sequence into a byte array. 478 * @param dest Destination byte array. 479 * @param destOffset Starting offset to where in dest the byte sequence is copied. 480 * @stable ICU 4.8 481 */ 482 public void copyBytesTo(byte[] dest, int destOffset) { 483 System.arraycopy(bytes, 0, dest, destOffset, length); 484 } 485 /** 486 * @return The byte sequence as a read-only ByteBuffer. 487 * @stable ICU 4.8 488 */ 489 public ByteBuffer bytesAsByteBuffer() { 490 return ByteBuffer.wrap(bytes, 0, length).asReadOnlyBuffer(); 491 } 492 493 /** 494 * The value associated with the byte sequence. 495 * @stable ICU 4.8 496 */ 497 public int value; 498 499 private void ensureCapacity(int len) { 500 if(bytes.length<len) { 501 byte[] newBytes=new byte[Math.min(2*bytes.length, 2*len)]; 502 System.arraycopy(bytes, 0, newBytes, 0, length); 503 bytes=newBytes; 504 } 505 } 506 private void append(byte b) { 507 ensureCapacity(length+1); 508 bytes[length++]=b; 509 } 510 private void append(byte[] b, int off, int len) { 511 ensureCapacity(length+len); 512 System.arraycopy(b, off, bytes, length, len); 513 length+=len; 514 } 515 private void truncateString(int newLength) { length=newLength; } 516 517 private byte[] bytes; 518 private int length; 519 } 520 521 /** 522 * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. 523 * @stable ICU 4.8 524 */ 525 public static final class Iterator implements java.util.Iterator<Entry> { 526 private Iterator(byte[] trieBytes, int offset, int remainingMatchLength, int maxStringLength) { 527 bytes_=trieBytes; 528 pos_=initialPos_=offset; 529 remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength; 530 maxLength_=maxStringLength; 531 entry_=new Entry(maxLength_!=0 ? maxLength_ : 32); 532 int length=remainingMatchLength_; // Actual remaining match length minus 1. 533 if(length>=0) { 534 // Pending linear-match node, append remaining bytes to entry_. 535 ++length; 536 if(maxLength_>0 && length>maxLength_) { 537 length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. 538 } 539 entry_.append(bytes_, pos_, length); 540 pos_+=length; 541 remainingMatchLength_-=length; 542 } 543 } 544 545 /** 546 * Resets this iterator to its initial state. 547 * @return this 548 * @stable ICU 4.8 549 */ 550 public Iterator reset() { 551 pos_=initialPos_; 552 remainingMatchLength_=initialRemainingMatchLength_; 553 int length=remainingMatchLength_+1; // Remaining match length. 554 if(maxLength_>0 && length>maxLength_) { 555 length=maxLength_; 556 } 557 entry_.truncateString(length); 558 pos_+=length; 559 remainingMatchLength_-=length; 560 stack_.clear(); 561 return this; 562 } 563 564 /** 565 * @return true if there are more elements. 566 * @stable ICU 4.8 567 */ 568 public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); } 569 570 /** 571 * Finds the next (byte sequence, value) pair if there is one. 572 * 573 * If the byte sequence is truncated to the maximum length and does not 574 * have a real value, then the value is set to -1. 575 * In this case, this "not a real value" is indistinguishable from 576 * a real value of -1. 577 * @return An Entry with the string and value of the next element. 578 * @throws NoSuchElementException - iteration has no more elements. 579 * @stable ICU 4.8 580 */ 581 public Entry next() { 582 int pos=pos_; 583 if(pos<0) { 584 if(stack_.isEmpty()) { 585 throw new NoSuchElementException(); 586 } 587 // Pop the state off the stack and continue with the next outbound edge of 588 // the branch node. 589 long top=stack_.remove(stack_.size()-1); 590 int length=(int)top; 591 pos=(int)(top>>32); 592 entry_.truncateString(length&0xffff); 593 length>>>=16; 594 if(length>1) { 595 pos=branchNext(pos, length); 596 if(pos<0) { 597 return entry_; // Reached a final value. 598 } 599 } else { 600 entry_.append(bytes_[pos++]); 601 } 602 } 603 if(remainingMatchLength_>=0) { 604 // We only get here if we started in a pending linear-match node 605 // with more than maxLength remaining bytes. 606 return truncateAndStop(); 607 } 608 for(;;) { 609 int node=bytes_[pos++]&0xff; 610 if(node>=kMinValueLead) { 611 // Deliver value for the byte sequence so far. 612 boolean isFinal=(node&kValueIsFinal)!=0; 613 entry_.value=readValue(bytes_, pos, node>>1); 614 if(isFinal || (maxLength_>0 && entry_.length==maxLength_)) { 615 pos_=-1; 616 } else { 617 pos_=skipValue(pos, node); 618 } 619 return entry_; 620 } 621 if(maxLength_>0 && entry_.length==maxLength_) { 622 return truncateAndStop(); 623 } 624 if(node<kMinLinearMatch) { 625 if(node==0) { 626 node=bytes_[pos++]&0xff; 627 } 628 pos=branchNext(pos, node+1); 629 if(pos<0) { 630 return entry_; // Reached a final value. 631 } 632 } else { 633 // Linear-match node, append length bytes to entry_. 634 int length=node-kMinLinearMatch+1; 635 if(maxLength_>0 && entry_.length+length>maxLength_) { 636 entry_.append(bytes_, pos, maxLength_-entry_.length); 637 return truncateAndStop(); 638 } 639 entry_.append(bytes_, pos, length); 640 pos+=length; 641 } 642 } 643 } 644 645 /** 646 * Iterator.remove() is not supported. 647 * @throws UnsupportedOperationException (always) 648 * @stable ICU 4.8 649 */ 650 public void remove() { 651 throw new UnsupportedOperationException(); 652 } 653 654 private Entry truncateAndStop() { 655 pos_=-1; 656 entry_.value=-1; // no real value for str 657 return entry_; 658 } 659 660 private int branchNext(int pos, int length) { 661 while(length>kMaxBranchLinearSubNodeLength) { 662 ++pos; // ignore the comparison byte 663 // Push state for the greater-or-equal edge. 664 stack_.add(((long)skipDelta(bytes_, pos)<<32)|((length-(length>>1))<<16)|entry_.length); 665 // Follow the less-than edge. 666 length>>=1; 667 pos=jumpByDelta(bytes_, pos); 668 } 669 // List of key-value pairs where values are either final values or jump deltas. 670 // Read the first (key, value) pair. 671 byte trieByte=bytes_[pos++]; 672 int node=bytes_[pos++]&0xff; 673 boolean isFinal=(node&kValueIsFinal)!=0; 674 int value=readValue(bytes_, pos, node>>1); 675 pos=skipValue(pos, node); 676 stack_.add(((long)pos<<32)|((length-1)<<16)|entry_.length); 677 entry_.append(trieByte); 678 if(isFinal) { 679 pos_=-1; 680 entry_.value=value; 681 return -1; 682 } else { 683 return pos+value; 684 } 685 } 686 687 private byte[] bytes_; 688 private int pos_; 689 private int initialPos_; 690 private int remainingMatchLength_; 691 private int initialRemainingMatchLength_; 692 693 private int maxLength_; 694 private Entry entry_; 695 696 // The stack stores longs for backtracking to another 697 // outbound edge of a branch node. 698 // Each long has the offset from bytes_ in bits 62..32, 699 // the entry_.stringLength() from before the node in bits 15..0, 700 // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) 701 // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, 702 // but the code looks more confusing that way.) 703 private ArrayList<Long> stack_=new ArrayList<Long>(); 704 } 705 706 private void stop() { 707 pos_=-1; 708 } 709 710 // Reads a compact 32-bit integer. 711 // pos is already after the leadByte, and the lead byte is already shifted right by 1. 712 private static int readValue(byte[] bytes, int pos, int leadByte) { 713 int value; 714 if(leadByte<kMinTwoByteValueLead) { 715 value=leadByte-kMinOneByteValueLead; 716 } else if(leadByte<kMinThreeByteValueLead) { 717 value=((leadByte-kMinTwoByteValueLead)<<8)|(bytes[pos]&0xff); 718 } else if(leadByte<kFourByteValueLead) { 719 value=((leadByte-kMinThreeByteValueLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); 720 } else if(leadByte==kFourByteValueLead) { 721 value=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); 722 } else { 723 value=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); 724 } 725 return value; 726 } 727 private static int skipValue(int pos, int leadByte) { 728 assert(leadByte>=kMinValueLead); 729 if(leadByte>=(kMinTwoByteValueLead<<1)) { 730 if(leadByte<(kMinThreeByteValueLead<<1)) { 731 ++pos; 732 } else if(leadByte<(kFourByteValueLead<<1)) { 733 pos+=2; 734 } else { 735 pos+=3+((leadByte>>1)&1); 736 } 737 } 738 return pos; 739 } 740 private static int skipValue(byte[] bytes, int pos) { 741 int leadByte=bytes[pos++]&0xff; 742 return skipValue(pos, leadByte); 743 } 744 745 // Reads a jump delta and jumps. 746 private static int jumpByDelta(byte[] bytes, int pos) { 747 int delta=bytes[pos++]&0xff; 748 if(delta<kMinTwoByteDeltaLead) { 749 // nothing to do 750 } else if(delta<kMinThreeByteDeltaLead) { 751 delta=((delta-kMinTwoByteDeltaLead)<<8)|(bytes[pos++]&0xff); 752 } else if(delta<kFourByteDeltaLead) { 753 delta=((delta-kMinThreeByteDeltaLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); 754 pos+=2; 755 } else if(delta==kFourByteDeltaLead) { 756 delta=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); 757 pos+=3; 758 } else { 759 delta=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); 760 pos+=4; 761 } 762 return pos+delta; 763 } 764 765 private static int skipDelta(byte[] bytes, int pos) { 766 int delta=bytes[pos++]&0xff; 767 if(delta>=kMinTwoByteDeltaLead) { 768 if(delta<kMinThreeByteDeltaLead) { 769 ++pos; 770 } else if(delta<kFourByteDeltaLead) { 771 pos+=2; 772 } else { 773 pos+=3+(delta&1); 774 } 775 } 776 return pos; 777 } 778 779 private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE }; 780 781 // Handles a branch node for both next(byte) and next(string). 782 private Result branchNext(int pos, int length, int inByte) { 783 // Branch according to the current byte. 784 if(length==0) { 785 length=bytes_[pos++]&0xff; 786 } 787 ++length; 788 // The length of the branch is the number of bytes to select from. 789 // The data structure encodes a binary search. 790 while(length>kMaxBranchLinearSubNodeLength) { 791 if(inByte<(bytes_[pos++]&0xff)) { 792 length>>=1; 793 pos=jumpByDelta(bytes_, pos); 794 } else { 795 length=length-(length>>1); 796 pos=skipDelta(bytes_, pos); 797 } 798 } 799 // Drop down to linear search for the last few bytes. 800 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 801 // and divides length by 2. 802 do { 803 if(inByte==(bytes_[pos++]&0xff)) { 804 Result result; 805 int node=bytes_[pos]&0xff; 806 assert(node>=kMinValueLead); 807 if((node&kValueIsFinal)!=0) { 808 // Leave the final value for getValue() to read. 809 result=Result.FINAL_VALUE; 810 } else { 811 // Use the non-final value as the jump delta. 812 ++pos; 813 // int delta=readValue(pos, node>>1); 814 node>>=1; 815 int delta; 816 if(node<kMinTwoByteValueLead) { 817 delta=node-kMinOneByteValueLead; 818 } else if(node<kMinThreeByteValueLead) { 819 delta=((node-kMinTwoByteValueLead)<<8)|(bytes_[pos++]&0xff); 820 } else if(node<kFourByteValueLead) { 821 delta=((node-kMinThreeByteValueLead)<<16)|((bytes_[pos]&0xff)<<8)|(bytes_[pos+1]&0xff); 822 pos+=2; 823 } else if(node==kFourByteValueLead) { 824 delta=((bytes_[pos]&0xff)<<16)|((bytes_[pos+1]&0xff)<<8)|(bytes_[pos+2]&0xff); 825 pos+=3; 826 } else { 827 delta=(bytes_[pos]<<24)|((bytes_[pos+1]&0xff)<<16)|((bytes_[pos+2]&0xff)<<8)|(bytes_[pos+3]&0xff); 828 pos+=4; 829 } 830 // end readValue() 831 pos+=delta; 832 node=bytes_[pos]&0xff; 833 result= node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 834 } 835 pos_=pos; 836 return result; 837 } 838 --length; 839 pos=skipValue(bytes_, pos); 840 } while(length>1); 841 if(inByte==(bytes_[pos++]&0xff)) { 842 pos_=pos; 843 int node=bytes_[pos]&0xff; 844 return node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 845 } else { 846 stop(); 847 return Result.NO_MATCH; 848 } 849 } 850 851 // Requires remainingLength_<0. 852 private Result nextImpl(int pos, int inByte) { 853 for(;;) { 854 int node=bytes_[pos++]&0xff; 855 if(node<kMinLinearMatch) { 856 return branchNext(pos, node, inByte); 857 } else if(node<kMinValueLead) { 858 // Match the first of length+1 bytes. 859 int length=node-kMinLinearMatch; // Actual match length minus 1. 860 if(inByte==(bytes_[pos++]&0xff)) { 861 remainingMatchLength_=--length; 862 pos_=pos; 863 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 864 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 865 } else { 866 // No match. 867 break; 868 } 869 } else if((node&kValueIsFinal)!=0) { 870 // No further matching bytes. 871 break; 872 } else { 873 // Skip intermediate value. 874 pos=skipValue(pos, node); 875 // The next node must not also be a value node. 876 assert((bytes_[pos]&0xff)<kMinValueLead); 877 } 878 } 879 stop(); 880 return Result.NO_MATCH; 881 } 882 883 // Helper functions for getUniqueValue(). 884 // Recursively finds a unique value (or whether there is not a unique one) 885 // from a branch. 886 // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). 887 // On return, if not 0, then bits 63..33 contain the updated non-negative pos. 888 private static long findUniqueValueFromBranch(byte[] bytes, int pos, int length, 889 long uniqueValue) { 890 while(length>kMaxBranchLinearSubNodeLength) { 891 ++pos; // ignore the comparison byte 892 uniqueValue=findUniqueValueFromBranch(bytes, jumpByDelta(bytes, pos), length>>1, uniqueValue); 893 if(uniqueValue==0) { 894 return 0; 895 } 896 length=length-(length>>1); 897 pos=skipDelta(bytes, pos); 898 } 899 do { 900 ++pos; // ignore a comparison byte 901 // handle its value 902 int node=bytes[pos++]&0xff; 903 boolean isFinal=(node&kValueIsFinal)!=0; 904 int value=readValue(bytes, pos, node>>1); 905 pos=skipValue(pos, node); 906 if(isFinal) { 907 if(uniqueValue!=0) { 908 if(value!=(int)(uniqueValue>>1)) { 909 return 0; 910 } 911 } else { 912 uniqueValue=((long)value<<1)|1; 913 } 914 } else { 915 uniqueValue=findUniqueValue(bytes, pos+value, uniqueValue); 916 if(uniqueValue==0) { 917 return 0; 918 } 919 } 920 } while(--length>1); 921 // ignore the last comparison byte 922 return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL); 923 } 924 // Recursively finds a unique value (or whether there is not a unique one) 925 // starting from a position on a node lead byte. 926 // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set. 927 // Otherwise, uniqueValue is 0. Bits 63..33 are ignored. 928 private static long findUniqueValue(byte[] bytes, int pos, long uniqueValue) { 929 for(;;) { 930 int node=bytes[pos++]&0xff; 931 if(node<kMinLinearMatch) { 932 if(node==0) { 933 node=bytes[pos++]&0xff; 934 } 935 uniqueValue=findUniqueValueFromBranch(bytes, pos, node+1, uniqueValue); 936 if(uniqueValue==0) { 937 return 0; 938 } 939 pos=(int)(uniqueValue>>>33); 940 } else if(node<kMinValueLead) { 941 // linear-match node 942 pos+=node-kMinLinearMatch+1; // Ignore the match bytes. 943 } else { 944 boolean isFinal=(node&kValueIsFinal)!=0; 945 int value=readValue(bytes, pos, node>>1); 946 if(uniqueValue!=0) { 947 if(value!=(int)(uniqueValue>>1)) { 948 return 0; 949 } 950 } else { 951 uniqueValue=((long)value<<1)|1; 952 } 953 if(isFinal) { 954 return uniqueValue; 955 } 956 pos=skipValue(pos, node); 957 } 958 } 959 } 960 961 // Helper functions for getNextBytes(). 962 // getNextBytes() when pos is on a branch node. 963 private static void getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out) { 964 while(length>kMaxBranchLinearSubNodeLength) { 965 ++pos; // ignore the comparison byte 966 getNextBranchBytes(bytes, jumpByDelta(bytes, pos), length>>1, out); 967 length=length-(length>>1); 968 pos=skipDelta(bytes, pos); 969 } 970 do { 971 append(out, bytes[pos++]&0xff); 972 pos=skipValue(bytes, pos); 973 } while(--length>1); 974 append(out, bytes[pos]&0xff); 975 } 976 private static void append(Appendable out, int c) { 977 try { 978 out.append((char)c); 979 } catch(IOException e) { 980 throw new ICUUncheckedIOException(e); 981 } 982 } 983 984 // BytesTrie data structure 985 // 986 // The trie consists of a series of byte-serialized nodes for incremental 987 // string/byte sequence matching. The root node is at the beginning of the trie data. 988 // 989 // Types of nodes are distinguished by their node lead byte ranges. 990 // After each node, except a final-value node, another node follows to 991 // encode match values or continue matching further bytes. 992 // 993 // Node types: 994 // - Value node: Stores a 32-bit integer in a compact, variable-length format. 995 // The value is for the string/byte sequence so far. 996 // One node bit indicates whether the value is final or whether 997 // matching continues with the next node. 998 // - Linear-match node: Matches a number of bytes. 999 // - Branch node: Branches to other nodes according to the current input byte. 1000 // The node byte is the length of the branch (number of bytes to select from) 1001 // minus 1. It is followed by a sub-node: 1002 // - If the length is at most kMaxBranchLinearSubNodeLength, then 1003 // there are length-1 (key, value) pairs and then one more comparison byte. 1004 // If one of the key bytes matches, then the value is either a final value for 1005 // the string/byte sequence so far, or a "jump" delta to the next node. 1006 // If the last byte matches, then matching continues with the next node. 1007 // (Values have the same encoding as value nodes.) 1008 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 1009 // there is one byte and one "jump" delta. 1010 // If the input byte is less than the sub-node byte, then "jump" by delta to 1011 // the next sub-node which will have a length of length/2. 1012 // (The delta has its own compact encoding.) 1013 // Otherwise, skip the "jump" delta to the next sub-node 1014 // which will have a length of length-length/2. 1015 1016 // Node lead byte values. 1017 1018 // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise 1019 // the length is one more than the next byte. 1020 1021 // For a branch sub-node with at most this many entries, we drop down 1022 // to a linear search. 1023 /*package*/ static final int kMaxBranchLinearSubNodeLength=5; 1024 1025 // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. 1026 /*package*/ static final int kMinLinearMatch=0x10; 1027 /*package*/ static final int kMaxLinearMatchLength=0x10; 1028 1029 // 20..ff: Variable-length value node. 1030 // If odd, the value is final. (Otherwise, intermediate value or jump delta.) 1031 // Then shift-right by 1 bit. 1032 // The remaining lead byte value indicates the number of following bytes (0..4) 1033 // and contains the value's top bits. 1034 /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 1035 // It is a final value if bit 0 is set. 1036 private static final int kValueIsFinal=1; 1037 1038 // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. 1039 /*package*/ static final int kMinOneByteValueLead=kMinValueLead/2; // 0x10 1040 /*package*/ static final int kMaxOneByteValue=0x40; // At least 6 bits in the first byte. 1041 1042 /*package*/ static final int kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 1043 /*package*/ static final int kMaxTwoByteValue=0x1aff; 1044 1045 /*package*/ static final int kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c 1046 /*package*/ static final int kFourByteValueLead=0x7e; 1047 1048 // A little more than Unicode code points. (0x11ffff) 1049 /*package*/ static final int kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; 1050 1051 /*package*/ static final int kFiveByteValueLead=0x7f; 1052 1053 // Compact delta integers. 1054 /*package*/ static final int kMaxOneByteDelta=0xbf; 1055 /*package*/ static final int kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 1056 /*package*/ static final int kMinThreeByteDeltaLead=0xf0; 1057 /*package*/ static final int kFourByteDeltaLead=0xfe; 1058 /*package*/ static final int kFiveByteDeltaLead=0xff; 1059 1060 /*package*/ static final int kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff 1061 /*package*/ static final int kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff 1062 1063 // Fixed value referencing the BytesTrie bytes. 1064 private byte[] bytes_; 1065 private int root_; 1066 1067 // Iterator variables. 1068 1069 // Index of next trie byte to read. Negative if no more matches. 1070 private int pos_; 1071 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 1072 private int remainingMatchLength_; 1073}; 1074