BytesTrie.java revision d4d3fc6e0d919d96cdb282a05a05327cdf2c7f4e
1/* GENERATED SOURCE. DO NOT MODIFY. */ 2/* 3******************************************************************************* 4* Copyright (C) 2010-2014, International Business Machines 5* Corporation and others. All Rights Reserved. 6******************************************************************************* 7* created on: 2010nov23 8* created by: Markus W. Scherer 9* ported from ICU4C bytestrie.h/.cpp 10*/ 11package android.icu.util; 12 13import java.io.IOException; 14import java.nio.ByteBuffer; 15import java.util.ArrayList; 16import java.util.NoSuchElementException; 17 18/** 19 * Light-weight, non-const reader class for a BytesTrie. 20 * Traverses a byte-serialized data structure with minimal state, 21 * for mapping byte sequences to non-negative integer values. 22 * 23 * <p>This class is not intended for public subclassing. 24 * 25 * @author Markus W. Scherer 26 * @hide Only a subset of ICU is exposed in Android 27 */ 28public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> { 29 /** 30 * Constructs a BytesTrie reader instance. 31 * 32 * <p>The array must contain a copy of a byte sequence from the BytesTrieBuilder, 33 * with the offset indicating the first byte of that sequence. 34 * The BytesTrie object will not read more bytes than 35 * the BytesTrieBuilder generated in the corresponding build() call. 36 * 37 * <p>The array is not copied/cloned and must not be modified while 38 * the BytesTrie object is in use. 39 * 40 * @param trieBytes Bytes array that contains the serialized trie. 41 * @param offset Root offset of the trie in the array. 42 */ 43 public BytesTrie(byte[] trieBytes, int offset) { 44 bytes_=trieBytes; 45 pos_=root_=offset; 46 remainingMatchLength_=-1; 47 } 48 49 /** 50 * Clones this trie reader object and its state, 51 * but not the byte array which will be shared. 52 * @return A shallow clone of this trie. 53 */ 54 @Override 55 public Object clone() throws CloneNotSupportedException { 56 return super.clone(); // A shallow copy is just what we need. 57 } 58 59 /** 60 * Resets this trie to its initial state. 61 * @return this 62 */ 63 public BytesTrie reset() { 64 pos_=root_; 65 remainingMatchLength_=-1; 66 return this; 67 } 68 69 /** 70 * BytesTrie state object, for saving a trie's current state 71 * and resetting the trie back to this state later. 72 */ 73 public static final class State { 74 /** 75 * Constructs an empty State. 76 */ 77 public State() {} 78 private byte[] bytes; 79 private int root; 80 private int pos; 81 private int remainingMatchLength; 82 } 83 84 /** 85 * Saves the state of this trie. 86 * @param state The State object to hold the trie's state. 87 * @return this 88 * @see #resetToState 89 */ 90 public BytesTrie saveState(State state) /*const*/ { 91 state.bytes=bytes_; 92 state.root=root_; 93 state.pos=pos_; 94 state.remainingMatchLength=remainingMatchLength_; 95 return this; 96 } 97 98 /** 99 * Resets this trie to the saved state. 100 * @param state The State object which holds a saved trie state. 101 * @return this 102 * @throws IllegalArgumentException if the state object contains no state, 103 * or the state of a different trie 104 * @see #saveState 105 * @see #reset 106 */ 107 public BytesTrie resetToState(State state) { 108 if(bytes_==state.bytes && bytes_!=null && root_==state.root) { 109 pos_=state.pos; 110 remainingMatchLength_=state.remainingMatchLength; 111 } else { 112 throw new IllegalArgumentException("incompatible trie state"); 113 } 114 return this; 115 } 116 117 /** 118 * Return values for BytesTrie.next(), CharsTrie.next() and similar methods. 119 */ 120 public enum Result { 121 /** 122 * The input unit(s) did not continue a matching string. 123 * Once current()/next() return NO_MATCH, 124 * all further calls to current()/next() will also return NO_MATCH, 125 * until the trie is reset to its original state or to a saved state. 126 */ 127 NO_MATCH, 128 /** 129 * The input unit(s) continued a matching string 130 * but there is no value for the string so far. 131 * (It is a prefix of a longer string.) 132 */ 133 NO_VALUE, 134 /** 135 * The input unit(s) continued a matching string 136 * and there is a value for the string so far. 137 * This value will be returned by getValue(). 138 * No further input byte/unit can continue a matching string. 139 */ 140 FINAL_VALUE, 141 /** 142 * The input unit(s) continued a matching string 143 * and there is a value for the string so far. 144 * This value will be returned by getValue(). 145 * Another input byte/unit can continue a matching string. 146 */ 147 INTERMEDIATE_VALUE; 148 149 // Note: The following methods assume the particular order 150 // of enum constants, treating the ordinal() values like bit sets. 151 // Do not reorder the enum constants! 152 153 /** 154 * Same as (result!=NO_MATCH). 155 * @return true if the input bytes/units so far are part of a matching string/byte sequence. 156 */ 157 public boolean matches() { return this!=NO_MATCH; } 158 159 /** 160 * Equivalent to (result==INTERMEDIATE_VALUE || result==FINAL_VALUE). 161 * @return true if there is a value for the input bytes/units so far. 162 * @see #getValue 163 */ 164 public boolean hasValue() { return ordinal()>=2; } 165 166 /** 167 * Equivalent to (result==NO_VALUE || result==INTERMEDIATE_VALUE). 168 * @return true if another input byte/unit can continue a matching string. 169 */ 170 public boolean hasNext() { return (ordinal()&1)!=0; } 171 } 172 173 /** 174 * Determines whether the byte sequence so far matches, whether it has a value, 175 * and whether another input byte can continue a matching byte sequence. 176 * @return The match/value Result. 177 */ 178 public Result current() /*const*/ { 179 int pos=pos_; 180 if(pos<0) { 181 return Result.NO_MATCH; 182 } else { 183 int node; 184 return (remainingMatchLength_<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 185 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 186 } 187 } 188 189 /** 190 * Traverses the trie from the initial state for this input byte. 191 * Equivalent to reset().next(inByte). 192 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 193 * Values below -0x100 and above 0xff will never match. 194 * @return The match/value Result. 195 */ 196 public Result first(int inByte) { 197 remainingMatchLength_=-1; 198 if(inByte<0) { 199 inByte+=0x100; 200 } 201 return nextImpl(root_, inByte); 202 } 203 204 /** 205 * Traverses the trie from the current state for this input byte. 206 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 207 * Values below -0x100 and above 0xff will never match. 208 * @return The match/value Result. 209 */ 210 public Result next(int inByte) { 211 int pos=pos_; 212 if(pos<0) { 213 return Result.NO_MATCH; 214 } 215 if(inByte<0) { 216 inByte+=0x100; 217 } 218 int length=remainingMatchLength_; // Actual remaining match length minus 1. 219 if(length>=0) { 220 // Remaining part of a linear-match node. 221 if(inByte==(bytes_[pos++]&0xff)) { 222 remainingMatchLength_=--length; 223 pos_=pos; 224 int node; 225 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 226 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 227 } else { 228 stop(); 229 return Result.NO_MATCH; 230 } 231 } 232 return nextImpl(pos, inByte); 233 } 234 235 /** 236 * Traverses the trie from the current state for this byte sequence. 237 * Equivalent to 238 * <pre> 239 * Result result=current(); 240 * for(each c in s) 241 * if(!result.hasNext()) return Result.NO_MATCH; 242 * result=next(c); 243 * return result; 244 * </pre> 245 * @param s Contains a string or byte sequence. 246 * @param sIndex The start index of the byte sequence in s. 247 * @param sLimit The (exclusive) end index of the byte sequence in s. 248 * @return The match/value Result. 249 */ 250 public Result next(byte[] s, int sIndex, int sLimit) { 251 if(sIndex>=sLimit) { 252 // Empty input. 253 return current(); 254 } 255 int pos=pos_; 256 if(pos<0) { 257 return Result.NO_MATCH; 258 } 259 int length=remainingMatchLength_; // Actual remaining match length minus 1. 260 for(;;) { 261 // Fetch the next input byte, if there is one. 262 // Continue a linear-match node. 263 byte inByte; 264 for(;;) { 265 if(sIndex==sLimit) { 266 remainingMatchLength_=length; 267 pos_=pos; 268 int node; 269 return (length<0 && (node=(bytes_[pos]&0xff))>=kMinValueLead) ? 270 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 271 } 272 inByte=s[sIndex++]; 273 if(length<0) { 274 remainingMatchLength_=length; 275 break; 276 } 277 if(inByte!=bytes_[pos]) { 278 stop(); 279 return Result.NO_MATCH; 280 } 281 ++pos; 282 --length; 283 } 284 for(;;) { 285 int node=bytes_[pos++]&0xff; 286 if(node<kMinLinearMatch) { 287 Result result=branchNext(pos, node, inByte&0xff); 288 if(result==Result.NO_MATCH) { 289 return Result.NO_MATCH; 290 } 291 // Fetch the next input byte, if there is one. 292 if(sIndex==sLimit) { 293 return result; 294 } 295 if(result==Result.FINAL_VALUE) { 296 // No further matching bytes. 297 stop(); 298 return Result.NO_MATCH; 299 } 300 inByte=s[sIndex++]; 301 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 302 } else if(node<kMinValueLead) { 303 // Match length+1 bytes. 304 length=node-kMinLinearMatch; // Actual match length minus 1. 305 if(inByte!=bytes_[pos]) { 306 stop(); 307 return Result.NO_MATCH; 308 } 309 ++pos; 310 --length; 311 break; 312 } else if((node&kValueIsFinal)!=0) { 313 // No further matching bytes. 314 stop(); 315 return Result.NO_MATCH; 316 } else { 317 // Skip intermediate value. 318 pos=skipValue(pos, node); 319 // The next node must not also be a value node. 320 assert((bytes_[pos]&0xff)<kMinValueLead); 321 } 322 } 323 } 324 } 325 326 /** 327 * Returns a matching byte sequence's value if called immediately after 328 * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE. 329 * getValue() can be called multiple times. 330 * 331 * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE! 332 * @return The value for the byte sequence so far. 333 */ 334 public int getValue() /*const*/ { 335 int pos=pos_; 336 int leadByte=bytes_[pos++]&0xff; 337 assert(leadByte>=kMinValueLead); 338 return readValue(bytes_, pos, leadByte>>1); 339 } 340 341 /** 342 * Determines whether all byte sequences reachable from the current state 343 * map to the same value, and if so, returns that value. 344 * @return The unique value in bits 32..1 with bit 0 set, 345 * if all byte sequences reachable from the current state 346 * map to the same value; otherwise returns 0. 347 */ 348 public long getUniqueValue() /*const*/ { 349 int pos=pos_; 350 if(pos<0) { 351 return 0; 352 } 353 // Skip the rest of a pending linear-match node. 354 long uniqueValue=findUniqueValue(bytes_, pos+remainingMatchLength_+1, 0); 355 // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32. 356 return (uniqueValue<<31)>>31; 357 } 358 359 /** 360 * Finds each byte which continues the byte sequence from the current state. 361 * That is, each byte b for which it would be next(b)!=Result.NO_MATCH now. 362 * @param out Each next byte is 0-extended to a char and appended to this object. 363 * (Only uses the out.append(c) method.) 364 * @return The number of bytes which continue the byte sequence from here. 365 */ 366 public int getNextBytes(Appendable out) /*const*/ { 367 int pos=pos_; 368 if(pos<0) { 369 return 0; 370 } 371 if(remainingMatchLength_>=0) { 372 append(out, bytes_[pos]&0xff); // Next byte of a pending linear-match node. 373 return 1; 374 } 375 int node=bytes_[pos++]&0xff; 376 if(node>=kMinValueLead) { 377 if((node&kValueIsFinal)!=0) { 378 return 0; 379 } else { 380 pos=skipValue(pos, node); 381 node=bytes_[pos++]&0xff; 382 assert(node<kMinValueLead); 383 } 384 } 385 if(node<kMinLinearMatch) { 386 if(node==0) { 387 node=bytes_[pos++]&0xff; 388 } 389 getNextBranchBytes(bytes_, pos, ++node, out); 390 return node; 391 } else { 392 // First byte of the linear-match node. 393 append(out, bytes_[pos]&0xff); 394 return 1; 395 } 396 } 397 398 /** 399 * Iterates from the current state of this trie. 400 * @return A new BytesTrie.Iterator. 401 */ 402 public Iterator iterator() { 403 return new Iterator(bytes_, pos_, remainingMatchLength_, 0); 404 } 405 406 /** 407 * Iterates from the current state of this trie. 408 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 409 * Otherwise, the iterator returns strings with this maximum length. 410 * @return A new BytesTrie.Iterator. 411 */ 412 public Iterator iterator(int maxStringLength) { 413 return new Iterator(bytes_, pos_, remainingMatchLength_, maxStringLength); 414 } 415 416 /** 417 * Iterates from the root of a byte-serialized BytesTrie. 418 * @param trieBytes Bytes array that contains the serialized trie. 419 * @param offset Root offset of the trie in the array. 420 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 421 * Otherwise, the iterator returns strings with this maximum length. 422 * @return A new BytesTrie.Iterator. 423 */ 424 public static Iterator iterator(byte[] trieBytes, int offset, int maxStringLength) { 425 return new Iterator(trieBytes, offset, -1, maxStringLength); 426 } 427 428 /** 429 * Return value type for the Iterator. 430 */ 431 public static final class Entry { 432 private Entry(int capacity) { 433 bytes=new byte[capacity]; 434 } 435 436 /** 437 * @return The length of the byte sequence. 438 */ 439 public int bytesLength() { return length; } 440 /** 441 * Returns a byte of the byte sequence. 442 * @param index An index into the byte sequence. 443 * @return The index-th byte sequence byte. 444 */ 445 public byte byteAt(int index) { return bytes[index]; } 446 /** 447 * Copies the byte sequence into a byte array. 448 * @param dest Destination byte array. 449 * @param destOffset Starting offset to where in dest the byte sequence is copied. 450 */ 451 public void copyBytesTo(byte[] dest, int destOffset) { 452 System.arraycopy(bytes, 0, dest, destOffset, length); 453 } 454 /** 455 * @return The byte sequence as a read-only ByteBuffer. 456 */ 457 public ByteBuffer bytesAsByteBuffer() { 458 return ByteBuffer.wrap(bytes, 0, length).asReadOnlyBuffer(); 459 } 460 461 /** 462 * The value associated with the byte sequence. 463 */ 464 public int value; 465 466 private void ensureCapacity(int len) { 467 if(bytes.length<len) { 468 byte[] newBytes=new byte[Math.min(2*bytes.length, 2*len)]; 469 System.arraycopy(bytes, 0, newBytes, 0, length); 470 bytes=newBytes; 471 } 472 } 473 private void append(byte b) { 474 ensureCapacity(length+1); 475 bytes[length++]=b; 476 } 477 private void append(byte[] b, int off, int len) { 478 ensureCapacity(length+len); 479 System.arraycopy(b, off, bytes, length, len); 480 length+=len; 481 } 482 private void truncateString(int newLength) { length=newLength; } 483 484 private byte[] bytes; 485 private int length; 486 } 487 488 /** 489 * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. 490 */ 491 public static final class Iterator implements java.util.Iterator<Entry> { 492 private Iterator(byte[] trieBytes, int offset, int remainingMatchLength, int maxStringLength) { 493 bytes_=trieBytes; 494 pos_=initialPos_=offset; 495 remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength; 496 maxLength_=maxStringLength; 497 entry_=new Entry(maxLength_!=0 ? maxLength_ : 32); 498 int length=remainingMatchLength_; // Actual remaining match length minus 1. 499 if(length>=0) { 500 // Pending linear-match node, append remaining bytes to entry_. 501 ++length; 502 if(maxLength_>0 && length>maxLength_) { 503 length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. 504 } 505 entry_.append(bytes_, pos_, length); 506 pos_+=length; 507 remainingMatchLength_-=length; 508 } 509 } 510 511 /** 512 * Resets this iterator to its initial state. 513 * @return this 514 */ 515 public Iterator reset() { 516 pos_=initialPos_; 517 remainingMatchLength_=initialRemainingMatchLength_; 518 int length=remainingMatchLength_+1; // Remaining match length. 519 if(maxLength_>0 && length>maxLength_) { 520 length=maxLength_; 521 } 522 entry_.truncateString(length); 523 pos_+=length; 524 remainingMatchLength_-=length; 525 stack_.clear(); 526 return this; 527 } 528 529 /** 530 * @return true if there are more elements. 531 */ 532 public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); } 533 534 /** 535 * Finds the next (byte sequence, value) pair if there is one. 536 * 537 * If the byte sequence is truncated to the maximum length and does not 538 * have a real value, then the value is set to -1. 539 * In this case, this "not a real value" is indistinguishable from 540 * a real value of -1. 541 * @return An Entry with the string and value of the next element. 542 * @throws NoSuchElementException - iteration has no more elements. 543 */ 544 public Entry next() { 545 int pos=pos_; 546 if(pos<0) { 547 if(stack_.isEmpty()) { 548 throw new NoSuchElementException(); 549 } 550 // Pop the state off the stack and continue with the next outbound edge of 551 // the branch node. 552 long top=stack_.remove(stack_.size()-1); 553 int length=(int)top; 554 pos=(int)(top>>32); 555 entry_.truncateString(length&0xffff); 556 length>>>=16; 557 if(length>1) { 558 pos=branchNext(pos, length); 559 if(pos<0) { 560 return entry_; // Reached a final value. 561 } 562 } else { 563 entry_.append(bytes_[pos++]); 564 } 565 } 566 if(remainingMatchLength_>=0) { 567 // We only get here if we started in a pending linear-match node 568 // with more than maxLength remaining bytes. 569 return truncateAndStop(); 570 } 571 for(;;) { 572 int node=bytes_[pos++]&0xff; 573 if(node>=kMinValueLead) { 574 // Deliver value for the byte sequence so far. 575 boolean isFinal=(node&kValueIsFinal)!=0; 576 entry_.value=readValue(bytes_, pos, node>>1); 577 if(isFinal || (maxLength_>0 && entry_.length==maxLength_)) { 578 pos_=-1; 579 } else { 580 pos_=skipValue(pos, node); 581 } 582 return entry_; 583 } 584 if(maxLength_>0 && entry_.length==maxLength_) { 585 return truncateAndStop(); 586 } 587 if(node<kMinLinearMatch) { 588 if(node==0) { 589 node=bytes_[pos++]&0xff; 590 } 591 pos=branchNext(pos, node+1); 592 if(pos<0) { 593 return entry_; // Reached a final value. 594 } 595 } else { 596 // Linear-match node, append length bytes to entry_. 597 int length=node-kMinLinearMatch+1; 598 if(maxLength_>0 && entry_.length+length>maxLength_) { 599 entry_.append(bytes_, pos, maxLength_-entry_.length); 600 return truncateAndStop(); 601 } 602 entry_.append(bytes_, pos, length); 603 pos+=length; 604 } 605 } 606 } 607 608 /** 609 * Iterator.remove() is not supported. 610 * @throws UnsupportedOperationException (always) 611 */ 612 public void remove() { 613 throw new UnsupportedOperationException(); 614 } 615 616 private Entry truncateAndStop() { 617 pos_=-1; 618 entry_.value=-1; // no real value for str 619 return entry_; 620 } 621 622 private int branchNext(int pos, int length) { 623 while(length>kMaxBranchLinearSubNodeLength) { 624 ++pos; // ignore the comparison byte 625 // Push state for the greater-or-equal edge. 626 stack_.add(((long)skipDelta(bytes_, pos)<<32)|((length-(length>>1))<<16)|entry_.length); 627 // Follow the less-than edge. 628 length>>=1; 629 pos=jumpByDelta(bytes_, pos); 630 } 631 // List of key-value pairs where values are either final values or jump deltas. 632 // Read the first (key, value) pair. 633 byte trieByte=bytes_[pos++]; 634 int node=bytes_[pos++]&0xff; 635 boolean isFinal=(node&kValueIsFinal)!=0; 636 int value=readValue(bytes_, pos, node>>1); 637 pos=skipValue(pos, node); 638 stack_.add(((long)pos<<32)|((length-1)<<16)|entry_.length); 639 entry_.append(trieByte); 640 if(isFinal) { 641 pos_=-1; 642 entry_.value=value; 643 return -1; 644 } else { 645 return pos+value; 646 } 647 } 648 649 private byte[] bytes_; 650 private int pos_; 651 private int initialPos_; 652 private int remainingMatchLength_; 653 private int initialRemainingMatchLength_; 654 655 private int maxLength_; 656 private Entry entry_; 657 658 // The stack stores longs for backtracking to another 659 // outbound edge of a branch node. 660 // Each long has the offset from bytes_ in bits 62..32, 661 // the entry_.stringLength() from before the node in bits 15..0, 662 // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) 663 // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, 664 // but the code looks more confusing that way.) 665 private ArrayList<Long> stack_=new ArrayList<Long>(); 666 } 667 668 private void stop() { 669 pos_=-1; 670 } 671 672 // Reads a compact 32-bit integer. 673 // pos is already after the leadByte, and the lead byte is already shifted right by 1. 674 private static int readValue(byte[] bytes, int pos, int leadByte) { 675 int value; 676 if(leadByte<kMinTwoByteValueLead) { 677 value=leadByte-kMinOneByteValueLead; 678 } else if(leadByte<kMinThreeByteValueLead) { 679 value=((leadByte-kMinTwoByteValueLead)<<8)|(bytes[pos]&0xff); 680 } else if(leadByte<kFourByteValueLead) { 681 value=((leadByte-kMinThreeByteValueLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); 682 } else if(leadByte==kFourByteValueLead) { 683 value=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); 684 } else { 685 value=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); 686 } 687 return value; 688 } 689 private static int skipValue(int pos, int leadByte) { 690 assert(leadByte>=kMinValueLead); 691 if(leadByte>=(kMinTwoByteValueLead<<1)) { 692 if(leadByte<(kMinThreeByteValueLead<<1)) { 693 ++pos; 694 } else if(leadByte<(kFourByteValueLead<<1)) { 695 pos+=2; 696 } else { 697 pos+=3+((leadByte>>1)&1); 698 } 699 } 700 return pos; 701 } 702 private static int skipValue(byte[] bytes, int pos) { 703 int leadByte=bytes[pos++]&0xff; 704 return skipValue(pos, leadByte); 705 } 706 707 // Reads a jump delta and jumps. 708 private static int jumpByDelta(byte[] bytes, int pos) { 709 int delta=bytes[pos++]&0xff; 710 if(delta<kMinTwoByteDeltaLead) { 711 // nothing to do 712 } else if(delta<kMinThreeByteDeltaLead) { 713 delta=((delta-kMinTwoByteDeltaLead)<<8)|(bytes[pos++]&0xff); 714 } else if(delta<kFourByteDeltaLead) { 715 delta=((delta-kMinThreeByteDeltaLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); 716 pos+=2; 717 } else if(delta==kFourByteDeltaLead) { 718 delta=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); 719 pos+=3; 720 } else { 721 delta=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); 722 pos+=4; 723 } 724 return pos+delta; 725 } 726 727 private static int skipDelta(byte[] bytes, int pos) { 728 int delta=bytes[pos++]&0xff; 729 if(delta>=kMinTwoByteDeltaLead) { 730 if(delta<kMinThreeByteDeltaLead) { 731 ++pos; 732 } else if(delta<kFourByteDeltaLead) { 733 pos+=2; 734 } else { 735 pos+=3+(delta&1); 736 } 737 } 738 return pos; 739 } 740 741 private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE }; 742 743 // Handles a branch node for both next(byte) and next(string). 744 private Result branchNext(int pos, int length, int inByte) { 745 // Branch according to the current byte. 746 if(length==0) { 747 length=bytes_[pos++]&0xff; 748 } 749 ++length; 750 // The length of the branch is the number of bytes to select from. 751 // The data structure encodes a binary search. 752 while(length>kMaxBranchLinearSubNodeLength) { 753 if(inByte<(bytes_[pos++]&0xff)) { 754 length>>=1; 755 pos=jumpByDelta(bytes_, pos); 756 } else { 757 length=length-(length>>1); 758 pos=skipDelta(bytes_, pos); 759 } 760 } 761 // Drop down to linear search for the last few bytes. 762 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 763 // and divides length by 2. 764 do { 765 if(inByte==(bytes_[pos++]&0xff)) { 766 Result result; 767 int node=bytes_[pos]&0xff; 768 assert(node>=kMinValueLead); 769 if((node&kValueIsFinal)!=0) { 770 // Leave the final value for getValue() to read. 771 result=Result.FINAL_VALUE; 772 } else { 773 // Use the non-final value as the jump delta. 774 ++pos; 775 // int delta=readValue(pos, node>>1); 776 node>>=1; 777 int delta; 778 if(node<kMinTwoByteValueLead) { 779 delta=node-kMinOneByteValueLead; 780 } else if(node<kMinThreeByteValueLead) { 781 delta=((node-kMinTwoByteValueLead)<<8)|(bytes_[pos++]&0xff); 782 } else if(node<kFourByteValueLead) { 783 delta=((node-kMinThreeByteValueLead)<<16)|((bytes_[pos]&0xff)<<8)|(bytes_[pos+1]&0xff); 784 pos+=2; 785 } else if(node==kFourByteValueLead) { 786 delta=((bytes_[pos]&0xff)<<16)|((bytes_[pos+1]&0xff)<<8)|(bytes_[pos+2]&0xff); 787 pos+=3; 788 } else { 789 delta=(bytes_[pos]<<24)|((bytes_[pos+1]&0xff)<<16)|((bytes_[pos+2]&0xff)<<8)|(bytes_[pos+3]&0xff); 790 pos+=4; 791 } 792 // end readValue() 793 pos+=delta; 794 node=bytes_[pos]&0xff; 795 result= node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 796 } 797 pos_=pos; 798 return result; 799 } 800 --length; 801 pos=skipValue(bytes_, pos); 802 } while(length>1); 803 if(inByte==(bytes_[pos++]&0xff)) { 804 pos_=pos; 805 int node=bytes_[pos]&0xff; 806 return node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 807 } else { 808 stop(); 809 return Result.NO_MATCH; 810 } 811 } 812 813 // Requires remainingLength_<0. 814 private Result nextImpl(int pos, int inByte) { 815 for(;;) { 816 int node=bytes_[pos++]&0xff; 817 if(node<kMinLinearMatch) { 818 return branchNext(pos, node, inByte); 819 } else if(node<kMinValueLead) { 820 // Match the first of length+1 bytes. 821 int length=node-kMinLinearMatch; // Actual match length minus 1. 822 if(inByte==(bytes_[pos++]&0xff)) { 823 remainingMatchLength_=--length; 824 pos_=pos; 825 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 826 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 827 } else { 828 // No match. 829 break; 830 } 831 } else if((node&kValueIsFinal)!=0) { 832 // No further matching bytes. 833 break; 834 } else { 835 // Skip intermediate value. 836 pos=skipValue(pos, node); 837 // The next node must not also be a value node. 838 assert((bytes_[pos]&0xff)<kMinValueLead); 839 } 840 } 841 stop(); 842 return Result.NO_MATCH; 843 } 844 845 // Helper functions for getUniqueValue(). 846 // Recursively finds a unique value (or whether there is not a unique one) 847 // from a branch. 848 // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). 849 // On return, if not 0, then bits 63..33 contain the updated non-negative pos. 850 private static long findUniqueValueFromBranch(byte[] bytes, int pos, int length, 851 long uniqueValue) { 852 while(length>kMaxBranchLinearSubNodeLength) { 853 ++pos; // ignore the comparison byte 854 uniqueValue=findUniqueValueFromBranch(bytes, jumpByDelta(bytes, pos), length>>1, uniqueValue); 855 if(uniqueValue==0) { 856 return 0; 857 } 858 length=length-(length>>1); 859 pos=skipDelta(bytes, pos); 860 } 861 do { 862 ++pos; // ignore a comparison byte 863 // handle its value 864 int node=bytes[pos++]&0xff; 865 boolean isFinal=(node&kValueIsFinal)!=0; 866 int value=readValue(bytes, pos, node>>1); 867 pos=skipValue(pos, node); 868 if(isFinal) { 869 if(uniqueValue!=0) { 870 if(value!=(int)(uniqueValue>>1)) { 871 return 0; 872 } 873 } else { 874 uniqueValue=((long)value<<1)|1; 875 } 876 } else { 877 uniqueValue=findUniqueValue(bytes, pos+value, uniqueValue); 878 if(uniqueValue==0) { 879 return 0; 880 } 881 } 882 } while(--length>1); 883 // ignore the last comparison byte 884 return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL); 885 } 886 // Recursively finds a unique value (or whether there is not a unique one) 887 // starting from a position on a node lead byte. 888 // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set. 889 // Otherwise, uniqueValue is 0. Bits 63..33 are ignored. 890 private static long findUniqueValue(byte[] bytes, int pos, long uniqueValue) { 891 for(;;) { 892 int node=bytes[pos++]&0xff; 893 if(node<kMinLinearMatch) { 894 if(node==0) { 895 node=bytes[pos++]&0xff; 896 } 897 uniqueValue=findUniqueValueFromBranch(bytes, pos, node+1, uniqueValue); 898 if(uniqueValue==0) { 899 return 0; 900 } 901 pos=(int)(uniqueValue>>>33); 902 } else if(node<kMinValueLead) { 903 // linear-match node 904 pos+=node-kMinLinearMatch+1; // Ignore the match bytes. 905 } else { 906 boolean isFinal=(node&kValueIsFinal)!=0; 907 int value=readValue(bytes, pos, node>>1); 908 if(uniqueValue!=0) { 909 if(value!=(int)(uniqueValue>>1)) { 910 return 0; 911 } 912 } else { 913 uniqueValue=((long)value<<1)|1; 914 } 915 if(isFinal) { 916 return uniqueValue; 917 } 918 pos=skipValue(pos, node); 919 } 920 } 921 } 922 923 // Helper functions for getNextBytes(). 924 // getNextBytes() when pos is on a branch node. 925 private static void getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out) { 926 while(length>kMaxBranchLinearSubNodeLength) { 927 ++pos; // ignore the comparison byte 928 getNextBranchBytes(bytes, jumpByDelta(bytes, pos), length>>1, out); 929 length=length-(length>>1); 930 pos=skipDelta(bytes, pos); 931 } 932 do { 933 append(out, bytes[pos++]&0xff); 934 pos=skipValue(bytes, pos); 935 } while(--length>1); 936 append(out, bytes[pos]&0xff); 937 } 938 private static void append(Appendable out, int c) { 939 try { 940 out.append((char)c); 941 } catch(IOException e) { 942 throw new ICUUncheckedIOException(e); 943 } 944 } 945 946 // BytesTrie data structure 947 // 948 // The trie consists of a series of byte-serialized nodes for incremental 949 // string/byte sequence matching. The root node is at the beginning of the trie data. 950 // 951 // Types of nodes are distinguished by their node lead byte ranges. 952 // After each node, except a final-value node, another node follows to 953 // encode match values or continue matching further bytes. 954 // 955 // Node types: 956 // - Value node: Stores a 32-bit integer in a compact, variable-length format. 957 // The value is for the string/byte sequence so far. 958 // One node bit indicates whether the value is final or whether 959 // matching continues with the next node. 960 // - Linear-match node: Matches a number of bytes. 961 // - Branch node: Branches to other nodes according to the current input byte. 962 // The node byte is the length of the branch (number of bytes to select from) 963 // minus 1. It is followed by a sub-node: 964 // - If the length is at most kMaxBranchLinearSubNodeLength, then 965 // there are length-1 (key, value) pairs and then one more comparison byte. 966 // If one of the key bytes matches, then the value is either a final value for 967 // the string/byte sequence so far, or a "jump" delta to the next node. 968 // If the last byte matches, then matching continues with the next node. 969 // (Values have the same encoding as value nodes.) 970 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 971 // there is one byte and one "jump" delta. 972 // If the input byte is less than the sub-node byte, then "jump" by delta to 973 // the next sub-node which will have a length of length/2. 974 // (The delta has its own compact encoding.) 975 // Otherwise, skip the "jump" delta to the next sub-node 976 // which will have a length of length-length/2. 977 978 // Node lead byte values. 979 980 // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise 981 // the length is one more than the next byte. 982 983 // For a branch sub-node with at most this many entries, we drop down 984 // to a linear search. 985 /*package*/ static final int kMaxBranchLinearSubNodeLength=5; 986 987 // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. 988 /*package*/ static final int kMinLinearMatch=0x10; 989 /*package*/ static final int kMaxLinearMatchLength=0x10; 990 991 // 20..ff: Variable-length value node. 992 // If odd, the value is final. (Otherwise, intermediate value or jump delta.) 993 // Then shift-right by 1 bit. 994 // The remaining lead byte value indicates the number of following bytes (0..4) 995 // and contains the value's top bits. 996 /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 997 // It is a final value if bit 0 is set. 998 private static final int kValueIsFinal=1; 999 1000 // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. 1001 /*package*/ static final int kMinOneByteValueLead=kMinValueLead/2; // 0x10 1002 /*package*/ static final int kMaxOneByteValue=0x40; // At least 6 bits in the first byte. 1003 1004 /*package*/ static final int kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 1005 /*package*/ static final int kMaxTwoByteValue=0x1aff; 1006 1007 /*package*/ static final int kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c 1008 /*package*/ static final int kFourByteValueLead=0x7e; 1009 1010 // A little more than Unicode code points. (0x11ffff) 1011 /*package*/ static final int kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; 1012 1013 /*package*/ static final int kFiveByteValueLead=0x7f; 1014 1015 // Compact delta integers. 1016 /*package*/ static final int kMaxOneByteDelta=0xbf; 1017 /*package*/ static final int kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 1018 /*package*/ static final int kMinThreeByteDeltaLead=0xf0; 1019 /*package*/ static final int kFourByteDeltaLead=0xfe; 1020 /*package*/ static final int kFiveByteDeltaLead=0xff; 1021 1022 /*package*/ static final int kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff 1023 /*package*/ static final int kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff 1024 1025 // Fixed value referencing the BytesTrie bytes. 1026 private byte[] bytes_; 1027 private int root_; 1028 1029 // Iterator variables. 1030 1031 // Index of next trie byte to read. Negative if no more matches. 1032 private int pos_; 1033 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 1034 private int remainingMatchLength_; 1035}; 1036