1/* 2******************************************************************************* 3* Copyright (C) 2010-2014, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* created on: 2010nov23 7* created by: Markus W. Scherer 8* ported from ICU4C bytestrie.h/.cpp 9*/ 10package com.ibm.icu.util; 11 12import java.io.IOException; 13import java.nio.ByteBuffer; 14import java.util.ArrayList; 15import java.util.NoSuchElementException; 16 17/** 18 * Light-weight, non-const reader class for a BytesTrie. 19 * Traverses a byte-serialized data structure with minimal state, 20 * for mapping byte sequences to non-negative integer values. 21 * 22 * <p>This class is not intended for public subclassing. 23 * 24 * @stable ICU 4.8 25 * @author Markus W. Scherer 26 */ 27public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> { 28 /** 29 * Constructs a BytesTrie reader instance. 30 * 31 * <p>The array must contain a copy of a byte sequence from the BytesTrieBuilder, 32 * with the offset indicating the first byte of that sequence. 33 * The BytesTrie object will not read more bytes than 34 * the BytesTrieBuilder generated in the corresponding build() call. 35 * 36 * <p>The array is not copied/cloned and must not be modified while 37 * the BytesTrie object is in use. 38 * 39 * @param trieBytes Bytes array that contains the serialized trie. 40 * @param offset Root offset of the trie in the array. 41 * @stable ICU 4.8 42 */ 43 public BytesTrie(byte[] trieBytes, int offset) { 44 bytes_=trieBytes; 45 pos_=root_=offset; 46 remainingMatchLength_=-1; 47 } 48 49 /** 50 * Clones this trie reader object and its state, 51 * but not the byte array which will be shared. 52 * @return A shallow clone of this trie. 53 * @stable ICU 4.8 54 */ 55 @Override 56 public Object clone() throws CloneNotSupportedException { 57 return super.clone(); // A shallow copy is just what we need. 58 } 59 60 /** 61 * Resets this trie to its initial state. 62 * @return this 63 * @stable ICU 4.8 64 */ 65 public BytesTrie reset() { 66 pos_=root_; 67 remainingMatchLength_=-1; 68 return this; 69 } 70 71 /** 72 * BytesTrie state object, for saving a trie's current state 73 * and resetting the trie back to this state later. 74 * @stable ICU 4.8 75 */ 76 public static final class State { 77 /** 78 * Constructs an empty State. 79 * @stable ICU 4.8 80 */ 81 public State() {} 82 private byte[] bytes; 83 private int root; 84 private int pos; 85 private int remainingMatchLength; 86 } 87 88 /** 89 * Saves the state of this trie. 90 * @param state The State object to hold the trie's state. 91 * @return this 92 * @see #resetToState 93 * @stable ICU 4.8 94 */ 95 public BytesTrie saveState(State state) /*const*/ { 96 state.bytes=bytes_; 97 state.root=root_; 98 state.pos=pos_; 99 state.remainingMatchLength=remainingMatchLength_; 100 return this; 101 } 102 103 /** 104 * Resets this trie to the saved state. 105 * @param state The State object which holds a saved trie state. 106 * @return this 107 * @throws IllegalArgumentException if the state object contains no state, 108 * or the state of a different trie 109 * @see #saveState 110 * @see #reset 111 * @stable ICU 4.8 112 */ 113 public BytesTrie resetToState(State state) { 114 if(bytes_==state.bytes && bytes_!=null && root_==state.root) { 115 pos_=state.pos; 116 remainingMatchLength_=state.remainingMatchLength; 117 } else { 118 throw new IllegalArgumentException("incompatible trie state"); 119 } 120 return this; 121 } 122 123 /** 124 * Return values for BytesTrie.next(), CharsTrie.next() and similar methods. 125 * @stable ICU 4.8 126 */ 127 public enum Result { 128 /** 129 * The input unit(s) did not continue a matching string. 130 * Once current()/next() return NO_MATCH, 131 * all further calls to current()/next() will also return NO_MATCH, 132 * until the trie is reset to its original state or to a saved state. 133 * @stable ICU 4.8 134 */ 135 NO_MATCH, 136 /** 137 * The input unit(s) continued a matching string 138 * but there is no value for the string so far. 139 * (It is a prefix of a longer string.) 140 * @stable ICU 4.8 141 */ 142 NO_VALUE, 143 /** 144 * The input unit(s) continued a matching string 145 * and there is a value for the string so far. 146 * This value will be returned by getValue(). 147 * No further input byte/unit can continue a matching string. 148 * @stable ICU 4.8 149 */ 150 FINAL_VALUE, 151 /** 152 * The input unit(s) continued a matching string 153 * and there is a value for the string so far. 154 * This value will be returned by getValue(). 155 * Another input byte/unit can continue a matching string. 156 * @stable ICU 4.8 157 */ 158 INTERMEDIATE_VALUE; 159 160 // Note: The following methods assume the particular order 161 // of enum constants, treating the ordinal() values like bit sets. 162 // Do not reorder the enum constants! 163 164 /** 165 * Same as (result!=NO_MATCH). 166 * @return true if the input bytes/units so far are part of a matching string/byte sequence. 167 * @stable ICU 4.8 168 */ 169 public boolean matches() { return this!=NO_MATCH; } 170 171 /** 172 * Equivalent to (result==INTERMEDIATE_VALUE || result==FINAL_VALUE). 173 * @return true if there is a value for the input bytes/units so far. 174 * @see #getValue 175 * @stable ICU 4.8 176 */ 177 public boolean hasValue() { return ordinal()>=2; } 178 179 /** 180 * Equivalent to (result==NO_VALUE || result==INTERMEDIATE_VALUE). 181 * @return true if another input byte/unit can continue a matching string. 182 * @stable ICU 4.8 183 */ 184 public boolean hasNext() { return (ordinal()&1)!=0; } 185 } 186 187 /** 188 * Determines whether the byte sequence so far matches, whether it has a value, 189 * and whether another input byte can continue a matching byte sequence. 190 * @return The match/value Result. 191 * @stable ICU 4.8 192 */ 193 public Result current() /*const*/ { 194 int pos=pos_; 195 if(pos<0) { 196 return Result.NO_MATCH; 197 } else { 198 int node; 199 return (remainingMatchLength_<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 200 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 201 } 202 } 203 204 /** 205 * Traverses the trie from the initial state for this input byte. 206 * Equivalent to reset().next(inByte). 207 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 208 * Values below -0x100 and above 0xff will never match. 209 * @return The match/value Result. 210 * @stable ICU 4.8 211 */ 212 public Result first(int inByte) { 213 remainingMatchLength_=-1; 214 if(inByte<0) { 215 inByte+=0x100; 216 } 217 return nextImpl(root_, inByte); 218 } 219 220 /** 221 * Traverses the trie from the current state for this input byte. 222 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 223 * Values below -0x100 and above 0xff will never match. 224 * @return The match/value Result. 225 * @stable ICU 4.8 226 */ 227 public Result next(int inByte) { 228 int pos=pos_; 229 if(pos<0) { 230 return Result.NO_MATCH; 231 } 232 if(inByte<0) { 233 inByte+=0x100; 234 } 235 int length=remainingMatchLength_; // Actual remaining match length minus 1. 236 if(length>=0) { 237 // Remaining part of a linear-match node. 238 if(inByte==(bytes_[pos++]&0xff)) { 239 remainingMatchLength_=--length; 240 pos_=pos; 241 int node; 242 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 243 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 244 } else { 245 stop(); 246 return Result.NO_MATCH; 247 } 248 } 249 return nextImpl(pos, inByte); 250 } 251 252 /** 253 * Traverses the trie from the current state for this byte sequence. 254 * Equivalent to 255 * <pre> 256 * Result result=current(); 257 * for(each c in s) 258 * if(!result.hasNext()) return Result.NO_MATCH; 259 * result=next(c); 260 * return result; 261 * </pre> 262 * @param s Contains a string or byte sequence. 263 * @param sIndex The start index of the byte sequence in s. 264 * @param sLimit The (exclusive) end index of the byte sequence in s. 265 * @return The match/value Result. 266 * @stable ICU 4.8 267 */ 268 public Result next(byte[] s, int sIndex, int sLimit) { 269 if(sIndex>=sLimit) { 270 // Empty input. 271 return current(); 272 } 273 int pos=pos_; 274 if(pos<0) { 275 return Result.NO_MATCH; 276 } 277 int length=remainingMatchLength_; // Actual remaining match length minus 1. 278 for(;;) { 279 // Fetch the next input byte, if there is one. 280 // Continue a linear-match node. 281 byte inByte; 282 for(;;) { 283 if(sIndex==sLimit) { 284 remainingMatchLength_=length; 285 pos_=pos; 286 int node; 287 return (length<0 && (node=(bytes_[pos]&0xff))>=kMinValueLead) ? 288 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 289 } 290 inByte=s[sIndex++]; 291 if(length<0) { 292 remainingMatchLength_=length; 293 break; 294 } 295 if(inByte!=bytes_[pos]) { 296 stop(); 297 return Result.NO_MATCH; 298 } 299 ++pos; 300 --length; 301 } 302 for(;;) { 303 int node=bytes_[pos++]&0xff; 304 if(node<kMinLinearMatch) { 305 Result result=branchNext(pos, node, inByte&0xff); 306 if(result==Result.NO_MATCH) { 307 return Result.NO_MATCH; 308 } 309 // Fetch the next input byte, if there is one. 310 if(sIndex==sLimit) { 311 return result; 312 } 313 if(result==Result.FINAL_VALUE) { 314 // No further matching bytes. 315 stop(); 316 return Result.NO_MATCH; 317 } 318 inByte=s[sIndex++]; 319 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 320 } else if(node<kMinValueLead) { 321 // Match length+1 bytes. 322 length=node-kMinLinearMatch; // Actual match length minus 1. 323 if(inByte!=bytes_[pos]) { 324 stop(); 325 return Result.NO_MATCH; 326 } 327 ++pos; 328 --length; 329 break; 330 } else if((node&kValueIsFinal)!=0) { 331 // No further matching bytes. 332 stop(); 333 return Result.NO_MATCH; 334 } else { 335 // Skip intermediate value. 336 pos=skipValue(pos, node); 337 // The next node must not also be a value node. 338 assert((bytes_[pos]&0xff)<kMinValueLead); 339 } 340 } 341 } 342 } 343 344 /** 345 * Returns a matching byte sequence's value if called immediately after 346 * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE. 347 * getValue() can be called multiple times. 348 * 349 * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE! 350 * @return The value for the byte sequence so far. 351 * @stable ICU 4.8 352 */ 353 public int getValue() /*const*/ { 354 int pos=pos_; 355 int leadByte=bytes_[pos++]&0xff; 356 assert(leadByte>=kMinValueLead); 357 return readValue(bytes_, pos, leadByte>>1); 358 } 359 360 /** 361 * Determines whether all byte sequences reachable from the current state 362 * map to the same value, and if so, returns that value. 363 * @return The unique value in bits 32..1 with bit 0 set, 364 * if all byte sequences reachable from the current state 365 * map to the same value; otherwise returns 0. 366 * @stable ICU 4.8 367 */ 368 public long getUniqueValue() /*const*/ { 369 int pos=pos_; 370 if(pos<0) { 371 return 0; 372 } 373 // Skip the rest of a pending linear-match node. 374 long uniqueValue=findUniqueValue(bytes_, pos+remainingMatchLength_+1, 0); 375 // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32. 376 return (uniqueValue<<31)>>31; 377 } 378 379 /** 380 * Finds each byte which continues the byte sequence from the current state. 381 * That is, each byte b for which it would be next(b)!=Result.NO_MATCH now. 382 * @param out Each next byte is 0-extended to a char and appended to this object. 383 * (Only uses the out.append(c) method.) 384 * @return The number of bytes which continue the byte sequence from here. 385 * @stable ICU 4.8 386 */ 387 public int getNextBytes(Appendable out) /*const*/ { 388 int pos=pos_; 389 if(pos<0) { 390 return 0; 391 } 392 if(remainingMatchLength_>=0) { 393 append(out, bytes_[pos]&0xff); // Next byte of a pending linear-match node. 394 return 1; 395 } 396 int node=bytes_[pos++]&0xff; 397 if(node>=kMinValueLead) { 398 if((node&kValueIsFinal)!=0) { 399 return 0; 400 } else { 401 pos=skipValue(pos, node); 402 node=bytes_[pos++]&0xff; 403 assert(node<kMinValueLead); 404 } 405 } 406 if(node<kMinLinearMatch) { 407 if(node==0) { 408 node=bytes_[pos++]&0xff; 409 } 410 getNextBranchBytes(bytes_, pos, ++node, out); 411 return node; 412 } else { 413 // First byte of the linear-match node. 414 append(out, bytes_[pos]&0xff); 415 return 1; 416 } 417 } 418 419 /** 420 * Iterates from the current state of this trie. 421 * @return A new BytesTrie.Iterator. 422 * @stable ICU 4.8 423 */ 424 public Iterator iterator() { 425 return new Iterator(bytes_, pos_, remainingMatchLength_, 0); 426 } 427 428 /** 429 * Iterates from the current state of this trie. 430 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 431 * Otherwise, the iterator returns strings with this maximum length. 432 * @return A new BytesTrie.Iterator. 433 * @stable ICU 4.8 434 */ 435 public Iterator iterator(int maxStringLength) { 436 return new Iterator(bytes_, pos_, remainingMatchLength_, maxStringLength); 437 } 438 439 /** 440 * Iterates from the root of a byte-serialized BytesTrie. 441 * @param trieBytes Bytes array that contains the serialized trie. 442 * @param offset Root offset of the trie in the array. 443 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 444 * Otherwise, the iterator returns strings with this maximum length. 445 * @return A new BytesTrie.Iterator. 446 * @stable ICU 4.8 447 */ 448 public static Iterator iterator(byte[] trieBytes, int offset, int maxStringLength) { 449 return new Iterator(trieBytes, offset, -1, maxStringLength); 450 } 451 452 /** 453 * Return value type for the Iterator. 454 * @stable ICU 4.8 455 */ 456 public static final class Entry { 457 private Entry(int capacity) { 458 bytes=new byte[capacity]; 459 } 460 461 /** 462 * @return The length of the byte sequence. 463 * @stable ICU 4.8 464 */ 465 public int bytesLength() { return length; } 466 /** 467 * Returns a byte of the byte sequence. 468 * @param index An index into the byte sequence. 469 * @return The index-th byte sequence byte. 470 * @stable ICU 4.8 471 */ 472 public byte byteAt(int index) { return bytes[index]; } 473 /** 474 * Copies the byte sequence into a byte array. 475 * @param dest Destination byte array. 476 * @param destOffset Starting offset to where in dest the byte sequence is copied. 477 * @stable ICU 4.8 478 */ 479 public void copyBytesTo(byte[] dest, int destOffset) { 480 System.arraycopy(bytes, 0, dest, destOffset, length); 481 } 482 /** 483 * @return The byte sequence as a read-only ByteBuffer. 484 * @stable ICU 4.8 485 */ 486 public ByteBuffer bytesAsByteBuffer() { 487 return ByteBuffer.wrap(bytes, 0, length).asReadOnlyBuffer(); 488 } 489 490 /** 491 * The value associated with the byte sequence. 492 * @stable ICU 4.8 493 */ 494 public int value; 495 496 private void ensureCapacity(int len) { 497 if(bytes.length<len) { 498 byte[] newBytes=new byte[Math.min(2*bytes.length, 2*len)]; 499 System.arraycopy(bytes, 0, newBytes, 0, length); 500 bytes=newBytes; 501 } 502 } 503 private void append(byte b) { 504 ensureCapacity(length+1); 505 bytes[length++]=b; 506 } 507 private void append(byte[] b, int off, int len) { 508 ensureCapacity(length+len); 509 System.arraycopy(b, off, bytes, length, len); 510 length+=len; 511 } 512 private void truncateString(int newLength) { length=newLength; } 513 514 private byte[] bytes; 515 private int length; 516 } 517 518 /** 519 * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. 520 * @stable ICU 4.8 521 */ 522 public static final class Iterator implements java.util.Iterator<Entry> { 523 private Iterator(byte[] trieBytes, int offset, int remainingMatchLength, int maxStringLength) { 524 bytes_=trieBytes; 525 pos_=initialPos_=offset; 526 remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength; 527 maxLength_=maxStringLength; 528 entry_=new Entry(maxLength_!=0 ? maxLength_ : 32); 529 int length=remainingMatchLength_; // Actual remaining match length minus 1. 530 if(length>=0) { 531 // Pending linear-match node, append remaining bytes to entry_. 532 ++length; 533 if(maxLength_>0 && length>maxLength_) { 534 length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. 535 } 536 entry_.append(bytes_, pos_, length); 537 pos_+=length; 538 remainingMatchLength_-=length; 539 } 540 } 541 542 /** 543 * Resets this iterator to its initial state. 544 * @return this 545 * @stable ICU 4.8 546 */ 547 public Iterator reset() { 548 pos_=initialPos_; 549 remainingMatchLength_=initialRemainingMatchLength_; 550 int length=remainingMatchLength_+1; // Remaining match length. 551 if(maxLength_>0 && length>maxLength_) { 552 length=maxLength_; 553 } 554 entry_.truncateString(length); 555 pos_+=length; 556 remainingMatchLength_-=length; 557 stack_.clear(); 558 return this; 559 } 560 561 /** 562 * @return true if there are more elements. 563 * @stable ICU 4.8 564 */ 565 public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); } 566 567 /** 568 * Finds the next (byte sequence, value) pair if there is one. 569 * 570 * If the byte sequence is truncated to the maximum length and does not 571 * have a real value, then the value is set to -1. 572 * In this case, this "not a real value" is indistinguishable from 573 * a real value of -1. 574 * @return An Entry with the string and value of the next element. 575 * @throws NoSuchElementException - iteration has no more elements. 576 * @stable ICU 4.8 577 */ 578 public Entry next() { 579 int pos=pos_; 580 if(pos<0) { 581 if(stack_.isEmpty()) { 582 throw new NoSuchElementException(); 583 } 584 // Pop the state off the stack and continue with the next outbound edge of 585 // the branch node. 586 long top=stack_.remove(stack_.size()-1); 587 int length=(int)top; 588 pos=(int)(top>>32); 589 entry_.truncateString(length&0xffff); 590 length>>>=16; 591 if(length>1) { 592 pos=branchNext(pos, length); 593 if(pos<0) { 594 return entry_; // Reached a final value. 595 } 596 } else { 597 entry_.append(bytes_[pos++]); 598 } 599 } 600 if(remainingMatchLength_>=0) { 601 // We only get here if we started in a pending linear-match node 602 // with more than maxLength remaining bytes. 603 return truncateAndStop(); 604 } 605 for(;;) { 606 int node=bytes_[pos++]&0xff; 607 if(node>=kMinValueLead) { 608 // Deliver value for the byte sequence so far. 609 boolean isFinal=(node&kValueIsFinal)!=0; 610 entry_.value=readValue(bytes_, pos, node>>1); 611 if(isFinal || (maxLength_>0 && entry_.length==maxLength_)) { 612 pos_=-1; 613 } else { 614 pos_=skipValue(pos, node); 615 } 616 return entry_; 617 } 618 if(maxLength_>0 && entry_.length==maxLength_) { 619 return truncateAndStop(); 620 } 621 if(node<kMinLinearMatch) { 622 if(node==0) { 623 node=bytes_[pos++]&0xff; 624 } 625 pos=branchNext(pos, node+1); 626 if(pos<0) { 627 return entry_; // Reached a final value. 628 } 629 } else { 630 // Linear-match node, append length bytes to entry_. 631 int length=node-kMinLinearMatch+1; 632 if(maxLength_>0 && entry_.length+length>maxLength_) { 633 entry_.append(bytes_, pos, maxLength_-entry_.length); 634 return truncateAndStop(); 635 } 636 entry_.append(bytes_, pos, length); 637 pos+=length; 638 } 639 } 640 } 641 642 /** 643 * Iterator.remove() is not supported. 644 * @throws UnsupportedOperationException (always) 645 * @stable ICU 4.8 646 */ 647 public void remove() { 648 throw new UnsupportedOperationException(); 649 } 650 651 private Entry truncateAndStop() { 652 pos_=-1; 653 entry_.value=-1; // no real value for str 654 return entry_; 655 } 656 657 private int branchNext(int pos, int length) { 658 while(length>kMaxBranchLinearSubNodeLength) { 659 ++pos; // ignore the comparison byte 660 // Push state for the greater-or-equal edge. 661 stack_.add(((long)skipDelta(bytes_, pos)<<32)|((length-(length>>1))<<16)|entry_.length); 662 // Follow the less-than edge. 663 length>>=1; 664 pos=jumpByDelta(bytes_, pos); 665 } 666 // List of key-value pairs where values are either final values or jump deltas. 667 // Read the first (key, value) pair. 668 byte trieByte=bytes_[pos++]; 669 int node=bytes_[pos++]&0xff; 670 boolean isFinal=(node&kValueIsFinal)!=0; 671 int value=readValue(bytes_, pos, node>>1); 672 pos=skipValue(pos, node); 673 stack_.add(((long)pos<<32)|((length-1)<<16)|entry_.length); 674 entry_.append(trieByte); 675 if(isFinal) { 676 pos_=-1; 677 entry_.value=value; 678 return -1; 679 } else { 680 return pos+value; 681 } 682 } 683 684 private byte[] bytes_; 685 private int pos_; 686 private int initialPos_; 687 private int remainingMatchLength_; 688 private int initialRemainingMatchLength_; 689 690 private int maxLength_; 691 private Entry entry_; 692 693 // The stack stores longs for backtracking to another 694 // outbound edge of a branch node. 695 // Each long has the offset from bytes_ in bits 62..32, 696 // the entry_.stringLength() from before the node in bits 15..0, 697 // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) 698 // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, 699 // but the code looks more confusing that way.) 700 private ArrayList<Long> stack_=new ArrayList<Long>(); 701 } 702 703 private void stop() { 704 pos_=-1; 705 } 706 707 // Reads a compact 32-bit integer. 708 // pos is already after the leadByte, and the lead byte is already shifted right by 1. 709 private static int readValue(byte[] bytes, int pos, int leadByte) { 710 int value; 711 if(leadByte<kMinTwoByteValueLead) { 712 value=leadByte-kMinOneByteValueLead; 713 } else if(leadByte<kMinThreeByteValueLead) { 714 value=((leadByte-kMinTwoByteValueLead)<<8)|(bytes[pos]&0xff); 715 } else if(leadByte<kFourByteValueLead) { 716 value=((leadByte-kMinThreeByteValueLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); 717 } else if(leadByte==kFourByteValueLead) { 718 value=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); 719 } else { 720 value=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); 721 } 722 return value; 723 } 724 private static int skipValue(int pos, int leadByte) { 725 assert(leadByte>=kMinValueLead); 726 if(leadByte>=(kMinTwoByteValueLead<<1)) { 727 if(leadByte<(kMinThreeByteValueLead<<1)) { 728 ++pos; 729 } else if(leadByte<(kFourByteValueLead<<1)) { 730 pos+=2; 731 } else { 732 pos+=3+((leadByte>>1)&1); 733 } 734 } 735 return pos; 736 } 737 private static int skipValue(byte[] bytes, int pos) { 738 int leadByte=bytes[pos++]&0xff; 739 return skipValue(pos, leadByte); 740 } 741 742 // Reads a jump delta and jumps. 743 private static int jumpByDelta(byte[] bytes, int pos) { 744 int delta=bytes[pos++]&0xff; 745 if(delta<kMinTwoByteDeltaLead) { 746 // nothing to do 747 } else if(delta<kMinThreeByteDeltaLead) { 748 delta=((delta-kMinTwoByteDeltaLead)<<8)|(bytes[pos++]&0xff); 749 } else if(delta<kFourByteDeltaLead) { 750 delta=((delta-kMinThreeByteDeltaLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); 751 pos+=2; 752 } else if(delta==kFourByteDeltaLead) { 753 delta=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); 754 pos+=3; 755 } else { 756 delta=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); 757 pos+=4; 758 } 759 return pos+delta; 760 } 761 762 private static int skipDelta(byte[] bytes, int pos) { 763 int delta=bytes[pos++]&0xff; 764 if(delta>=kMinTwoByteDeltaLead) { 765 if(delta<kMinThreeByteDeltaLead) { 766 ++pos; 767 } else if(delta<kFourByteDeltaLead) { 768 pos+=2; 769 } else { 770 pos+=3+(delta&1); 771 } 772 } 773 return pos; 774 } 775 776 private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE }; 777 778 // Handles a branch node for both next(byte) and next(string). 779 private Result branchNext(int pos, int length, int inByte) { 780 // Branch according to the current byte. 781 if(length==0) { 782 length=bytes_[pos++]&0xff; 783 } 784 ++length; 785 // The length of the branch is the number of bytes to select from. 786 // The data structure encodes a binary search. 787 while(length>kMaxBranchLinearSubNodeLength) { 788 if(inByte<(bytes_[pos++]&0xff)) { 789 length>>=1; 790 pos=jumpByDelta(bytes_, pos); 791 } else { 792 length=length-(length>>1); 793 pos=skipDelta(bytes_, pos); 794 } 795 } 796 // Drop down to linear search for the last few bytes. 797 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 798 // and divides length by 2. 799 do { 800 if(inByte==(bytes_[pos++]&0xff)) { 801 Result result; 802 int node=bytes_[pos]&0xff; 803 assert(node>=kMinValueLead); 804 if((node&kValueIsFinal)!=0) { 805 // Leave the final value for getValue() to read. 806 result=Result.FINAL_VALUE; 807 } else { 808 // Use the non-final value as the jump delta. 809 ++pos; 810 // int delta=readValue(pos, node>>1); 811 node>>=1; 812 int delta; 813 if(node<kMinTwoByteValueLead) { 814 delta=node-kMinOneByteValueLead; 815 } else if(node<kMinThreeByteValueLead) { 816 delta=((node-kMinTwoByteValueLead)<<8)|(bytes_[pos++]&0xff); 817 } else if(node<kFourByteValueLead) { 818 delta=((node-kMinThreeByteValueLead)<<16)|((bytes_[pos]&0xff)<<8)|(bytes_[pos+1]&0xff); 819 pos+=2; 820 } else if(node==kFourByteValueLead) { 821 delta=((bytes_[pos]&0xff)<<16)|((bytes_[pos+1]&0xff)<<8)|(bytes_[pos+2]&0xff); 822 pos+=3; 823 } else { 824 delta=(bytes_[pos]<<24)|((bytes_[pos+1]&0xff)<<16)|((bytes_[pos+2]&0xff)<<8)|(bytes_[pos+3]&0xff); 825 pos+=4; 826 } 827 // end readValue() 828 pos+=delta; 829 node=bytes_[pos]&0xff; 830 result= node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 831 } 832 pos_=pos; 833 return result; 834 } 835 --length; 836 pos=skipValue(bytes_, pos); 837 } while(length>1); 838 if(inByte==(bytes_[pos++]&0xff)) { 839 pos_=pos; 840 int node=bytes_[pos]&0xff; 841 return node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 842 } else { 843 stop(); 844 return Result.NO_MATCH; 845 } 846 } 847 848 // Requires remainingLength_<0. 849 private Result nextImpl(int pos, int inByte) { 850 for(;;) { 851 int node=bytes_[pos++]&0xff; 852 if(node<kMinLinearMatch) { 853 return branchNext(pos, node, inByte); 854 } else if(node<kMinValueLead) { 855 // Match the first of length+1 bytes. 856 int length=node-kMinLinearMatch; // Actual match length minus 1. 857 if(inByte==(bytes_[pos++]&0xff)) { 858 remainingMatchLength_=--length; 859 pos_=pos; 860 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 861 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 862 } else { 863 // No match. 864 break; 865 } 866 } else if((node&kValueIsFinal)!=0) { 867 // No further matching bytes. 868 break; 869 } else { 870 // Skip intermediate value. 871 pos=skipValue(pos, node); 872 // The next node must not also be a value node. 873 assert((bytes_[pos]&0xff)<kMinValueLead); 874 } 875 } 876 stop(); 877 return Result.NO_MATCH; 878 } 879 880 // Helper functions for getUniqueValue(). 881 // Recursively finds a unique value (or whether there is not a unique one) 882 // from a branch. 883 // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). 884 // On return, if not 0, then bits 63..33 contain the updated non-negative pos. 885 private static long findUniqueValueFromBranch(byte[] bytes, int pos, int length, 886 long uniqueValue) { 887 while(length>kMaxBranchLinearSubNodeLength) { 888 ++pos; // ignore the comparison byte 889 uniqueValue=findUniqueValueFromBranch(bytes, jumpByDelta(bytes, pos), length>>1, uniqueValue); 890 if(uniqueValue==0) { 891 return 0; 892 } 893 length=length-(length>>1); 894 pos=skipDelta(bytes, pos); 895 } 896 do { 897 ++pos; // ignore a comparison byte 898 // handle its value 899 int node=bytes[pos++]&0xff; 900 boolean isFinal=(node&kValueIsFinal)!=0; 901 int value=readValue(bytes, pos, node>>1); 902 pos=skipValue(pos, node); 903 if(isFinal) { 904 if(uniqueValue!=0) { 905 if(value!=(int)(uniqueValue>>1)) { 906 return 0; 907 } 908 } else { 909 uniqueValue=((long)value<<1)|1; 910 } 911 } else { 912 uniqueValue=findUniqueValue(bytes, pos+value, uniqueValue); 913 if(uniqueValue==0) { 914 return 0; 915 } 916 } 917 } while(--length>1); 918 // ignore the last comparison byte 919 return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL); 920 } 921 // Recursively finds a unique value (or whether there is not a unique one) 922 // starting from a position on a node lead byte. 923 // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set. 924 // Otherwise, uniqueValue is 0. Bits 63..33 are ignored. 925 private static long findUniqueValue(byte[] bytes, int pos, long uniqueValue) { 926 for(;;) { 927 int node=bytes[pos++]&0xff; 928 if(node<kMinLinearMatch) { 929 if(node==0) { 930 node=bytes[pos++]&0xff; 931 } 932 uniqueValue=findUniqueValueFromBranch(bytes, pos, node+1, uniqueValue); 933 if(uniqueValue==0) { 934 return 0; 935 } 936 pos=(int)(uniqueValue>>>33); 937 } else if(node<kMinValueLead) { 938 // linear-match node 939 pos+=node-kMinLinearMatch+1; // Ignore the match bytes. 940 } else { 941 boolean isFinal=(node&kValueIsFinal)!=0; 942 int value=readValue(bytes, pos, node>>1); 943 if(uniqueValue!=0) { 944 if(value!=(int)(uniqueValue>>1)) { 945 return 0; 946 } 947 } else { 948 uniqueValue=((long)value<<1)|1; 949 } 950 if(isFinal) { 951 return uniqueValue; 952 } 953 pos=skipValue(pos, node); 954 } 955 } 956 } 957 958 // Helper functions for getNextBytes(). 959 // getNextBytes() when pos is on a branch node. 960 private static void getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out) { 961 while(length>kMaxBranchLinearSubNodeLength) { 962 ++pos; // ignore the comparison byte 963 getNextBranchBytes(bytes, jumpByDelta(bytes, pos), length>>1, out); 964 length=length-(length>>1); 965 pos=skipDelta(bytes, pos); 966 } 967 do { 968 append(out, bytes[pos++]&0xff); 969 pos=skipValue(bytes, pos); 970 } while(--length>1); 971 append(out, bytes[pos]&0xff); 972 } 973 private static void append(Appendable out, int c) { 974 try { 975 out.append((char)c); 976 } catch(IOException e) { 977 throw new ICUUncheckedIOException(e); 978 } 979 } 980 981 // BytesTrie data structure 982 // 983 // The trie consists of a series of byte-serialized nodes for incremental 984 // string/byte sequence matching. The root node is at the beginning of the trie data. 985 // 986 // Types of nodes are distinguished by their node lead byte ranges. 987 // After each node, except a final-value node, another node follows to 988 // encode match values or continue matching further bytes. 989 // 990 // Node types: 991 // - Value node: Stores a 32-bit integer in a compact, variable-length format. 992 // The value is for the string/byte sequence so far. 993 // One node bit indicates whether the value is final or whether 994 // matching continues with the next node. 995 // - Linear-match node: Matches a number of bytes. 996 // - Branch node: Branches to other nodes according to the current input byte. 997 // The node byte is the length of the branch (number of bytes to select from) 998 // minus 1. It is followed by a sub-node: 999 // - If the length is at most kMaxBranchLinearSubNodeLength, then 1000 // there are length-1 (key, value) pairs and then one more comparison byte. 1001 // If one of the key bytes matches, then the value is either a final value for 1002 // the string/byte sequence so far, or a "jump" delta to the next node. 1003 // If the last byte matches, then matching continues with the next node. 1004 // (Values have the same encoding as value nodes.) 1005 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 1006 // there is one byte and one "jump" delta. 1007 // If the input byte is less than the sub-node byte, then "jump" by delta to 1008 // the next sub-node which will have a length of length/2. 1009 // (The delta has its own compact encoding.) 1010 // Otherwise, skip the "jump" delta to the next sub-node 1011 // which will have a length of length-length/2. 1012 1013 // Node lead byte values. 1014 1015 // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise 1016 // the length is one more than the next byte. 1017 1018 // For a branch sub-node with at most this many entries, we drop down 1019 // to a linear search. 1020 /*package*/ static final int kMaxBranchLinearSubNodeLength=5; 1021 1022 // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. 1023 /*package*/ static final int kMinLinearMatch=0x10; 1024 /*package*/ static final int kMaxLinearMatchLength=0x10; 1025 1026 // 20..ff: Variable-length value node. 1027 // If odd, the value is final. (Otherwise, intermediate value or jump delta.) 1028 // Then shift-right by 1 bit. 1029 // The remaining lead byte value indicates the number of following bytes (0..4) 1030 // and contains the value's top bits. 1031 /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 1032 // It is a final value if bit 0 is set. 1033 private static final int kValueIsFinal=1; 1034 1035 // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. 1036 /*package*/ static final int kMinOneByteValueLead=kMinValueLead/2; // 0x10 1037 /*package*/ static final int kMaxOneByteValue=0x40; // At least 6 bits in the first byte. 1038 1039 /*package*/ static final int kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 1040 /*package*/ static final int kMaxTwoByteValue=0x1aff; 1041 1042 /*package*/ static final int kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c 1043 /*package*/ static final int kFourByteValueLead=0x7e; 1044 1045 // A little more than Unicode code points. (0x11ffff) 1046 /*package*/ static final int kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; 1047 1048 /*package*/ static final int kFiveByteValueLead=0x7f; 1049 1050 // Compact delta integers. 1051 /*package*/ static final int kMaxOneByteDelta=0xbf; 1052 /*package*/ static final int kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 1053 /*package*/ static final int kMinThreeByteDeltaLead=0xf0; 1054 /*package*/ static final int kFourByteDeltaLead=0xfe; 1055 /*package*/ static final int kFiveByteDeltaLead=0xff; 1056 1057 /*package*/ static final int kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff 1058 /*package*/ static final int kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff 1059 1060 // Fixed value referencing the BytesTrie bytes. 1061 private byte[] bytes_; 1062 private int root_; 1063 1064 // Iterator variables. 1065 1066 // Index of next trie byte to read. Negative if no more matches. 1067 private int pos_; 1068 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 1069 private int remainingMatchLength_; 1070}; 1071