1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4******************************************************************************* 5* Copyright (C) 2010-2014, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* created on: 2010nov23 9* created by: Markus W. Scherer 10* ported from ICU4C bytestrie.h/.cpp 11*/ 12package com.ibm.icu.util; 13 14import java.io.IOException; 15import java.nio.ByteBuffer; 16import java.util.ArrayList; 17import java.util.NoSuchElementException; 18 19/** 20 * Light-weight, non-const reader class for a BytesTrie. 21 * Traverses a byte-serialized data structure with minimal state, 22 * for mapping byte sequences to non-negative integer values. 23 * 24 * <p>This class is not intended for public subclassing. 25 * 26 * @stable ICU 4.8 27 * @author Markus W. Scherer 28 */ 29public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> { 30 /** 31 * Constructs a BytesTrie reader instance. 32 * 33 * <p>The array must contain a copy of a byte sequence from the BytesTrieBuilder, 34 * with the offset indicating the first byte of that sequence. 35 * The BytesTrie object will not read more bytes than 36 * the BytesTrieBuilder generated in the corresponding build() call. 37 * 38 * <p>The array is not copied/cloned and must not be modified while 39 * the BytesTrie object is in use. 40 * 41 * @param trieBytes Bytes array that contains the serialized trie. 42 * @param offset Root offset of the trie in the array. 43 * @stable ICU 4.8 44 */ 45 public BytesTrie(byte[] trieBytes, int offset) { 46 bytes_=trieBytes; 47 pos_=root_=offset; 48 remainingMatchLength_=-1; 49 } 50 51 /** 52 * Clones this trie reader object and its state, 53 * but not the byte array which will be shared. 54 * @return A shallow clone of this trie. 55 * @stable ICU 4.8 56 */ 57 @Override 58 public Object clone() throws CloneNotSupportedException { 59 return super.clone(); // A shallow copy is just what we need. 60 } 61 62 /** 63 * Resets this trie to its initial state. 64 * @return this 65 * @stable ICU 4.8 66 */ 67 public BytesTrie reset() { 68 pos_=root_; 69 remainingMatchLength_=-1; 70 return this; 71 } 72 73 /** 74 * BytesTrie state object, for saving a trie's current state 75 * and resetting the trie back to this state later. 76 * @stable ICU 4.8 77 */ 78 public static final class State { 79 /** 80 * Constructs an empty State. 81 * @stable ICU 4.8 82 */ 83 public State() {} 84 private byte[] bytes; 85 private int root; 86 private int pos; 87 private int remainingMatchLength; 88 } 89 90 /** 91 * Saves the state of this trie. 92 * @param state The State object to hold the trie's state. 93 * @return this 94 * @see #resetToState 95 * @stable ICU 4.8 96 */ 97 public BytesTrie saveState(State state) /*const*/ { 98 state.bytes=bytes_; 99 state.root=root_; 100 state.pos=pos_; 101 state.remainingMatchLength=remainingMatchLength_; 102 return this; 103 } 104 105 /** 106 * Resets this trie to the saved state. 107 * @param state The State object which holds a saved trie state. 108 * @return this 109 * @throws IllegalArgumentException if the state object contains no state, 110 * or the state of a different trie 111 * @see #saveState 112 * @see #reset 113 * @stable ICU 4.8 114 */ 115 public BytesTrie resetToState(State state) { 116 if(bytes_==state.bytes && bytes_!=null && root_==state.root) { 117 pos_=state.pos; 118 remainingMatchLength_=state.remainingMatchLength; 119 } else { 120 throw new IllegalArgumentException("incompatible trie state"); 121 } 122 return this; 123 } 124 125 /** 126 * Return values for BytesTrie.next(), CharsTrie.next() and similar methods. 127 * @stable ICU 4.8 128 */ 129 public enum Result { 130 /** 131 * The input unit(s) did not continue a matching string. 132 * Once current()/next() return NO_MATCH, 133 * all further calls to current()/next() will also return NO_MATCH, 134 * until the trie is reset to its original state or to a saved state. 135 * @stable ICU 4.8 136 */ 137 NO_MATCH, 138 /** 139 * The input unit(s) continued a matching string 140 * but there is no value for the string so far. 141 * (It is a prefix of a longer string.) 142 * @stable ICU 4.8 143 */ 144 NO_VALUE, 145 /** 146 * The input unit(s) continued a matching string 147 * and there is a value for the string so far. 148 * This value will be returned by getValue(). 149 * No further input byte/unit can continue a matching string. 150 * @stable ICU 4.8 151 */ 152 FINAL_VALUE, 153 /** 154 * The input unit(s) continued a matching string 155 * and there is a value for the string so far. 156 * This value will be returned by getValue(). 157 * Another input byte/unit can continue a matching string. 158 * @stable ICU 4.8 159 */ 160 INTERMEDIATE_VALUE; 161 162 // Note: The following methods assume the particular order 163 // of enum constants, treating the ordinal() values like bit sets. 164 // Do not reorder the enum constants! 165 166 /** 167 * Same as (result!=NO_MATCH). 168 * @return true if the input bytes/units so far are part of a matching string/byte sequence. 169 * @stable ICU 4.8 170 */ 171 public boolean matches() { return this!=NO_MATCH; } 172 173 /** 174 * Equivalent to (result==INTERMEDIATE_VALUE || result==FINAL_VALUE). 175 * @return true if there is a value for the input bytes/units so far. 176 * @see #getValue 177 * @stable ICU 4.8 178 */ 179 public boolean hasValue() { return ordinal()>=2; } 180 181 /** 182 * Equivalent to (result==NO_VALUE || result==INTERMEDIATE_VALUE). 183 * @return true if another input byte/unit can continue a matching string. 184 * @stable ICU 4.8 185 */ 186 public boolean hasNext() { return (ordinal()&1)!=0; } 187 } 188 189 /** 190 * Determines whether the byte sequence so far matches, whether it has a value, 191 * and whether another input byte can continue a matching byte sequence. 192 * @return The match/value Result. 193 * @stable ICU 4.8 194 */ 195 public Result current() /*const*/ { 196 int pos=pos_; 197 if(pos<0) { 198 return Result.NO_MATCH; 199 } else { 200 int node; 201 return (remainingMatchLength_<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 202 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 203 } 204 } 205 206 /** 207 * Traverses the trie from the initial state for this input byte. 208 * Equivalent to reset().next(inByte). 209 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 210 * Values below -0x100 and above 0xff will never match. 211 * @return The match/value Result. 212 * @stable ICU 4.8 213 */ 214 public Result first(int inByte) { 215 remainingMatchLength_=-1; 216 if(inByte<0) { 217 inByte+=0x100; 218 } 219 return nextImpl(root_, inByte); 220 } 221 222 /** 223 * Traverses the trie from the current state for this input byte. 224 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 225 * Values below -0x100 and above 0xff will never match. 226 * @return The match/value Result. 227 * @stable ICU 4.8 228 */ 229 public Result next(int inByte) { 230 int pos=pos_; 231 if(pos<0) { 232 return Result.NO_MATCH; 233 } 234 if(inByte<0) { 235 inByte+=0x100; 236 } 237 int length=remainingMatchLength_; // Actual remaining match length minus 1. 238 if(length>=0) { 239 // Remaining part of a linear-match node. 240 if(inByte==(bytes_[pos++]&0xff)) { 241 remainingMatchLength_=--length; 242 pos_=pos; 243 int node; 244 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 245 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 246 } else { 247 stop(); 248 return Result.NO_MATCH; 249 } 250 } 251 return nextImpl(pos, inByte); 252 } 253 254 /** 255 * Traverses the trie from the current state for this byte sequence. 256 * Equivalent to 257 * <pre> 258 * Result result=current(); 259 * for(each c in s) 260 * if(!result.hasNext()) return Result.NO_MATCH; 261 * result=next(c); 262 * return result; 263 * </pre> 264 * @param s Contains a string or byte sequence. 265 * @param sIndex The start index of the byte sequence in s. 266 * @param sLimit The (exclusive) end index of the byte sequence in s. 267 * @return The match/value Result. 268 * @stable ICU 4.8 269 */ 270 public Result next(byte[] s, int sIndex, int sLimit) { 271 if(sIndex>=sLimit) { 272 // Empty input. 273 return current(); 274 } 275 int pos=pos_; 276 if(pos<0) { 277 return Result.NO_MATCH; 278 } 279 int length=remainingMatchLength_; // Actual remaining match length minus 1. 280 for(;;) { 281 // Fetch the next input byte, if there is one. 282 // Continue a linear-match node. 283 byte inByte; 284 for(;;) { 285 if(sIndex==sLimit) { 286 remainingMatchLength_=length; 287 pos_=pos; 288 int node; 289 return (length<0 && (node=(bytes_[pos]&0xff))>=kMinValueLead) ? 290 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 291 } 292 inByte=s[sIndex++]; 293 if(length<0) { 294 remainingMatchLength_=length; 295 break; 296 } 297 if(inByte!=bytes_[pos]) { 298 stop(); 299 return Result.NO_MATCH; 300 } 301 ++pos; 302 --length; 303 } 304 for(;;) { 305 int node=bytes_[pos++]&0xff; 306 if(node<kMinLinearMatch) { 307 Result result=branchNext(pos, node, inByte&0xff); 308 if(result==Result.NO_MATCH) { 309 return Result.NO_MATCH; 310 } 311 // Fetch the next input byte, if there is one. 312 if(sIndex==sLimit) { 313 return result; 314 } 315 if(result==Result.FINAL_VALUE) { 316 // No further matching bytes. 317 stop(); 318 return Result.NO_MATCH; 319 } 320 inByte=s[sIndex++]; 321 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 322 } else if(node<kMinValueLead) { 323 // Match length+1 bytes. 324 length=node-kMinLinearMatch; // Actual match length minus 1. 325 if(inByte!=bytes_[pos]) { 326 stop(); 327 return Result.NO_MATCH; 328 } 329 ++pos; 330 --length; 331 break; 332 } else if((node&kValueIsFinal)!=0) { 333 // No further matching bytes. 334 stop(); 335 return Result.NO_MATCH; 336 } else { 337 // Skip intermediate value. 338 pos=skipValue(pos, node); 339 // The next node must not also be a value node. 340 assert((bytes_[pos]&0xff)<kMinValueLead); 341 } 342 } 343 } 344 } 345 346 /** 347 * Returns a matching byte sequence's value if called immediately after 348 * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE. 349 * getValue() can be called multiple times. 350 * 351 * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE! 352 * @return The value for the byte sequence so far. 353 * @stable ICU 4.8 354 */ 355 public int getValue() /*const*/ { 356 int pos=pos_; 357 int leadByte=bytes_[pos++]&0xff; 358 assert(leadByte>=kMinValueLead); 359 return readValue(bytes_, pos, leadByte>>1); 360 } 361 362 /** 363 * Determines whether all byte sequences reachable from the current state 364 * map to the same value, and if so, returns that value. 365 * @return The unique value in bits 32..1 with bit 0 set, 366 * if all byte sequences reachable from the current state 367 * map to the same value; otherwise returns 0. 368 * @stable ICU 4.8 369 */ 370 public long getUniqueValue() /*const*/ { 371 int pos=pos_; 372 if(pos<0) { 373 return 0; 374 } 375 // Skip the rest of a pending linear-match node. 376 long uniqueValue=findUniqueValue(bytes_, pos+remainingMatchLength_+1, 0); 377 // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32. 378 return (uniqueValue<<31)>>31; 379 } 380 381 /** 382 * Finds each byte which continues the byte sequence from the current state. 383 * That is, each byte b for which it would be next(b)!=Result.NO_MATCH now. 384 * @param out Each next byte is 0-extended to a char and appended to this object. 385 * (Only uses the out.append(c) method.) 386 * @return The number of bytes which continue the byte sequence from here. 387 * @stable ICU 4.8 388 */ 389 public int getNextBytes(Appendable out) /*const*/ { 390 int pos=pos_; 391 if(pos<0) { 392 return 0; 393 } 394 if(remainingMatchLength_>=0) { 395 append(out, bytes_[pos]&0xff); // Next byte of a pending linear-match node. 396 return 1; 397 } 398 int node=bytes_[pos++]&0xff; 399 if(node>=kMinValueLead) { 400 if((node&kValueIsFinal)!=0) { 401 return 0; 402 } else { 403 pos=skipValue(pos, node); 404 node=bytes_[pos++]&0xff; 405 assert(node<kMinValueLead); 406 } 407 } 408 if(node<kMinLinearMatch) { 409 if(node==0) { 410 node=bytes_[pos++]&0xff; 411 } 412 getNextBranchBytes(bytes_, pos, ++node, out); 413 return node; 414 } else { 415 // First byte of the linear-match node. 416 append(out, bytes_[pos]&0xff); 417 return 1; 418 } 419 } 420 421 /** 422 * Iterates from the current state of this trie. 423 * @return A new BytesTrie.Iterator. 424 * @stable ICU 4.8 425 */ 426 @Override 427 public Iterator iterator() { 428 return new Iterator(bytes_, pos_, remainingMatchLength_, 0); 429 } 430 431 /** 432 * Iterates from the current state of this trie. 433 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 434 * Otherwise, the iterator returns strings with this maximum length. 435 * @return A new BytesTrie.Iterator. 436 * @stable ICU 4.8 437 */ 438 public Iterator iterator(int maxStringLength) { 439 return new Iterator(bytes_, pos_, remainingMatchLength_, maxStringLength); 440 } 441 442 /** 443 * Iterates from the root of a byte-serialized BytesTrie. 444 * @param trieBytes Bytes array that contains the serialized trie. 445 * @param offset Root offset of the trie in the array. 446 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 447 * Otherwise, the iterator returns strings with this maximum length. 448 * @return A new BytesTrie.Iterator. 449 * @stable ICU 4.8 450 */ 451 public static Iterator iterator(byte[] trieBytes, int offset, int maxStringLength) { 452 return new Iterator(trieBytes, offset, -1, maxStringLength); 453 } 454 455 /** 456 * Return value type for the Iterator. 457 * @stable ICU 4.8 458 */ 459 public static final class Entry { 460 private Entry(int capacity) { 461 bytes=new byte[capacity]; 462 } 463 464 /** 465 * @return The length of the byte sequence. 466 * @stable ICU 4.8 467 */ 468 public int bytesLength() { return length; } 469 /** 470 * Returns a byte of the byte sequence. 471 * @param index An index into the byte sequence. 472 * @return The index-th byte sequence byte. 473 * @stable ICU 4.8 474 */ 475 public byte byteAt(int index) { return bytes[index]; } 476 /** 477 * Copies the byte sequence into a byte array. 478 * @param dest Destination byte array. 479 * @param destOffset Starting offset to where in dest the byte sequence is copied. 480 * @stable ICU 4.8 481 */ 482 public void copyBytesTo(byte[] dest, int destOffset) { 483 System.arraycopy(bytes, 0, dest, destOffset, length); 484 } 485 /** 486 * @return The byte sequence as a read-only ByteBuffer. 487 * @stable ICU 4.8 488 */ 489 public ByteBuffer bytesAsByteBuffer() { 490 return ByteBuffer.wrap(bytes, 0, length).asReadOnlyBuffer(); 491 } 492 493 /** 494 * The value associated with the byte sequence. 495 * @stable ICU 4.8 496 */ 497 public int value; 498 499 private void ensureCapacity(int len) { 500 if(bytes.length<len) { 501 byte[] newBytes=new byte[Math.min(2*bytes.length, 2*len)]; 502 System.arraycopy(bytes, 0, newBytes, 0, length); 503 bytes=newBytes; 504 } 505 } 506 private void append(byte b) { 507 ensureCapacity(length+1); 508 bytes[length++]=b; 509 } 510 private void append(byte[] b, int off, int len) { 511 ensureCapacity(length+len); 512 System.arraycopy(b, off, bytes, length, len); 513 length+=len; 514 } 515 private void truncateString(int newLength) { length=newLength; } 516 517 private byte[] bytes; 518 private int length; 519 } 520 521 /** 522 * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. 523 * @stable ICU 4.8 524 */ 525 public static final class Iterator implements java.util.Iterator<Entry> { 526 private Iterator(byte[] trieBytes, int offset, int remainingMatchLength, int maxStringLength) { 527 bytes_=trieBytes; 528 pos_=initialPos_=offset; 529 remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength; 530 maxLength_=maxStringLength; 531 entry_=new Entry(maxLength_!=0 ? maxLength_ : 32); 532 int length=remainingMatchLength_; // Actual remaining match length minus 1. 533 if(length>=0) { 534 // Pending linear-match node, append remaining bytes to entry_. 535 ++length; 536 if(maxLength_>0 && length>maxLength_) { 537 length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. 538 } 539 entry_.append(bytes_, pos_, length); 540 pos_+=length; 541 remainingMatchLength_-=length; 542 } 543 } 544 545 /** 546 * Resets this iterator to its initial state. 547 * @return this 548 * @stable ICU 4.8 549 */ 550 public Iterator reset() { 551 pos_=initialPos_; 552 remainingMatchLength_=initialRemainingMatchLength_; 553 int length=remainingMatchLength_+1; // Remaining match length. 554 if(maxLength_>0 && length>maxLength_) { 555 length=maxLength_; 556 } 557 entry_.truncateString(length); 558 pos_+=length; 559 remainingMatchLength_-=length; 560 stack_.clear(); 561 return this; 562 } 563 564 /** 565 * @return true if there are more elements. 566 * @stable ICU 4.8 567 */ 568 @Override 569 public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); } 570 571 /** 572 * Finds the next (byte sequence, value) pair if there is one. 573 * 574 * If the byte sequence is truncated to the maximum length and does not 575 * have a real value, then the value is set to -1. 576 * In this case, this "not a real value" is indistinguishable from 577 * a real value of -1. 578 * @return An Entry with the string and value of the next element. 579 * @throws NoSuchElementException - iteration has no more elements. 580 * @stable ICU 4.8 581 */ 582 @Override 583 public Entry next() { 584 int pos=pos_; 585 if(pos<0) { 586 if(stack_.isEmpty()) { 587 throw new NoSuchElementException(); 588 } 589 // Pop the state off the stack and continue with the next outbound edge of 590 // the branch node. 591 long top=stack_.remove(stack_.size()-1); 592 int length=(int)top; 593 pos=(int)(top>>32); 594 entry_.truncateString(length&0xffff); 595 length>>>=16; 596 if(length>1) { 597 pos=branchNext(pos, length); 598 if(pos<0) { 599 return entry_; // Reached a final value. 600 } 601 } else { 602 entry_.append(bytes_[pos++]); 603 } 604 } 605 if(remainingMatchLength_>=0) { 606 // We only get here if we started in a pending linear-match node 607 // with more than maxLength remaining bytes. 608 return truncateAndStop(); 609 } 610 for(;;) { 611 int node=bytes_[pos++]&0xff; 612 if(node>=kMinValueLead) { 613 // Deliver value for the byte sequence so far. 614 boolean isFinal=(node&kValueIsFinal)!=0; 615 entry_.value=readValue(bytes_, pos, node>>1); 616 if(isFinal || (maxLength_>0 && entry_.length==maxLength_)) { 617 pos_=-1; 618 } else { 619 pos_=skipValue(pos, node); 620 } 621 return entry_; 622 } 623 if(maxLength_>0 && entry_.length==maxLength_) { 624 return truncateAndStop(); 625 } 626 if(node<kMinLinearMatch) { 627 if(node==0) { 628 node=bytes_[pos++]&0xff; 629 } 630 pos=branchNext(pos, node+1); 631 if(pos<0) { 632 return entry_; // Reached a final value. 633 } 634 } else { 635 // Linear-match node, append length bytes to entry_. 636 int length=node-kMinLinearMatch+1; 637 if(maxLength_>0 && entry_.length+length>maxLength_) { 638 entry_.append(bytes_, pos, maxLength_-entry_.length); 639 return truncateAndStop(); 640 } 641 entry_.append(bytes_, pos, length); 642 pos+=length; 643 } 644 } 645 } 646 647 /** 648 * Iterator.remove() is not supported. 649 * @throws UnsupportedOperationException (always) 650 * @stable ICU 4.8 651 */ 652 @Override 653 public void remove() { 654 throw new UnsupportedOperationException(); 655 } 656 657 private Entry truncateAndStop() { 658 pos_=-1; 659 entry_.value=-1; // no real value for str 660 return entry_; 661 } 662 663 private int branchNext(int pos, int length) { 664 while(length>kMaxBranchLinearSubNodeLength) { 665 ++pos; // ignore the comparison byte 666 // Push state for the greater-or-equal edge. 667 stack_.add(((long)skipDelta(bytes_, pos)<<32)|((length-(length>>1))<<16)|entry_.length); 668 // Follow the less-than edge. 669 length>>=1; 670 pos=jumpByDelta(bytes_, pos); 671 } 672 // List of key-value pairs where values are either final values or jump deltas. 673 // Read the first (key, value) pair. 674 byte trieByte=bytes_[pos++]; 675 int node=bytes_[pos++]&0xff; 676 boolean isFinal=(node&kValueIsFinal)!=0; 677 int value=readValue(bytes_, pos, node>>1); 678 pos=skipValue(pos, node); 679 stack_.add(((long)pos<<32)|((length-1)<<16)|entry_.length); 680 entry_.append(trieByte); 681 if(isFinal) { 682 pos_=-1; 683 entry_.value=value; 684 return -1; 685 } else { 686 return pos+value; 687 } 688 } 689 690 private byte[] bytes_; 691 private int pos_; 692 private int initialPos_; 693 private int remainingMatchLength_; 694 private int initialRemainingMatchLength_; 695 696 private int maxLength_; 697 private Entry entry_; 698 699 // The stack stores longs for backtracking to another 700 // outbound edge of a branch node. 701 // Each long has the offset from bytes_ in bits 62..32, 702 // the entry_.stringLength() from before the node in bits 15..0, 703 // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) 704 // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, 705 // but the code looks more confusing that way.) 706 private ArrayList<Long> stack_=new ArrayList<Long>(); 707 } 708 709 private void stop() { 710 pos_=-1; 711 } 712 713 // Reads a compact 32-bit integer. 714 // pos is already after the leadByte, and the lead byte is already shifted right by 1. 715 private static int readValue(byte[] bytes, int pos, int leadByte) { 716 int value; 717 if(leadByte<kMinTwoByteValueLead) { 718 value=leadByte-kMinOneByteValueLead; 719 } else if(leadByte<kMinThreeByteValueLead) { 720 value=((leadByte-kMinTwoByteValueLead)<<8)|(bytes[pos]&0xff); 721 } else if(leadByte<kFourByteValueLead) { 722 value=((leadByte-kMinThreeByteValueLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); 723 } else if(leadByte==kFourByteValueLead) { 724 value=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); 725 } else { 726 value=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); 727 } 728 return value; 729 } 730 private static int skipValue(int pos, int leadByte) { 731 assert(leadByte>=kMinValueLead); 732 if(leadByte>=(kMinTwoByteValueLead<<1)) { 733 if(leadByte<(kMinThreeByteValueLead<<1)) { 734 ++pos; 735 } else if(leadByte<(kFourByteValueLead<<1)) { 736 pos+=2; 737 } else { 738 pos+=3+((leadByte>>1)&1); 739 } 740 } 741 return pos; 742 } 743 private static int skipValue(byte[] bytes, int pos) { 744 int leadByte=bytes[pos++]&0xff; 745 return skipValue(pos, leadByte); 746 } 747 748 // Reads a jump delta and jumps. 749 private static int jumpByDelta(byte[] bytes, int pos) { 750 int delta=bytes[pos++]&0xff; 751 if(delta<kMinTwoByteDeltaLead) { 752 // nothing to do 753 } else if(delta<kMinThreeByteDeltaLead) { 754 delta=((delta-kMinTwoByteDeltaLead)<<8)|(bytes[pos++]&0xff); 755 } else if(delta<kFourByteDeltaLead) { 756 delta=((delta-kMinThreeByteDeltaLead)<<16)|((bytes[pos]&0xff)<<8)|(bytes[pos+1]&0xff); 757 pos+=2; 758 } else if(delta==kFourByteDeltaLead) { 759 delta=((bytes[pos]&0xff)<<16)|((bytes[pos+1]&0xff)<<8)|(bytes[pos+2]&0xff); 760 pos+=3; 761 } else { 762 delta=(bytes[pos]<<24)|((bytes[pos+1]&0xff)<<16)|((bytes[pos+2]&0xff)<<8)|(bytes[pos+3]&0xff); 763 pos+=4; 764 } 765 return pos+delta; 766 } 767 768 private static int skipDelta(byte[] bytes, int pos) { 769 int delta=bytes[pos++]&0xff; 770 if(delta>=kMinTwoByteDeltaLead) { 771 if(delta<kMinThreeByteDeltaLead) { 772 ++pos; 773 } else if(delta<kFourByteDeltaLead) { 774 pos+=2; 775 } else { 776 pos+=3+(delta&1); 777 } 778 } 779 return pos; 780 } 781 782 private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE }; 783 784 // Handles a branch node for both next(byte) and next(string). 785 private Result branchNext(int pos, int length, int inByte) { 786 // Branch according to the current byte. 787 if(length==0) { 788 length=bytes_[pos++]&0xff; 789 } 790 ++length; 791 // The length of the branch is the number of bytes to select from. 792 // The data structure encodes a binary search. 793 while(length>kMaxBranchLinearSubNodeLength) { 794 if(inByte<(bytes_[pos++]&0xff)) { 795 length>>=1; 796 pos=jumpByDelta(bytes_, pos); 797 } else { 798 length=length-(length>>1); 799 pos=skipDelta(bytes_, pos); 800 } 801 } 802 // Drop down to linear search for the last few bytes. 803 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 804 // and divides length by 2. 805 do { 806 if(inByte==(bytes_[pos++]&0xff)) { 807 Result result; 808 int node=bytes_[pos]&0xff; 809 assert(node>=kMinValueLead); 810 if((node&kValueIsFinal)!=0) { 811 // Leave the final value for getValue() to read. 812 result=Result.FINAL_VALUE; 813 } else { 814 // Use the non-final value as the jump delta. 815 ++pos; 816 // int delta=readValue(pos, node>>1); 817 node>>=1; 818 int delta; 819 if(node<kMinTwoByteValueLead) { 820 delta=node-kMinOneByteValueLead; 821 } else if(node<kMinThreeByteValueLead) { 822 delta=((node-kMinTwoByteValueLead)<<8)|(bytes_[pos++]&0xff); 823 } else if(node<kFourByteValueLead) { 824 delta=((node-kMinThreeByteValueLead)<<16)|((bytes_[pos]&0xff)<<8)|(bytes_[pos+1]&0xff); 825 pos+=2; 826 } else if(node==kFourByteValueLead) { 827 delta=((bytes_[pos]&0xff)<<16)|((bytes_[pos+1]&0xff)<<8)|(bytes_[pos+2]&0xff); 828 pos+=3; 829 } else { 830 delta=(bytes_[pos]<<24)|((bytes_[pos+1]&0xff)<<16)|((bytes_[pos+2]&0xff)<<8)|(bytes_[pos+3]&0xff); 831 pos+=4; 832 } 833 // end readValue() 834 pos+=delta; 835 node=bytes_[pos]&0xff; 836 result= node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 837 } 838 pos_=pos; 839 return result; 840 } 841 --length; 842 pos=skipValue(bytes_, pos); 843 } while(length>1); 844 if(inByte==(bytes_[pos++]&0xff)) { 845 pos_=pos; 846 int node=bytes_[pos]&0xff; 847 return node>=kMinValueLead ? valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 848 } else { 849 stop(); 850 return Result.NO_MATCH; 851 } 852 } 853 854 // Requires remainingLength_<0. 855 private Result nextImpl(int pos, int inByte) { 856 for(;;) { 857 int node=bytes_[pos++]&0xff; 858 if(node<kMinLinearMatch) { 859 return branchNext(pos, node, inByte); 860 } else if(node<kMinValueLead) { 861 // Match the first of length+1 bytes. 862 int length=node-kMinLinearMatch; // Actual match length minus 1. 863 if(inByte==(bytes_[pos++]&0xff)) { 864 remainingMatchLength_=--length; 865 pos_=pos; 866 return (length<0 && (node=bytes_[pos]&0xff)>=kMinValueLead) ? 867 valueResults_[node&kValueIsFinal] : Result.NO_VALUE; 868 } else { 869 // No match. 870 break; 871 } 872 } else if((node&kValueIsFinal)!=0) { 873 // No further matching bytes. 874 break; 875 } else { 876 // Skip intermediate value. 877 pos=skipValue(pos, node); 878 // The next node must not also be a value node. 879 assert((bytes_[pos]&0xff)<kMinValueLead); 880 } 881 } 882 stop(); 883 return Result.NO_MATCH; 884 } 885 886 // Helper functions for getUniqueValue(). 887 // Recursively finds a unique value (or whether there is not a unique one) 888 // from a branch. 889 // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). 890 // On return, if not 0, then bits 63..33 contain the updated non-negative pos. 891 private static long findUniqueValueFromBranch(byte[] bytes, int pos, int length, 892 long uniqueValue) { 893 while(length>kMaxBranchLinearSubNodeLength) { 894 ++pos; // ignore the comparison byte 895 uniqueValue=findUniqueValueFromBranch(bytes, jumpByDelta(bytes, pos), length>>1, uniqueValue); 896 if(uniqueValue==0) { 897 return 0; 898 } 899 length=length-(length>>1); 900 pos=skipDelta(bytes, pos); 901 } 902 do { 903 ++pos; // ignore a comparison byte 904 // handle its value 905 int node=bytes[pos++]&0xff; 906 boolean isFinal=(node&kValueIsFinal)!=0; 907 int value=readValue(bytes, pos, node>>1); 908 pos=skipValue(pos, node); 909 if(isFinal) { 910 if(uniqueValue!=0) { 911 if(value!=(int)(uniqueValue>>1)) { 912 return 0; 913 } 914 } else { 915 uniqueValue=((long)value<<1)|1; 916 } 917 } else { 918 uniqueValue=findUniqueValue(bytes, pos+value, uniqueValue); 919 if(uniqueValue==0) { 920 return 0; 921 } 922 } 923 } while(--length>1); 924 // ignore the last comparison byte 925 return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL); 926 } 927 // Recursively finds a unique value (or whether there is not a unique one) 928 // starting from a position on a node lead byte. 929 // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set. 930 // Otherwise, uniqueValue is 0. Bits 63..33 are ignored. 931 private static long findUniqueValue(byte[] bytes, int pos, long uniqueValue) { 932 for(;;) { 933 int node=bytes[pos++]&0xff; 934 if(node<kMinLinearMatch) { 935 if(node==0) { 936 node=bytes[pos++]&0xff; 937 } 938 uniqueValue=findUniqueValueFromBranch(bytes, pos, node+1, uniqueValue); 939 if(uniqueValue==0) { 940 return 0; 941 } 942 pos=(int)(uniqueValue>>>33); 943 } else if(node<kMinValueLead) { 944 // linear-match node 945 pos+=node-kMinLinearMatch+1; // Ignore the match bytes. 946 } else { 947 boolean isFinal=(node&kValueIsFinal)!=0; 948 int value=readValue(bytes, pos, node>>1); 949 if(uniqueValue!=0) { 950 if(value!=(int)(uniqueValue>>1)) { 951 return 0; 952 } 953 } else { 954 uniqueValue=((long)value<<1)|1; 955 } 956 if(isFinal) { 957 return uniqueValue; 958 } 959 pos=skipValue(pos, node); 960 } 961 } 962 } 963 964 // Helper functions for getNextBytes(). 965 // getNextBytes() when pos is on a branch node. 966 private static void getNextBranchBytes(byte[] bytes, int pos, int length, Appendable out) { 967 while(length>kMaxBranchLinearSubNodeLength) { 968 ++pos; // ignore the comparison byte 969 getNextBranchBytes(bytes, jumpByDelta(bytes, pos), length>>1, out); 970 length=length-(length>>1); 971 pos=skipDelta(bytes, pos); 972 } 973 do { 974 append(out, bytes[pos++]&0xff); 975 pos=skipValue(bytes, pos); 976 } while(--length>1); 977 append(out, bytes[pos]&0xff); 978 } 979 private static void append(Appendable out, int c) { 980 try { 981 out.append((char)c); 982 } catch(IOException e) { 983 throw new ICUUncheckedIOException(e); 984 } 985 } 986 987 // BytesTrie data structure 988 // 989 // The trie consists of a series of byte-serialized nodes for incremental 990 // string/byte sequence matching. The root node is at the beginning of the trie data. 991 // 992 // Types of nodes are distinguished by their node lead byte ranges. 993 // After each node, except a final-value node, another node follows to 994 // encode match values or continue matching further bytes. 995 // 996 // Node types: 997 // - Value node: Stores a 32-bit integer in a compact, variable-length format. 998 // The value is for the string/byte sequence so far. 999 // One node bit indicates whether the value is final or whether 1000 // matching continues with the next node. 1001 // - Linear-match node: Matches a number of bytes. 1002 // - Branch node: Branches to other nodes according to the current input byte. 1003 // The node byte is the length of the branch (number of bytes to select from) 1004 // minus 1. It is followed by a sub-node: 1005 // - If the length is at most kMaxBranchLinearSubNodeLength, then 1006 // there are length-1 (key, value) pairs and then one more comparison byte. 1007 // If one of the key bytes matches, then the value is either a final value for 1008 // the string/byte sequence so far, or a "jump" delta to the next node. 1009 // If the last byte matches, then matching continues with the next node. 1010 // (Values have the same encoding as value nodes.) 1011 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 1012 // there is one byte and one "jump" delta. 1013 // If the input byte is less than the sub-node byte, then "jump" by delta to 1014 // the next sub-node which will have a length of length/2. 1015 // (The delta has its own compact encoding.) 1016 // Otherwise, skip the "jump" delta to the next sub-node 1017 // which will have a length of length-length/2. 1018 1019 // Node lead byte values. 1020 1021 // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise 1022 // the length is one more than the next byte. 1023 1024 // For a branch sub-node with at most this many entries, we drop down 1025 // to a linear search. 1026 /*package*/ static final int kMaxBranchLinearSubNodeLength=5; 1027 1028 // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. 1029 /*package*/ static final int kMinLinearMatch=0x10; 1030 /*package*/ static final int kMaxLinearMatchLength=0x10; 1031 1032 // 20..ff: Variable-length value node. 1033 // If odd, the value is final. (Otherwise, intermediate value or jump delta.) 1034 // Then shift-right by 1 bit. 1035 // The remaining lead byte value indicates the number of following bytes (0..4) 1036 // and contains the value's top bits. 1037 /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 1038 // It is a final value if bit 0 is set. 1039 private static final int kValueIsFinal=1; 1040 1041 // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. 1042 /*package*/ static final int kMinOneByteValueLead=kMinValueLead/2; // 0x10 1043 /*package*/ static final int kMaxOneByteValue=0x40; // At least 6 bits in the first byte. 1044 1045 /*package*/ static final int kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 1046 /*package*/ static final int kMaxTwoByteValue=0x1aff; 1047 1048 /*package*/ static final int kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c 1049 /*package*/ static final int kFourByteValueLead=0x7e; 1050 1051 // A little more than Unicode code points. (0x11ffff) 1052 /*package*/ static final int kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; 1053 1054 /*package*/ static final int kFiveByteValueLead=0x7f; 1055 1056 // Compact delta integers. 1057 /*package*/ static final int kMaxOneByteDelta=0xbf; 1058 /*package*/ static final int kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 1059 /*package*/ static final int kMinThreeByteDeltaLead=0xf0; 1060 /*package*/ static final int kFourByteDeltaLead=0xfe; 1061 /*package*/ static final int kFiveByteDeltaLead=0xff; 1062 1063 /*package*/ static final int kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff 1064 /*package*/ static final int kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff 1065 1066 // Fixed value referencing the BytesTrie bytes. 1067 private byte[] bytes_; 1068 private int root_; 1069 1070 // Iterator variables. 1071 1072 // Index of next trie byte to read. Negative if no more matches. 1073 private int pos_; 1074 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 1075 private int remainingMatchLength_; 1076}; 1077