1/* GENERATED SOURCE. DO NOT MODIFY. */ 2/* 3 ******************************************************************************* 4 * Copyright (C) 2009-2015, International Business Machines Corporation and 5 * others. All Rights Reserved. 6 ******************************************************************************* 7 */ 8 9package android.icu.impl; 10 11import java.io.DataOutputStream; 12import java.io.IOException; 13import java.io.InputStream; 14import java.nio.ByteBuffer; 15import java.nio.ByteOrder; 16import java.util.Iterator; 17import java.util.NoSuchElementException; 18 19 20/** 21 * This is the interface and common implementation of a Unicode Trie2. 22 * It is a kind of compressed table that maps from Unicode code points (0..0x10ffff) 23 * to 16- or 32-bit integer values. It works best when there are ranges of 24 * characters with the same value, which is generally the case with Unicode 25 * character properties. 26 * 27 * This is the second common version of a Unicode trie (hence the name Trie2). 28 * @hide Only a subset of ICU is exposed in Android 29 * 30 */ 31public abstract class Trie2 implements Iterable<Trie2.Range> { 32 33 34 /** 35 * Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). 36 * 37 * Reads from the current position and leaves the buffer after the end of the trie. 38 * 39 * The serialized format is identical between ICU4C and ICU4J, so this function 40 * will work with serialized Trie2s from either. 41 * 42 * The actual type of the returned Trie2 will be either Trie2_16 or Trie2_32, depending 43 * on the width of the data. 44 * 45 * To obtain the width of the Trie2, check the actual class type of the returned Trie2. 46 * Or use the createFromSerialized() function of Trie2_16 or Trie2_32, which will 47 * return only Tries of their specific type/size. 48 * 49 * The serialized Trie2 on the stream may be in either little or big endian byte order. 50 * This allows using serialized Tries from ICU4C without needing to consider the 51 * byte order of the system that created them. 52 * 53 * @param bytes a byte buffer to the serialized form of a UTrie2. 54 * @return An unserialized Trie2, ready for use. 55 * @throws IllegalArgumentException if the stream does not contain a serialized Trie2. 56 * @throws IOException if a read error occurs in the buffer. 57 * 58 */ 59 public static Trie2 createFromSerialized(ByteBuffer bytes) throws IOException { 60 // From ICU4C utrie2_impl.h 61 // * Trie2 data structure in serialized form: 62 // * 63 // * UTrie2Header header; 64 // * uint16_t index[header.index2Length]; 65 // * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] 66 // * @internal 67 // */ 68 // typedef struct UTrie2Header { 69 // /** "Tri2" in big-endian US-ASCII (0x54726932) */ 70 // uint32_t signature; 71 72 // /** 73 // * options bit field: 74 // * 15.. 4 reserved (0) 75 // * 3.. 0 UTrie2ValueBits valueBits 76 // */ 77 // uint16_t options; 78 // 79 // /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */ 80 // uint16_t indexLength; 81 // 82 // /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */ 83 // uint16_t shiftedDataLength; 84 // 85 // /** Null index and data blocks, not shifted. */ 86 // uint16_t index2NullOffset, dataNullOffset; 87 // 88 // /** 89 // * First code point of the single-value range ending with U+10ffff, 90 // * rounded up and then shifted right by UTRIE2_SHIFT_1. 91 // */ 92 // uint16_t shiftedHighStart; 93 // } UTrie2Header; 94 95 ByteOrder outerByteOrder = bytes.order(); 96 try { 97 UTrie2Header header = new UTrie2Header(); 98 99 /* check the signature */ 100 header.signature = bytes.getInt(); 101 switch (header.signature) { 102 case 0x54726932: 103 // The buffer is already set to the trie data byte order. 104 break; 105 case 0x32697254: 106 // Temporarily reverse the byte order. 107 boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN; 108 bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN); 109 header.signature = 0x54726932; 110 break; 111 default: 112 throw new IllegalArgumentException("Buffer does not contain a serialized UTrie2"); 113 } 114 115 header.options = bytes.getChar(); 116 header.indexLength = bytes.getChar(); 117 header.shiftedDataLength = bytes.getChar(); 118 header.index2NullOffset = bytes.getChar(); 119 header.dataNullOffset = bytes.getChar(); 120 header.shiftedHighStart = bytes.getChar(); 121 122 // Trie2 data width - 0: 16 bits 123 // 1: 32 bits 124 if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) > 1) { 125 throw new IllegalArgumentException("UTrie2 serialized format error."); 126 } 127 ValueWidth width; 128 Trie2 This; 129 if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) == 0) { 130 width = ValueWidth.BITS_16; 131 This = new Trie2_16(); 132 } else { 133 width = ValueWidth.BITS_32; 134 This = new Trie2_32(); 135 } 136 This.header = header; 137 138 /* get the length values and offsets */ 139 This.indexLength = header.indexLength; 140 This.dataLength = header.shiftedDataLength << UTRIE2_INDEX_SHIFT; 141 This.index2NullOffset = header.index2NullOffset; 142 This.dataNullOffset = header.dataNullOffset; 143 This.highStart = header.shiftedHighStart << UTRIE2_SHIFT_1; 144 This.highValueIndex = This.dataLength - UTRIE2_DATA_GRANULARITY; 145 if (width == ValueWidth.BITS_16) { 146 This.highValueIndex += This.indexLength; 147 } 148 149 // Allocate the Trie2 index array. If the data width is 16 bits, the array also 150 // includes the space for the data. 151 152 int indexArraySize = This.indexLength; 153 if (width == ValueWidth.BITS_16) { 154 indexArraySize += This.dataLength; 155 } 156 157 /* Read in the index */ 158 This.index = ICUBinary.getChars(bytes, indexArraySize, 0); 159 160 /* Read in the data. 16 bit data goes in the same array as the index. 161 * 32 bit data goes in its own separate data array. 162 */ 163 if (width == ValueWidth.BITS_16) { 164 This.data16 = This.indexLength; 165 } else { 166 This.data32 = ICUBinary.getInts(bytes, This.dataLength, 0); 167 } 168 169 switch(width) { 170 case BITS_16: 171 This.data32 = null; 172 This.initialValue = This.index[This.dataNullOffset]; 173 This.errorValue = This.index[This.data16+UTRIE2_BAD_UTF8_DATA_OFFSET]; 174 break; 175 case BITS_32: 176 This.data16=0; 177 This.initialValue = This.data32[This.dataNullOffset]; 178 This.errorValue = This.data32[UTRIE2_BAD_UTF8_DATA_OFFSET]; 179 break; 180 default: 181 throw new IllegalArgumentException("UTrie2 serialized format error."); 182 } 183 184 return This; 185 } finally { 186 bytes.order(outerByteOrder); 187 } 188 } 189 190 /** 191 * Get the UTrie version from an InputStream containing the serialized form 192 * of either a Trie (version 1) or a Trie2 (version 2). 193 * 194 * @param is an InputStream containing the serialized form 195 * of a UTrie, version 1 or 2. The stream must support mark() and reset(). 196 * The position of the input stream will be left unchanged. 197 * @param littleEndianOk If FALSE, only big-endian (Java native) serialized forms are recognized. 198 * If TRUE, little-endian serialized forms are recognized as well. 199 * @return the Trie version of the serialized form, or 0 if it is not 200 * recognized as a serialized UTrie 201 * @throws IOException on errors in reading from the input stream. 202 */ 203 public static int getVersion(InputStream is, boolean littleEndianOk) throws IOException { 204 if (! is.markSupported()) { 205 throw new IllegalArgumentException("Input stream must support mark()."); 206 } 207 is.mark(4); 208 byte sig[] = new byte[4]; 209 int read = is.read(sig); 210 is.reset(); 211 212 if (read != sig.length) { 213 return 0; 214 } 215 216 if (sig[0]=='T' && sig[1]=='r' && sig[2]=='i' && sig[3]=='e') { 217 return 1; 218 } 219 if (sig[0]=='T' && sig[1]=='r' && sig[2]=='i' && sig[3]=='2') { 220 return 2; 221 } 222 if (littleEndianOk) { 223 if (sig[0]=='e' && sig[1]=='i' && sig[2]=='r' && sig[3]=='T') { 224 return 1; 225 } 226 if (sig[0]=='2' && sig[1]=='i' && sig[2]=='r' && sig[3]=='T') { 227 return 2; 228 } 229 } 230 return 0; 231 } 232 233 /** 234 * Get the value for a code point as stored in the Trie2. 235 * 236 * @param codePoint the code point 237 * @return the value 238 */ 239 abstract public int get(int codePoint); 240 241 242 /** 243 * Get the trie value for a UTF-16 code unit. 244 * 245 * A Trie2 stores two distinct values for input in the lead surrogate 246 * range, one for lead surrogates, which is the value that will be 247 * returned by this function, and a second value that is returned 248 * by Trie2.get(). 249 * 250 * For code units outside of the lead surrogate range, this function 251 * returns the same result as Trie2.get(). 252 * 253 * This function, together with the alternate value for lead surrogates, 254 * makes possible very efficient processing of UTF-16 strings without 255 * first converting surrogate pairs to their corresponding 32 bit code point 256 * values. 257 * 258 * At build-time, enumerate the contents of the Trie2 to see if there 259 * is non-trivial (non-initialValue) data for any of the supplementary 260 * code points associated with a lead surrogate. 261 * If so, then set a special (application-specific) value for the 262 * lead surrogate code _unit_, with Trie2Writable.setForLeadSurrogateCodeUnit(). 263 * 264 * At runtime, use Trie2.getFromU16SingleLead(). If there is non-trivial 265 * data and the code unit is a lead surrogate, then check if a trail surrogate 266 * follows. If so, assemble the supplementary code point and look up its value 267 * with Trie2.get(); otherwise reset the lead 268 * surrogate's value or do a code point lookup for it. 269 * 270 * If there is only trivial data for lead and trail surrogates, then processing 271 * can often skip them. For example, in normalization or case mapping 272 * all characters that do not have any mappings are simply copied as is. 273 * 274 * @param c the code point or lead surrogate value. 275 * @return the value 276 */ 277 abstract public int getFromU16SingleLead(char c); 278 279 280 /** 281 * Equals function. Two Tries are equal if their contents are equal. 282 * The type need not be the same, so a Trie2Writable will be equal to 283 * (read-only) Trie2_16 or Trie2_32 so long as they are storing the same values. 284 * 285 */ 286 public final boolean equals(Object other) { 287 if(!(other instanceof Trie2)) { 288 return false; 289 } 290 Trie2 OtherTrie = (Trie2)other; 291 Range rangeFromOther; 292 293 Iterator<Trie2.Range> otherIter = OtherTrie.iterator(); 294 for (Trie2.Range rangeFromThis: this) { 295 if (otherIter.hasNext() == false) { 296 return false; 297 } 298 rangeFromOther = otherIter.next(); 299 if (!rangeFromThis.equals(rangeFromOther)) { 300 return false; 301 } 302 } 303 if (otherIter.hasNext()) { 304 return false; 305 } 306 307 if (errorValue != OtherTrie.errorValue || 308 initialValue != OtherTrie.initialValue) { 309 return false; 310 } 311 312 return true; 313 } 314 315 316 public int hashCode() { 317 if (fHash == 0) { 318 int hash = initHash(); 319 for (Range r: this) { 320 hash = hashInt(hash, r.hashCode()); 321 } 322 if (hash == 0) { 323 hash = 1; 324 } 325 fHash = hash; 326 } 327 return fHash; 328 } 329 330 /** 331 * When iterating over the contents of a Trie2, Elements of this type are produced. 332 * The iterator will return one item for each contiguous range of codepoints having the same value. 333 * 334 * When iterating, the same Trie2EnumRange object will be reused and returned for each range. 335 * If you need to retain complete iteration results, clone each returned Trie2EnumRange, 336 * or save the range in some other way, before advancing to the next iteration step. 337 */ 338 public static class Range { 339 public int startCodePoint; 340 public int endCodePoint; // Inclusive. 341 public int value; 342 public boolean leadSurrogate; 343 344 public boolean equals(Object other) { 345 if (other == null || !(other.getClass().equals(getClass()))) { 346 return false; 347 } 348 Range tother = (Range)other; 349 return this.startCodePoint == tother.startCodePoint && 350 this.endCodePoint == tother.endCodePoint && 351 this.value == tother.value && 352 this.leadSurrogate == tother.leadSurrogate; 353 } 354 355 356 public int hashCode() { 357 int h = initHash(); 358 h = hashUChar32(h, startCodePoint); 359 h = hashUChar32(h, endCodePoint); 360 h = hashInt(h, value); 361 h = hashByte(h, leadSurrogate? 1: 0); 362 return h; 363 } 364 } 365 366 367 /** 368 * Create an iterator over the value ranges in this Trie2. 369 * Values from the Trie2 are not remapped or filtered, but are returned as they 370 * are stored in the Trie2. 371 * 372 * @return an Iterator 373 */ 374 public Iterator<Range> iterator() { 375 return iterator(defaultValueMapper); 376 } 377 378 private static ValueMapper defaultValueMapper = new ValueMapper() { 379 public int map(int in) { 380 return in; 381 } 382 }; 383 384 /** 385 * Create an iterator over the value ranges from this Trie2. 386 * Values from the Trie2 are passed through a caller-supplied remapping function, 387 * and it is the remapped values that determine the ranges that 388 * will be produced by the iterator. 389 * 390 * 391 * @param mapper provides a function to remap values obtained from the Trie2. 392 * @return an Iterator 393 */ 394 public Iterator<Range> iterator(ValueMapper mapper) { 395 return new Trie2Iterator(mapper); 396 } 397 398 399 /** 400 * Create an iterator over the Trie2 values for the 1024=0x400 code points 401 * corresponding to a given lead surrogate. 402 * For example, for the lead surrogate U+D87E it will enumerate the values 403 * for [U+2F800..U+2FC00[. 404 * Used by data builder code that sets special lead surrogate code unit values 405 * for optimized UTF-16 string processing. 406 * 407 * Do not modify the Trie2 during the iteration. 408 * 409 * Except for the limited code point range, this functions just like Trie2.iterator(). 410 * 411 */ 412 public Iterator<Range> iteratorForLeadSurrogate(char lead, ValueMapper mapper) { 413 return new Trie2Iterator(lead, mapper); 414 } 415 416 /** 417 * Create an iterator over the Trie2 values for the 1024=0x400 code points 418 * corresponding to a given lead surrogate. 419 * For example, for the lead surrogate U+D87E it will enumerate the values 420 * for [U+2F800..U+2FC00[. 421 * Used by data builder code that sets special lead surrogate code unit values 422 * for optimized UTF-16 string processing. 423 * 424 * Do not modify the Trie2 during the iteration. 425 * 426 * Except for the limited code point range, this functions just like Trie2.iterator(). 427 * 428 */ 429 public Iterator<Range> iteratorForLeadSurrogate(char lead) { 430 return new Trie2Iterator(lead, defaultValueMapper); 431 } 432 433 /** 434 * When iterating over the contents of a Trie2, an instance of TrieValueMapper may 435 * be used to remap the values from the Trie2. The remapped values will be used 436 * both in determining the ranges of codepoints and as the value to be returned 437 * for each range. 438 * 439 * Example of use, with an anonymous subclass of TrieValueMapper: 440 * 441 * 442 * ValueMapper m = new ValueMapper() { 443 * int map(int in) {return in & 0x1f;}; 444 * } 445 * for (Iterator<Trie2EnumRange> iter = trie.iterator(m); i.hasNext(); ) { 446 * Trie2EnumRange r = i.next(); 447 * ... // Do something with the range r. 448 * } 449 * 450 */ 451 public interface ValueMapper { 452 public int map(int originalVal); 453 } 454 455 456 /** 457 * Serialize a trie2 Header and Index onto an OutputStream. This is 458 * common code used for both the Trie2_16 and Trie2_32 serialize functions. 459 * @param dos the stream to which the serialized Trie2 data will be written. 460 * @return the number of bytes written. 461 */ 462 protected int serializeHeader(DataOutputStream dos) throws IOException { 463 // Write the header. It is already set and ready to use, having been 464 // created when the Trie2 was unserialized or when it was frozen. 465 int bytesWritten = 0; 466 467 dos.writeInt(header.signature); 468 dos.writeShort(header.options); 469 dos.writeShort(header.indexLength); 470 dos.writeShort(header.shiftedDataLength); 471 dos.writeShort(header.index2NullOffset); 472 dos.writeShort(header.dataNullOffset); 473 dos.writeShort(header.shiftedHighStart); 474 bytesWritten += 16; 475 476 // Write the index 477 int i; 478 for (i=0; i< header.indexLength; i++) { 479 dos.writeChar(index[i]); 480 } 481 bytesWritten += header.indexLength; 482 return bytesWritten; 483 } 484 485 486 /** 487 * Struct-like class for holding the results returned by a UTrie2 CharSequence iterator. 488 * The iteration walks over a CharSequence, and for each Unicode code point therein 489 * returns the character and its associated Trie2 value. 490 */ 491 public static class CharSequenceValues { 492 /** string index of the current code point. */ 493 public int index; 494 /** The code point at index. */ 495 public int codePoint; 496 /** The Trie2 value for the current code point */ 497 public int value; 498 } 499 500 501 /** 502 * Create an iterator that will produce the values from the Trie2 for 503 * the sequence of code points in an input text. 504 * 505 * @param text A text string to be iterated over. 506 * @param index The starting iteration position within the input text. 507 * @return the CharSequenceIterator 508 */ 509 public CharSequenceIterator charSequenceIterator(CharSequence text, int index) { 510 return new CharSequenceIterator(text, index); 511 } 512 513 // TODO: Survey usage of the equivalent of CharSequenceIterator in ICU4C 514 // and if there is none, remove it from here. 515 // Don't waste time testing and maintaining unused code. 516 517 /** 518 * An iterator that operates over an input CharSequence, and for each Unicode code point 519 * in the input returns the associated value from the Trie2. 520 * 521 * The iterator can move forwards or backwards, and can be reset to an arbitrary index. 522 * 523 * Note that Trie2_16 and Trie2_32 subclass Trie2.CharSequenceIterator. This is done 524 * only for performance reasons. It does require that any changes made here be propagated 525 * into the corresponding code in the subclasses. 526 */ 527 public class CharSequenceIterator implements Iterator<CharSequenceValues> { 528 /** 529 * Internal constructor. 530 */ 531 CharSequenceIterator(CharSequence t, int index) { 532 text = t; 533 textLength = text.length(); 534 set(index); 535 } 536 537 private CharSequence text; 538 private int textLength; 539 private int index; 540 private Trie2.CharSequenceValues fResults = new Trie2.CharSequenceValues(); 541 542 543 public void set(int i) { 544 if (i < 0 || i > textLength) { 545 throw new IndexOutOfBoundsException(); 546 } 547 index = i; 548 } 549 550 551 public final boolean hasNext() { 552 return index<textLength; 553 } 554 555 556 public final boolean hasPrevious() { 557 return index>0; 558 } 559 560 561 public Trie2.CharSequenceValues next() { 562 int c = Character.codePointAt(text, index); 563 int val = get(c); 564 565 fResults.index = index; 566 fResults.codePoint = c; 567 fResults.value = val; 568 index++; 569 if (c >= 0x10000) { 570 index++; 571 } 572 return fResults; 573 } 574 575 576 public Trie2.CharSequenceValues previous() { 577 int c = Character.codePointBefore(text, index); 578 int val = get(c); 579 index--; 580 if (c >= 0x10000) { 581 index--; 582 } 583 fResults.index = index; 584 fResults.codePoint = c; 585 fResults.value = val; 586 return fResults; 587 } 588 589 /** 590 * Iterator.remove() is not supported by Trie2.CharSequenceIterator. 591 * @throws UnsupportedOperationException Always thrown because this operation is not supported 592 * @see java.util.Iterator#remove() 593 */ 594 public void remove() { 595 throw new UnsupportedOperationException("Trie2.CharSequenceIterator does not support remove()."); 596 } 597 } 598 599 600 //-------------------------------------------------------------------------------- 601 // 602 // Below this point are internal implementation items. No further public API. 603 // 604 //-------------------------------------------------------------------------------- 605 606 607 /** 608 * Selectors for the width of a UTrie2 data value. 609 */ 610 enum ValueWidth { 611 BITS_16, 612 BITS_32 613 } 614 615 /** 616 * Trie2 data structure in serialized form: 617 * 618 * UTrie2Header header; 619 * uint16_t index[header.index2Length]; 620 * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] 621 * 622 * For Java, this is read from the stream into an instance of UTrie2Header. 623 * (The C version just places a struct over the raw serialized data.) 624 * 625 * @hide draft / provisional / internal are hidden on Android 626 */ 627 static class UTrie2Header { 628 /** "Tri2" in big-endian US-ASCII (0x54726932) */ 629 int signature; 630 631 /** 632 * options bit field (uint16_t): 633 * 15.. 4 reserved (0) 634 * 3.. 0 UTrie2ValueBits valueBits 635 */ 636 int options; 637 638 /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH (uint16_t) */ 639 int indexLength; 640 641 /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT (uint16_t) */ 642 int shiftedDataLength; 643 644 /** Null index and data blocks, not shifted. (uint16_t) */ 645 int index2NullOffset, dataNullOffset; 646 647 /** 648 * First code point of the single-value range ending with U+10ffff, 649 * rounded up and then shifted right by UTRIE2_SHIFT_1. (uint16_t) 650 */ 651 int shiftedHighStart; 652 } 653 654 // 655 // Data members of UTrie2. 656 // 657 UTrie2Header header; 658 char index[]; // Index array. Includes data for 16 bit Tries. 659 int data16; // Offset to data portion of the index array, if 16 bit data. 660 // zero if 32 bit data. 661 int data32[]; // NULL if 16b data is used via index 662 663 int indexLength; 664 int dataLength; 665 int index2NullOffset; // 0xffff if there is no dedicated index-2 null block 666 int initialValue; 667 668 /** Value returned for out-of-range code points and illegal UTF-8. */ 669 int errorValue; 670 671 /* Start of the last range which ends at U+10ffff, and its value. */ 672 int highStart; 673 int highValueIndex; 674 675 int dataNullOffset; 676 677 int fHash; // Zero if not yet computed. 678 // Shared by Trie2Writable, Trie2_16, Trie2_32. 679 // Thread safety: if two racing threads compute 680 // the same hash on a frozen Trie2, no damage is done. 681 682 683 /** 684 * Trie2 constants, defining shift widths, index array lengths, etc. 685 * 686 * These are needed for the runtime macros but users can treat these as 687 * implementation details and skip to the actual public API further below. 688 */ 689 690 static final int UTRIE2_OPTIONS_VALUE_BITS_MASK=0x000f; 691 692 693 /** Shift size for getting the index-1 table offset. */ 694 static final int UTRIE2_SHIFT_1=6+5; 695 696 /** Shift size for getting the index-2 table offset. */ 697 static final int UTRIE2_SHIFT_2=5; 698 699 /** 700 * Difference between the two shift sizes, 701 * for getting an index-1 offset from an index-2 offset. 6=11-5 702 */ 703 static final int UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2; 704 705 /** 706 * Number of index-1 entries for the BMP. 32=0x20 707 * This part of the index-1 table is omitted from the serialized form. 708 */ 709 static final int UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1; 710 711 /** Number of code points per index-1 table entry. 2048=0x800 */ 712 static final int UTRIE2_CP_PER_INDEX_1_ENTRY=1<<UTRIE2_SHIFT_1; 713 714 /** Number of entries in an index-2 block. 64=0x40 */ 715 static final int UTRIE2_INDEX_2_BLOCK_LENGTH=1<<UTRIE2_SHIFT_1_2; 716 717 /** Mask for getting the lower bits for the in-index-2-block offset. */ 718 static final int UTRIE2_INDEX_2_MASK=UTRIE2_INDEX_2_BLOCK_LENGTH-1; 719 720 /** Number of entries in a data block. 32=0x20 */ 721 static final int UTRIE2_DATA_BLOCK_LENGTH=1<<UTRIE2_SHIFT_2; 722 723 /** Mask for getting the lower bits for the in-data-block offset. */ 724 static final int UTRIE2_DATA_MASK=UTRIE2_DATA_BLOCK_LENGTH-1; 725 726 /** 727 * Shift size for shifting left the index array values. 728 * Increases possible data size with 16-bit index values at the cost 729 * of compactability. 730 * This requires data blocks to be aligned by UTRIE2_DATA_GRANULARITY. 731 */ 732 static final int UTRIE2_INDEX_SHIFT=2; 733 734 /** The alignment size of a data block. Also the granularity for compaction. */ 735 static final int UTRIE2_DATA_GRANULARITY=1<<UTRIE2_INDEX_SHIFT; 736 737 /* Fixed layout of the first part of the index array. ------------------- */ 738 739 /** 740 * The BMP part of the index-2 table is fixed and linear and starts at offset 0. 741 * Length=2048=0x800=0x10000>>UTRIE2_SHIFT_2. 742 */ 743 static final int UTRIE2_INDEX_2_OFFSET=0; 744 745 /** 746 * The part of the index-2 table for U+D800..U+DBFF stores values for 747 * lead surrogate code _units_ not code _points_. 748 * Values for lead surrogate code _points_ are indexed with this portion of the table. 749 * Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.) 750 */ 751 static final int UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2; 752 static final int UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2; 753 754 /** Count the lengths of both BMP pieces. 2080=0x820 */ 755 static final int UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH; 756 757 /** 758 * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820. 759 * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2. 760 */ 761 static final int UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH; 762 static final int UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6; /* U+0800 is the first code point after 2-byte UTF-8 */ 763 764 /** 765 * The index-1 table, only used for supplementary code points, at offset 2112=0x840. 766 * Variable length, for code points up to highStart, where the last single-value range starts. 767 * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1. 768 * (For 0x100000 supplementary code points U+10000..U+10ffff.) 769 * 770 * The part of the index-2 table for supplementary code points starts 771 * after this index-1 table. 772 * 773 * Both the index-1 table and the following part of the index-2 table 774 * are omitted completely if there is only BMP data. 775 */ 776 static final int UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH; 777 static final int UTRIE2_MAX_INDEX_1_LENGTH=0x100000>>UTRIE2_SHIFT_1; 778 779 /* 780 * Fixed layout of the first part of the data array. ----------------------- 781 * Starts with 4 blocks (128=0x80 entries) for ASCII. 782 */ 783 784 /** 785 * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80. 786 * Used with linear access for single bytes 0..0xbf for simple error handling. 787 * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH. 788 */ 789 static final int UTRIE2_BAD_UTF8_DATA_OFFSET=0x80; 790 791 /** The start of non-linear-ASCII data blocks, at offset 192=0xc0. */ 792 static final int UTRIE2_DATA_START_OFFSET=0xc0; 793 794 /* Building a Trie2 ---------------------------------------------------------- */ 795 796 /* 797 * These definitions are mostly needed by utrie2_builder.c, but also by 798 * utrie2_get32() and utrie2_enum(). 799 */ 800 801 /* 802 * At build time, leave a gap in the index-2 table, 803 * at least as long as the maximum lengths of the 2-byte UTF-8 index-2 table 804 * and the supplementary index-1 table. 805 * Round up to UTRIE2_INDEX_2_BLOCK_LENGTH for proper compacting. 806 */ 807 static final int UNEWTRIE2_INDEX_GAP_OFFSET = UTRIE2_INDEX_2_BMP_LENGTH; 808 static final int UNEWTRIE2_INDEX_GAP_LENGTH = 809 ((UTRIE2_UTF8_2B_INDEX_2_LENGTH + UTRIE2_MAX_INDEX_1_LENGTH) + UTRIE2_INDEX_2_MASK) & 810 ~UTRIE2_INDEX_2_MASK; 811 812 /** 813 * Maximum length of the build-time index-2 array. 814 * Maximum number of Unicode code points (0x110000) shifted right by UTRIE2_SHIFT_2, 815 * plus the part of the index-2 table for lead surrogate code points, 816 * plus the build-time index gap, 817 * plus the null index-2 block. 818 */ 819 static final int UNEWTRIE2_MAX_INDEX_2_LENGTH= 820 (0x110000>>UTRIE2_SHIFT_2)+ 821 UTRIE2_LSCP_INDEX_2_LENGTH+ 822 UNEWTRIE2_INDEX_GAP_LENGTH+ 823 UTRIE2_INDEX_2_BLOCK_LENGTH; 824 825 static final int UNEWTRIE2_INDEX_1_LENGTH = 0x110000>>UTRIE2_SHIFT_1; 826 827 /** 828 * Maximum length of the build-time data array. 829 * One entry per 0x110000 code points, plus the illegal-UTF-8 block and the null block, 830 * plus values for the 0x400 surrogate code units. 831 */ 832 static final int UNEWTRIE2_MAX_DATA_LENGTH = (0x110000+0x40+0x40+0x400); 833 834 835 836 /** 837 * Implementation class for an iterator over a Trie2. 838 * 839 * Iteration over a Trie2 first returns all of the ranges that are indexed by code points, 840 * then returns the special alternate values for the lead surrogates 841 * 842 * @hide draft / provisional / internal are hidden on Android 843 */ 844 class Trie2Iterator implements Iterator<Range> { 845 // The normal constructor that configures the iterator to cover the complete 846 // contents of the Trie2 847 Trie2Iterator(ValueMapper vm) { 848 mapper = vm; 849 nextStart = 0; 850 limitCP = 0x110000; 851 doLeadSurrogates = true; 852 } 853 854 // An alternate constructor that configures the iterator to cover only the 855 // code points corresponding to a particular Lead Surrogate value. 856 Trie2Iterator(char leadSurrogate, ValueMapper vm) { 857 if (leadSurrogate < 0xd800 || leadSurrogate > 0xdbff) { 858 throw new IllegalArgumentException("Bad lead surrogate value."); 859 } 860 mapper = vm; 861 nextStart = (leadSurrogate - 0xd7c0) << 10; 862 limitCP = nextStart + 0x400; 863 doLeadSurrogates = false; // Do not iterate over lead the special lead surrogate 864 // values after completing iteration over code points. 865 } 866 867 /** 868 * The main next() function for Trie2 iterators 869 * 870 */ 871 public Range next() { 872 if (!hasNext()) { 873 throw new NoSuchElementException(); 874 } 875 if (nextStart >= limitCP) { 876 // Switch over from iterating normal code point values to 877 // doing the alternate lead-surrogate values. 878 doingCodePoints = false; 879 nextStart = 0xd800; 880 } 881 int endOfRange = 0; 882 int val = 0; 883 int mappedVal = 0; 884 885 if (doingCodePoints) { 886 // Iteration over code point values. 887 val = get(nextStart); 888 mappedVal = mapper.map(val); 889 endOfRange = rangeEnd(nextStart, limitCP, val); 890 // Loop once for each range in the Trie2 with the same raw (unmapped) value. 891 // Loop continues so long as the mapped values are the same. 892 for (;;) { 893 if (endOfRange >= limitCP-1) { 894 break; 895 } 896 val = get(endOfRange+1); 897 if (mapper.map(val) != mappedVal) { 898 break; 899 } 900 endOfRange = rangeEnd(endOfRange+1, limitCP, val); 901 } 902 } else { 903 // Iteration over the alternate lead surrogate values. 904 val = getFromU16SingleLead((char)nextStart); 905 mappedVal = mapper.map(val); 906 endOfRange = rangeEndLS((char)nextStart); 907 // Loop once for each range in the Trie2 with the same raw (unmapped) value. 908 // Loop continues so long as the mapped values are the same. 909 for (;;) { 910 if (endOfRange >= 0xdbff) { 911 break; 912 } 913 val = getFromU16SingleLead((char)(endOfRange+1)); 914 if (mapper.map(val) != mappedVal) { 915 break; 916 } 917 endOfRange = rangeEndLS((char)(endOfRange+1)); 918 } 919 } 920 returnValue.startCodePoint = nextStart; 921 returnValue.endCodePoint = endOfRange; 922 returnValue.value = mappedVal; 923 returnValue.leadSurrogate = !doingCodePoints; 924 nextStart = endOfRange+1; 925 return returnValue; 926 } 927 928 /** 929 * 930 */ 931 public boolean hasNext() { 932 return doingCodePoints && (doLeadSurrogates || nextStart < limitCP) || nextStart < 0xdc00; 933 } 934 935 public void remove() { 936 throw new UnsupportedOperationException(); 937 } 938 939 940 /** 941 * Find the last lead surrogate in a contiguous range with the 942 * same Trie2 value as the input character. 943 * 944 * Use the alternate Lead Surrogate values from the Trie2, 945 * not the code-point values. 946 * 947 * Note: Trie2_16 and Trie2_32 override this implementation with optimized versions, 948 * meaning that the implementation here is only being used with 949 * Trie2Writable. The code here is logically correct with any type 950 * of Trie2, however. 951 * 952 * @param c The character to begin with. 953 * @return The last contiguous character with the same value. 954 */ 955 private int rangeEndLS(char startingLS) { 956 if (startingLS >= 0xdbff) { 957 return 0xdbff; 958 } 959 960 int c; 961 int val = getFromU16SingleLead(startingLS); 962 for (c = startingLS+1; c <= 0x0dbff; c++) { 963 if (getFromU16SingleLead((char)c) != val) { 964 break; 965 } 966 } 967 return c-1; 968 } 969 970 // 971 // Iteration State Variables 972 // 973 private ValueMapper mapper; 974 private Range returnValue = new Range(); 975 // The starting code point for the next range to be returned. 976 private int nextStart; 977 // The upper limit for the last normal range to be returned. Normally 0x110000, but 978 // may be lower when iterating over the code points for a single lead surrogate. 979 private int limitCP; 980 981 // True while iterating over the the Trie2 values for code points. 982 // False while iterating over the alternate values for lead surrogates. 983 private boolean doingCodePoints = true; 984 985 // True if the iterator should iterate the special values for lead surrogates in 986 // addition to the normal values for code points. 987 private boolean doLeadSurrogates = true; 988 } 989 990 /** 991 * Find the last character in a contiguous range of characters with the 992 * same Trie2 value as the input character. 993 * 994 * @param c The character to begin with. 995 * @return The last contiguous character with the same value. 996 */ 997 int rangeEnd(int start, int limitp, int val) { 998 int c; 999 int limit = Math.min(highStart, limitp); 1000 1001 for (c = start+1; c < limit; c++) { 1002 if (get(c) != val) { 1003 break; 1004 } 1005 } 1006 if (c >= highStart) { 1007 c = limitp; 1008 } 1009 return c - 1; 1010 } 1011 1012 1013 // 1014 // Hashing implementation functions. FNV hash. Respected public domain algorithm. 1015 // 1016 private static int initHash() { 1017 return 0x811c9DC5; // unsigned 2166136261 1018 } 1019 1020 private static int hashByte(int h, int b) { 1021 h = h * 16777619; 1022 h = h ^ b; 1023 return h; 1024 } 1025 1026 private static int hashUChar32(int h, int c) { 1027 h = Trie2.hashByte(h, c & 255); 1028 h = Trie2.hashByte(h, (c>>8) & 255); 1029 h = Trie2.hashByte(h, c>>16); 1030 return h; 1031 } 1032 1033 private static int hashInt(int h, int i) { 1034 h = Trie2.hashByte(h, i & 255); 1035 h = Trie2.hashByte(h, (i>>8) & 255); 1036 h = Trie2.hashByte(h, (i>>16) & 255); 1037 h = Trie2.hashByte(h, (i>>24) & 255); 1038 return h; 1039 } 1040 1041} 1042