1/* GENERATED SOURCE. DO NOT MODIFY. */ 2/* 3 ****************************************************************************** 4 * Copyright (C) 1996-2015, International Business Machines Corporation and 5 * others. All Rights Reserved. 6 ****************************************************************************** 7 */ 8 9package android.icu.impl; 10 11import java.nio.ByteBuffer; 12import java.util.Arrays; 13 14import android.icu.lang.UCharacter; 15import android.icu.text.UTF16; 16 17/** 18 * <p>A trie is a kind of compressed, serializable table of values 19 * associated with Unicode code points (0..0x10ffff).</p> 20 * <p>This class defines the basic structure of a trie and provides methods 21 * to <b>retrieve the offsets to the actual data</b>.</p> 22 * <p>Data will be the form of an array of basic types, char or int.</p> 23 * <p>The actual data format will have to be specified by the user in the 24 * inner static interface android.icu.impl.Trie.DataManipulate.</p> 25 * <p>This trie implementation is optimized for getting offset while walking 26 * forward through a UTF-16 string. 27 * Therefore, the simplest and fastest access macros are the 28 * fromLead() and fromOffsetTrail() methods. 29 * The fromBMP() method are a little more complicated; they get offsets even 30 * for lead surrogate codepoints, while the fromLead() method get special 31 * "folded" offsets for lead surrogate code units if there is relevant data 32 * associated with them. 33 * From such a folded offsets, an offset needs to be extracted to supply 34 * to the fromOffsetTrail() methods. 35 * To handle such supplementary codepoints, some offset information are kept 36 * in the data.</p> 37 * <p>Methods in android.icu.impl.Trie.DataManipulate are called to retrieve 38 * that offset from the folded value for the lead surrogate unit.</p> 39 * <p>For examples of use, see android.icu.impl.CharTrie or 40 * android.icu.impl.IntTrie.</p> 41 * @author synwee 42 * @see android.icu.impl.CharTrie 43 * @see android.icu.impl.IntTrie 44 * @hide Only a subset of ICU is exposed in Android 45 */ 46public abstract class Trie 47{ 48 // public class declaration ---------------------------------------- 49 50 /** 51 * Character data in com.ibm.impl.Trie have different user-specified format 52 * for different purposes. 53 * This interface specifies methods to be implemented in order for 54 * com.ibm.impl.Trie, to surrogate offset information encapsulated within 55 * the data. 56 */ 57 public static interface DataManipulate 58 { 59 /** 60 * Called by android.icu.impl.Trie to extract from a lead surrogate's 61 * data 62 * the index array offset of the indexes for that lead surrogate. 63 * @param value data value for a surrogate from the trie, including the 64 * folding offset 65 * @return data offset or 0 if there is no data for the lead surrogate 66 */ 67 public int getFoldingOffset(int value); 68 } 69 70 // default implementation 71 private static class DefaultGetFoldingOffset implements DataManipulate { 72 public int getFoldingOffset(int value) { 73 return value; 74 } 75 } 76 77 // public methods -------------------------------------------------- 78 79 /** 80 * Determines if this trie has a linear latin 1 array 81 * @return true if this trie has a linear latin 1 array, false otherwise 82 */ 83 public final boolean isLatin1Linear() 84 { 85 return m_isLatin1Linear_; 86 } 87 88 /** 89 * Checks if the argument Trie has the same data as this Trie. 90 * Attributes are checked but not the index data. 91 * @param other Trie to check 92 * @return true if the argument Trie has the same data as this Trie, false 93 * otherwise 94 */ 95 ///CLOVER:OFF 96 public boolean equals(Object other) 97 { 98 if (other == this) { 99 return true; 100 } 101 if (!(other instanceof Trie)) { 102 return false; 103 } 104 Trie othertrie = (Trie)other; 105 return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_ 106 && m_options_ == othertrie.m_options_ 107 && m_dataLength_ == othertrie.m_dataLength_ 108 && Arrays.equals(m_index_, othertrie.m_index_); 109 } 110 111 public int hashCode() { 112 assert false : "hashCode not designed"; 113 return 42; 114 } 115 ///CLOVER:ON 116 117 /** 118 * Gets the serialized data file size of the Trie. This is used during 119 * trie data reading for size checking purposes. 120 * @return size size of serialized trie data file in terms of the number 121 * of bytes 122 */ 123 public int getSerializedDataSize() 124 { 125 // includes signature, option, dataoffset and datalength output 126 int result = (4 << 2); 127 result += (m_dataOffset_ << 1); 128 if (isCharTrie()) { 129 result += (m_dataLength_ << 1); 130 } 131 else if (isIntTrie()) { 132 result += (m_dataLength_ << 2); 133 } 134 return result; 135 } 136 137 // protected constructor ------------------------------------------- 138 139 /** 140 * Trie constructor for CharTrie use. 141 * @param bytes data of an ICU data file, containing the trie 142 * @param dataManipulate object containing the information to parse the 143 * trie data 144 */ 145 protected Trie(ByteBuffer bytes, DataManipulate dataManipulate) 146 { 147 // Magic number to authenticate the data. 148 int signature = bytes.getInt(); 149 m_options_ = bytes.getInt(); 150 151 if (!checkHeader(signature)) { 152 throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file"); 153 } 154 155 if(dataManipulate != null) { 156 m_dataManipulate_ = dataManipulate; 157 } else { 158 m_dataManipulate_ = new DefaultGetFoldingOffset(); 159 } 160 m_isLatin1Linear_ = (m_options_ & 161 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; 162 m_dataOffset_ = bytes.getInt(); 163 m_dataLength_ = bytes.getInt(); 164 unserialize(bytes); 165 } 166 167 /** 168 * Trie constructor 169 * @param index array to be used for index 170 * @param options used by the trie 171 * @param dataManipulate object containing the information to parse the 172 * trie data 173 */ 174 protected Trie(char index[], int options, DataManipulate dataManipulate) 175 { 176 m_options_ = options; 177 if(dataManipulate != null) { 178 m_dataManipulate_ = dataManipulate; 179 } else { 180 m_dataManipulate_ = new DefaultGetFoldingOffset(); 181 } 182 m_isLatin1Linear_ = (m_options_ & 183 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; 184 m_index_ = index; 185 m_dataOffset_ = m_index_.length; 186 } 187 188 189 // protected data members ------------------------------------------ 190 191 /** 192 * Lead surrogate code points' index displacement in the index array. 193 * 0x10000-0xd800=0x2800 194 * 0x2800 >> INDEX_STAGE_1_SHIFT_ 195 */ 196 protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5; 197 /** 198 * Shift size for shifting right the input index. 1..9 199 */ 200 protected static final int INDEX_STAGE_1_SHIFT_ = 5; 201 /** 202 * Shift size for shifting left the index array values. 203 * Increases possible data size with 16-bit index values at the cost 204 * of compactability. 205 * This requires blocks of stage 2 data to be aligned by 206 * DATA_GRANULARITY. 207 * 0..INDEX_STAGE_1_SHIFT 208 */ 209 protected static final int INDEX_STAGE_2_SHIFT_ = 2; 210 /** 211 * Number of data values in a stage 2 (data array) block. 212 */ 213 protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_; 214 /** 215 * Mask for getting the lower bits from the input index. 216 * DATA_BLOCK_LENGTH - 1. 217 */ 218 protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1; 219 /** Number of bits of a trail surrogate that are used in index table lookups. */ 220 protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_; 221 /** 222 * Number of index (stage 1) entries per lead surrogate. 223 * Same as number of index entries for 1024 trail surrogates, 224 * ==0x400>>INDEX_STAGE_1_SHIFT_ 225 */ 226 protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS); 227 /** Length of the BMP portion of the index (stage 1) array. */ 228 protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_; 229 /** 230 * Surrogate mask to use when shifting offset to retrieve supplementary 231 * values 232 */ 233 protected static final int SURROGATE_MASK_ = 0x3FF; 234 /** 235 * Index or UTF16 characters 236 */ 237 protected char m_index_[]; 238 /** 239 * Internal TrieValue which handles the parsing of the data value. 240 * This class is to be implemented by the user 241 */ 242 protected DataManipulate m_dataManipulate_; 243 /** 244 * Start index of the data portion of the trie. CharTrie combines 245 * index and data into a char array, so this is used to indicate the 246 * initial offset to the data portion. 247 * Note this index always points to the initial value. 248 */ 249 protected int m_dataOffset_; 250 /** 251 * Length of the data array 252 */ 253 protected int m_dataLength_; 254 255 // protected methods ----------------------------------------------- 256 257 /** 258 * Gets the offset to the data which the surrogate pair points to. 259 * @param lead lead surrogate 260 * @param trail trailing surrogate 261 * @return offset to data 262 */ 263 protected abstract int getSurrogateOffset(char lead, char trail); 264 265 /** 266 * Gets the value at the argument index 267 * @param index value at index will be retrieved 268 * @return 32 bit value 269 */ 270 protected abstract int getValue(int index); 271 272 /** 273 * Gets the default initial value 274 * @return 32 bit value 275 */ 276 protected abstract int getInitialValue(); 277 278 /** 279 * Gets the offset to the data which the index ch after variable offset 280 * points to. 281 * Note for locating a non-supplementary character data offset, calling 282 * <p> 283 * getRawOffset(0, ch); 284 * </p> 285 * will do. Otherwise if it is a supplementary character formed by 286 * surrogates lead and trail. Then we would have to call getRawOffset() 287 * with getFoldingIndexOffset(). See getSurrogateOffset(). 288 * @param offset index offset which ch is to start from 289 * @param ch index to be used after offset 290 * @return offset to the data 291 */ 292 protected final int getRawOffset(int offset, char ch) 293 { 294 return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] 295 << INDEX_STAGE_2_SHIFT_) 296 + (ch & INDEX_STAGE_3_MASK_); 297 } 298 299 /** 300 * Gets the offset to data which the BMP character points to 301 * Treats a lead surrogate as a normal code point. 302 * @param ch BMP character 303 * @return offset to data 304 */ 305 protected final int getBMPOffset(char ch) 306 { 307 return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE 308 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) 309 ? getRawOffset(LEAD_INDEX_OFFSET_, ch) 310 : getRawOffset(0, ch); 311 // using a getRawOffset(ch) makes no diff 312 } 313 314 /** 315 * Gets the offset to the data which this lead surrogate character points 316 * to. 317 * Data at the returned offset may contain folding offset information for 318 * the next trailing surrogate character. 319 * @param ch lead surrogate character 320 * @return offset to data 321 */ 322 protected final int getLeadOffset(char ch) 323 { 324 return getRawOffset(0, ch); 325 } 326 327 /** 328 * Internal trie getter from a code point. 329 * Could be faster(?) but longer with 330 * if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); } 331 * Gets the offset to data which the codepoint points to 332 * @param ch codepoint 333 * @return offset to data 334 */ 335 protected final int getCodePointOffset(int ch) 336 { 337 // if ((ch >> 16) == 0) slower 338 if (ch < 0) { 339 return -1; 340 } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { 341 // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works 342 return getRawOffset(0, (char)ch); 343 } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) { 344 // BMP codepoint 345 return getBMPOffset((char)ch); 346 } else if (ch <= UCharacter.MAX_VALUE) { 347 // look at the construction of supplementary characters 348 // trail forms the ends of it. 349 return getSurrogateOffset(UTF16.getLeadSurrogate(ch), 350 (char)(ch & SURROGATE_MASK_)); 351 } else { 352 // return -1 if there is an error, in this case we return 353 return -1; 354 } 355 } 356 357 /** 358 * <p>Parses the byte buffer and creates the trie index with it.</p> 359 * <p>The position of the input ByteBuffer must be right after the trie header.</p> 360 * <p>This is overwritten by the child classes. 361 * @param bytes buffer containing trie data 362 */ 363 protected void unserialize(ByteBuffer bytes) 364 { 365 m_index_ = ICUBinary.getChars(bytes, m_dataOffset_, 0); 366 } 367 368 /** 369 * Determines if this is a 32 bit trie 370 * @return true if options specifies this is a 32 bit trie 371 */ 372 protected final boolean isIntTrie() 373 { 374 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0; 375 } 376 377 /** 378 * Determines if this is a 16 bit trie 379 * @return true if this is a 16 bit trie 380 */ 381 protected final boolean isCharTrie() 382 { 383 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0; 384 } 385 386 // private data members -------------------------------------------- 387 388 // struct UTrieHeader { 389 // int32_t signature; 390 // int32_t options (a bit field) 391 // int32_t indexLength 392 // int32_t dataLength 393 394 /** 395 * Size of Trie header in bytes 396 */ 397 protected static final int HEADER_LENGTH_ = 4 * 4; 398 /** 399 * Latin 1 option mask 400 */ 401 protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200; 402 /** 403 * Constant number to authenticate the byte block 404 */ 405 protected static final int HEADER_SIGNATURE_ = 0x54726965; 406 /** 407 * Header option formatting 408 */ 409 private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF; 410 protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4; 411 protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100; 412 413 /** 414 * Flag indicator for Latin quick access data block 415 */ 416 private boolean m_isLatin1Linear_; 417 418 /** 419 * <p>Trie options field.</p> 420 * <p>options bit field:<br> 421 * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br> 422 * 8 0 = 16-bit data, 1=32-bit data<br> 423 * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br> 424 * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br> 425 */ 426 private int m_options_; 427 428 // private methods --------------------------------------------------- 429 430 /** 431 * Authenticates raw data header. 432 * Checking the header information, signature and options. 433 * @param signature This contains the options and type of a Trie 434 * @return true if the header is authenticated valid 435 */ 436 private final boolean checkHeader(int signature) 437 { 438 // check the signature 439 // Trie in big-endian US-ASCII (0x54726965). 440 // Magic number to authenticate the data. 441 if (signature != HEADER_SIGNATURE_) { 442 return false; 443 } 444 445 if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != 446 INDEX_STAGE_1_SHIFT_ || 447 ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & 448 HEADER_OPTIONS_SHIFT_MASK_) 449 != INDEX_STAGE_2_SHIFT_) { 450 return false; 451 } 452 return true; 453 } 454} 455