Trie.java revision 7935b1839a081ed19ae0d33029ad3c09632a2caa
1/* 2 ****************************************************************************** 3 * Copyright (C) 1996-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ****************************************************************************** 6 */ 7 8package com.ibm.icu.impl; 9 10import java.nio.ByteBuffer; 11import java.util.Arrays; 12 13import com.ibm.icu.lang.UCharacter; 14import com.ibm.icu.text.UTF16; 15 16/** 17 * <p>A trie is a kind of compressed, serializable table of values 18 * associated with Unicode code points (0..0x10ffff).</p> 19 * <p>This class defines the basic structure of a trie and provides methods 20 * to <b>retrieve the offsets to the actual data</b>.</p> 21 * <p>Data will be the form of an array of basic types, char or int.</p> 22 * <p>The actual data format will have to be specified by the user in the 23 * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p> 24 * <p>This trie implementation is optimized for getting offset while walking 25 * forward through a UTF-16 string. 26 * Therefore, the simplest and fastest access macros are the 27 * fromLead() and fromOffsetTrail() methods. 28 * The fromBMP() method are a little more complicated; they get offsets even 29 * for lead surrogate codepoints, while the fromLead() method get special 30 * "folded" offsets for lead surrogate code units if there is relevant data 31 * associated with them. 32 * From such a folded offsets, an offset needs to be extracted to supply 33 * to the fromOffsetTrail() methods. 34 * To handle such supplementary codepoints, some offset information are kept 35 * in the data.</p> 36 * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve 37 * that offset from the folded value for the lead surrogate unit.</p> 38 * <p>For examples of use, see com.ibm.icu.impl.CharTrie or 39 * com.ibm.icu.impl.IntTrie.</p> 40 * @author synwee 41 * @see com.ibm.icu.impl.CharTrie 42 * @see com.ibm.icu.impl.IntTrie 43 * @since release 2.1, Jan 01 2002 44 */ 45public abstract class Trie 46{ 47 // public class declaration ---------------------------------------- 48 49 /** 50 * Character data in com.ibm.impl.Trie have different user-specified format 51 * for different purposes. 52 * This interface specifies methods to be implemented in order for 53 * com.ibm.impl.Trie, to surrogate offset information encapsulated within 54 * the data. 55 */ 56 public static interface DataManipulate 57 { 58 /** 59 * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's 60 * data 61 * the index array offset of the indexes for that lead surrogate. 62 * @param value data value for a surrogate from the trie, including the 63 * folding offset 64 * @return data offset or 0 if there is no data for the lead surrogate 65 */ 66 public int getFoldingOffset(int value); 67 } 68 69 // default implementation 70 private static class DefaultGetFoldingOffset implements DataManipulate { 71 public int getFoldingOffset(int value) { 72 return value; 73 } 74 } 75 76 // public methods -------------------------------------------------- 77 78 /** 79 * Determines if this trie has a linear latin 1 array 80 * @return true if this trie has a linear latin 1 array, false otherwise 81 */ 82 public final boolean isLatin1Linear() 83 { 84 return m_isLatin1Linear_; 85 } 86 87 /** 88 * Checks if the argument Trie has the same data as this Trie. 89 * Attributes are checked but not the index data. 90 * @param other Trie to check 91 * @return true if the argument Trie has the same data as this Trie, false 92 * otherwise 93 */ 94 ///CLOVER:OFF 95 public boolean equals(Object other) 96 { 97 if (other == this) { 98 return true; 99 } 100 if (!(other instanceof Trie)) { 101 return false; 102 } 103 Trie othertrie = (Trie)other; 104 return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_ 105 && m_options_ == othertrie.m_options_ 106 && m_dataLength_ == othertrie.m_dataLength_ 107 && Arrays.equals(m_index_, othertrie.m_index_); 108 } 109 110 public int hashCode() { 111 assert false : "hashCode not designed"; 112 return 42; 113 } 114 ///CLOVER:ON 115 116 /** 117 * Gets the serialized data file size of the Trie. This is used during 118 * trie data reading for size checking purposes. 119 * @return size size of serialized trie data file in terms of the number 120 * of bytes 121 */ 122 public int getSerializedDataSize() 123 { 124 // includes signature, option, dataoffset and datalength output 125 int result = (4 << 2); 126 result += (m_dataOffset_ << 1); 127 if (isCharTrie()) { 128 result += (m_dataLength_ << 1); 129 } 130 else if (isIntTrie()) { 131 result += (m_dataLength_ << 2); 132 } 133 return result; 134 } 135 136 // protected constructor ------------------------------------------- 137 138 /** 139 * Trie constructor for CharTrie use. 140 * @param bytes data of an ICU data file, containing the trie 141 * @param dataManipulate object containing the information to parse the 142 * trie data 143 */ 144 protected Trie(ByteBuffer bytes, DataManipulate dataManipulate) 145 { 146 // Magic number to authenticate the data. 147 int signature = bytes.getInt(); 148 m_options_ = bytes.getInt(); 149 150 if (!checkHeader(signature)) { 151 throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file"); 152 } 153 154 if(dataManipulate != null) { 155 m_dataManipulate_ = dataManipulate; 156 } else { 157 m_dataManipulate_ = new DefaultGetFoldingOffset(); 158 } 159 m_isLatin1Linear_ = (m_options_ & 160 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; 161 m_dataOffset_ = bytes.getInt(); 162 m_dataLength_ = bytes.getInt(); 163 unserialize(bytes); 164 } 165 166 /** 167 * Trie constructor 168 * @param index array to be used for index 169 * @param options used by the trie 170 * @param dataManipulate object containing the information to parse the 171 * trie data 172 */ 173 protected Trie(char index[], int options, DataManipulate dataManipulate) 174 { 175 m_options_ = options; 176 if(dataManipulate != null) { 177 m_dataManipulate_ = dataManipulate; 178 } else { 179 m_dataManipulate_ = new DefaultGetFoldingOffset(); 180 } 181 m_isLatin1Linear_ = (m_options_ & 182 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; 183 m_index_ = index; 184 m_dataOffset_ = m_index_.length; 185 } 186 187 188 // protected data members ------------------------------------------ 189 190 /** 191 * Lead surrogate code points' index displacement in the index array. 192 * 0x10000-0xd800=0x2800 193 * 0x2800 >> INDEX_STAGE_1_SHIFT_ 194 */ 195 protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5; 196 /** 197 * Shift size for shifting right the input index. 1..9 198 */ 199 protected static final int INDEX_STAGE_1_SHIFT_ = 5; 200 /** 201 * Shift size for shifting left the index array values. 202 * Increases possible data size with 16-bit index values at the cost 203 * of compactability. 204 * This requires blocks of stage 2 data to be aligned by 205 * DATA_GRANULARITY. 206 * 0..INDEX_STAGE_1_SHIFT 207 */ 208 protected static final int INDEX_STAGE_2_SHIFT_ = 2; 209 /** 210 * Number of data values in a stage 2 (data array) block. 211 */ 212 protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_; 213 /** 214 * Mask for getting the lower bits from the input index. 215 * DATA_BLOCK_LENGTH - 1. 216 */ 217 protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1; 218 /** Number of bits of a trail surrogate that are used in index table lookups. */ 219 protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_; 220 /** 221 * Number of index (stage 1) entries per lead surrogate. 222 * Same as number of index entries for 1024 trail surrogates, 223 * ==0x400>>INDEX_STAGE_1_SHIFT_ 224 */ 225 protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS); 226 /** Length of the BMP portion of the index (stage 1) array. */ 227 protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_; 228 /** 229 * Surrogate mask to use when shifting offset to retrieve supplementary 230 * values 231 */ 232 protected static final int SURROGATE_MASK_ = 0x3FF; 233 /** 234 * Index or UTF16 characters 235 */ 236 protected char m_index_[]; 237 /** 238 * Internal TrieValue which handles the parsing of the data value. 239 * This class is to be implemented by the user 240 */ 241 protected DataManipulate m_dataManipulate_; 242 /** 243 * Start index of the data portion of the trie. CharTrie combines 244 * index and data into a char array, so this is used to indicate the 245 * initial offset to the data portion. 246 * Note this index always points to the initial value. 247 */ 248 protected int m_dataOffset_; 249 /** 250 * Length of the data array 251 */ 252 protected int m_dataLength_; 253 254 // protected methods ----------------------------------------------- 255 256 /** 257 * Gets the offset to the data which the surrogate pair points to. 258 * @param lead lead surrogate 259 * @param trail trailing surrogate 260 * @return offset to data 261 */ 262 protected abstract int getSurrogateOffset(char lead, char trail); 263 264 /** 265 * Gets the value at the argument index 266 * @param index value at index will be retrieved 267 * @return 32 bit value 268 */ 269 protected abstract int getValue(int index); 270 271 /** 272 * Gets the default initial value 273 * @return 32 bit value 274 */ 275 protected abstract int getInitialValue(); 276 277 /** 278 * Gets the offset to the data which the index ch after variable offset 279 * points to. 280 * Note for locating a non-supplementary character data offset, calling 281 * <p> 282 * getRawOffset(0, ch); 283 * </p> 284 * will do. Otherwise if it is a supplementary character formed by 285 * surrogates lead and trail. Then we would have to call getRawOffset() 286 * with getFoldingIndexOffset(). See getSurrogateOffset(). 287 * @param offset index offset which ch is to start from 288 * @param ch index to be used after offset 289 * @return offset to the data 290 */ 291 protected final int getRawOffset(int offset, char ch) 292 { 293 return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] 294 << INDEX_STAGE_2_SHIFT_) 295 + (ch & INDEX_STAGE_3_MASK_); 296 } 297 298 /** 299 * Gets the offset to data which the BMP character points to 300 * Treats a lead surrogate as a normal code point. 301 * @param ch BMP character 302 * @return offset to data 303 */ 304 protected final int getBMPOffset(char ch) 305 { 306 return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE 307 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) 308 ? getRawOffset(LEAD_INDEX_OFFSET_, ch) 309 : getRawOffset(0, ch); 310 // using a getRawOffset(ch) makes no diff 311 } 312 313 /** 314 * Gets the offset to the data which this lead surrogate character points 315 * to. 316 * Data at the returned offset may contain folding offset information for 317 * the next trailing surrogate character. 318 * @param ch lead surrogate character 319 * @return offset to data 320 */ 321 protected final int getLeadOffset(char ch) 322 { 323 return getRawOffset(0, ch); 324 } 325 326 /** 327 * Internal trie getter from a code point. 328 * Could be faster(?) but longer with 329 * if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); } 330 * Gets the offset to data which the codepoint points to 331 * @param ch codepoint 332 * @return offset to data 333 */ 334 protected final int getCodePointOffset(int ch) 335 { 336 // if ((ch >> 16) == 0) slower 337 if (ch < 0) { 338 return -1; 339 } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { 340 // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works 341 return getRawOffset(0, (char)ch); 342 } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) { 343 // BMP codepoint 344 return getBMPOffset((char)ch); 345 } else if (ch <= UCharacter.MAX_VALUE) { 346 // look at the construction of supplementary characters 347 // trail forms the ends of it. 348 return getSurrogateOffset(UTF16.getLeadSurrogate(ch), 349 (char)(ch & SURROGATE_MASK_)); 350 } else { 351 // return -1 if there is an error, in this case we return 352 return -1; 353 } 354 } 355 356 /** 357 * <p>Parses the byte buffer and creates the trie index with it.</p> 358 * <p>The position of the input ByteBuffer must be right after the trie header.</p> 359 * <p>This is overwritten by the child classes. 360 * @param bytes buffer containing trie data 361 */ 362 protected void unserialize(ByteBuffer bytes) 363 { 364 m_index_ = new char[m_dataOffset_]; 365 for (int i = 0; i < m_dataOffset_; i ++) { 366 m_index_[i] = bytes.getChar(); 367 } 368 } 369 370 /** 371 * Determines if this is a 32 bit trie 372 * @return true if options specifies this is a 32 bit trie 373 */ 374 protected final boolean isIntTrie() 375 { 376 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0; 377 } 378 379 /** 380 * Determines if this is a 16 bit trie 381 * @return true if this is a 16 bit trie 382 */ 383 protected final boolean isCharTrie() 384 { 385 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0; 386 } 387 388 // private data members -------------------------------------------- 389 390 // struct UTrieHeader { 391 // int32_t signature; 392 // int32_t options (a bit field) 393 // int32_t indexLength 394 // int32_t dataLength 395 396 /** 397 * Size of Trie header in bytes 398 */ 399 protected static final int HEADER_LENGTH_ = 4 * 4; 400 /** 401 * Latin 1 option mask 402 */ 403 protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200; 404 /** 405 * Constant number to authenticate the byte block 406 */ 407 protected static final int HEADER_SIGNATURE_ = 0x54726965; 408 /** 409 * Header option formatting 410 */ 411 private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF; 412 protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4; 413 protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100; 414 415 /** 416 * Flag indicator for Latin quick access data block 417 */ 418 private boolean m_isLatin1Linear_; 419 420 /** 421 * <p>Trie options field.</p> 422 * <p>options bit field:<br> 423 * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br> 424 * 8 0 = 16-bit data, 1=32-bit data<br> 425 * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br> 426 * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br> 427 */ 428 private int m_options_; 429 430 // private methods --------------------------------------------------- 431 432 /** 433 * Authenticates raw data header. 434 * Checking the header information, signature and options. 435 * @param signature This contains the options and type of a Trie 436 * @return true if the header is authenticated valid 437 */ 438 private final boolean checkHeader(int signature) 439 { 440 // check the signature 441 // Trie in big-endian US-ASCII (0x54726965). 442 // Magic number to authenticate the data. 443 if (signature != HEADER_SIGNATURE_) { 444 return false; 445 } 446 447 if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != 448 INDEX_STAGE_1_SHIFT_ || 449 ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & 450 HEADER_OPTIONS_SHIFT_MASK_) 451 != INDEX_STAGE_2_SHIFT_) { 452 return false; 453 } 454 return true; 455 } 456} 457