1/* 2****************************************************************************** 3* Copyright (C) 1996-2015, International Business Machines Corporation and 4* others. All Rights Reserved. 5****************************************************************************** 6*/ 7 8package com.ibm.icu.impl; 9 10import java.util.NoSuchElementException; 11 12import com.ibm.icu.lang.UCharacter; 13import com.ibm.icu.text.UTF16; 14import com.ibm.icu.util.RangeValueIterator; 15 16/** 17 * <p>Class enabling iteration of the values in a Trie.</p> 18 * 19 * <p>2015-sep-03 TODO: Only used in test code, move there. 20 * 21 * <p>Result of each iteration contains the interval of codepoints that have 22 * the same value type and the value type itself.</p> 23 * <p>The comparison of each codepoint value is done via extract(), which the 24 * default implementation is to return the value as it is.</p> 25 * <p>Method extract() can be overwritten to perform manipulations on 26 * codepoint values in order to perform specialized comparison.</p> 27 * <p>TrieIterator is designed to be a generic iterator for the CharTrie 28 * and the IntTrie, hence to accommodate both types of data, the return 29 * result will be in terms of int (32 bit) values.</p> 30 * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p> 31 * <p>Notes for porting utrie_enum from icu4c to icu4j:<br> 32 * Internally, icu4c's utrie_enum performs all iterations in its body. In Java 33 * sense, the caller will have to pass a object with a callback function 34 * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, 35 * uint32_t value) into utrie_enum. utrie_enum will then find ranges of 36 * codepoints with the same value as determined by 37 * UTrieEnumValue(const void *context, uint32_t value). for each range, 38 * utrie_enum calls the callback function to perform a task. In this way, 39 * icu4c performs the iteration within utrie_enum. 40 * To follow the JDK model, icu4j is slightly different from icu4c. 41 * Instead of requesting the caller to implement an object for a callback. 42 * The caller will have to implement a subclass of TrieIterator, fleshing out 43 * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j, 44 * the caller will have to code his own iteration and flesh out the task 45 * (equivalent to UTrieEnumRange) to be performed in the iteration loop. 46 * </p> 47 * <p>There are basically 3 usage scenarios for porting:</p> 48 * <p>1) UTrieEnumValue is the only implemented callback then just implement a 49 * subclass of TrieIterator and override the extract(int) method. The 50 * extract(int) method is analogus to UTrieEnumValue callback. 51 * </p> 52 * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement 53 * a subclass of TrieIterator, override the extract method and iterate, e.g 54 * </p> 55 * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange, 56 * set);<br> 57 * In Java :<br> 58 * <pre> 59 * class TrieIteratorImpl extends TrieIterator{ 60 * public TrieIteratorImpl(Trie data){ 61 * super(data); 62 * } 63 * public int extract(int value){ 64 * // port the implementation of _enumPropertyStartsValue here 65 * } 66 * } 67 * .... 68 * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie); 69 * while(fcdIter.next(result)) { 70 * // port the implementation of _enumPropertyStartsRange 71 * } 72 * </pre> 73 * </p> 74 * <p>3) UTrieEnumRange is the only implemented callback then just implement 75 * the while loop, when utrie_enum is called 76 * <pre> 77 * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set); 78 * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie); 79 * while(fcdIter.next(result)){ 80 * set.add(result.start); 81 * } 82 * </p> 83 * @author synwee 84 * @see com.ibm.icu.impl.Trie 85 * @since release 2.1, Jan 17 2002 86 */ 87public class TrieIterator implements RangeValueIterator 88 89{ 90 // public constructor --------------------------------------------- 91 92 /** 93 * TrieEnumeration constructor 94 * @param trie to be used 95 * @exception IllegalArgumentException throw when argument is null. 96 */ 97 public TrieIterator(Trie trie) 98 { 99 if (trie == null) { 100 throw new IllegalArgumentException( 101 "Argument trie cannot be null"); 102 } 103 m_trie_ = trie; 104 // synwee: check that extract belongs to the child class 105 m_initialValue_ = extract(m_trie_.getInitialValue()); 106 reset(); 107 } 108 109 // public methods ------------------------------------------------- 110 111 /** 112 * <p>Returns true if we are not at the end of the iteration, false 113 * otherwise.</p> 114 * <p>The next set of codepoints with the same value type will be 115 * calculated during this call and returned in the arguement element.</p> 116 * @param element return result 117 * @return true if we are not at the end of the iteration, false otherwise. 118 * @exception NoSuchElementException - if no more elements exist. 119 * @see com.ibm.icu.util.RangeValueIterator.Element 120 */ 121 public final boolean next(Element element) 122 { 123 if (m_nextCodepoint_ > UCharacter.MAX_VALUE) { 124 return false; 125 } 126 if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE && 127 calculateNextBMPElement(element)) { 128 return true; 129 } 130 calculateNextSupplementaryElement(element); 131 return true; 132 } 133 134 /** 135 * Resets the iterator to the beginning of the iteration 136 */ 137 public final void reset() 138 { 139 m_currentCodepoint_ = 0; 140 m_nextCodepoint_ = 0; 141 m_nextIndex_ = 0; 142 m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_; 143 if (m_nextBlock_ == m_trie_.m_dataOffset_) { 144 m_nextValue_ = m_initialValue_; 145 } 146 else { 147 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_)); 148 } 149 m_nextBlockIndex_ = 0; 150 m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_; 151 } 152 153 // protected methods ---------------------------------------------- 154 155 /** 156 * Called by next() to extracts a 32 bit value from a trie value 157 * used for comparison. 158 * This method is to be overwritten if special manipulation is to be done 159 * to retrieve a relevant comparison. 160 * The default function is to return the value as it is. 161 * @param value a value from the trie 162 * @return extracted value 163 */ 164 protected int extract(int value) 165 { 166 return value; 167 } 168 169 // private methods ------------------------------------------------ 170 171 /** 172 * Set the result values 173 * @param element return result object 174 * @param start codepoint of range 175 * @param limit (end + 1) codepoint of range 176 * @param value common value of range 177 */ 178 private final void setResult(Element element, int start, int limit, 179 int value) 180 { 181 element.start = start; 182 element.limit = limit; 183 element.value = value; 184 } 185 186 /** 187 * Finding the next element. 188 * This method is called just before returning the result of 189 * next(). 190 * We always store the next element before it is requested. 191 * In the case that we have to continue calculations into the 192 * supplementary planes, a false will be returned. 193 * @param element return result object 194 * @return true if the next range is found, false if we have to proceed to 195 * the supplementary range. 196 */ 197 private final boolean calculateNextBMPElement(Element element) 198 { 199 int currentValue = m_nextValue_; 200 m_currentCodepoint_ = m_nextCodepoint_; 201 m_nextCodepoint_ ++; 202 m_nextBlockIndex_ ++; 203 if (!checkBlockDetail(currentValue)) { 204 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 205 currentValue); 206 return true; 207 } 208 // synwee check that next block index == 0 here 209 // enumerate BMP - the main loop enumerates data blocks 210 while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) { 211 // because of the way the character is split to form the index 212 // the lead surrogate and trail surrogate can not be in the 213 // mid of a block 214 if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) { 215 // skip lead surrogate code units, 216 // go to lead surrogate codepoints 217 m_nextIndex_ = BMP_INDEX_LENGTH_; 218 } 219 else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) { 220 // go back to regular BMP code points 221 m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_; 222 } else { 223 m_nextIndex_ ++; 224 } 225 226 m_nextBlockIndex_ = 0; 227 if (!checkBlock(currentValue)) { 228 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 229 currentValue); 230 return true; 231 } 232 } 233 m_nextCodepoint_ --; // step one back since this value has not been 234 m_nextBlockIndex_ --; // retrieved yet. 235 return false; 236 } 237 238 /** 239 * Finds the next supplementary element. 240 * For each entry in the trie, the value to be delivered is passed through 241 * extract(). 242 * We always store the next element before it is requested. 243 * Called after calculateNextBMP() completes its round of BMP characters. 244 * There is a slight difference in the usage of m_currentCodepoint_ 245 * here as compared to calculateNextBMP(). Though both represents the 246 * lower bound of the next element, in calculateNextBMP() it gets set 247 * at the start of any loop, where-else, in calculateNextSupplementary() 248 * since m_currentCodepoint_ already contains the lower bound of the 249 * next element (passed down from calculateNextBMP()), we keep it till 250 * the end before resetting it to the new value. 251 * Note, if there are no more iterations, it will never get to here. 252 * Blocked out by next(). 253 * @param element return result object 254 */ 255 private final void calculateNextSupplementaryElement(Element element) 256 { 257 int currentValue = m_nextValue_; 258 m_nextCodepoint_ ++; 259 m_nextBlockIndex_ ++; 260 261 if (UTF16.getTrailSurrogate(m_nextCodepoint_) 262 != UTF16.TRAIL_SURROGATE_MIN_VALUE) { 263 // this piece is only called when we are in the middle of a lead 264 // surrogate block 265 if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) { 266 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 267 currentValue); 268 m_currentCodepoint_ = m_nextCodepoint_; 269 return; 270 } 271 // we have cleared one block 272 m_nextIndex_ ++; 273 m_nextTrailIndexOffset_ ++; 274 if (!checkTrailBlock(currentValue)) { 275 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 276 currentValue); 277 m_currentCodepoint_ = m_nextCodepoint_; 278 return; 279 } 280 } 281 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); 282 // enumerate supplementary code points 283 while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) { 284 // lead surrogate access 285 final int leadBlock = 286 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << 287 Trie.INDEX_STAGE_2_SHIFT_; 288 if (leadBlock == m_trie_.m_dataOffset_) { 289 // no entries for a whole block of lead surrogates 290 if (currentValue != m_initialValue_) { 291 m_nextValue_ = m_initialValue_; 292 m_nextBlock_ = leadBlock; // == m_trie_.m_dataOffset_ 293 m_nextBlockIndex_ = 0; 294 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 295 currentValue); 296 m_currentCodepoint_ = m_nextCodepoint_; 297 return; 298 } 299 300 nextLead += DATA_BLOCK_LENGTH_; 301 // number of total affected supplementary codepoints in one 302 // block 303 // this is not a simple addition of 304 // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider 305 // that we might have moved some of the codepoints 306 m_nextCodepoint_ = Character.toCodePoint((char)nextLead, (char)UTF16.TRAIL_SURROGATE_MIN_VALUE); 307 continue; 308 } 309 if (m_trie_.m_dataManipulate_ == null) { 310 throw new NullPointerException( 311 "The field DataManipulate in this Trie is null"); 312 } 313 // enumerate trail surrogates for this lead surrogate 314 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( 315 m_trie_.getValue(leadBlock + 316 (nextLead & Trie.INDEX_STAGE_3_MASK_))); 317 if (m_nextIndex_ <= 0) { 318 // no data for this lead surrogate 319 if (currentValue != m_initialValue_) { 320 m_nextValue_ = m_initialValue_; 321 m_nextBlock_ = m_trie_.m_dataOffset_; 322 m_nextBlockIndex_ = 0; 323 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 324 currentValue); 325 m_currentCodepoint_ = m_nextCodepoint_; 326 return; 327 } 328 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_; 329 } else { 330 m_nextTrailIndexOffset_ = 0; 331 if (!checkTrailBlock(currentValue)) { 332 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 333 currentValue); 334 m_currentCodepoint_ = m_nextCodepoint_; 335 return; 336 } 337 } 338 nextLead ++; 339 } 340 341 // deliver last range 342 setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1, 343 currentValue); 344 } 345 346 /** 347 * Internal block value calculations 348 * Performs calculations on a data block to find codepoints in m_nextBlock_ 349 * after the index m_nextBlockIndex_ that has the same value. 350 * Note m_*_ variables at this point is the next codepoint whose value 351 * has not been calculated. 352 * But when returned with false, it will be the last codepoint whose 353 * value has been calculated. 354 * @param currentValue the value which other codepoints are tested against 355 * @return true if the whole block has the same value as currentValue or if 356 * the whole block has been calculated, false otherwise. 357 */ 358 private final boolean checkBlockDetail(int currentValue) 359 { 360 while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) { 361 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ + 362 m_nextBlockIndex_)); 363 if (m_nextValue_ != currentValue) { 364 return false; 365 } 366 ++ m_nextBlockIndex_; 367 ++ m_nextCodepoint_; 368 } 369 return true; 370 } 371 372 /** 373 * Internal block value calculations 374 * Performs calculations on a data block to find codepoints in m_nextBlock_ 375 * that has the same value. 376 * Will call checkBlockDetail() if highlevel check fails. 377 * Note m_*_ variables at this point is the next codepoint whose value 378 * has not been calculated. 379 * @param currentBlock the initial block containing all currentValue 380 * @param currentValue the value which other codepoints are tested against 381 * @return true if the whole block has the same value as currentValue or if 382 * the whole block has been calculated, false otherwise. 383 */ 384 private final boolean checkBlock(int currentValue) 385 { 386 int currentBlock = m_nextBlock_; 387 m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] << 388 Trie.INDEX_STAGE_2_SHIFT_; 389 if (m_nextBlock_ == currentBlock && 390 (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) { 391 // the block is the same as the previous one, filled with 392 // currentValue 393 m_nextCodepoint_ += DATA_BLOCK_LENGTH_; 394 } 395 else if (m_nextBlock_ == m_trie_.m_dataOffset_) { 396 // this is the all-initial-value block 397 if (currentValue != m_initialValue_) { 398 m_nextValue_ = m_initialValue_; 399 m_nextBlockIndex_ = 0; 400 return false; 401 } 402 m_nextCodepoint_ += DATA_BLOCK_LENGTH_; 403 } 404 else { 405 if (!checkBlockDetail(currentValue)) { 406 return false; 407 } 408 } 409 return true; 410 } 411 412 /** 413 * Internal block value calculations 414 * Performs calculations on multiple data blocks for a set of trail 415 * surrogates to find codepoints in m_nextBlock_ that has the same value. 416 * Will call checkBlock() for internal block checks. 417 * Note m_*_ variables at this point is the next codepoint whose value 418 * has not been calculated. 419 * @param currentValue the value which other codepoints are tested against 420 * @return true if the whole block has the same value as currentValue or if 421 * the whole block has been calculated, false otherwise. 422 */ 423 private final boolean checkTrailBlock(int currentValue) 424 { 425 // enumerate code points for this lead surrogate 426 while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_) 427 { 428 // if we ever reach here, we are at the start of a new block 429 m_nextBlockIndex_ = 0; 430 // copy of most of the body of the BMP loop 431 if (!checkBlock(currentValue)) { 432 return false; 433 } 434 m_nextTrailIndexOffset_ ++; 435 m_nextIndex_ ++; 436 } 437 return true; 438 } 439 440 /** 441 * Checks if we are beginning at the start of a initial block. 442 * If we are then the rest of the codepoints in this initial block 443 * has the same values. 444 * We increment m_nextCodepoint_ and relevant data members if so. 445 * This is used only in for the supplementary codepoints because 446 * the offset to the trail indexes could be 0. 447 * @return true if we are at the start of a initial block. 448 */ 449 private final boolean checkNullNextTrailIndex() 450 { 451 if (m_nextIndex_ <= 0) { 452 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1; 453 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); 454 int leadBlock = 455 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << 456 Trie.INDEX_STAGE_2_SHIFT_; 457 if (m_trie_.m_dataManipulate_ == null) { 458 throw new NullPointerException( 459 "The field DataManipulate in this Trie is null"); 460 } 461 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( 462 m_trie_.getValue(leadBlock + 463 (nextLead & Trie.INDEX_STAGE_3_MASK_))); 464 m_nextIndex_ --; 465 m_nextBlockIndex_ = DATA_BLOCK_LENGTH_; 466 return true; 467 } 468 return false; 469 } 470 471 // private data members -------------------------------------------- 472 473 /** 474 * Size of the stage 1 BMP indexes 475 */ 476 private static final int BMP_INDEX_LENGTH_ = 477 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_; 478 /** 479 * Lead surrogate minimum value 480 */ 481 private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800; 482 /** 483 * Trail surrogate minimum value 484 */ 485 private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00; 486 /* 487 * Trail surrogate maximum value 488 */ 489 //private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF; 490 /** 491 * Number of trail surrogate 492 */ 493 private static final int TRAIL_SURROGATE_COUNT_ = 0x400; 494 /** 495 * Number of stage 1 indexes for supplementary calculations that maps to 496 * each lead surrogate character. 497 * See second pass into getRawOffset for the trail surrogate character. 498 * 10 for significant number of bits for trail surrogates, 5 for what we 499 * discard during shifting. 500 */ 501 private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ = 502 1 << (10 - Trie.INDEX_STAGE_1_SHIFT_); 503 /** 504 * Number of data values in a stage 2 (data array) block. 505 */ 506 private static final int DATA_BLOCK_LENGTH_ = 507 1 << Trie.INDEX_STAGE_1_SHIFT_; 508// /** 509// * Number of codepoints in a stage 2 block 510// */ 511// private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ = 512// DATA_BLOCK_LENGTH_ << 10; 513 /** 514 * Trie instance 515 */ 516 private Trie m_trie_; 517 /** 518 * Initial value for trie values 519 */ 520 private int m_initialValue_; 521 /** 522 * Next element results and data. 523 */ 524 private int m_currentCodepoint_; 525 private int m_nextCodepoint_; 526 private int m_nextValue_; 527 private int m_nextIndex_; 528 private int m_nextBlock_; 529 private int m_nextBlockIndex_; 530 private int m_nextTrailIndexOffset_; 531} 532