1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4****************************************************************************** 5* Copyright (C) 1996-2015, International Business Machines Corporation and 6* others. All Rights Reserved. 7****************************************************************************** 8*/ 9 10package com.ibm.icu.impl; 11 12import java.util.NoSuchElementException; 13 14import com.ibm.icu.lang.UCharacter; 15import com.ibm.icu.text.UTF16; 16import com.ibm.icu.util.RangeValueIterator; 17 18/** 19 * <p>Class enabling iteration of the values in a Trie.</p> 20 * 21 * <p>2015-sep-03 TODO: Only used in test code, move there. 22 * 23 * <p>Result of each iteration contains the interval of codepoints that have 24 * the same value type and the value type itself.</p> 25 * <p>The comparison of each codepoint value is done via extract(), which the 26 * default implementation is to return the value as it is.</p> 27 * <p>Method extract() can be overwritten to perform manipulations on 28 * codepoint values in order to perform specialized comparison.</p> 29 * <p>TrieIterator is designed to be a generic iterator for the CharTrie 30 * and the IntTrie, hence to accommodate both types of data, the return 31 * result will be in terms of int (32 bit) values.</p> 32 * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p> 33 * <p>Notes for porting utrie_enum from icu4c to icu4j:<br> 34 * Internally, icu4c's utrie_enum performs all iterations in its body. In Java 35 * sense, the caller will have to pass a object with a callback function 36 * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, 37 * uint32_t value) into utrie_enum. utrie_enum will then find ranges of 38 * codepoints with the same value as determined by 39 * UTrieEnumValue(const void *context, uint32_t value). for each range, 40 * utrie_enum calls the callback function to perform a task. In this way, 41 * icu4c performs the iteration within utrie_enum. 42 * To follow the JDK model, icu4j is slightly different from icu4c. 43 * Instead of requesting the caller to implement an object for a callback. 44 * The caller will have to implement a subclass of TrieIterator, fleshing out 45 * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j, 46 * the caller will have to code his own iteration and flesh out the task 47 * (equivalent to UTrieEnumRange) to be performed in the iteration loop. 48 * </p> 49 * <p>There are basically 3 usage scenarios for porting:</p> 50 * <p>1) UTrieEnumValue is the only implemented callback then just implement a 51 * subclass of TrieIterator and override the extract(int) method. The 52 * extract(int) method is analogus to UTrieEnumValue callback. 53 * </p> 54 * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement 55 * a subclass of TrieIterator, override the extract method and iterate, e.g 56 * </p> 57 * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange, 58 * set);<br> 59 * In Java :<br> 60 * <pre> 61 * class TrieIteratorImpl extends TrieIterator{ 62 * public TrieIteratorImpl(Trie data){ 63 * super(data); 64 * } 65 * public int extract(int value){ 66 * // port the implementation of _enumPropertyStartsValue here 67 * } 68 * } 69 * .... 70 * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie); 71 * while(fcdIter.next(result)) { 72 * // port the implementation of _enumPropertyStartsRange 73 * } 74 * </pre> 75 * </p> 76 * <p>3) UTrieEnumRange is the only implemented callback then just implement 77 * the while loop, when utrie_enum is called 78 * <pre> 79 * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set); 80 * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie); 81 * while(fcdIter.next(result)){ 82 * set.add(result.start); 83 * } 84 * </p> 85 * @author synwee 86 * @see com.ibm.icu.impl.Trie 87 * @since release 2.1, Jan 17 2002 88 */ 89public class TrieIterator implements RangeValueIterator 90 91{ 92 // public constructor --------------------------------------------- 93 94 /** 95 * TrieEnumeration constructor 96 * @param trie to be used 97 * @exception IllegalArgumentException throw when argument is null. 98 */ 99 public TrieIterator(Trie trie) 100 { 101 if (trie == null) { 102 throw new IllegalArgumentException( 103 "Argument trie cannot be null"); 104 } 105 m_trie_ = trie; 106 // synwee: check that extract belongs to the child class 107 m_initialValue_ = extract(m_trie_.getInitialValue()); 108 reset(); 109 } 110 111 // public methods ------------------------------------------------- 112 113 /** 114 * <p>Returns true if we are not at the end of the iteration, false 115 * otherwise.</p> 116 * <p>The next set of codepoints with the same value type will be 117 * calculated during this call and returned in the arguement element.</p> 118 * @param element return result 119 * @return true if we are not at the end of the iteration, false otherwise. 120 * @exception NoSuchElementException - if no more elements exist. 121 * @see com.ibm.icu.util.RangeValueIterator.Element 122 */ 123 @Override 124 public final boolean next(Element element) 125 { 126 if (m_nextCodepoint_ > UCharacter.MAX_VALUE) { 127 return false; 128 } 129 if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE && 130 calculateNextBMPElement(element)) { 131 return true; 132 } 133 calculateNextSupplementaryElement(element); 134 return true; 135 } 136 137 /** 138 * Resets the iterator to the beginning of the iteration 139 */ 140 @Override 141 public final void reset() 142 { 143 m_currentCodepoint_ = 0; 144 m_nextCodepoint_ = 0; 145 m_nextIndex_ = 0; 146 m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_; 147 if (m_nextBlock_ == m_trie_.m_dataOffset_) { 148 m_nextValue_ = m_initialValue_; 149 } 150 else { 151 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_)); 152 } 153 m_nextBlockIndex_ = 0; 154 m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_; 155 } 156 157 // protected methods ---------------------------------------------- 158 159 /** 160 * Called by next() to extracts a 32 bit value from a trie value 161 * used for comparison. 162 * This method is to be overwritten if special manipulation is to be done 163 * to retrieve a relevant comparison. 164 * The default function is to return the value as it is. 165 * @param value a value from the trie 166 * @return extracted value 167 */ 168 protected int extract(int value) 169 { 170 return value; 171 } 172 173 // private methods ------------------------------------------------ 174 175 /** 176 * Set the result values 177 * @param element return result object 178 * @param start codepoint of range 179 * @param limit (end + 1) codepoint of range 180 * @param value common value of range 181 */ 182 private final void setResult(Element element, int start, int limit, 183 int value) 184 { 185 element.start = start; 186 element.limit = limit; 187 element.value = value; 188 } 189 190 /** 191 * Finding the next element. 192 * This method is called just before returning the result of 193 * next(). 194 * We always store the next element before it is requested. 195 * In the case that we have to continue calculations into the 196 * supplementary planes, a false will be returned. 197 * @param element return result object 198 * @return true if the next range is found, false if we have to proceed to 199 * the supplementary range. 200 */ 201 private final boolean calculateNextBMPElement(Element element) 202 { 203 int currentValue = m_nextValue_; 204 m_currentCodepoint_ = m_nextCodepoint_; 205 m_nextCodepoint_ ++; 206 m_nextBlockIndex_ ++; 207 if (!checkBlockDetail(currentValue)) { 208 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 209 currentValue); 210 return true; 211 } 212 // synwee check that next block index == 0 here 213 // enumerate BMP - the main loop enumerates data blocks 214 while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) { 215 // because of the way the character is split to form the index 216 // the lead surrogate and trail surrogate can not be in the 217 // mid of a block 218 if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) { 219 // skip lead surrogate code units, 220 // go to lead surrogate codepoints 221 m_nextIndex_ = BMP_INDEX_LENGTH_; 222 } 223 else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) { 224 // go back to regular BMP code points 225 m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_; 226 } else { 227 m_nextIndex_ ++; 228 } 229 230 m_nextBlockIndex_ = 0; 231 if (!checkBlock(currentValue)) { 232 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 233 currentValue); 234 return true; 235 } 236 } 237 m_nextCodepoint_ --; // step one back since this value has not been 238 m_nextBlockIndex_ --; // retrieved yet. 239 return false; 240 } 241 242 /** 243 * Finds the next supplementary element. 244 * For each entry in the trie, the value to be delivered is passed through 245 * extract(). 246 * We always store the next element before it is requested. 247 * Called after calculateNextBMP() completes its round of BMP characters. 248 * There is a slight difference in the usage of m_currentCodepoint_ 249 * here as compared to calculateNextBMP(). Though both represents the 250 * lower bound of the next element, in calculateNextBMP() it gets set 251 * at the start of any loop, where-else, in calculateNextSupplementary() 252 * since m_currentCodepoint_ already contains the lower bound of the 253 * next element (passed down from calculateNextBMP()), we keep it till 254 * the end before resetting it to the new value. 255 * Note, if there are no more iterations, it will never get to here. 256 * Blocked out by next(). 257 * @param element return result object 258 */ 259 private final void calculateNextSupplementaryElement(Element element) 260 { 261 int currentValue = m_nextValue_; 262 m_nextCodepoint_ ++; 263 m_nextBlockIndex_ ++; 264 265 if (UTF16.getTrailSurrogate(m_nextCodepoint_) 266 != UTF16.TRAIL_SURROGATE_MIN_VALUE) { 267 // this piece is only called when we are in the middle of a lead 268 // surrogate block 269 if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) { 270 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 271 currentValue); 272 m_currentCodepoint_ = m_nextCodepoint_; 273 return; 274 } 275 // we have cleared one block 276 m_nextIndex_ ++; 277 m_nextTrailIndexOffset_ ++; 278 if (!checkTrailBlock(currentValue)) { 279 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 280 currentValue); 281 m_currentCodepoint_ = m_nextCodepoint_; 282 return; 283 } 284 } 285 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); 286 // enumerate supplementary code points 287 while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) { 288 // lead surrogate access 289 final int leadBlock = 290 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << 291 Trie.INDEX_STAGE_2_SHIFT_; 292 if (leadBlock == m_trie_.m_dataOffset_) { 293 // no entries for a whole block of lead surrogates 294 if (currentValue != m_initialValue_) { 295 m_nextValue_ = m_initialValue_; 296 m_nextBlock_ = leadBlock; // == m_trie_.m_dataOffset_ 297 m_nextBlockIndex_ = 0; 298 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 299 currentValue); 300 m_currentCodepoint_ = m_nextCodepoint_; 301 return; 302 } 303 304 nextLead += DATA_BLOCK_LENGTH_; 305 // number of total affected supplementary codepoints in one 306 // block 307 // this is not a simple addition of 308 // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider 309 // that we might have moved some of the codepoints 310 m_nextCodepoint_ = Character.toCodePoint((char)nextLead, (char)UTF16.TRAIL_SURROGATE_MIN_VALUE); 311 continue; 312 } 313 if (m_trie_.m_dataManipulate_ == null) { 314 throw new NullPointerException( 315 "The field DataManipulate in this Trie is null"); 316 } 317 // enumerate trail surrogates for this lead surrogate 318 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( 319 m_trie_.getValue(leadBlock + 320 (nextLead & Trie.INDEX_STAGE_3_MASK_))); 321 if (m_nextIndex_ <= 0) { 322 // no data for this lead surrogate 323 if (currentValue != m_initialValue_) { 324 m_nextValue_ = m_initialValue_; 325 m_nextBlock_ = m_trie_.m_dataOffset_; 326 m_nextBlockIndex_ = 0; 327 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 328 currentValue); 329 m_currentCodepoint_ = m_nextCodepoint_; 330 return; 331 } 332 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_; 333 } else { 334 m_nextTrailIndexOffset_ = 0; 335 if (!checkTrailBlock(currentValue)) { 336 setResult(element, m_currentCodepoint_, m_nextCodepoint_, 337 currentValue); 338 m_currentCodepoint_ = m_nextCodepoint_; 339 return; 340 } 341 } 342 nextLead ++; 343 } 344 345 // deliver last range 346 setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1, 347 currentValue); 348 } 349 350 /** 351 * Internal block value calculations 352 * Performs calculations on a data block to find codepoints in m_nextBlock_ 353 * after the index m_nextBlockIndex_ that has the same value. 354 * Note m_*_ variables at this point is the next codepoint whose value 355 * has not been calculated. 356 * But when returned with false, it will be the last codepoint whose 357 * value has been calculated. 358 * @param currentValue the value which other codepoints are tested against 359 * @return true if the whole block has the same value as currentValue or if 360 * the whole block has been calculated, false otherwise. 361 */ 362 private final boolean checkBlockDetail(int currentValue) 363 { 364 while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) { 365 m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ + 366 m_nextBlockIndex_)); 367 if (m_nextValue_ != currentValue) { 368 return false; 369 } 370 ++ m_nextBlockIndex_; 371 ++ m_nextCodepoint_; 372 } 373 return true; 374 } 375 376 /** 377 * Internal block value calculations 378 * Performs calculations on a data block to find codepoints in m_nextBlock_ 379 * that has the same value. 380 * Will call checkBlockDetail() if highlevel check fails. 381 * Note m_*_ variables at this point is the next codepoint whose value 382 * has not been calculated. 383 * @param currentBlock the initial block containing all currentValue 384 * @param currentValue the value which other codepoints are tested against 385 * @return true if the whole block has the same value as currentValue or if 386 * the whole block has been calculated, false otherwise. 387 */ 388 private final boolean checkBlock(int currentValue) 389 { 390 int currentBlock = m_nextBlock_; 391 m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] << 392 Trie.INDEX_STAGE_2_SHIFT_; 393 if (m_nextBlock_ == currentBlock && 394 (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) { 395 // the block is the same as the previous one, filled with 396 // currentValue 397 m_nextCodepoint_ += DATA_BLOCK_LENGTH_; 398 } 399 else if (m_nextBlock_ == m_trie_.m_dataOffset_) { 400 // this is the all-initial-value block 401 if (currentValue != m_initialValue_) { 402 m_nextValue_ = m_initialValue_; 403 m_nextBlockIndex_ = 0; 404 return false; 405 } 406 m_nextCodepoint_ += DATA_BLOCK_LENGTH_; 407 } 408 else { 409 if (!checkBlockDetail(currentValue)) { 410 return false; 411 } 412 } 413 return true; 414 } 415 416 /** 417 * Internal block value calculations 418 * Performs calculations on multiple data blocks for a set of trail 419 * surrogates to find codepoints in m_nextBlock_ that has the same value. 420 * Will call checkBlock() for internal block checks. 421 * Note m_*_ variables at this point is the next codepoint whose value 422 * has not been calculated. 423 * @param currentValue the value which other codepoints are tested against 424 * @return true if the whole block has the same value as currentValue or if 425 * the whole block has been calculated, false otherwise. 426 */ 427 private final boolean checkTrailBlock(int currentValue) 428 { 429 // enumerate code points for this lead surrogate 430 while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_) 431 { 432 // if we ever reach here, we are at the start of a new block 433 m_nextBlockIndex_ = 0; 434 // copy of most of the body of the BMP loop 435 if (!checkBlock(currentValue)) { 436 return false; 437 } 438 m_nextTrailIndexOffset_ ++; 439 m_nextIndex_ ++; 440 } 441 return true; 442 } 443 444 /** 445 * Checks if we are beginning at the start of a initial block. 446 * If we are then the rest of the codepoints in this initial block 447 * has the same values. 448 * We increment m_nextCodepoint_ and relevant data members if so. 449 * This is used only in for the supplementary codepoints because 450 * the offset to the trail indexes could be 0. 451 * @return true if we are at the start of a initial block. 452 */ 453 private final boolean checkNullNextTrailIndex() 454 { 455 if (m_nextIndex_ <= 0) { 456 m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1; 457 int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); 458 int leadBlock = 459 m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << 460 Trie.INDEX_STAGE_2_SHIFT_; 461 if (m_trie_.m_dataManipulate_ == null) { 462 throw new NullPointerException( 463 "The field DataManipulate in this Trie is null"); 464 } 465 m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( 466 m_trie_.getValue(leadBlock + 467 (nextLead & Trie.INDEX_STAGE_3_MASK_))); 468 m_nextIndex_ --; 469 m_nextBlockIndex_ = DATA_BLOCK_LENGTH_; 470 return true; 471 } 472 return false; 473 } 474 475 // private data members -------------------------------------------- 476 477 /** 478 * Size of the stage 1 BMP indexes 479 */ 480 private static final int BMP_INDEX_LENGTH_ = 481 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_; 482 /** 483 * Lead surrogate minimum value 484 */ 485 private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800; 486 /** 487 * Trail surrogate minimum value 488 */ 489 private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00; 490 /* 491 * Trail surrogate maximum value 492 */ 493 //private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF; 494 /** 495 * Number of trail surrogate 496 */ 497 private static final int TRAIL_SURROGATE_COUNT_ = 0x400; 498 /** 499 * Number of stage 1 indexes for supplementary calculations that maps to 500 * each lead surrogate character. 501 * See second pass into getRawOffset for the trail surrogate character. 502 * 10 for significant number of bits for trail surrogates, 5 for what we 503 * discard during shifting. 504 */ 505 private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ = 506 1 << (10 - Trie.INDEX_STAGE_1_SHIFT_); 507 /** 508 * Number of data values in a stage 2 (data array) block. 509 */ 510 private static final int DATA_BLOCK_LENGTH_ = 511 1 << Trie.INDEX_STAGE_1_SHIFT_; 512// /** 513// * Number of codepoints in a stage 2 block 514// */ 515// private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ = 516// DATA_BLOCK_LENGTH_ << 10; 517 /** 518 * Trie instance 519 */ 520 private Trie m_trie_; 521 /** 522 * Initial value for trie values 523 */ 524 private int m_initialValue_; 525 /** 526 * Next element results and data. 527 */ 528 private int m_currentCodepoint_; 529 private int m_nextCodepoint_; 530 private int m_nextValue_; 531 private int m_nextIndex_; 532 private int m_nextBlock_; 533 private int m_nextBlockIndex_; 534 private int m_nextTrailIndexOffset_; 535} 536