1/*
2******************************************************************************
3* Copyright (C) 1996-2015, International Business Machines Corporation and
4* others. All Rights Reserved.
5******************************************************************************
6*/
7
8package com.ibm.icu.impl;
9
10import java.util.NoSuchElementException;
11
12import com.ibm.icu.lang.UCharacter;
13import com.ibm.icu.text.UTF16;
14import com.ibm.icu.util.RangeValueIterator;
15
16/**
17 * <p>Class enabling iteration of the values in a Trie.</p>
18 *
19 * <p>2015-sep-03 TODO: Only used in test code, move there.
20 *
21 * <p>Result of each iteration contains the interval of codepoints that have
22 * the same value type and the value type itself.</p>
23 * <p>The comparison of each codepoint value is done via extract(), which the
24 * default implementation is to return the value as it is.</p>
25 * <p>Method extract() can be overwritten to perform manipulations on
26 * codepoint values in order to perform specialized comparison.</p>
27 * <p>TrieIterator is designed to be a generic iterator for the CharTrie
28 * and the IntTrie, hence to accommodate both types of data, the return
29 * result will be in terms of int (32 bit) values.</p>
30 * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p>
31 * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
32 * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
33 * sense, the caller will have to pass a object with a callback function
34 * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
35 * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
36 * codepoints with the same value as determined by
37 * UTrieEnumValue(const void *context, uint32_t value). for each range,
38 * utrie_enum calls the callback function to perform a task. In this way,
39 * icu4c performs the iteration within utrie_enum.
40 * To follow the JDK model, icu4j is slightly different from icu4c.
41 * Instead of requesting the caller to implement an object for a callback.
42 * The caller will have to implement a subclass of TrieIterator, fleshing out
43 * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
44 * the caller will have to code his own iteration and flesh out the task
45 * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
46 * </p>
47 * <p>There are basically 3 usage scenarios for porting:</p>
48 * <p>1) UTrieEnumValue is the only implemented callback then just implement a
49 * subclass of TrieIterator and override the extract(int) method. The
50 * extract(int) method is analogus to UTrieEnumValue callback.
51 * </p>
52 * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
53 * a subclass of TrieIterator, override the extract method and iterate, e.g
54 * </p>
55 * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
56 *               set);<br>
57 * In Java :<br>
58 * <pre>
59 * class TrieIteratorImpl extends TrieIterator{
60 *     public TrieIteratorImpl(Trie data){
61 *         super(data);
62 *     }
63 *     public int extract(int value){
64 *         // port the implementation of _enumPropertyStartsValue here
65 *     }
66 * }
67 * ....
68 * TrieIterator fcdIter  = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
69 * while(fcdIter.next(result)) {
70 *     // port the implementation of _enumPropertyStartsRange
71 * }
72 * </pre>
73 * </p>
74 * <p>3) UTrieEnumRange is the only implemented callback then just implement
75 * the while loop, when utrie_enum is called
76 * <pre>
77 * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
78 * TrieIterator fcdIter  = new TrieIterator(fcdTrieImpl.fcdTrie);
79 * while(fcdIter.next(result)){
80 *     set.add(result.start);
81 * }
82 * </p>
83 * @author synwee
84 * @see com.ibm.icu.impl.Trie
85 * @since release 2.1, Jan 17 2002
86 */
87public class TrieIterator implements RangeValueIterator
88
89{
90    // public constructor ---------------------------------------------
91
92    /**
93    * TrieEnumeration constructor
94    * @param trie to be used
95    * @exception IllegalArgumentException throw when argument is null.
96    */
97    public TrieIterator(Trie trie)
98    {
99        if (trie == null) {
100            throw new IllegalArgumentException(
101                                          "Argument trie cannot be null");
102        }
103        m_trie_             = trie;
104        // synwee: check that extract belongs to the child class
105        m_initialValue_     = extract(m_trie_.getInitialValue());
106        reset();
107    }
108
109    // public methods -------------------------------------------------
110
111    /**
112    * <p>Returns true if we are not at the end of the iteration, false
113    * otherwise.</p>
114    * <p>The next set of codepoints with the same value type will be
115    * calculated during this call and returned in the arguement element.</p>
116    * @param element return result
117    * @return true if we are not at the end of the iteration, false otherwise.
118    * @exception NoSuchElementException - if no more elements exist.
119    * @see com.ibm.icu.util.RangeValueIterator.Element
120    */
121    public final boolean next(Element element)
122    {
123        if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
124            return false;
125        }
126        if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
127            calculateNextBMPElement(element)) {
128            return true;
129        }
130        calculateNextSupplementaryElement(element);
131        return true;
132    }
133
134    /**
135    * Resets the iterator to the beginning of the iteration
136    */
137    public final void reset()
138    {
139        m_currentCodepoint_ = 0;
140        m_nextCodepoint_    = 0;
141        m_nextIndex_        = 0;
142        m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
143        if (m_nextBlock_ == m_trie_.m_dataOffset_) {
144            m_nextValue_ = m_initialValue_;
145        }
146        else {
147            m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
148        }
149        m_nextBlockIndex_ = 0;
150        m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
151    }
152
153    // protected methods ----------------------------------------------
154
155    /**
156    * Called by next() to extracts a 32 bit value from a trie value
157    * used for comparison.
158    * This method is to be overwritten if special manipulation is to be done
159    * to retrieve a relevant comparison.
160    * The default function is to return the value as it is.
161    * @param value a value from the trie
162    * @return extracted value
163    */
164    protected int extract(int value)
165    {
166        return value;
167    }
168
169    // private methods ------------------------------------------------
170
171    /**
172    * Set the result values
173    * @param element return result object
174    * @param start codepoint of range
175    * @param limit (end + 1) codepoint of range
176    * @param value common value of range
177    */
178    private final void setResult(Element element, int start, int limit,
179                                 int value)
180    {
181        element.start = start;
182        element.limit = limit;
183        element.value = value;
184    }
185
186    /**
187    * Finding the next element.
188    * This method is called just before returning the result of
189    * next().
190    * We always store the next element before it is requested.
191    * In the case that we have to continue calculations into the
192    * supplementary planes, a false will be returned.
193    * @param element return result object
194    * @return true if the next range is found, false if we have to proceed to
195    *         the supplementary range.
196    */
197    private final boolean calculateNextBMPElement(Element element)
198    {
199        int currentValue    = m_nextValue_;
200        m_currentCodepoint_ = m_nextCodepoint_;
201        m_nextCodepoint_ ++;
202        m_nextBlockIndex_ ++;
203        if (!checkBlockDetail(currentValue)) {
204            setResult(element, m_currentCodepoint_, m_nextCodepoint_,
205                      currentValue);
206            return true;
207        }
208        // synwee check that next block index == 0 here
209        // enumerate BMP - the main loop enumerates data blocks
210        while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
211            // because of the way the character is split to form the index
212            // the lead surrogate and trail surrogate can not be in the
213            // mid of a block
214            if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
215                // skip lead surrogate code units,
216                // go to lead surrogate codepoints
217                m_nextIndex_ = BMP_INDEX_LENGTH_;
218            }
219            else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
220                // go back to regular BMP code points
221                m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
222            } else {
223                m_nextIndex_ ++;
224            }
225
226            m_nextBlockIndex_ = 0;
227            if (!checkBlock(currentValue)) {
228                setResult(element, m_currentCodepoint_, m_nextCodepoint_,
229                          currentValue);
230                return true;
231            }
232        }
233        m_nextCodepoint_ --;   // step one back since this value has not been
234        m_nextBlockIndex_ --;  // retrieved yet.
235        return false;
236    }
237
238    /**
239    * Finds the next supplementary element.
240    * For each entry in the trie, the value to be delivered is passed through
241    * extract().
242    * We always store the next element before it is requested.
243    * Called after calculateNextBMP() completes its round of BMP characters.
244    * There is a slight difference in the usage of m_currentCodepoint_
245    * here as compared to calculateNextBMP(). Though both represents the
246    * lower bound of the next element, in calculateNextBMP() it gets set
247    * at the start of any loop, where-else, in calculateNextSupplementary()
248    * since m_currentCodepoint_ already contains the lower bound of the
249    * next element (passed down from calculateNextBMP()), we keep it till
250    * the end before resetting it to the new value.
251    * Note, if there are no more iterations, it will never get to here.
252    * Blocked out by next().
253    * @param element return result object
254    */
255    private final void calculateNextSupplementaryElement(Element element)
256    {
257        int currentValue = m_nextValue_;
258        m_nextCodepoint_ ++;
259        m_nextBlockIndex_ ++;
260
261        if (UTF16.getTrailSurrogate(m_nextCodepoint_)
262                                        != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
263            // this piece is only called when we are in the middle of a lead
264            // surrogate block
265            if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
266                setResult(element, m_currentCodepoint_, m_nextCodepoint_,
267                          currentValue);
268                m_currentCodepoint_ = m_nextCodepoint_;
269                return;
270            }
271            // we have cleared one block
272            m_nextIndex_ ++;
273            m_nextTrailIndexOffset_ ++;
274            if (!checkTrailBlock(currentValue)) {
275                setResult(element, m_currentCodepoint_, m_nextCodepoint_,
276                          currentValue);
277                m_currentCodepoint_ = m_nextCodepoint_;
278                return;
279            }
280        }
281        int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
282        // enumerate supplementary code points
283        while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
284            // lead surrogate access
285            final int leadBlock =
286                   m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
287                                                   Trie.INDEX_STAGE_2_SHIFT_;
288            if (leadBlock == m_trie_.m_dataOffset_) {
289                // no entries for a whole block of lead surrogates
290                if (currentValue != m_initialValue_) {
291                    m_nextValue_      = m_initialValue_;
292                    m_nextBlock_      = leadBlock;  // == m_trie_.m_dataOffset_
293                    m_nextBlockIndex_ = 0;
294                    setResult(element, m_currentCodepoint_, m_nextCodepoint_,
295                              currentValue);
296                    m_currentCodepoint_ = m_nextCodepoint_;
297                    return;
298                }
299
300                nextLead += DATA_BLOCK_LENGTH_;
301                // number of total affected supplementary codepoints in one
302                // block
303                // this is not a simple addition of
304                // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
305                // that we might have moved some of the codepoints
306                m_nextCodepoint_ = Character.toCodePoint((char)nextLead, (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
307                continue;
308            }
309            if (m_trie_.m_dataManipulate_ == null) {
310                throw new NullPointerException(
311                            "The field DataManipulate in this Trie is null");
312            }
313            // enumerate trail surrogates for this lead surrogate
314            m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
315                               m_trie_.getValue(leadBlock +
316                                   (nextLead & Trie.INDEX_STAGE_3_MASK_)));
317            if (m_nextIndex_ <= 0) {
318                // no data for this lead surrogate
319                if (currentValue != m_initialValue_) {
320                    m_nextValue_      = m_initialValue_;
321                    m_nextBlock_      = m_trie_.m_dataOffset_;
322                    m_nextBlockIndex_ = 0;
323                    setResult(element, m_currentCodepoint_, m_nextCodepoint_,
324                              currentValue);
325                    m_currentCodepoint_ = m_nextCodepoint_;
326                    return;
327                }
328                m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
329            } else {
330                m_nextTrailIndexOffset_ = 0;
331                if (!checkTrailBlock(currentValue)) {
332                    setResult(element, m_currentCodepoint_, m_nextCodepoint_,
333                              currentValue);
334                    m_currentCodepoint_ = m_nextCodepoint_;
335                    return;
336                }
337            }
338            nextLead ++;
339         }
340
341         // deliver last range
342         setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
343                   currentValue);
344    }
345
346    /**
347    * Internal block value calculations
348    * Performs calculations on a data block to find codepoints in m_nextBlock_
349    * after the index m_nextBlockIndex_ that has the same value.
350    * Note m_*_ variables at this point is the next codepoint whose value
351    * has not been calculated.
352    * But when returned with false, it will be the last codepoint whose
353    * value has been calculated.
354    * @param currentValue the value which other codepoints are tested against
355    * @return true if the whole block has the same value as currentValue or if
356    *              the whole block has been calculated, false otherwise.
357    */
358    private final boolean checkBlockDetail(int currentValue)
359    {
360        while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
361            m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
362                                                    m_nextBlockIndex_));
363            if (m_nextValue_ != currentValue) {
364                return false;
365            }
366            ++ m_nextBlockIndex_;
367            ++ m_nextCodepoint_;
368        }
369        return true;
370    }
371
372    /**
373    * Internal block value calculations
374    * Performs calculations on a data block to find codepoints in m_nextBlock_
375    * that has the same value.
376    * Will call checkBlockDetail() if highlevel check fails.
377    * Note m_*_ variables at this point is the next codepoint whose value
378    * has not been calculated.
379    * @param currentBlock the initial block containing all currentValue
380    * @param currentValue the value which other codepoints are tested against
381    * @return true if the whole block has the same value as currentValue or if
382    *              the whole block has been calculated, false otherwise.
383    */
384    private final boolean checkBlock(int currentValue)
385    {
386        int currentBlock = m_nextBlock_;
387        m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
388                                                  Trie.INDEX_STAGE_2_SHIFT_;
389        if (m_nextBlock_ == currentBlock &&
390            (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
391            // the block is the same as the previous one, filled with
392            // currentValue
393            m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
394        }
395        else if (m_nextBlock_ == m_trie_.m_dataOffset_) {
396            // this is the all-initial-value block
397            if (currentValue != m_initialValue_) {
398                m_nextValue_      = m_initialValue_;
399                m_nextBlockIndex_ = 0;
400                return false;
401            }
402            m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
403        }
404        else {
405            if (!checkBlockDetail(currentValue)) {
406                return false;
407            }
408        }
409        return true;
410    }
411
412    /**
413    * Internal block value calculations
414    * Performs calculations on multiple data blocks for a set of trail
415    * surrogates to find codepoints in m_nextBlock_ that has the same value.
416    * Will call checkBlock() for internal block checks.
417    * Note m_*_ variables at this point is the next codepoint whose value
418    * has not been calculated.
419    * @param currentValue the value which other codepoints are tested against
420    * @return true if the whole block has the same value as currentValue or if
421    *              the whole block has been calculated, false otherwise.
422    */
423    private final boolean checkTrailBlock(int currentValue)
424    {
425        // enumerate code points for this lead surrogate
426        while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
427        {
428            // if we ever reach here, we are at the start of a new block
429            m_nextBlockIndex_ = 0;
430            // copy of most of the body of the BMP loop
431            if (!checkBlock(currentValue)) {
432                return false;
433            }
434            m_nextTrailIndexOffset_ ++;
435            m_nextIndex_ ++;
436        }
437        return true;
438    }
439
440    /**
441    * Checks if we are beginning at the start of a initial block.
442    * If we are then the rest of the codepoints in this initial block
443    * has the same values.
444    * We increment m_nextCodepoint_ and relevant data members if so.
445    * This is used only in for the supplementary codepoints because
446    * the offset to the trail indexes could be 0.
447    * @return true if we are at the start of a initial block.
448    */
449    private final boolean checkNullNextTrailIndex()
450    {
451        if (m_nextIndex_ <= 0) {
452            m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
453            int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
454            int leadBlock =
455                   m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
456                                                   Trie.INDEX_STAGE_2_SHIFT_;
457            if (m_trie_.m_dataManipulate_ == null) {
458                throw new NullPointerException(
459                            "The field DataManipulate in this Trie is null");
460            }
461            m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
462                               m_trie_.getValue(leadBlock +
463                                   (nextLead & Trie.INDEX_STAGE_3_MASK_)));
464            m_nextIndex_ --;
465            m_nextBlockIndex_ =  DATA_BLOCK_LENGTH_;
466            return true;
467        }
468        return false;
469    }
470
471    // private data members --------------------------------------------
472
473    /**
474    * Size of the stage 1 BMP indexes
475    */
476    private static final int BMP_INDEX_LENGTH_ =
477                                        0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
478    /**
479    * Lead surrogate minimum value
480    */
481    private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
482    /**
483    * Trail surrogate minimum value
484    */
485    private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
486    /*
487    * Trail surrogate maximum value
488    */
489    //private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
490    /**
491    * Number of trail surrogate
492    */
493    private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
494    /**
495    * Number of stage 1 indexes for supplementary calculations that maps to
496    * each lead surrogate character.
497    * See second pass into getRawOffset for the trail surrogate character.
498    * 10 for significant number of bits for trail surrogates, 5 for what we
499    * discard during shifting.
500    */
501    private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
502                                    1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
503    /**
504    * Number of data values in a stage 2 (data array) block.
505    */
506    private static final int DATA_BLOCK_LENGTH_ =
507                                              1 << Trie.INDEX_STAGE_1_SHIFT_;
508//    /**
509//    * Number of codepoints in a stage 2 block
510//    */
511//    private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ =
512//                                                     DATA_BLOCK_LENGTH_ << 10;
513    /**
514    * Trie instance
515    */
516    private Trie m_trie_;
517    /**
518    * Initial value for trie values
519    */
520    private int m_initialValue_;
521    /**
522    * Next element results and data.
523    */
524    private int m_currentCodepoint_;
525    private int m_nextCodepoint_;
526    private int m_nextValue_;
527    private int m_nextIndex_;
528    private int m_nextBlock_;
529    private int m_nextBlockIndex_;
530    private int m_nextTrailIndexOffset_;
531}
532