1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4******************************************************************************
5* Copyright (C) 1996-2015, International Business Machines Corporation and
6* others. All Rights Reserved.
7******************************************************************************
8*/
9
10package com.ibm.icu.impl;
11
12import java.util.NoSuchElementException;
13
14import com.ibm.icu.lang.UCharacter;
15import com.ibm.icu.text.UTF16;
16import com.ibm.icu.util.RangeValueIterator;
17
18/**
19 * <p>Class enabling iteration of the values in a Trie.</p>
20 *
21 * <p>2015-sep-03 TODO: Only used in test code, move there.
22 *
23 * <p>Result of each iteration contains the interval of codepoints that have
24 * the same value type and the value type itself.</p>
25 * <p>The comparison of each codepoint value is done via extract(), which the
26 * default implementation is to return the value as it is.</p>
27 * <p>Method extract() can be overwritten to perform manipulations on
28 * codepoint values in order to perform specialized comparison.</p>
29 * <p>TrieIterator is designed to be a generic iterator for the CharTrie
30 * and the IntTrie, hence to accommodate both types of data, the return
31 * result will be in terms of int (32 bit) values.</p>
32 * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p>
33 * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
34 * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
35 * sense, the caller will have to pass a object with a callback function
36 * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
37 * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
38 * codepoints with the same value as determined by
39 * UTrieEnumValue(const void *context, uint32_t value). for each range,
40 * utrie_enum calls the callback function to perform a task. In this way,
41 * icu4c performs the iteration within utrie_enum.
42 * To follow the JDK model, icu4j is slightly different from icu4c.
43 * Instead of requesting the caller to implement an object for a callback.
44 * The caller will have to implement a subclass of TrieIterator, fleshing out
45 * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
46 * the caller will have to code his own iteration and flesh out the task
47 * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
48 * </p>
49 * <p>There are basically 3 usage scenarios for porting:</p>
50 * <p>1) UTrieEnumValue is the only implemented callback then just implement a
51 * subclass of TrieIterator and override the extract(int) method. The
52 * extract(int) method is analogus to UTrieEnumValue callback.
53 * </p>
54 * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
55 * a subclass of TrieIterator, override the extract method and iterate, e.g
56 * </p>
57 * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
58 *               set);<br>
59 * In Java :<br>
60 * <pre>
61 * class TrieIteratorImpl extends TrieIterator{
62 *     public TrieIteratorImpl(Trie data){
63 *         super(data);
64 *     }
65 *     public int extract(int value){
66 *         // port the implementation of _enumPropertyStartsValue here
67 *     }
68 * }
69 * ....
70 * TrieIterator fcdIter  = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
71 * while(fcdIter.next(result)) {
72 *     // port the implementation of _enumPropertyStartsRange
73 * }
74 * </pre>
75 * </p>
76 * <p>3) UTrieEnumRange is the only implemented callback then just implement
77 * the while loop, when utrie_enum is called
78 * <pre>
79 * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
80 * TrieIterator fcdIter  = new TrieIterator(fcdTrieImpl.fcdTrie);
81 * while(fcdIter.next(result)){
82 *     set.add(result.start);
83 * }
84 * </p>
85 * @author synwee
86 * @see com.ibm.icu.impl.Trie
87 * @since release 2.1, Jan 17 2002
88 */
89public class TrieIterator implements RangeValueIterator
90
91{
92    // public constructor ---------------------------------------------
93
94    /**
95    * TrieEnumeration constructor
96    * @param trie to be used
97    * @exception IllegalArgumentException throw when argument is null.
98    */
99    public TrieIterator(Trie trie)
100    {
101        if (trie == null) {
102            throw new IllegalArgumentException(
103                                          "Argument trie cannot be null");
104        }
105        m_trie_             = trie;
106        // synwee: check that extract belongs to the child class
107        m_initialValue_     = extract(m_trie_.getInitialValue());
108        reset();
109    }
110
111    // public methods -------------------------------------------------
112
113    /**
114    * <p>Returns true if we are not at the end of the iteration, false
115    * otherwise.</p>
116    * <p>The next set of codepoints with the same value type will be
117    * calculated during this call and returned in the arguement element.</p>
118    * @param element return result
119    * @return true if we are not at the end of the iteration, false otherwise.
120    * @exception NoSuchElementException - if no more elements exist.
121    * @see com.ibm.icu.util.RangeValueIterator.Element
122    */
123    @Override
124    public final boolean next(Element element)
125    {
126        if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
127            return false;
128        }
129        if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
130            calculateNextBMPElement(element)) {
131            return true;
132        }
133        calculateNextSupplementaryElement(element);
134        return true;
135    }
136
137    /**
138    * Resets the iterator to the beginning of the iteration
139    */
140    @Override
141    public final void reset()
142    {
143        m_currentCodepoint_ = 0;
144        m_nextCodepoint_    = 0;
145        m_nextIndex_        = 0;
146        m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
147        if (m_nextBlock_ == m_trie_.m_dataOffset_) {
148            m_nextValue_ = m_initialValue_;
149        }
150        else {
151            m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
152        }
153        m_nextBlockIndex_ = 0;
154        m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
155    }
156
157    // protected methods ----------------------------------------------
158
159    /**
160    * Called by next() to extracts a 32 bit value from a trie value
161    * used for comparison.
162    * This method is to be overwritten if special manipulation is to be done
163    * to retrieve a relevant comparison.
164    * The default function is to return the value as it is.
165    * @param value a value from the trie
166    * @return extracted value
167    */
168    protected int extract(int value)
169    {
170        return value;
171    }
172
173    // private methods ------------------------------------------------
174
175    /**
176    * Set the result values
177    * @param element return result object
178    * @param start codepoint of range
179    * @param limit (end + 1) codepoint of range
180    * @param value common value of range
181    */
182    private final void setResult(Element element, int start, int limit,
183                                 int value)
184    {
185        element.start = start;
186        element.limit = limit;
187        element.value = value;
188    }
189
190    /**
191    * Finding the next element.
192    * This method is called just before returning the result of
193    * next().
194    * We always store the next element before it is requested.
195    * In the case that we have to continue calculations into the
196    * supplementary planes, a false will be returned.
197    * @param element return result object
198    * @return true if the next range is found, false if we have to proceed to
199    *         the supplementary range.
200    */
201    private final boolean calculateNextBMPElement(Element element)
202    {
203        int currentValue    = m_nextValue_;
204        m_currentCodepoint_ = m_nextCodepoint_;
205        m_nextCodepoint_ ++;
206        m_nextBlockIndex_ ++;
207        if (!checkBlockDetail(currentValue)) {
208            setResult(element, m_currentCodepoint_, m_nextCodepoint_,
209                      currentValue);
210            return true;
211        }
212        // synwee check that next block index == 0 here
213        // enumerate BMP - the main loop enumerates data blocks
214        while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
215            // because of the way the character is split to form the index
216            // the lead surrogate and trail surrogate can not be in the
217            // mid of a block
218            if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
219                // skip lead surrogate code units,
220                // go to lead surrogate codepoints
221                m_nextIndex_ = BMP_INDEX_LENGTH_;
222            }
223            else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
224                // go back to regular BMP code points
225                m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
226            } else {
227                m_nextIndex_ ++;
228            }
229
230            m_nextBlockIndex_ = 0;
231            if (!checkBlock(currentValue)) {
232                setResult(element, m_currentCodepoint_, m_nextCodepoint_,
233                          currentValue);
234                return true;
235            }
236        }
237        m_nextCodepoint_ --;   // step one back since this value has not been
238        m_nextBlockIndex_ --;  // retrieved yet.
239        return false;
240    }
241
242    /**
243    * Finds the next supplementary element.
244    * For each entry in the trie, the value to be delivered is passed through
245    * extract().
246    * We always store the next element before it is requested.
247    * Called after calculateNextBMP() completes its round of BMP characters.
248    * There is a slight difference in the usage of m_currentCodepoint_
249    * here as compared to calculateNextBMP(). Though both represents the
250    * lower bound of the next element, in calculateNextBMP() it gets set
251    * at the start of any loop, where-else, in calculateNextSupplementary()
252    * since m_currentCodepoint_ already contains the lower bound of the
253    * next element (passed down from calculateNextBMP()), we keep it till
254    * the end before resetting it to the new value.
255    * Note, if there are no more iterations, it will never get to here.
256    * Blocked out by next().
257    * @param element return result object
258    */
259    private final void calculateNextSupplementaryElement(Element element)
260    {
261        int currentValue = m_nextValue_;
262        m_nextCodepoint_ ++;
263        m_nextBlockIndex_ ++;
264
265        if (UTF16.getTrailSurrogate(m_nextCodepoint_)
266                                        != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
267            // this piece is only called when we are in the middle of a lead
268            // surrogate block
269            if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
270                setResult(element, m_currentCodepoint_, m_nextCodepoint_,
271                          currentValue);
272                m_currentCodepoint_ = m_nextCodepoint_;
273                return;
274            }
275            // we have cleared one block
276            m_nextIndex_ ++;
277            m_nextTrailIndexOffset_ ++;
278            if (!checkTrailBlock(currentValue)) {
279                setResult(element, m_currentCodepoint_, m_nextCodepoint_,
280                          currentValue);
281                m_currentCodepoint_ = m_nextCodepoint_;
282                return;
283            }
284        }
285        int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
286        // enumerate supplementary code points
287        while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
288            // lead surrogate access
289            final int leadBlock =
290                   m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
291                                                   Trie.INDEX_STAGE_2_SHIFT_;
292            if (leadBlock == m_trie_.m_dataOffset_) {
293                // no entries for a whole block of lead surrogates
294                if (currentValue != m_initialValue_) {
295                    m_nextValue_      = m_initialValue_;
296                    m_nextBlock_      = leadBlock;  // == m_trie_.m_dataOffset_
297                    m_nextBlockIndex_ = 0;
298                    setResult(element, m_currentCodepoint_, m_nextCodepoint_,
299                              currentValue);
300                    m_currentCodepoint_ = m_nextCodepoint_;
301                    return;
302                }
303
304                nextLead += DATA_BLOCK_LENGTH_;
305                // number of total affected supplementary codepoints in one
306                // block
307                // this is not a simple addition of
308                // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
309                // that we might have moved some of the codepoints
310                m_nextCodepoint_ = Character.toCodePoint((char)nextLead, (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
311                continue;
312            }
313            if (m_trie_.m_dataManipulate_ == null) {
314                throw new NullPointerException(
315                            "The field DataManipulate in this Trie is null");
316            }
317            // enumerate trail surrogates for this lead surrogate
318            m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
319                               m_trie_.getValue(leadBlock +
320                                   (nextLead & Trie.INDEX_STAGE_3_MASK_)));
321            if (m_nextIndex_ <= 0) {
322                // no data for this lead surrogate
323                if (currentValue != m_initialValue_) {
324                    m_nextValue_      = m_initialValue_;
325                    m_nextBlock_      = m_trie_.m_dataOffset_;
326                    m_nextBlockIndex_ = 0;
327                    setResult(element, m_currentCodepoint_, m_nextCodepoint_,
328                              currentValue);
329                    m_currentCodepoint_ = m_nextCodepoint_;
330                    return;
331                }
332                m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
333            } else {
334                m_nextTrailIndexOffset_ = 0;
335                if (!checkTrailBlock(currentValue)) {
336                    setResult(element, m_currentCodepoint_, m_nextCodepoint_,
337                              currentValue);
338                    m_currentCodepoint_ = m_nextCodepoint_;
339                    return;
340                }
341            }
342            nextLead ++;
343         }
344
345         // deliver last range
346         setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
347                   currentValue);
348    }
349
350    /**
351    * Internal block value calculations
352    * Performs calculations on a data block to find codepoints in m_nextBlock_
353    * after the index m_nextBlockIndex_ that has the same value.
354    * Note m_*_ variables at this point is the next codepoint whose value
355    * has not been calculated.
356    * But when returned with false, it will be the last codepoint whose
357    * value has been calculated.
358    * @param currentValue the value which other codepoints are tested against
359    * @return true if the whole block has the same value as currentValue or if
360    *              the whole block has been calculated, false otherwise.
361    */
362    private final boolean checkBlockDetail(int currentValue)
363    {
364        while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
365            m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
366                                                    m_nextBlockIndex_));
367            if (m_nextValue_ != currentValue) {
368                return false;
369            }
370            ++ m_nextBlockIndex_;
371            ++ m_nextCodepoint_;
372        }
373        return true;
374    }
375
376    /**
377    * Internal block value calculations
378    * Performs calculations on a data block to find codepoints in m_nextBlock_
379    * that has the same value.
380    * Will call checkBlockDetail() if highlevel check fails.
381    * Note m_*_ variables at this point is the next codepoint whose value
382    * has not been calculated.
383    * @param currentBlock the initial block containing all currentValue
384    * @param currentValue the value which other codepoints are tested against
385    * @return true if the whole block has the same value as currentValue or if
386    *              the whole block has been calculated, false otherwise.
387    */
388    private final boolean checkBlock(int currentValue)
389    {
390        int currentBlock = m_nextBlock_;
391        m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
392                                                  Trie.INDEX_STAGE_2_SHIFT_;
393        if (m_nextBlock_ == currentBlock &&
394            (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
395            // the block is the same as the previous one, filled with
396            // currentValue
397            m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
398        }
399        else if (m_nextBlock_ == m_trie_.m_dataOffset_) {
400            // this is the all-initial-value block
401            if (currentValue != m_initialValue_) {
402                m_nextValue_      = m_initialValue_;
403                m_nextBlockIndex_ = 0;
404                return false;
405            }
406            m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
407        }
408        else {
409            if (!checkBlockDetail(currentValue)) {
410                return false;
411            }
412        }
413        return true;
414    }
415
416    /**
417    * Internal block value calculations
418    * Performs calculations on multiple data blocks for a set of trail
419    * surrogates to find codepoints in m_nextBlock_ that has the same value.
420    * Will call checkBlock() for internal block checks.
421    * Note m_*_ variables at this point is the next codepoint whose value
422    * has not been calculated.
423    * @param currentValue the value which other codepoints are tested against
424    * @return true if the whole block has the same value as currentValue or if
425    *              the whole block has been calculated, false otherwise.
426    */
427    private final boolean checkTrailBlock(int currentValue)
428    {
429        // enumerate code points for this lead surrogate
430        while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
431        {
432            // if we ever reach here, we are at the start of a new block
433            m_nextBlockIndex_ = 0;
434            // copy of most of the body of the BMP loop
435            if (!checkBlock(currentValue)) {
436                return false;
437            }
438            m_nextTrailIndexOffset_ ++;
439            m_nextIndex_ ++;
440        }
441        return true;
442    }
443
444    /**
445    * Checks if we are beginning at the start of a initial block.
446    * If we are then the rest of the codepoints in this initial block
447    * has the same values.
448    * We increment m_nextCodepoint_ and relevant data members if so.
449    * This is used only in for the supplementary codepoints because
450    * the offset to the trail indexes could be 0.
451    * @return true if we are at the start of a initial block.
452    */
453    private final boolean checkNullNextTrailIndex()
454    {
455        if (m_nextIndex_ <= 0) {
456            m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
457            int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
458            int leadBlock =
459                   m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
460                                                   Trie.INDEX_STAGE_2_SHIFT_;
461            if (m_trie_.m_dataManipulate_ == null) {
462                throw new NullPointerException(
463                            "The field DataManipulate in this Trie is null");
464            }
465            m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
466                               m_trie_.getValue(leadBlock +
467                                   (nextLead & Trie.INDEX_STAGE_3_MASK_)));
468            m_nextIndex_ --;
469            m_nextBlockIndex_ =  DATA_BLOCK_LENGTH_;
470            return true;
471        }
472        return false;
473    }
474
475    // private data members --------------------------------------------
476
477    /**
478    * Size of the stage 1 BMP indexes
479    */
480    private static final int BMP_INDEX_LENGTH_ =
481                                        0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
482    /**
483    * Lead surrogate minimum value
484    */
485    private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
486    /**
487    * Trail surrogate minimum value
488    */
489    private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
490    /*
491    * Trail surrogate maximum value
492    */
493    //private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
494    /**
495    * Number of trail surrogate
496    */
497    private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
498    /**
499    * Number of stage 1 indexes for supplementary calculations that maps to
500    * each lead surrogate character.
501    * See second pass into getRawOffset for the trail surrogate character.
502    * 10 for significant number of bits for trail surrogates, 5 for what we
503    * discard during shifting.
504    */
505    private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
506                                    1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
507    /**
508    * Number of data values in a stage 2 (data array) block.
509    */
510    private static final int DATA_BLOCK_LENGTH_ =
511                                              1 << Trie.INDEX_STAGE_1_SHIFT_;
512//    /**
513//    * Number of codepoints in a stage 2 block
514//    */
515//    private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ =
516//                                                     DATA_BLOCK_LENGTH_ << 10;
517    /**
518    * Trie instance
519    */
520    private Trie m_trie_;
521    /**
522    * Initial value for trie values
523    */
524    private int m_initialValue_;
525    /**
526    * Next element results and data.
527    */
528    private int m_currentCodepoint_;
529    private int m_nextCodepoint_;
530    private int m_nextValue_;
531    private int m_nextIndex_;
532    private int m_nextBlock_;
533    private int m_nextBlockIndex_;
534    private int m_nextTrailIndexOffset_;
535}
536