TrieIterator.java revision f86f25d102340da66b9c7cb6b2d5ecdc0de43ecf
1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5******************************************************************************
6* Copyright (C) 1996-2015, International Business Machines Corporation and
7* others. All Rights Reserved.
8******************************************************************************
9*/
10
11package android.icu.impl;
12
13import java.util.NoSuchElementException;
14
15import android.icu.lang.UCharacter;
16import android.icu.text.UTF16;
17import android.icu.util.RangeValueIterator;
18
19/**
20 * <p>Class enabling iteration of the values in a Trie.</p>
21 *
22 * <p>2015-sep-03 TODO: Only used in test code, move there.
23 *
24 * <p>Result of each iteration contains the interval of codepoints that have
25 * the same value type and the value type itself.</p>
26 * <p>The comparison of each codepoint value is done via extract(), which the
27 * default implementation is to return the value as it is.</p>
28 * <p>Method extract() can be overwritten to perform manipulations on
29 * codepoint values in order to perform specialized comparison.</p>
30 * <p>TrieIterator is designed to be a generic iterator for the CharTrie
31 * and the IntTrie, hence to accommodate both types of data, the return
32 * result will be in terms of int (32 bit) values.</p>
33 * <p>See android.icu.text.UCharacterTypeIterator for examples of use.</p>
34 * <p>Notes for porting utrie_enum from icu4c to icu4j:<br>
35 * Internally, icu4c's utrie_enum performs all iterations in its body. In Java
36 * sense, the caller will have to pass a object with a callback function
37 * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit,
38 * uint32_t value) into utrie_enum. utrie_enum will then find ranges of
39 * codepoints with the same value as determined by
40 * UTrieEnumValue(const void *context, uint32_t value). for each range,
41 * utrie_enum calls the callback function to perform a task. In this way,
42 * icu4c performs the iteration within utrie_enum.
43 * To follow the JDK model, icu4j is slightly different from icu4c.
44 * Instead of requesting the caller to implement an object for a callback.
45 * The caller will have to implement a subclass of TrieIterator, fleshing out
46 * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j,
47 * the caller will have to code his own iteration and flesh out the task
48 * (equivalent to UTrieEnumRange) to be performed in the iteration loop.
49 * </p>
50 * <p>There are basically 3 usage scenarios for porting:</p>
51 * <p>1) UTrieEnumValue is the only implemented callback then just implement a
52 * subclass of TrieIterator and override the extract(int) method. The
53 * extract(int) method is analogus to UTrieEnumValue callback.
54 * </p>
55 * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement
56 * a subclass of TrieIterator, override the extract method and iterate, e.g
57 * </p>
58 * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange,
59 *               set);<br>
60 * In Java :<br>
61 * <pre>
62 * class TrieIteratorImpl extends TrieIterator{
63 *     public TrieIteratorImpl(Trie data){
64 *         super(data);
65 *     }
66 *     public int extract(int value){
67 *         // port the implementation of _enumPropertyStartsValue here
68 *     }
69 * }
70 * ....
71 * TrieIterator fcdIter  = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
72 * while(fcdIter.next(result)) {
73 *     // port the implementation of _enumPropertyStartsRange
74 * }
75 * </pre>
76 * </p>
77 * <p>3) UTrieEnumRange is the only implemented callback then just implement
78 * the while loop, when utrie_enum is called
79 * <pre>
80 * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
81 * TrieIterator fcdIter  = new TrieIterator(fcdTrieImpl.fcdTrie);
82 * while(fcdIter.next(result)){
83 *     set.add(result.start);
84 * }
85 * </p>
86 * @author synwee
87 * @see android.icu.impl.Trie
88 * @hide Only a subset of ICU is exposed in Android
89 */
90public class TrieIterator implements RangeValueIterator
91
92{
93    // public constructor ---------------------------------------------
94
95    /**
96    * TrieEnumeration constructor
97    * @param trie to be used
98    * @exception IllegalArgumentException throw when argument is null.
99    */
100    public TrieIterator(Trie trie)
101    {
102        if (trie == null) {
103            throw new IllegalArgumentException(
104                                          "Argument trie cannot be null");
105        }
106        m_trie_             = trie;
107        // synwee: check that extract belongs to the child class
108        m_initialValue_     = extract(m_trie_.getInitialValue());
109        reset();
110    }
111
112    // public methods -------------------------------------------------
113
114    /**
115    * <p>Returns true if we are not at the end of the iteration, false
116    * otherwise.</p>
117    * <p>The next set of codepoints with the same value type will be
118    * calculated during this call and returned in the arguement element.</p>
119    * @param element return result
120    * @return true if we are not at the end of the iteration, false otherwise.
121    * @exception NoSuchElementException - if no more elements exist.
122    * @see android.icu.util.RangeValueIterator.Element
123    */
124    @Override
125    public final boolean next(Element element)
126    {
127        if (m_nextCodepoint_ > UCharacter.MAX_VALUE) {
128            return false;
129        }
130        if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE &&
131            calculateNextBMPElement(element)) {
132            return true;
133        }
134        calculateNextSupplementaryElement(element);
135        return true;
136    }
137
138    /**
139    * Resets the iterator to the beginning of the iteration
140    */
141    @Override
142    public final void reset()
143    {
144        m_currentCodepoint_ = 0;
145        m_nextCodepoint_    = 0;
146        m_nextIndex_        = 0;
147        m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_;
148        if (m_nextBlock_ == m_trie_.m_dataOffset_) {
149            m_nextValue_ = m_initialValue_;
150        }
151        else {
152            m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_));
153        }
154        m_nextBlockIndex_ = 0;
155        m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_;
156    }
157
158    // protected methods ----------------------------------------------
159
160    /**
161    * Called by next() to extracts a 32 bit value from a trie value
162    * used for comparison.
163    * This method is to be overwritten if special manipulation is to be done
164    * to retrieve a relevant comparison.
165    * The default function is to return the value as it is.
166    * @param value a value from the trie
167    * @return extracted value
168    */
169    protected int extract(int value)
170    {
171        return value;
172    }
173
174    // private methods ------------------------------------------------
175
176    /**
177    * Set the result values
178    * @param element return result object
179    * @param start codepoint of range
180    * @param limit (end + 1) codepoint of range
181    * @param value common value of range
182    */
183    private final void setResult(Element element, int start, int limit,
184                                 int value)
185    {
186        element.start = start;
187        element.limit = limit;
188        element.value = value;
189    }
190
191    /**
192    * Finding the next element.
193    * This method is called just before returning the result of
194    * next().
195    * We always store the next element before it is requested.
196    * In the case that we have to continue calculations into the
197    * supplementary planes, a false will be returned.
198    * @param element return result object
199    * @return true if the next range is found, false if we have to proceed to
200    *         the supplementary range.
201    */
202    private final boolean calculateNextBMPElement(Element element)
203    {
204        int currentValue    = m_nextValue_;
205        m_currentCodepoint_ = m_nextCodepoint_;
206        m_nextCodepoint_ ++;
207        m_nextBlockIndex_ ++;
208        if (!checkBlockDetail(currentValue)) {
209            setResult(element, m_currentCodepoint_, m_nextCodepoint_,
210                      currentValue);
211            return true;
212        }
213        // synwee check that next block index == 0 here
214        // enumerate BMP - the main loop enumerates data blocks
215        while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) {
216            // because of the way the character is split to form the index
217            // the lead surrogate and trail surrogate can not be in the
218            // mid of a block
219            if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) {
220                // skip lead surrogate code units,
221                // go to lead surrogate codepoints
222                m_nextIndex_ = BMP_INDEX_LENGTH_;
223            }
224            else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) {
225                // go back to regular BMP code points
226                m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_;
227            } else {
228                m_nextIndex_ ++;
229            }
230
231            m_nextBlockIndex_ = 0;
232            if (!checkBlock(currentValue)) {
233                setResult(element, m_currentCodepoint_, m_nextCodepoint_,
234                          currentValue);
235                return true;
236            }
237        }
238        m_nextCodepoint_ --;   // step one back since this value has not been
239        m_nextBlockIndex_ --;  // retrieved yet.
240        return false;
241    }
242
243    /**
244    * Finds the next supplementary element.
245    * For each entry in the trie, the value to be delivered is passed through
246    * extract().
247    * We always store the next element before it is requested.
248    * Called after calculateNextBMP() completes its round of BMP characters.
249    * There is a slight difference in the usage of m_currentCodepoint_
250    * here as compared to calculateNextBMP(). Though both represents the
251    * lower bound of the next element, in calculateNextBMP() it gets set
252    * at the start of any loop, where-else, in calculateNextSupplementary()
253    * since m_currentCodepoint_ already contains the lower bound of the
254    * next element (passed down from calculateNextBMP()), we keep it till
255    * the end before resetting it to the new value.
256    * Note, if there are no more iterations, it will never get to here.
257    * Blocked out by next().
258    * @param element return result object
259    */
260    private final void calculateNextSupplementaryElement(Element element)
261    {
262        int currentValue = m_nextValue_;
263        m_nextCodepoint_ ++;
264        m_nextBlockIndex_ ++;
265
266        if (UTF16.getTrailSurrogate(m_nextCodepoint_)
267                                        != UTF16.TRAIL_SURROGATE_MIN_VALUE) {
268            // this piece is only called when we are in the middle of a lead
269            // surrogate block
270            if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) {
271                setResult(element, m_currentCodepoint_, m_nextCodepoint_,
272                          currentValue);
273                m_currentCodepoint_ = m_nextCodepoint_;
274                return;
275            }
276            // we have cleared one block
277            m_nextIndex_ ++;
278            m_nextTrailIndexOffset_ ++;
279            if (!checkTrailBlock(currentValue)) {
280                setResult(element, m_currentCodepoint_, m_nextCodepoint_,
281                          currentValue);
282                m_currentCodepoint_ = m_nextCodepoint_;
283                return;
284            }
285        }
286        int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
287        // enumerate supplementary code points
288        while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) {
289            // lead surrogate access
290            final int leadBlock =
291                   m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
292                                                   Trie.INDEX_STAGE_2_SHIFT_;
293            if (leadBlock == m_trie_.m_dataOffset_) {
294                // no entries for a whole block of lead surrogates
295                if (currentValue != m_initialValue_) {
296                    m_nextValue_      = m_initialValue_;
297                    m_nextBlock_      = leadBlock;  // == m_trie_.m_dataOffset_
298                    m_nextBlockIndex_ = 0;
299                    setResult(element, m_currentCodepoint_, m_nextCodepoint_,
300                              currentValue);
301                    m_currentCodepoint_ = m_nextCodepoint_;
302                    return;
303                }
304
305                nextLead += DATA_BLOCK_LENGTH_;
306                // number of total affected supplementary codepoints in one
307                // block
308                // this is not a simple addition of
309                // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider
310                // that we might have moved some of the codepoints
311                m_nextCodepoint_ = Character.toCodePoint((char)nextLead, (char)UTF16.TRAIL_SURROGATE_MIN_VALUE);
312                continue;
313            }
314            if (m_trie_.m_dataManipulate_ == null) {
315                throw new NullPointerException(
316                            "The field DataManipulate in this Trie is null");
317            }
318            // enumerate trail surrogates for this lead surrogate
319            m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
320                               m_trie_.getValue(leadBlock +
321                                   (nextLead & Trie.INDEX_STAGE_3_MASK_)));
322            if (m_nextIndex_ <= 0) {
323                // no data for this lead surrogate
324                if (currentValue != m_initialValue_) {
325                    m_nextValue_      = m_initialValue_;
326                    m_nextBlock_      = m_trie_.m_dataOffset_;
327                    m_nextBlockIndex_ = 0;
328                    setResult(element, m_currentCodepoint_, m_nextCodepoint_,
329                              currentValue);
330                    m_currentCodepoint_ = m_nextCodepoint_;
331                    return;
332                }
333                m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_;
334            } else {
335                m_nextTrailIndexOffset_ = 0;
336                if (!checkTrailBlock(currentValue)) {
337                    setResult(element, m_currentCodepoint_, m_nextCodepoint_,
338                              currentValue);
339                    m_currentCodepoint_ = m_nextCodepoint_;
340                    return;
341                }
342            }
343            nextLead ++;
344         }
345
346         // deliver last range
347         setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1,
348                   currentValue);
349    }
350
351    /**
352    * Internal block value calculations
353    * Performs calculations on a data block to find codepoints in m_nextBlock_
354    * after the index m_nextBlockIndex_ that has the same value.
355    * Note m_*_ variables at this point is the next codepoint whose value
356    * has not been calculated.
357    * But when returned with false, it will be the last codepoint whose
358    * value has been calculated.
359    * @param currentValue the value which other codepoints are tested against
360    * @return true if the whole block has the same value as currentValue or if
361    *              the whole block has been calculated, false otherwise.
362    */
363    private final boolean checkBlockDetail(int currentValue)
364    {
365        while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) {
366            m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ +
367                                                    m_nextBlockIndex_));
368            if (m_nextValue_ != currentValue) {
369                return false;
370            }
371            ++ m_nextBlockIndex_;
372            ++ m_nextCodepoint_;
373        }
374        return true;
375    }
376
377    /**
378    * Internal block value calculations
379    * Performs calculations on a data block to find codepoints in m_nextBlock_
380    * that has the same value.
381    * Will call checkBlockDetail() if highlevel check fails.
382    * Note m_*_ variables at this point is the next codepoint whose value
383    * has not been calculated.
384    * @param currentBlock the initial block containing all currentValue
385    * @param currentValue the value which other codepoints are tested against
386    * @return true if the whole block has the same value as currentValue or if
387    *              the whole block has been calculated, false otherwise.
388    */
389    private final boolean checkBlock(int currentValue)
390    {
391        int currentBlock = m_nextBlock_;
392        m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] <<
393                                                  Trie.INDEX_STAGE_2_SHIFT_;
394        if (m_nextBlock_ == currentBlock &&
395            (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) {
396            // the block is the same as the previous one, filled with
397            // currentValue
398            m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
399        }
400        else if (m_nextBlock_ == m_trie_.m_dataOffset_) {
401            // this is the all-initial-value block
402            if (currentValue != m_initialValue_) {
403                m_nextValue_      = m_initialValue_;
404                m_nextBlockIndex_ = 0;
405                return false;
406            }
407            m_nextCodepoint_ += DATA_BLOCK_LENGTH_;
408        }
409        else {
410            if (!checkBlockDetail(currentValue)) {
411                return false;
412            }
413        }
414        return true;
415    }
416
417    /**
418    * Internal block value calculations
419    * Performs calculations on multiple data blocks for a set of trail
420    * surrogates to find codepoints in m_nextBlock_ that has the same value.
421    * Will call checkBlock() for internal block checks.
422    * Note m_*_ variables at this point is the next codepoint whose value
423    * has not been calculated.
424    * @param currentValue the value which other codepoints are tested against
425    * @return true if the whole block has the same value as currentValue or if
426    *              the whole block has been calculated, false otherwise.
427    */
428    private final boolean checkTrailBlock(int currentValue)
429    {
430        // enumerate code points for this lead surrogate
431        while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_)
432        {
433            // if we ever reach here, we are at the start of a new block
434            m_nextBlockIndex_ = 0;
435            // copy of most of the body of the BMP loop
436            if (!checkBlock(currentValue)) {
437                return false;
438            }
439            m_nextTrailIndexOffset_ ++;
440            m_nextIndex_ ++;
441        }
442        return true;
443    }
444
445    /**
446    * Checks if we are beginning at the start of a initial block.
447    * If we are then the rest of the codepoints in this initial block
448    * has the same values.
449    * We increment m_nextCodepoint_ and relevant data members if so.
450    * This is used only in for the supplementary codepoints because
451    * the offset to the trail indexes could be 0.
452    * @return true if we are at the start of a initial block.
453    */
454    private final boolean checkNullNextTrailIndex()
455    {
456        if (m_nextIndex_ <= 0) {
457            m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1;
458            int nextLead  = UTF16.getLeadSurrogate(m_nextCodepoint_);
459            int leadBlock =
460                   m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] <<
461                                                   Trie.INDEX_STAGE_2_SHIFT_;
462            if (m_trie_.m_dataManipulate_ == null) {
463                throw new NullPointerException(
464                            "The field DataManipulate in this Trie is null");
465            }
466            m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset(
467                               m_trie_.getValue(leadBlock +
468                                   (nextLead & Trie.INDEX_STAGE_3_MASK_)));
469            m_nextIndex_ --;
470            m_nextBlockIndex_ =  DATA_BLOCK_LENGTH_;
471            return true;
472        }
473        return false;
474    }
475
476    // private data members --------------------------------------------
477
478    /**
479    * Size of the stage 1 BMP indexes
480    */
481    private static final int BMP_INDEX_LENGTH_ =
482                                        0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
483    /**
484    * Lead surrogate minimum value
485    */
486    private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
487    /**
488    * Trail surrogate minimum value
489    */
490    private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
491    /*
492    * Trail surrogate maximum value
493    */
494    //private static final int TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;
495    /**
496    * Number of trail surrogate
497    */
498    private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
499    /**
500    * Number of stage 1 indexes for supplementary calculations that maps to
501    * each lead surrogate character.
502    * See second pass into getRawOffset for the trail surrogate character.
503    * 10 for significant number of bits for trail surrogates, 5 for what we
504    * discard during shifting.
505    */
506    private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ =
507                                    1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
508    /**
509    * Number of data values in a stage 2 (data array) block.
510    */
511    private static final int DATA_BLOCK_LENGTH_ =
512                                              1 << Trie.INDEX_STAGE_1_SHIFT_;
513//    /**
514//    * Number of codepoints in a stage 2 block
515//    */
516//    private static final int DATA_BLOCK_SUPPLEMENTARY_LENGTH_ =
517//                                                     DATA_BLOCK_LENGTH_ << 10;
518    /**
519    * Trie instance
520    */
521    private Trie m_trie_;
522    /**
523    * Initial value for trie values
524    */
525    private int m_initialValue_;
526    /**
527    * Next element results and data.
528    */
529    private int m_currentCodepoint_;
530    private int m_nextCodepoint_;
531    private int m_nextValue_;
532    private int m_nextIndex_;
533    private int m_nextBlock_;
534    private int m_nextBlockIndex_;
535    private int m_nextTrailIndexOffset_;
536}
537