1/*
2*******************************************************************************
3*   Copyright (C) 2011-2014, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   created on: 2011jan06
7*   created by: Markus W. Scherer
8*   ported from ICU4C ucharstrie.h/.cpp
9*/
10
11package com.ibm.icu.util;
12
13import java.io.IOException;
14import java.util.ArrayList;
15import java.util.NoSuchElementException;
16
17import com.ibm.icu.text.UTF16;
18import com.ibm.icu.util.BytesTrie.Result;
19
20/**
21 * Light-weight, non-const reader class for a CharsTrie.
22 * Traverses a char-serialized data structure with minimal state,
23 * for mapping strings (16-bit-unit sequences) to non-negative integer values.
24 *
25 * <p>This class is not intended for public subclassing.
26 *
27 * @stable ICU 4.8
28 * @author Markus W. Scherer
29 */
30public final class CharsTrie implements Cloneable, Iterable<CharsTrie.Entry> {
31    /**
32     * Constructs a CharsTrie reader instance.
33     *
34     * <p>The CharSequence must contain a copy of a char sequence from the CharsTrieBuilder,
35     * with the offset indicating the first char of that sequence.
36     * The CharsTrie object will not read more chars than
37     * the CharsTrieBuilder generated in the corresponding build() call.
38     *
39     * <p>The CharSequence is not copied/cloned and must not be modified while
40     * the CharsTrie object is in use.
41     *
42     * @param trieChars CharSequence that contains the serialized trie.
43     * @param offset Root offset of the trie in the CharSequence.
44     * @stable ICU 4.8
45     */
46    public CharsTrie(CharSequence trieChars, int offset) {
47        chars_=trieChars;
48        pos_=root_=offset;
49        remainingMatchLength_=-1;
50    }
51
52    /**
53     * Clones this trie reader object and its state,
54     * but not the char array which will be shared.
55     * @return A shallow clone of this trie.
56     * @stable ICU 4.8
57     */
58    @Override
59    public Object clone() throws CloneNotSupportedException {
60        return super.clone();  // A shallow copy is just what we need.
61    }
62
63    /**
64     * Resets this trie to its initial state.
65     * @return this
66     * @stable ICU 4.8
67     */
68    public CharsTrie reset() {
69        pos_=root_;
70        remainingMatchLength_=-1;
71        return this;
72    }
73
74    /**
75     * CharsTrie state object, for saving a trie's current state
76     * and resetting the trie back to this state later.
77     * @stable ICU 4.8
78     */
79    public static final class State {
80        /**
81         * Constructs an empty State.
82         * @stable ICU 4.8
83         */
84        public State() {}
85        private CharSequence chars;
86        private int root;
87        private int pos;
88        private int remainingMatchLength;
89    }
90
91    /**
92     * Saves the state of this trie.
93     * @param state The State object to hold the trie's state.
94     * @return this
95     * @see #resetToState
96     * @stable ICU 4.8
97     */
98    public CharsTrie saveState(State state) /*const*/ {
99        state.chars=chars_;
100        state.root=root_;
101        state.pos=pos_;
102        state.remainingMatchLength=remainingMatchLength_;
103        return this;
104    }
105
106    /**
107     * Resets this trie to the saved state.
108     * @param state The State object which holds a saved trie state.
109     * @return this
110     * @throws IllegalArgumentException if the state object contains no state,
111     *         or the state of a different trie
112     * @see #saveState
113     * @see #reset
114     * @stable ICU 4.8
115     */
116    public CharsTrie resetToState(State state) {
117        if(chars_==state.chars && chars_!=null && root_==state.root) {
118            pos_=state.pos;
119            remainingMatchLength_=state.remainingMatchLength;
120        } else {
121            throw new IllegalArgumentException("incompatible trie state");
122        }
123        return this;
124    }
125
126    /**
127     * Determines whether the string so far matches, whether it has a value,
128     * and whether another input char can continue a matching string.
129     * @return The match/value Result.
130     * @stable ICU 4.8
131     */
132    public Result current() /*const*/ {
133        int pos=pos_;
134        if(pos<0) {
135            return Result.NO_MATCH;
136        } else {
137            int node;
138            return (remainingMatchLength_<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
139                    valueResults_[node>>15] : Result.NO_VALUE;
140        }
141    }
142
143    /**
144     * Traverses the trie from the initial state for this input char.
145     * Equivalent to reset().next(inUnit).
146     * @param inUnit Input char value. Values below 0 and above 0xffff will never match.
147     * @return The match/value Result.
148     * @stable ICU 4.8
149     */
150    public Result first(int inUnit) {
151        remainingMatchLength_=-1;
152        return nextImpl(root_, inUnit);
153    }
154
155    /**
156     * Traverses the trie from the initial state for the
157     * one or two UTF-16 code units for this input code point.
158     * Equivalent to reset().nextForCodePoint(cp).
159     * @param cp A Unicode code point 0..0x10ffff.
160     * @return The match/value Result.
161     * @stable ICU 4.8
162     */
163    public Result firstForCodePoint(int cp) {
164        return cp<=0xffff ?
165            first(cp) :
166            (first(UTF16.getLeadSurrogate(cp)).hasNext() ?
167                next(UTF16.getTrailSurrogate(cp)) :
168                Result.NO_MATCH);
169    }
170
171    /**
172     * Traverses the trie from the current state for this input char.
173     * @param inUnit Input char value. Values below 0 and above 0xffff will never match.
174     * @return The match/value Result.
175     * @stable ICU 4.8
176     */
177    public Result next(int inUnit) {
178        int pos=pos_;
179        if(pos<0) {
180            return Result.NO_MATCH;
181        }
182        int length=remainingMatchLength_;  // Actual remaining match length minus 1.
183        if(length>=0) {
184            // Remaining part of a linear-match node.
185            if(inUnit==chars_.charAt(pos++)) {
186                remainingMatchLength_=--length;
187                pos_=pos;
188                int node;
189                return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
190                        valueResults_[node>>15] : Result.NO_VALUE;
191            } else {
192                stop();
193                return Result.NO_MATCH;
194            }
195        }
196        return nextImpl(pos, inUnit);
197    }
198
199    /**
200     * Traverses the trie from the current state for the
201     * one or two UTF-16 code units for this input code point.
202     * @param cp A Unicode code point 0..0x10ffff.
203     * @return The match/value Result.
204     * @stable ICU 4.8
205     */
206    public Result nextForCodePoint(int cp) {
207        return cp<=0xffff ?
208            next(cp) :
209            (next(UTF16.getLeadSurrogate(cp)).hasNext() ?
210                next(UTF16.getTrailSurrogate(cp)) :
211                Result.NO_MATCH);
212    }
213
214    /**
215     * Traverses the trie from the current state for this string.
216     * Equivalent to
217     * <pre>
218     * Result result=current();
219     * for(each c in s)
220     *   if(!result.hasNext()) return Result.NO_MATCH;
221     *   result=next(c);
222     * return result;
223     * </pre>
224     * @param s Contains a string.
225     * @param sIndex The start index of the string in s.
226     * @param sLimit The (exclusive) end index of the string in s.
227     * @return The match/value Result.
228     * @stable ICU 4.8
229     */
230    public Result next(CharSequence s, int sIndex, int sLimit) {
231        if(sIndex>=sLimit) {
232            // Empty input.
233            return current();
234        }
235        int pos=pos_;
236        if(pos<0) {
237            return Result.NO_MATCH;
238        }
239        int length=remainingMatchLength_;  // Actual remaining match length minus 1.
240        for(;;) {
241            // Fetch the next input unit, if there is one.
242            // Continue a linear-match node.
243            char inUnit;
244            for(;;) {
245                if(sIndex==sLimit) {
246                    remainingMatchLength_=length;
247                    pos_=pos;
248                    int node;
249                    return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
250                            valueResults_[node>>15] : Result.NO_VALUE;
251                }
252                inUnit=s.charAt(sIndex++);
253                if(length<0) {
254                    remainingMatchLength_=length;
255                    break;
256                }
257                if(inUnit!=chars_.charAt(pos)) {
258                    stop();
259                    return Result.NO_MATCH;
260                }
261                ++pos;
262                --length;
263            }
264            int node=chars_.charAt(pos++);
265            for(;;) {
266                if(node<kMinLinearMatch) {
267                    Result result=branchNext(pos, node, inUnit);
268                    if(result==Result.NO_MATCH) {
269                        return Result.NO_MATCH;
270                    }
271                    // Fetch the next input unit, if there is one.
272                    if(sIndex==sLimit) {
273                        return result;
274                    }
275                    if(result==Result.FINAL_VALUE) {
276                        // No further matching units.
277                        stop();
278                        return Result.NO_MATCH;
279                    }
280                    inUnit=s.charAt(sIndex++);
281                    pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
282                    node=chars_.charAt(pos++);
283                } else if(node<kMinValueLead) {
284                    // Match length+1 units.
285                    length=node-kMinLinearMatch;  // Actual match length minus 1.
286                    if(inUnit!=chars_.charAt(pos)) {
287                        stop();
288                        return Result.NO_MATCH;
289                    }
290                    ++pos;
291                    --length;
292                    break;
293                } else if((node&kValueIsFinal)!=0) {
294                    // No further matching units.
295                    stop();
296                    return Result.NO_MATCH;
297                } else {
298                    // Skip intermediate value.
299                    pos=skipNodeValue(pos, node);
300                    node&=kNodeTypeMask;
301                }
302            }
303        }
304    }
305
306    /**
307     * Returns a matching string's value if called immediately after
308     * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE.
309     * getValue() can be called multiple times.
310     *
311     * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE!
312     * @return The value for the string so far.
313     * @stable ICU 4.8
314     */
315    public int getValue() /*const*/ {
316        int pos=pos_;
317        int leadUnit=chars_.charAt(pos++);
318        assert(leadUnit>=kMinValueLead);
319        return (leadUnit&kValueIsFinal)!=0 ?
320            readValue(chars_, pos, leadUnit&0x7fff) : readNodeValue(chars_, pos, leadUnit);
321    }
322
323    /**
324     * Determines whether all strings reachable from the current state
325     * map to the same value, and if so, returns that value.
326     * @return The unique value in bits 32..1 with bit 0 set,
327     *         if all strings reachable from the current state
328     *         map to the same value; otherwise returns 0.
329     * @stable ICU 4.8
330     */
331    public long getUniqueValue() /*const*/ {
332        int pos=pos_;
333        if(pos<0) {
334            return 0;
335        }
336        // Skip the rest of a pending linear-match node.
337        long uniqueValue=findUniqueValue(chars_, pos+remainingMatchLength_+1, 0);
338        // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32.
339        return (uniqueValue<<31)>>31;
340    }
341
342    /**
343     * Finds each char which continues the string from the current state.
344     * That is, each char c for which it would be next(c)!=Result.NO_MATCH now.
345     * @param out Each next char is appended to this object.
346     *            (Only uses the out.append(c) method.)
347     * @return The number of chars which continue the string from here.
348     * @stable ICU 4.8
349     */
350    public int getNextChars(Appendable out) /*const*/ {
351        int pos=pos_;
352        if(pos<0) {
353            return 0;
354        }
355        if(remainingMatchLength_>=0) {
356            append(out, chars_.charAt(pos));  // Next unit of a pending linear-match node.
357            return 1;
358        }
359        int node=chars_.charAt(pos++);
360        if(node>=kMinValueLead) {
361            if((node&kValueIsFinal)!=0) {
362                return 0;
363            } else {
364                pos=skipNodeValue(pos, node);
365                node&=kNodeTypeMask;
366            }
367        }
368        if(node<kMinLinearMatch) {
369            if(node==0) {
370                node=chars_.charAt(pos++);
371            }
372            getNextBranchChars(chars_, pos, ++node, out);
373            return node;
374        } else {
375            // First unit of the linear-match node.
376            append(out, chars_.charAt(pos));
377            return 1;
378        }
379    }
380
381    /**
382     * Iterates from the current state of this trie.
383     * @return A new CharsTrie.Iterator.
384     * @stable ICU 4.8
385     */
386    public Iterator iterator() {
387        return new Iterator(chars_, pos_, remainingMatchLength_, 0);
388    }
389
390    /**
391     * Iterates from the current state of this trie.
392     * @param maxStringLength If 0, the iterator returns full strings.
393     *                        Otherwise, the iterator returns strings with this maximum length.
394     * @return A new CharsTrie.Iterator.
395     * @stable ICU 4.8
396     */
397    public Iterator iterator(int maxStringLength) {
398        return new Iterator(chars_, pos_, remainingMatchLength_, maxStringLength);
399    }
400
401    /**
402     * Iterates from the root of a char-serialized BytesTrie.
403     * @param trieChars CharSequence that contains the serialized trie.
404     * @param offset Root offset of the trie in the CharSequence.
405     * @param maxStringLength If 0, the iterator returns full strings.
406     *                        Otherwise, the iterator returns strings with this maximum length.
407     * @return A new CharsTrie.Iterator.
408     * @stable ICU 4.8
409     */
410    public static Iterator iterator(CharSequence trieChars, int offset, int maxStringLength) {
411        return new Iterator(trieChars, offset, -1, maxStringLength);
412    }
413
414    /**
415     * Return value type for the Iterator.
416     * @stable ICU 4.8
417     */
418    public static final class Entry {
419        /**
420         * The string.
421         * @stable ICU 4.8
422         */
423        public CharSequence chars;
424        /**
425         * The value associated with the string.
426         * @stable ICU 4.8
427         */
428        public int value;
429
430        private Entry() {
431        }
432    }
433
434    /**
435     * Iterator for all of the (string, value) pairs in a CharsTrie.
436     * @stable ICU 4.8
437     */
438    public static final class Iterator implements java.util.Iterator<Entry> {
439        private Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength) {
440            chars_=trieChars;
441            pos_=initialPos_=offset;
442            remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength;
443            maxLength_=maxStringLength;
444            int length=remainingMatchLength_;  // Actual remaining match length minus 1.
445            if(length>=0) {
446                // Pending linear-match node, append remaining bytes to str_.
447                ++length;
448                if(maxLength_>0 && length>maxLength_) {
449                    length=maxLength_;  // This will leave remainingMatchLength>=0 as a signal.
450                }
451                str_.append(chars_, pos_, pos_+length);
452                pos_+=length;
453                remainingMatchLength_-=length;
454            }
455        }
456
457        /**
458         * Resets this iterator to its initial state.
459         * @return this
460         * @stable ICU 4.8
461         */
462        public Iterator reset() {
463            pos_=initialPos_;
464            remainingMatchLength_=initialRemainingMatchLength_;
465            skipValue_=false;
466            int length=remainingMatchLength_+1;  // Remaining match length.
467            if(maxLength_>0 && length>maxLength_) {
468                length=maxLength_;
469            }
470            str_.setLength(length);
471            pos_+=length;
472            remainingMatchLength_-=length;
473            stack_.clear();
474            return this;
475        }
476
477        /**
478         * @return true if there are more elements.
479         * @stable ICU 4.8
480         */
481        public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); }
482
483        /**
484         * Finds the next (string, value) pair if there is one.
485         *
486         * If the string is truncated to the maximum length and does not
487         * have a real value, then the value is set to -1.
488         * In this case, this "not a real value" is indistinguishable from
489         * a real value of -1.
490         * @return An Entry with the string and value of the next element.
491         * @throws NoSuchElementException - iteration has no more elements.
492         * @stable ICU 4.8
493         */
494        public Entry next() {
495            int pos=pos_;
496            if(pos<0) {
497                if(stack_.isEmpty()) {
498                    throw new NoSuchElementException();
499                }
500                // Pop the state off the stack and continue with the next outbound edge of
501                // the branch node.
502                long top=stack_.remove(stack_.size()-1);
503                int length=(int)top;
504                pos=(int)(top>>32);
505                str_.setLength(length&0xffff);
506                length>>>=16;
507                if(length>1) {
508                    pos=branchNext(pos, length);
509                    if(pos<0) {
510                        return entry_;  // Reached a final value.
511                    }
512                } else {
513                    str_.append(chars_.charAt(pos++));
514                }
515            }
516            if(remainingMatchLength_>=0) {
517                // We only get here if we started in a pending linear-match node
518                // with more than maxLength remaining units.
519                return truncateAndStop();
520            }
521            for(;;) {
522                int node=chars_.charAt(pos++);
523                if(node>=kMinValueLead) {
524                    if(skipValue_) {
525                        pos=skipNodeValue(pos, node);
526                        node&=kNodeTypeMask;
527                        skipValue_=false;
528                    } else {
529                        // Deliver value for the string so far.
530                        boolean isFinal=(node&kValueIsFinal)!=0;
531                        if(isFinal) {
532                            entry_.value=readValue(chars_, pos, node&0x7fff);
533                        } else {
534                            entry_.value=readNodeValue(chars_, pos, node);
535                        }
536                        if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) {
537                            pos_=-1;
538                        } else {
539                            // We cannot skip the value right here because it shares its
540                            // lead unit with a match node which we have to evaluate
541                            // next time.
542                            // Instead, keep pos_ on the node lead unit itself.
543                            pos_=pos-1;
544                            skipValue_=true;
545                        }
546                        entry_.chars=str_;
547                        return entry_;
548                    }
549                }
550                if(maxLength_>0 && str_.length()==maxLength_) {
551                    return truncateAndStop();
552                }
553                if(node<kMinLinearMatch) {
554                    if(node==0) {
555                        node=chars_.charAt(pos++);
556                    }
557                    pos=branchNext(pos, node+1);
558                    if(pos<0) {
559                        return entry_;  // Reached a final value.
560                    }
561                } else {
562                    // Linear-match node, append length units to str_.
563                    int length=node-kMinLinearMatch+1;
564                    if(maxLength_>0 && str_.length()+length>maxLength_) {
565                        str_.append(chars_, pos, pos+maxLength_-str_.length());
566                        return truncateAndStop();
567                    }
568                    str_.append(chars_, pos, pos+length);
569                    pos+=length;
570                }
571            }
572        }
573
574        /**
575         * Iterator.remove() is not supported.
576         * @throws UnsupportedOperationException (always)
577         * @stable ICU 4.8
578         */
579        public void remove() {
580            throw new UnsupportedOperationException();
581        }
582
583        private Entry truncateAndStop() {
584            pos_=-1;
585            // We reset entry_.chars every time we return entry_
586            // just because the caller might have modified the Entry.
587            entry_.chars=str_;
588            entry_.value=-1;  // no real value for str
589            return entry_;
590        }
591
592        private int branchNext(int pos, int length) {
593            while(length>kMaxBranchLinearSubNodeLength) {
594                ++pos;  // ignore the comparison unit
595                // Push state for the greater-or-equal edge.
596                stack_.add(((long)skipDelta(chars_, pos)<<32)|((length-(length>>1))<<16)|str_.length());
597                // Follow the less-than edge.
598                length>>=1;
599                pos=jumpByDelta(chars_, pos);
600            }
601            // List of key-value pairs where values are either final values or jump deltas.
602            // Read the first (key, value) pair.
603            char trieUnit=chars_.charAt(pos++);
604            int node=chars_.charAt(pos++);
605            boolean isFinal=(node&kValueIsFinal)!=0;
606            int value=readValue(chars_, pos, node&=0x7fff);
607            pos=skipValue(pos, node);
608            stack_.add(((long)pos<<32)|((length-1)<<16)|str_.length());
609            str_.append(trieUnit);
610            if(isFinal) {
611                pos_=-1;
612                entry_.chars=str_;
613                entry_.value=value;
614                return -1;
615            } else {
616                return pos+value;
617            }
618        }
619
620        private CharSequence chars_;
621        private int pos_;
622        private int initialPos_;
623        private int remainingMatchLength_;
624        private int initialRemainingMatchLength_;
625        private boolean skipValue_;  // Skip intermediate value which was already delivered.
626
627        private StringBuilder str_=new StringBuilder();
628        private int maxLength_;
629        private Entry entry_=new Entry();
630
631        // The stack stores longs for backtracking to another
632        // outbound edge of a branch node.
633        // Each long has the offset in chars_ in bits 62..32,
634        // the str_.length() from before the node in bits 15..0,
635        // and the remaining branch length in bits 31..16.
636        // (We could store the remaining branch length minus 1 in bits 30..16 and not use bit 31,
637        // but the code looks more confusing that way.)
638        private ArrayList<Long> stack_=new ArrayList<Long>();
639    }
640
641    private void stop() {
642        pos_=-1;
643    }
644
645    // Reads a compact 32-bit integer.
646    // pos is already after the leadUnit, and the lead unit has bit 15 reset.
647    private static int readValue(CharSequence chars, int pos, int leadUnit) {
648        int value;
649        if(leadUnit<kMinTwoUnitValueLead) {
650            value=leadUnit;
651        } else if(leadUnit<kThreeUnitValueLead) {
652            value=((leadUnit-kMinTwoUnitValueLead)<<16)|chars.charAt(pos);
653        } else {
654            value=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
655        }
656        return value;
657    }
658    private static int skipValue(int pos, int leadUnit) {
659        if(leadUnit>=kMinTwoUnitValueLead) {
660            if(leadUnit<kThreeUnitValueLead) {
661                ++pos;
662            } else {
663                pos+=2;
664            }
665        }
666        return pos;
667    }
668    private static int skipValue(CharSequence chars, int pos) {
669        int leadUnit=chars.charAt(pos++);
670        return skipValue(pos, leadUnit&0x7fff);
671    }
672
673    private static int readNodeValue(CharSequence chars, int pos, int leadUnit) {
674        assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
675        int value;
676        if(leadUnit<kMinTwoUnitNodeValueLead) {
677            value=(leadUnit>>6)-1;
678        } else if(leadUnit<kThreeUnitNodeValueLead) {
679            value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|chars.charAt(pos);
680        } else {
681            value=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
682        }
683        return value;
684    }
685    private static int skipNodeValue(int pos, int leadUnit) {
686        assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
687        if(leadUnit>=kMinTwoUnitNodeValueLead) {
688            if(leadUnit<kThreeUnitNodeValueLead) {
689                ++pos;
690            } else {
691                pos+=2;
692            }
693        }
694        return pos;
695    }
696
697    private static int jumpByDelta(CharSequence chars, int pos) {
698        int delta=chars.charAt(pos++);
699        if(delta>=kMinTwoUnitDeltaLead) {
700            if(delta==kThreeUnitDeltaLead) {
701                delta=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
702                pos+=2;
703            } else {
704                delta=((delta-kMinTwoUnitDeltaLead)<<16)|chars.charAt(pos++);
705            }
706        }
707        return pos+delta;
708    }
709
710    private static int skipDelta(CharSequence chars, int pos) {
711        int delta=chars.charAt(pos++);
712        if(delta>=kMinTwoUnitDeltaLead) {
713            if(delta==kThreeUnitDeltaLead) {
714                pos+=2;
715            } else {
716                ++pos;
717            }
718        }
719        return pos;
720    }
721
722    private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE };
723
724    // Handles a branch node for both next(unit) and next(string).
725    private Result branchNext(int pos, int length, int inUnit) {
726        // Branch according to the current unit.
727        if(length==0) {
728            length=chars_.charAt(pos++);
729        }
730        ++length;
731        // The length of the branch is the number of units to select from.
732        // The data structure encodes a binary search.
733        while(length>kMaxBranchLinearSubNodeLength) {
734            if(inUnit<chars_.charAt(pos++)) {
735                length>>=1;
736                pos=jumpByDelta(chars_, pos);
737            } else {
738                length=length-(length>>1);
739                pos=skipDelta(chars_, pos);
740            }
741        }
742        // Drop down to linear search for the last few units.
743        // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
744        // and divides length by 2.
745        do {
746            if(inUnit==chars_.charAt(pos++)) {
747                Result result;
748                int node=chars_.charAt(pos);
749                if((node&kValueIsFinal)!=0) {
750                    // Leave the final value for getValue() to read.
751                    result=Result.FINAL_VALUE;
752                } else {
753                    // Use the non-final value as the jump delta.
754                    ++pos;
755                    // int delta=readValue(pos, node);
756                    int delta;
757                    if(node<kMinTwoUnitValueLead) {
758                        delta=node;
759                    } else if(node<kThreeUnitValueLead) {
760                        delta=((node-kMinTwoUnitValueLead)<<16)|chars_.charAt(pos++);
761                    } else {
762                        delta=(chars_.charAt(pos)<<16)|chars_.charAt(pos+1);
763                        pos+=2;
764                    }
765                    // end readValue()
766                    pos+=delta;
767                    node=chars_.charAt(pos);
768                    result= node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
769                }
770                pos_=pos;
771                return result;
772            }
773            --length;
774            pos=skipValue(chars_, pos);
775        } while(length>1);
776        if(inUnit==chars_.charAt(pos++)) {
777            pos_=pos;
778            int node=chars_.charAt(pos);
779            return node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
780        } else {
781            stop();
782            return Result.NO_MATCH;
783        }
784    }
785
786    // Requires remainingLength_<0.
787    private Result nextImpl(int pos, int inUnit) {
788        int node=chars_.charAt(pos++);
789        for(;;) {
790            if(node<kMinLinearMatch) {
791                return branchNext(pos, node, inUnit);
792            } else if(node<kMinValueLead) {
793                // Match the first of length+1 units.
794                int length=node-kMinLinearMatch;  // Actual match length minus 1.
795                if(inUnit==chars_.charAt(pos++)) {
796                    remainingMatchLength_=--length;
797                    pos_=pos;
798                    return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
799                            valueResults_[node>>15] : Result.NO_VALUE;
800                } else {
801                    // No match.
802                    break;
803                }
804            } else if((node&kValueIsFinal)!=0) {
805                // No further matching units.
806                break;
807            } else {
808                // Skip intermediate value.
809                pos=skipNodeValue(pos, node);
810                node&=kNodeTypeMask;
811            }
812        }
813        stop();
814        return Result.NO_MATCH;
815    }
816
817    // Helper functions for getUniqueValue().
818    // Recursively finds a unique value (or whether there is not a unique one)
819    // from a branch.
820    // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue().
821    // On return, if not 0, then bits 63..33 contain the updated non-negative pos.
822    private static long findUniqueValueFromBranch(CharSequence chars, int pos, int length,
823                                                  long uniqueValue) {
824        while(length>kMaxBranchLinearSubNodeLength) {
825            ++pos;  // ignore the comparison unit
826            uniqueValue=findUniqueValueFromBranch(chars, jumpByDelta(chars, pos), length>>1, uniqueValue);
827            if(uniqueValue==0) {
828                return 0;
829            }
830            length=length-(length>>1);
831            pos=skipDelta(chars, pos);
832        }
833        do {
834            ++pos;  // ignore a comparison unit
835            // handle its value
836            int node=chars.charAt(pos++);
837            boolean isFinal=(node&kValueIsFinal)!=0;
838            node&=0x7fff;
839            int value=readValue(chars, pos, node);
840            pos=skipValue(pos, node);
841            if(isFinal) {
842                if(uniqueValue!=0) {
843                    if(value!=(int)(uniqueValue>>1)) {
844                        return 0;
845                    }
846                } else {
847                    uniqueValue=((long)value<<1)|1;
848                }
849            } else {
850                uniqueValue=findUniqueValue(chars, pos+value, uniqueValue);
851                if(uniqueValue==0) {
852                    return 0;
853                }
854            }
855        } while(--length>1);
856        // ignore the last comparison byte
857        return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL);
858    }
859    // Recursively finds a unique value (or whether there is not a unique one)
860    // starting from a position on a node lead unit.
861    // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set.
862    // Otherwise, uniqueValue is 0. Bits 63..33 are ignored.
863    private static long findUniqueValue(CharSequence chars, int pos, long uniqueValue) {
864        int node=chars.charAt(pos++);
865        for(;;) {
866            if(node<kMinLinearMatch) {
867                if(node==0) {
868                    node=chars.charAt(pos++);
869                }
870                uniqueValue=findUniqueValueFromBranch(chars, pos, node+1, uniqueValue);
871                if(uniqueValue==0) {
872                    return 0;
873                }
874                pos=(int)(uniqueValue>>>33);
875                node=chars.charAt(pos++);
876            } else if(node<kMinValueLead) {
877                // linear-match node
878                pos+=node-kMinLinearMatch+1;  // Ignore the match units.
879                node=chars.charAt(pos++);
880            } else {
881                boolean isFinal=(node&kValueIsFinal)!=0;
882                int value;
883                if(isFinal) {
884                    value=readValue(chars, pos, node&0x7fff);
885                } else {
886                    value=readNodeValue(chars, pos, node);
887                }
888                if(uniqueValue!=0) {
889                    if(value!=(int)(uniqueValue>>1)) {
890                        return 0;
891                    }
892                } else {
893                    uniqueValue=((long)value<<1)|1;
894                }
895                if(isFinal) {
896                    return uniqueValue;
897                }
898                pos=skipNodeValue(pos, node);
899                node&=kNodeTypeMask;
900            }
901        }
902    }
903
904    // Helper functions for getNextChars().
905    // getNextChars() when pos is on a branch node.
906    private static void getNextBranchChars(CharSequence chars, int pos, int length, Appendable out) {
907        while(length>kMaxBranchLinearSubNodeLength) {
908            ++pos;  // ignore the comparison unit
909            getNextBranchChars(chars, jumpByDelta(chars, pos), length>>1, out);
910            length=length-(length>>1);
911            pos=skipDelta(chars, pos);
912        }
913        do {
914            append(out, chars.charAt(pos++));
915            pos=skipValue(chars, pos);
916        } while(--length>1);
917        append(out, chars.charAt(pos));
918    }
919    private static void append(Appendable out, int c) {
920        try {
921            out.append((char)c);
922        } catch(IOException e) {
923            throw new ICUUncheckedIOException(e);
924        }
925    }
926
927    // CharsTrie data structure
928    //
929    // The trie consists of a series of char-serialized nodes for incremental
930    // Unicode string/char sequence matching. (char=16-bit unsigned integer)
931    // The root node is at the beginning of the trie data.
932    //
933    // Types of nodes are distinguished by their node lead unit ranges.
934    // After each node, except a final-value node, another node follows to
935    // encode match values or continue matching further units.
936    //
937    // Node types:
938    //  - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
939    //    The value is for the string/char sequence so far.
940    //  - Match node, optionally with an intermediate value in a different compact format.
941    //    The value, if present, is for the string/char sequence so far.
942    //
943    //  Aside from the value, which uses the node lead unit's high bits:
944    //
945    //  - Linear-match node: Matches a number of units.
946    //  - Branch node: Branches to other nodes according to the current input unit.
947    //    The node unit is the length of the branch (number of units to select from)
948    //    minus 1. It is followed by a sub-node:
949    //    - If the length is at most kMaxBranchLinearSubNodeLength, then
950    //      there are length-1 (key, value) pairs and then one more comparison unit.
951    //      If one of the key units matches, then the value is either a final value for
952    //      the string so far, or a "jump" delta to the next node.
953    //      If the last unit matches, then matching continues with the next node.
954    //      (Values have the same encoding as final-value nodes.)
955    //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
956    //      there is one unit and one "jump" delta.
957    //      If the input unit is less than the sub-node unit, then "jump" by delta to
958    //      the next sub-node which will have a length of length/2.
959    //      (The delta has its own compact encoding.)
960    //      Otherwise, skip the "jump" delta to the next sub-node
961    //      which will have a length of length-length/2.
962
963    // Match-node lead unit values, after masking off intermediate-value bits:
964
965    // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
966    // the length is one more than the next unit.
967
968    // For a branch sub-node with at most this many entries, we drop down
969    // to a linear search.
970    /*package*/ static final int kMaxBranchLinearSubNodeLength=5;
971
972    // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
973    /*package*/ static final int kMinLinearMatch=0x30;
974    /*package*/ static final int kMaxLinearMatchLength=0x10;
975
976    // Match-node lead unit bits 14..6 for the optional intermediate value.
977    // If these bits are 0, then there is no intermediate value.
978    // Otherwise, see the *NodeValue* constants below.
979    /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x0040
980    /*package*/ static final int kNodeTypeMask=kMinValueLead-1;  // 0x003f
981
982    // A final-value node has bit 15 set.
983    /*package*/ static final int kValueIsFinal=0x8000;
984
985    // Compact value: After testing and masking off bit 15, use the following thresholds.
986    /*package*/ static final int kMaxOneUnitValue=0x3fff;
987
988    /*package*/ static final int kMinTwoUnitValueLead=kMaxOneUnitValue+1;  // 0x4000
989    /*package*/ static final int kThreeUnitValueLead=0x7fff;
990
991    /*package*/ static final int kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1;  // 0x3ffeffff
992
993    // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
994    /*package*/ static final int kMaxOneUnitNodeValue=0xff;
995    /*package*/ static final int kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6);  // 0x4040
996    /*package*/ static final int kThreeUnitNodeValueLead=0x7fc0;
997
998    /*package*/ static final int kMaxTwoUnitNodeValue=
999        ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1;  // 0xfdffff
1000
1001    // Compact delta integers.
1002    /*package*/ static final int kMaxOneUnitDelta=0xfbff;
1003    /*package*/ static final int kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1;  // 0xfc00
1004    /*package*/ static final int kThreeUnitDeltaLead=0xffff;
1005
1006    /*package*/ static final int kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1;  // 0x03feffff
1007
1008    // Fixed value referencing the CharsTrie words.
1009    private CharSequence chars_;
1010    private int root_;
1011
1012    // Iterator variables.
1013
1014    // Pointer to next trie unit to read. NULL if no more matches.
1015    private int pos_;
1016    // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
1017    private int remainingMatchLength_;
1018}
1019