1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5*******************************************************************************
6* Copyright (C) 2010-2014, International Business Machines
7* Corporation and others.  All Rights Reserved.
8*******************************************************************************
9* CollationIterator.java, ported from collationiterator.h/.cpp
10*
11* C++ version created on: 2010oct27
12* created by: Markus W. Scherer
13*/
14
15package android.icu.impl.coll;
16
17import android.icu.impl.Normalizer2Impl.Hangul;
18import android.icu.impl.Trie2_32;
19import android.icu.util.BytesTrie;
20import android.icu.util.CharsTrie;
21import android.icu.util.ICUException;
22
23/**
24 * Collation element iterator and abstract character iterator.
25 *
26 * When a method returns a code point value, it must be in 0..10FFFF,
27 * except it can be negative as a sentinel value.
28 * @hide Only a subset of ICU is exposed in Android
29 */
30public abstract class CollationIterator {
31    private static final class CEBuffer {
32        /** Large enough for CEs of most short strings. */
33        private static final int INITIAL_CAPACITY = 40;
34
35        CEBuffer() {}
36
37        void append(long ce) {
38            if(length >= INITIAL_CAPACITY) {
39                ensureAppendCapacity(1);
40            }
41            buffer[length++] = ce;
42        }
43
44        void appendUnsafe(long ce) {
45            buffer[length++] = ce;
46        }
47
48        void ensureAppendCapacity(int appCap) {
49            int capacity = buffer.length;
50            if((length + appCap) <= capacity) { return; }
51            do {
52                if(capacity < 1000) {
53                    capacity *= 4;
54                } else {
55                    capacity *= 2;
56                }
57            } while(capacity < (length + appCap));
58            long[] newBuffer = new long[capacity];
59            System.arraycopy(buffer, 0, newBuffer, 0, length);
60            buffer = newBuffer;
61        }
62
63        void incLength() {
64            // Use INITIAL_CAPACITY for a very simple fastpath.
65            // (Rather than buffer.getCapacity().)
66            if(length >= INITIAL_CAPACITY) {
67                ensureAppendCapacity(1);
68            }
69            ++length;
70        }
71
72        long set(int i, long ce) {
73            return buffer[i] = ce;
74        }
75        long get(int i) { return buffer[i]; }
76
77        long[] getCEs() { return buffer; }
78
79        int length = 0;
80
81        private long[] buffer = new long[INITIAL_CAPACITY];
82    }
83
84    // State of combining marks skipped in discontiguous contraction.
85    // We create a state object on first use and keep it around deactivated between uses.
86    private static final class SkippedState {
87        // Born active but empty.
88        SkippedState() {}
89        void clear() {
90            oldBuffer.setLength(0);
91            pos = 0;
92            // The newBuffer is reset by setFirstSkipped().
93        }
94
95        boolean isEmpty() { return oldBuffer.length() == 0; }
96
97        boolean hasNext() { return pos < oldBuffer.length(); }
98
99        // Requires hasNext().
100        int next() {
101            int c = oldBuffer.codePointAt(pos);
102            pos += Character.charCount(c);
103            return c;
104        }
105
106        // Accounts for one more input code point read beyond the end of the marks buffer.
107        void incBeyond() {
108            assert(!hasNext());
109            ++pos;
110        }
111
112        // Goes backward through the skipped-marks buffer.
113        // Returns the number of code points read beyond the skipped marks
114        // that need to be backtracked through normal input.
115        int backwardNumCodePoints(int n) {
116            int length = oldBuffer.length();
117            int beyond = pos - length;
118            if(beyond > 0) {
119                if(beyond >= n) {
120                    // Not back far enough to re-enter the oldBuffer.
121                    pos -= n;
122                    return n;
123                } else {
124                    // Back out all beyond-oldBuffer code points and re-enter the buffer.
125                    pos = oldBuffer.offsetByCodePoints(length, beyond - n);
126                    return beyond;
127                }
128            } else {
129                // Go backwards from inside the oldBuffer.
130                pos = oldBuffer.offsetByCodePoints(pos, -n);
131                return 0;
132            }
133        }
134
135        void setFirstSkipped(int c) {
136            skipLengthAtMatch = 0;
137            newBuffer.setLength(0);
138            newBuffer.appendCodePoint(c);
139        }
140
141        void skip(int c) {
142            newBuffer.appendCodePoint(c);
143        }
144
145        void recordMatch() { skipLengthAtMatch = newBuffer.length(); }
146
147        // Replaces the characters we consumed with the newly skipped ones.
148        void replaceMatch() {
149            // Note: UnicodeString.replace() pins pos to at most length().
150            int oldLength = oldBuffer.length();
151            if(pos > oldLength) { pos = oldLength; }
152            oldBuffer.delete(0, pos).insert(0, newBuffer, 0, skipLengthAtMatch);
153            pos = 0;
154        }
155
156        void saveTrieState(CharsTrie trie) { trie.saveState(state); }
157        void resetToTrieState(CharsTrie trie) { trie.resetToState(state); }
158
159        // Combining marks skipped in previous discontiguous-contraction matching.
160        // After that discontiguous contraction was completed, we start reading them from here.
161        private final StringBuilder oldBuffer = new StringBuilder();
162        // Combining marks newly skipped in current discontiguous-contraction matching.
163        // These might have been read from the normal text or from the oldBuffer.
164        private final StringBuilder newBuffer = new StringBuilder();
165        // Reading index in oldBuffer,
166        // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()).
167        private int pos;
168        // newBuffer.length() at the time of the last matching character.
169        // When a partial match fails, we back out skipped and partial-matching input characters.
170        private int skipLengthAtMatch;
171        // We save the trie state before we attempt to match a character,
172        // so that we can skip it and try the next one.
173        private CharsTrie.State state = new CharsTrie.State();
174    };
175
176    /**
177     * Partially constructs the iterator.
178     * In Java, we cache partially constructed iterators
179     * and finish their setup when starting to work on text
180     * (via reset(boolean) and the setText(numeric, ...) methods of subclasses).
181     * This avoids memory allocations for iterators that remain unused.
182     *
183     * <p>In C++, there is only one constructor, and iterators are
184     * stack-allocated as needed.
185     */
186    public CollationIterator(CollationData d) {
187        trie = d.trie;
188        data = d;
189        numCpFwd = -1;
190        isNumeric = false;
191        ceBuffer = null;
192    }
193
194    public CollationIterator(CollationData d, boolean numeric) {
195        trie = d.trie;
196        data = d;
197        numCpFwd = -1;
198        isNumeric = numeric;
199        ceBuffer = new CEBuffer();
200    }
201
202    @Override
203    public boolean equals(Object other) {
204        // Subclasses: Call this method and then add more specific checks.
205        // Compare the iterator state but not the collation data (trie & data fields):
206        // Assume that the caller compares the data.
207        // Ignore skipped since that should be unused between calls to nextCE().
208        // (It only stays around to avoid another memory allocation.)
209        if(other == null) { return false; }
210        if(!this.getClass().equals(other.getClass())) { return false; }
211        CollationIterator o = (CollationIterator)other;
212        if(!(ceBuffer.length == o.ceBuffer.length &&
213                cesIndex == o.cesIndex &&
214                numCpFwd == o.numCpFwd &&
215                isNumeric == o.isNumeric)) {
216            return false;
217        }
218        for(int i = 0; i < ceBuffer.length; ++i) {
219            if(ceBuffer.get(i) != o.ceBuffer.get(i)) { return false; }
220        }
221        return true;
222    }
223
224    @Override
225    public int hashCode() {
226        // Dummy return to prevent compile warnings.
227        return 0;
228    }
229
230    /**
231     * Resets the iterator state and sets the position to the specified offset.
232     * Subclasses must implement, and must call the parent class method,
233     * or CollationIterator.reset().
234     */
235    public abstract void resetToOffset(int newOffset);
236
237    public abstract int getOffset();
238
239    /**
240     * Returns the next collation element.
241     */
242    public final long nextCE() {
243        if(cesIndex < ceBuffer.length) {
244            // Return the next buffered CE.
245            return ceBuffer.get(cesIndex++);
246        }
247        assert cesIndex == ceBuffer.length;
248        ceBuffer.incLength();
249        long cAndCE32 = handleNextCE32();
250        int c = (int)(cAndCE32 >> 32);
251        int ce32 = (int)cAndCE32;
252        int t = ce32 & 0xff;
253        if(t < Collation.SPECIAL_CE32_LOW_BYTE) {  // Forced-inline of isSpecialCE32(ce32).
254            // Normal CE from the main data.
255            // Forced-inline of ceFromSimpleCE32(ce32).
256            return ceBuffer.set(cesIndex++,
257                    ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
258        }
259        CollationData d;
260        // The compiler should be able to optimize the previous and the following
261        // comparisons of t with the same constant.
262        if(t == Collation.SPECIAL_CE32_LOW_BYTE) {
263            if(c < 0) {
264                return ceBuffer.set(cesIndex++, Collation.NO_CE);
265            }
266            d = data.base;
267            ce32 = d.getCE32(c);
268            t = ce32 & 0xff;
269            if(t < Collation.SPECIAL_CE32_LOW_BYTE) {
270                // Normal CE from the base data.
271                return ceBuffer.set(cesIndex++,
272                        ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
273            }
274        } else {
275            d = data;
276        }
277        if(t == Collation.LONG_PRIMARY_CE32_LOW_BYTE) {
278            // Forced-inline of ceFromLongPrimaryCE32(ce32).
279            return ceBuffer.set(cesIndex++,
280                    ((long)(ce32 - t) << 32) | Collation.COMMON_SEC_AND_TER_CE);
281        }
282        return nextCEFromCE32(d, c, ce32);
283    }
284
285    /**
286     * Fetches all CEs.
287     * @return getCEsLength()
288     */
289    public final int fetchCEs() {
290        while(nextCE() != Collation.NO_CE) {
291            // No need to loop for each expansion CE.
292            cesIndex = ceBuffer.length;
293        }
294        return ceBuffer.length;
295    }
296
297    /**
298     * Overwrites the current CE (the last one returned by nextCE()).
299     */
300    final void setCurrentCE(long ce) {
301        assert cesIndex > 0;
302        ceBuffer.set(cesIndex - 1, ce);
303    }
304
305    /**
306     * Returns the previous collation element.
307     */
308    public final long previousCE(UVector32 offsets) {
309        if(ceBuffer.length > 0) {
310            // Return the previous buffered CE.
311            return ceBuffer.get(--ceBuffer.length);
312        }
313        offsets.removeAllElements();
314        int limitOffset = getOffset();
315        int c = previousCodePoint();
316        if(c < 0) { return Collation.NO_CE; }
317        if(data.isUnsafeBackward(c, isNumeric)) {
318            return previousCEUnsafe(c, offsets);
319        }
320        // Simple, safe-backwards iteration:
321        // Get a CE going backwards, handle prefixes but no contractions.
322        int ce32 = data.getCE32(c);
323        CollationData d;
324        if(ce32 == Collation.FALLBACK_CE32) {
325            d = data.base;
326            ce32 = d.getCE32(c);
327        } else {
328            d = data;
329        }
330        if(Collation.isSimpleOrLongCE32(ce32)) {
331            return Collation.ceFromCE32(ce32);
332        }
333        appendCEsFromCE32(d, c, ce32, false);
334        if(ceBuffer.length > 1) {
335            offsets.addElement(getOffset());
336            // For an expansion, the offset of each non-initial CE is the limit offset,
337            // consistent with forward iteration.
338            while(offsets.size() <= ceBuffer.length) {
339                offsets.addElement(limitOffset);
340            };
341        }
342        return ceBuffer.get(--ceBuffer.length);
343    }
344
345    public final int getCEsLength() {
346        return ceBuffer.length;
347    }
348
349    public final long getCE(int i) {
350        return ceBuffer.get(i);
351    }
352
353    public final long[] getCEs() {
354        return ceBuffer.getCEs();
355    }
356
357    final void clearCEs() {
358        cesIndex = ceBuffer.length = 0;
359    }
360
361    public final void clearCEsIfNoneRemaining() {
362        if(cesIndex == ceBuffer.length) { clearCEs(); }
363    }
364
365    /**
366     * Returns the next code point (with post-increment).
367     * Public for identical-level comparison and for testing.
368     */
369    public abstract int nextCodePoint();
370
371    /**
372     * Returns the previous code point (with pre-decrement).
373     * Public for identical-level comparison and for testing.
374     */
375    public abstract int previousCodePoint();
376
377    protected final void reset() {
378        cesIndex = ceBuffer.length = 0;
379        if(skipped != null) { skipped.clear(); }
380    }
381    /**
382     * Resets the state as well as the numeric setting,
383     * and completes the initialization.
384     * Only exists in Java where we reset cached CollationIterator instances
385     * rather than stack-allocating temporary ones.
386     * (See also the constructor comments.)
387     */
388    protected final void reset(boolean numeric) {
389        if(ceBuffer == null) {
390            ceBuffer = new CEBuffer();
391        }
392        reset();
393        isNumeric = numeric;
394    }
395
396    /**
397     * Returns the next code point and its local CE32 value.
398     * Returns Collation.FALLBACK_CE32 at the end of the text (c<0)
399     * or when c's CE32 value is to be looked up in the base data (fallback).
400     *
401     * The code point is used for fallbacks, context and implicit weights.
402     * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32).
403     *
404     * Returns the code point in bits 63..32 (signed) and the CE32 in bits 31..0.
405     */
406    protected long handleNextCE32() {
407        int c = nextCodePoint();
408        if(c < 0) { return NO_CP_AND_CE32; }
409        return makeCodePointAndCE32Pair(c, data.getCE32(c));
410    }
411    protected long makeCodePointAndCE32Pair(int c, int ce32) {
412        return ((long)c << 32) | (ce32 & 0xffffffffL);
413    }
414    protected static final long NO_CP_AND_CE32 = (-1L << 32) | (Collation.FALLBACK_CE32 & 0xffffffffL);
415
416    /**
417     * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit.
418     * Returns the trail surrogate in that case and advances past it,
419     * if a trail surrogate follows the lead surrogate.
420     * Otherwise returns any other code unit and does not advance.
421     */
422    protected char handleGetTrailSurrogate() {
423        return 0;
424    }
425
426    /**
427     * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator.
428     * (Not needed in Java.)
429     */
430    /*protected boolean foundNULTerminator() {
431        return false;
432    }*/
433
434    /**
435     * @return false if surrogate code points U+D800..U+DFFF
436     *         map to their own implicit primary weights (for UTF-16),
437     *         or true if they map to CE(U+FFFD) (for UTF-8)
438     */
439    protected boolean forbidSurrogateCodePoints() {
440        return false;
441    }
442
443    protected abstract void forwardNumCodePoints(int num);
444
445    protected abstract void backwardNumCodePoints(int num);
446
447    /**
448     * Returns the CE32 from the data trie.
449     * Normally the same as data.getCE32(), but overridden in the builder.
450     * Call this only when the faster data.getCE32() cannot be used.
451     */
452    protected int getDataCE32(int c) {
453        return data.getCE32(c);
454    }
455
456    protected int getCE32FromBuilderData(int ce32) {
457        throw new ICUException("internal program error: should be unreachable");
458    }
459
460    protected final void appendCEsFromCE32(CollationData d, int c, int ce32,
461                           boolean forward) {
462        while(Collation.isSpecialCE32(ce32)) {
463            switch(Collation.tagFromCE32(ce32)) {
464            case Collation.FALLBACK_TAG:
465            case Collation.RESERVED_TAG_3:
466                throw new ICUException("internal program error: should be unreachable");
467            case Collation.LONG_PRIMARY_TAG:
468                ceBuffer.append(Collation.ceFromLongPrimaryCE32(ce32));
469                return;
470            case Collation.LONG_SECONDARY_TAG:
471                ceBuffer.append(Collation.ceFromLongSecondaryCE32(ce32));
472                return;
473            case Collation.LATIN_EXPANSION_TAG:
474                ceBuffer.ensureAppendCapacity(2);
475                ceBuffer.set(ceBuffer.length, Collation.latinCE0FromCE32(ce32));
476                ceBuffer.set(ceBuffer.length + 1, Collation.latinCE1FromCE32(ce32));
477                ceBuffer.length += 2;
478                return;
479            case Collation.EXPANSION32_TAG: {
480                int index = Collation.indexFromCE32(ce32);
481                int length = Collation.lengthFromCE32(ce32);
482                ceBuffer.ensureAppendCapacity(length);
483                do {
484                    ceBuffer.appendUnsafe(Collation.ceFromCE32(d.ce32s[index++]));
485                } while(--length > 0);
486                return;
487            }
488            case Collation.EXPANSION_TAG: {
489                int index = Collation.indexFromCE32(ce32);
490                int length = Collation.lengthFromCE32(ce32);
491                ceBuffer.ensureAppendCapacity(length);
492                do {
493                    ceBuffer.appendUnsafe(d.ces[index++]);
494                } while(--length > 0);
495                return;
496            }
497            case Collation.BUILDER_DATA_TAG:
498                ce32 = getCE32FromBuilderData(ce32);
499                if(ce32 == Collation.FALLBACK_CE32) {
500                    d = data.base;
501                    ce32 = d.getCE32(c);
502                }
503                break;
504            case Collation.PREFIX_TAG:
505                if(forward) { backwardNumCodePoints(1); }
506                ce32 = getCE32FromPrefix(d, ce32);
507                if(forward) { forwardNumCodePoints(1); }
508                break;
509            case Collation.CONTRACTION_TAG: {
510                int index = Collation.indexFromCE32(ce32);
511                int defaultCE32 = d.getCE32FromContexts(index);  // Default if no suffix match.
512                if(!forward) {
513                    // Backward contractions are handled by previousCEUnsafe().
514                    // c has contractions but they were not found.
515                    ce32 = defaultCE32;
516                    break;
517                }
518                int nextCp;
519                if(skipped == null && numCpFwd < 0) {
520                    // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path,
521                    // avoiding the function call and the nextSkippedCodePoint() overhead.
522                    nextCp = nextCodePoint();
523                    if(nextCp < 0) {
524                        // No more text.
525                        ce32 = defaultCE32;
526                        break;
527                    } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
528                            !CollationFCD.mayHaveLccc(nextCp)) {
529                        // All contraction suffixes start with characters with lccc!=0
530                        // but the next code point has lccc==0.
531                        backwardNumCodePoints(1);
532                        ce32 = defaultCE32;
533                        break;
534                    }
535                } else {
536                    nextCp = nextSkippedCodePoint();
537                    if(nextCp < 0) {
538                        // No more text.
539                        ce32 = defaultCE32;
540                        break;
541                    } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
542                            !CollationFCD.mayHaveLccc(nextCp)) {
543                        // All contraction suffixes start with characters with lccc!=0
544                        // but the next code point has lccc==0.
545                        backwardNumSkipped(1);
546                        ce32 = defaultCE32;
547                        break;
548                    }
549                }
550                ce32 = nextCE32FromContraction(d, ce32, d.contexts, index + 2, defaultCE32, nextCp);
551                if(ce32 == Collation.NO_CE32) {
552                    // CEs from a discontiguous contraction plus the skipped combining marks
553                    // have been appended already.
554                    return;
555                }
556                break;
557            }
558            case Collation.DIGIT_TAG:
559                if(isNumeric) {
560                    appendNumericCEs(ce32, forward);
561                    return;
562                } else {
563                    // Fetch the non-numeric-collation CE32 and continue.
564                    ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
565                    break;
566                }
567            case Collation.U0000_TAG:
568                assert(c == 0);
569                // NUL-terminated input not supported in Java.
570                // Fetch the normal ce32 for U+0000 and continue.
571                ce32 = d.ce32s[0];
572                break;
573            case Collation.HANGUL_TAG: {
574                int[] jamoCE32s = d.jamoCE32s;
575                c -= Hangul.HANGUL_BASE;
576                int t = c % Hangul.JAMO_T_COUNT;
577                c /= Hangul.JAMO_T_COUNT;
578                int v = c % Hangul.JAMO_V_COUNT;
579                c /= Hangul.JAMO_V_COUNT;
580                if((ce32 & Collation.HANGUL_NO_SPECIAL_JAMO) != 0) {
581                    // None of the Jamo CE32s are isSpecialCE32().
582                    // Avoid recursive function calls and per-Jamo tests.
583                    ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3);
584                    ceBuffer.set(ceBuffer.length, Collation.ceFromCE32(jamoCE32s[c]));
585                    ceBuffer.set(ceBuffer.length + 1, Collation.ceFromCE32(jamoCE32s[19 + v]));
586                    ceBuffer.length += 2;
587                    if(t != 0) {
588                        ceBuffer.appendUnsafe(Collation.ceFromCE32(jamoCE32s[39 + t]));
589                    }
590                    return;
591                } else {
592                    // We should not need to compute each Jamo code point.
593                    // In particular, there should be no offset or implicit ce32.
594                    appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[c], forward);
595                    appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[19 + v], forward);
596                    if(t == 0) { return; }
597                    // offset 39 = 19 + 21 - 1:
598                    // 19 = JAMO_L_COUNT
599                    // 21 = JAMO_T_COUNT
600                    // -1 = omit t==0
601                    ce32 = jamoCE32s[39 + t];
602                    c = Collation.SENTINEL_CP;
603                    break;
604                }
605            }
606            case Collation.LEAD_SURROGATE_TAG: {
607                assert(forward);  // Backward iteration should never see lead surrogate code _unit_ data.
608                assert(isLeadSurrogate(c));
609                char trail;
610                if(Character.isLowSurrogate(trail = handleGetTrailSurrogate())) {
611                    c = Character.toCodePoint((char)c, trail);
612                    ce32 &= Collation.LEAD_TYPE_MASK;
613                    if(ce32 == Collation.LEAD_ALL_UNASSIGNED) {
614                        ce32 = Collation.UNASSIGNED_CE32;  // unassigned-implicit
615                    } else if(ce32 == Collation.LEAD_ALL_FALLBACK ||
616                            (ce32 = d.getCE32FromSupplementary(c)) == Collation.FALLBACK_CE32) {
617                        // fall back to the base data
618                        d = d.base;
619                        ce32 = d.getCE32FromSupplementary(c);
620                    }
621                } else {
622                    // c is an unpaired surrogate.
623                    ce32 = Collation.UNASSIGNED_CE32;
624                }
625                break;
626            }
627            case Collation.OFFSET_TAG:
628                assert(c >= 0);
629                ceBuffer.append(d.getCEFromOffsetCE32(c, ce32));
630                return;
631            case Collation.IMPLICIT_TAG:
632                assert(c >= 0);
633                if(isSurrogate(c) && forbidSurrogateCodePoints()) {
634                    ce32 = Collation.FFFD_CE32;
635                    break;
636                } else {
637                    ceBuffer.append(Collation.unassignedCEFromCodePoint(c));
638                    return;
639                }
640            }
641        }
642        ceBuffer.append(Collation.ceFromSimpleCE32(ce32));
643    }
644
645    // TODO: Propose widening the UTF16 method.
646    private static final boolean isSurrogate(int c) {
647        return (c & 0xfffff800) == 0xd800;
648    }
649
650    // TODO: Propose widening the UTF16 method.
651    protected static final boolean isLeadSurrogate(int c) {
652        return (c & 0xfffffc00) == 0xd800;
653    }
654
655    // TODO: Propose widening the UTF16 method.
656    protected static final boolean isTrailSurrogate(int c) {
657        return (c & 0xfffffc00) == 0xdc00;
658    }
659
660    // Main lookup trie of the data object.
661    protected final Trie2_32 trie;
662    protected final CollationData data;
663
664    private final long nextCEFromCE32(CollationData d, int c, int ce32) {
665        --ceBuffer.length;  // Undo ceBuffer.incLength().
666        appendCEsFromCE32(d, c, ce32, true);
667        return ceBuffer.get(cesIndex++);
668    }
669
670    private final int getCE32FromPrefix(CollationData d, int ce32) {
671        int index = Collation.indexFromCE32(ce32);
672        ce32 = d.getCE32FromContexts(index);  // Default if no prefix match.
673        index += 2;
674        // Number of code points read before the original code point.
675        int lookBehind = 0;
676        CharsTrie prefixes = new CharsTrie(d.contexts, index);
677        for(;;) {
678            int c = previousCodePoint();
679            if(c < 0) { break; }
680            ++lookBehind;
681            BytesTrie.Result match = prefixes.nextForCodePoint(c);
682            if(match.hasValue()) {
683                ce32 = prefixes.getValue();
684            }
685            if(!match.hasNext()) { break; }
686        }
687        forwardNumCodePoints(lookBehind);
688        return ce32;
689    }
690
691    private final int nextSkippedCodePoint() {
692        if(skipped != null && skipped.hasNext()) { return skipped.next(); }
693        if(numCpFwd == 0) { return Collation.SENTINEL_CP; }
694        int c = nextCodePoint();
695        if(skipped != null && !skipped.isEmpty() && c >= 0) { skipped.incBeyond(); }
696        if(numCpFwd > 0 && c >= 0) { --numCpFwd; }
697        return c;
698    }
699
700    private final void backwardNumSkipped(int n) {
701        if(skipped != null && !skipped.isEmpty()) {
702            n = skipped.backwardNumCodePoints(n);
703        }
704        backwardNumCodePoints(n);
705        if(numCpFwd >= 0) { numCpFwd += n; }
706    }
707
708    private final int nextCE32FromContraction(
709            CollationData d, int contractionCE32,
710            CharSequence trieChars, int trieOffset, int ce32, int c) {
711        // c: next code point after the original one
712
713        // Number of code points read beyond the original code point.
714        // Needed for discontiguous contraction matching.
715        int lookAhead = 1;
716        // Number of code points read since the last match (initially only c).
717        int sinceMatch = 1;
718        // Normally we only need a contiguous match,
719        // and therefore need not remember the suffixes state from before a mismatch for retrying.
720        // If we are already processing skipped combining marks, then we do track the state.
721        CharsTrie suffixes = new CharsTrie(trieChars, trieOffset);
722        if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
723        BytesTrie.Result match = suffixes.firstForCodePoint(c);
724        for(;;) {
725            int nextCp;
726            if(match.hasValue()) {
727                ce32 = suffixes.getValue();
728                if(!match.hasNext() || (c = nextSkippedCodePoint()) < 0) {
729                    return ce32;
730                }
731                if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
732                sinceMatch = 1;
733            } else if(match == BytesTrie.Result.NO_MATCH || (nextCp = nextSkippedCodePoint()) < 0) {
734                // No match for c, or partial match (BytesTrie.Result.NO_VALUE) and no further text.
735                // Back up if necessary, and try a discontiguous contraction.
736                if((contractionCE32 & Collation.CONTRACT_TRAILING_CCC) != 0 &&
737                        // Discontiguous contraction matching extends an existing match.
738                        // If there is no match yet, then there is nothing to do.
739                        ((contractionCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) == 0 ||
740                            sinceMatch < lookAhead)) {
741                    // The last character of at least one suffix has lccc!=0,
742                    // allowing for discontiguous contractions.
743                    // UCA S2.1.1 only processes non-starters immediately following
744                    // "a match in the table" (sinceMatch=1).
745                    if(sinceMatch > 1) {
746                        // Return to the state after the last match.
747                        // (Return to sinceMatch=0 and re-fetch the first partially-matched character.)
748                        backwardNumSkipped(sinceMatch);
749                        c = nextSkippedCodePoint();
750                        lookAhead -= sinceMatch - 1;
751                        sinceMatch = 1;
752                    }
753                    if(d.getFCD16(c) > 0xff) {
754                        return nextCE32FromDiscontiguousContraction(
755                            d, suffixes, ce32, lookAhead, c);
756                    }
757                }
758                break;
759            } else {
760                // Continue after partial match (BytesTrie.Result.NO_VALUE) for c.
761                // It does not have a result value, therefore it is not itself "a match in the table".
762                // If a partially-matched c has ccc!=0 then
763                // it might be skipped in discontiguous contraction.
764                c = nextCp;
765                ++sinceMatch;
766            }
767            ++lookAhead;
768            match = suffixes.nextForCodePoint(c);
769        }
770        backwardNumSkipped(sinceMatch);
771        return ce32;
772    }
773
774    private final int nextCE32FromDiscontiguousContraction(
775            CollationData d, CharsTrie suffixes, int ce32,
776            int lookAhead, int c) {
777        // UCA section 3.3.2 Contractions:
778        // Contractions that end with non-starter characters
779        // are known as discontiguous contractions.
780        // ... discontiguous contractions must be detected in input text
781        // whenever the final sequence of non-starter characters could be rearranged
782        // so as to make a contiguous matching sequence that is canonically equivalent.
783
784        // UCA: http://www.unicode.org/reports/tr10/#S2.1
785        // S2.1 Find the longest initial substring S at each point that has a match in the table.
786        // S2.1.1 If there are any non-starters following S, process each non-starter C.
787        // S2.1.2 If C is not blocked from S, find if S + C has a match in the table.
788        //     Note: A non-starter in a string is called blocked
789        //     if there is another non-starter of the same canonical combining class or zero
790        //     between it and the last character of canonical combining class 0.
791        // S2.1.3 If there is a match, replace S by S + C, and remove C.
792
793        // First: Is a discontiguous contraction even possible?
794        int fcd16 = d.getFCD16(c);
795        assert(fcd16 > 0xff);  // The caller checked this already, as a shortcut.
796        int nextCp = nextSkippedCodePoint();
797        if(nextCp < 0) {
798            // No further text.
799            backwardNumSkipped(1);
800            return ce32;
801        }
802        ++lookAhead;
803        int prevCC = fcd16 & 0xff;
804        fcd16 = d.getFCD16(nextCp);
805        if(fcd16 <= 0xff) {
806            // The next code point after c is a starter (S2.1.1 "process each non-starter").
807            backwardNumSkipped(2);
808            return ce32;
809        }
810
811        // We have read and matched (lookAhead-2) code points,
812        // read non-matching c and peeked ahead at nextCp.
813        // Return to the state before the mismatch and continue matching with nextCp.
814        if(skipped == null || skipped.isEmpty()) {
815            if(skipped == null) {
816                skipped = new SkippedState();
817            }
818            suffixes.reset();
819            if(lookAhead > 2) {
820                // Replay the partial match so far.
821                backwardNumCodePoints(lookAhead);
822                suffixes.firstForCodePoint(nextCodePoint());
823                for(int i = 3; i < lookAhead; ++i) {
824                    suffixes.nextForCodePoint(nextCodePoint());
825                }
826                // Skip c (which did not match) and nextCp (which we will try now).
827                forwardNumCodePoints(2);
828            }
829            skipped.saveTrieState(suffixes);
830        } else {
831            // Reset to the trie state before the failed match of c.
832            skipped.resetToTrieState(suffixes);
833        }
834
835        skipped.setFirstSkipped(c);
836        // Number of code points read since the last match (at this point: c and nextCp).
837        int sinceMatch = 2;
838        c = nextCp;
839        for(;;) {
840            BytesTrie.Result match;
841            // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2)
842            if(prevCC < (fcd16 >> 8) && (match = suffixes.nextForCodePoint(c)).hasValue()) {
843                // "If there is a match, replace S by S + C, and remove C." (S2.1.3)
844                // Keep prevCC unchanged.
845                ce32 = suffixes.getValue();
846                sinceMatch = 0;
847                skipped.recordMatch();
848                if(!match.hasNext()) { break; }
849                skipped.saveTrieState(suffixes);
850            } else {
851                // No match for "S + C", skip C.
852                skipped.skip(c);
853                skipped.resetToTrieState(suffixes);
854                prevCC = fcd16 & 0xff;
855            }
856            if((c = nextSkippedCodePoint()) < 0) { break; }
857            ++sinceMatch;
858            fcd16 = d.getFCD16(c);
859            if(fcd16 <= 0xff) {
860                // The next code point after c is a starter (S2.1.1 "process each non-starter").
861                break;
862            }
863        }
864        backwardNumSkipped(sinceMatch);
865        boolean isTopDiscontiguous = skipped.isEmpty();
866        skipped.replaceMatch();
867        if(isTopDiscontiguous && !skipped.isEmpty()) {
868            // We did get a match after skipping one or more combining marks,
869            // and we are not in a recursive discontiguous contraction.
870            // Append CEs from the contraction ce32
871            // and then from the combining marks that we skipped before the match.
872            c = Collation.SENTINEL_CP;
873            for(;;) {
874                appendCEsFromCE32(d, c, ce32, true);
875                // Fetch CE32s for skipped combining marks from the normal data, with fallback,
876                // rather than from the CollationData where we found the contraction.
877                if(!skipped.hasNext()) { break; }
878                c = skipped.next();
879                ce32 = getDataCE32(c);
880                if(ce32 == Collation.FALLBACK_CE32) {
881                    d = data.base;
882                    ce32 = d.getCE32(c);
883                } else {
884                    d = data;
885                }
886                // Note: A nested discontiguous-contraction match
887                // replaces consumed combining marks with newly skipped ones
888                // and resets the reading position to the beginning.
889            }
890            skipped.clear();
891            ce32 = Collation.NO_CE32;  // Signal to the caller that the result is in the ceBuffer.
892        }
893        return ce32;
894    }
895
896    /**
897     * Returns the previous CE when data.isUnsafeBackward(c, isNumeric).
898     */
899    private final long previousCEUnsafe(int c, UVector32 offsets) {
900        // We just move through the input counting safe and unsafe code points
901        // without collecting the unsafe-backward substring into a buffer and
902        // switching to it.
903        // This is to keep the logic simple. Otherwise we would have to handle
904        // prefix matching going before the backward buffer, switching
905        // to iteration and back, etc.
906        // In the most important case of iterating over a normal string,
907        // reading from the string itself is already maximally fast.
908        // The only drawback there is that after getting the CEs we always
909        // skip backward to the safe character rather than switching out
910        // of a backwardBuffer.
911        // But this should not be the common case for previousCE(),
912        // and correctness and maintainability are more important than
913        // complex optimizations.
914        // Find the first safe character before c.
915        int numBackward = 1;
916        while((c = previousCodePoint()) >= 0) {
917            ++numBackward;
918            if(!data.isUnsafeBackward(c, isNumeric)) {
919                break;
920            }
921        }
922        // Set the forward iteration limit.
923        // Note: This counts code points.
924        // We cannot enforce a limit in the middle of a surrogate pair or similar.
925        numCpFwd = numBackward;
926        // Reset the forward iterator.
927        cesIndex = 0;
928        assert(ceBuffer.length == 0);
929        // Go forward and collect the CEs.
930        int offset = getOffset();
931        while(numCpFwd > 0) {
932            // nextCE() normally reads one code point.
933            // Contraction matching and digit specials read more and check numCpFwd.
934            --numCpFwd;
935            // Append one or more CEs to the ceBuffer.
936            nextCE();
937            assert(ceBuffer.get(ceBuffer.length - 1) != Collation.NO_CE);
938            // No need to loop for getting each expansion CE from nextCE().
939            cesIndex = ceBuffer.length;
940            // However, we need to write an offset for each CE.
941            // This is for CollationElementIterator.getOffset() to return
942            // intermediate offsets from the unsafe-backwards segment.
943            assert(offsets.size() < ceBuffer.length);
944            offsets.addElement(offset);
945            // For an expansion, the offset of each non-initial CE is the limit offset,
946            // consistent with forward iteration.
947            offset = getOffset();
948            while(offsets.size() < ceBuffer.length) {
949                offsets.addElement(offset);
950            };
951        }
952        assert(offsets.size() == ceBuffer.length);
953        // End offset corresponding to just after the unsafe-backwards segment.
954        offsets.addElement(offset);
955        // Reset the forward iteration limit
956        // and move backward to before the segment for which we fetched CEs.
957        numCpFwd = -1;
958        backwardNumCodePoints(numBackward);
959        // Use the collected CEs and return the last one.
960        cesIndex = 0;  // Avoid cesIndex > ceBuffer.length when that gets decremented.
961        return ceBuffer.get(--ceBuffer.length);
962    }
963
964    /**
965     * Turns a string of digits (bytes 0..9)
966     * into a sequence of CEs that will sort in numeric order.
967     *
968     * Starts from this ce32's digit value and consumes the following/preceding digits.
969     * The digits string must not be empty and must not have leading zeros.
970     */
971    private final void appendNumericCEs(int ce32, boolean forward) {
972        // Collect digits.
973        // TODO: Use some kind of a byte buffer? We only store values 0..9.
974        StringBuilder digits = new StringBuilder();
975        if(forward) {
976            for(;;) {
977                char digit = Collation.digitFromCE32(ce32);
978                digits.append(digit);
979                if(numCpFwd == 0) { break; }
980                int c = nextCodePoint();
981                if(c < 0) { break; }
982                ce32 = data.getCE32(c);
983                if(ce32 == Collation.FALLBACK_CE32) {
984                    ce32 = data.base.getCE32(c);
985                }
986                if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
987                    backwardNumCodePoints(1);
988                    break;
989                }
990                if(numCpFwd > 0) { --numCpFwd; }
991            }
992        } else {
993            for(;;) {
994                char digit = Collation.digitFromCE32(ce32);
995                digits.append(digit);
996                int c = previousCodePoint();
997                if(c < 0) { break; }
998                ce32 = data.getCE32(c);
999                if(ce32 == Collation.FALLBACK_CE32) {
1000                    ce32 = data.base.getCE32(c);
1001                }
1002                if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
1003                    forwardNumCodePoints(1);
1004                    break;
1005                }
1006            }
1007            // Reverse the digit string.
1008            digits.reverse();
1009        }
1010        int pos = 0;
1011        do {
1012            // Skip leading zeros.
1013            while(pos < (digits.length() - 1) && digits.charAt(pos) == 0) { ++pos; }
1014            // Write a sequence of CEs for at most 254 digits at a time.
1015            int segmentLength = digits.length() - pos;
1016            if(segmentLength > 254) { segmentLength = 254; }
1017            appendNumericSegmentCEs(digits.subSequence(pos, pos + segmentLength));
1018            pos += segmentLength;
1019        } while(pos < digits.length());
1020    }
1021
1022    /**
1023     * Turns 1..254 digits into a sequence of CEs.
1024     * Called by appendNumericCEs() for each segment of at most 254 digits.
1025     */
1026    private final void appendNumericSegmentCEs(CharSequence digits) {
1027        int length = digits.length();
1028        assert(1 <= length && length <= 254);
1029        assert(length == 1 || digits.charAt(0) != 0);
1030        long numericPrimary = data.numericPrimary;
1031        // Note: We use primary byte values 2..255: digits are not compressible.
1032        if(length <= 7) {
1033            // Very dense encoding for small numbers.
1034            int value = digits.charAt(0);
1035            for(int i = 1; i < length; ++i) {
1036                value = value * 10 + digits.charAt(i);
1037            }
1038            // Primary weight second byte values:
1039            //     74 byte values   2.. 75 for small numbers in two-byte primary weights.
1040            //     40 byte values  76..115 for medium numbers in three-byte primary weights.
1041            //     16 byte values 116..131 for large numbers in four-byte primary weights.
1042            //    124 byte values 132..255 for very large numbers with 4..127 digit pairs.
1043            int firstByte = 2;
1044            int numBytes = 74;
1045            if(value < numBytes) {
1046                // Two-byte primary for 0..73, good for day & month numbers etc.
1047                long primary = numericPrimary | ((firstByte + value) << 16);
1048                ceBuffer.append(Collation.makeCE(primary));
1049                return;
1050            }
1051            value -= numBytes;
1052            firstByte += numBytes;
1053            numBytes = 40;
1054            if(value < numBytes * 254) {
1055                // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more.
1056                long primary = numericPrimary |
1057                    ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8);
1058                ceBuffer.append(Collation.makeCE(primary));
1059                return;
1060            }
1061            value -= numBytes * 254;
1062            firstByte += numBytes;
1063            numBytes = 16;
1064            if(value < numBytes * 254 * 254) {
1065                // Four-byte primary for 10234..1042489=10234+16*254*254-1.
1066                long primary = numericPrimary | (2 + value % 254);
1067                value /= 254;
1068                primary |= (2 + value % 254) << 8;
1069                value /= 254;
1070                primary |= (firstByte + value % 254) << 16;
1071                ceBuffer.append(Collation.makeCE(primary));
1072                return;
1073            }
1074            // original value > 1042489
1075        }
1076        assert(length >= 7);
1077
1078        // The second primary byte value 132..255 indicates the number of digit pairs (4..127),
1079        // then we generate primary bytes with those pairs.
1080        // Omit trailing 00 pairs.
1081        // Decrement the value for the last pair.
1082
1083        // Set the exponent. 4 pairs.132, 5 pairs.133, ..., 127 pairs.255.
1084        int numPairs = (length + 1) / 2;
1085        long primary = numericPrimary | ((132 - 4 + numPairs) << 16);
1086        // Find the length without trailing 00 pairs.
1087        while(digits.charAt(length - 1) == 0 && digits.charAt(length - 2) == 0) {
1088            length -= 2;
1089        }
1090        // Read the first pair.
1091        int pair;
1092        int pos;
1093        if((length & 1) != 0) {
1094            // Only "half a pair" if we have an odd number of digits.
1095            pair = digits.charAt(0);
1096            pos = 1;
1097        } else {
1098            pair = digits.charAt(0) * 10 + digits.charAt(1);
1099            pos = 2;
1100        }
1101        pair = 11 + 2 * pair;
1102        // Add the pairs of digits between pos and length.
1103        int shift = 8;
1104        while(pos < length) {
1105            if(shift == 0) {
1106                // Every three pairs/bytes we need to store a 4-byte-primary CE
1107                // and start with a new CE with the '0' primary lead byte.
1108                primary |= pair;
1109                ceBuffer.append(Collation.makeCE(primary));
1110                primary = numericPrimary;
1111                shift = 16;
1112            } else {
1113                primary |= pair << shift;
1114                shift -= 8;
1115            }
1116            pair = 11 + 2 * (digits.charAt(pos) * 10 + digits.charAt(pos + 1));
1117            pos += 2;
1118        }
1119        primary |= (pair - 1) << shift;
1120        ceBuffer.append(Collation.makeCE(primary));
1121    }
1122
1123    private CEBuffer ceBuffer;
1124    private int cesIndex;
1125
1126    private SkippedState skipped;
1127
1128    // Number of code points to read forward, or -1.
1129    // Used as a forward iteration limit in previousCEUnsafe().
1130    private int numCpFwd;
1131    // Numeric collation (CollationSettings.NUMERIC).
1132    private boolean isNumeric;
1133}
1134