12d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// © 2016 and later: Unicode, Inc. and others.
22d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License
37935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/*
47935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*******************************************************************************
57935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Copyright (C) 2010-2014, International Business Machines
67935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* Corporation and others.  All Rights Reserved.
77935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*******************************************************************************
87935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* CollationIterator.java, ported from collationiterator.h/.cpp
97935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*
107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* C++ version created on: 2010oct27
117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert* created by: Markus W. Scherer
127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert*/
137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpackage com.ibm.icu.impl.coll;
157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Normalizer2Impl.Hangul;
177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.impl.Trie2_32;
187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.BytesTrie;
197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.CharsTrie;
207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertimport com.ibm.icu.util.ICUException;
217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert/**
237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * Collation element iterator and abstract character iterator.
247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert *
257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * When a method returns a code point value, it must be in 0..10FFFF,
267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert * except it can be negative as a sentinel value.
277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert */
287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubertpublic abstract class CollationIterator {
297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static final class CEBuffer {
307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        /** Large enough for CEs of most short strings. */
317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        private static final int INITIAL_CAPACITY = 40;
327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        CEBuffer() {}
347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        void append(long ce) {
367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(length >= INITIAL_CAPACITY) {
377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ensureAppendCapacity(1);
387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            buffer[length++] = ce;
407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        void appendUnsafe(long ce) {
437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            buffer[length++] = ce;
447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        void ensureAppendCapacity(int appCap) {
477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int capacity = buffer.length;
487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if((length + appCap) <= capacity) { return; }
497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            do {
507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(capacity < 1000) {
517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    capacity *= 4;
527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else {
537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    capacity *= 2;
547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            } while(capacity < (length + appCap));
567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            long[] newBuffer = new long[capacity];
577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            System.arraycopy(buffer, 0, newBuffer, 0, length);
587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            buffer = newBuffer;
597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        void incLength() {
627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Use INITIAL_CAPACITY for a very simple fastpath.
637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // (Rather than buffer.getCapacity().)
647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(length >= INITIAL_CAPACITY) {
657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ensureAppendCapacity(1);
667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            ++length;
687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        long set(int i, long ce) {
717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return buffer[i] = ce;
727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        long get(int i) { return buffer[i]; }
747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        long[] getCEs() { return buffer; }
767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int length = 0;
787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        private long[] buffer = new long[INITIAL_CAPACITY];
807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // State of combining marks skipped in discontiguous contraction.
837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // We create a state object on first use and keep it around deactivated between uses.
847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static final class SkippedState {
857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Born active but empty.
867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        SkippedState() {}
877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        void clear() {
887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            oldBuffer.setLength(0);
897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            pos = 0;
907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // The newBuffer is reset by setFirstSkipped().
917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        boolean isEmpty() { return oldBuffer.length() == 0; }
947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        boolean hasNext() { return pos < oldBuffer.length(); }
967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Requires hasNext().
987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int next() {
997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int c = oldBuffer.codePointAt(pos);
1007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            pos += Character.charCount(c);
1017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return c;
1027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Accounts for one more input code point read beyond the end of the marks buffer.
1057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        void incBeyond() {
1067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            assert(!hasNext());
1077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            ++pos;
1087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Goes backward through the skipped-marks buffer.
1117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Returns the number of code points read beyond the skipped marks
1127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // that need to be backtracked through normal input.
1137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int backwardNumCodePoints(int n) {
1147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int length = oldBuffer.length();
1157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int beyond = pos - length;
1167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(beyond > 0) {
1177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(beyond >= n) {
1187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // Not back far enough to re-enter the oldBuffer.
1197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    pos -= n;
1207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    return n;
1217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else {
1227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // Back out all beyond-oldBuffer code points and re-enter the buffer.
1237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    pos = oldBuffer.offsetByCodePoints(length, beyond - n);
1247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    return beyond;
1257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
1267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            } else {
1277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Go backwards from inside the oldBuffer.
1287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                pos = oldBuffer.offsetByCodePoints(pos, -n);
1297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return 0;
1307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
1317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        void setFirstSkipped(int c) {
1347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            skipLengthAtMatch = 0;
1357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            newBuffer.setLength(0);
1367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            newBuffer.appendCodePoint(c);
1377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        void skip(int c) {
1407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            newBuffer.appendCodePoint(c);
1417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        void recordMatch() { skipLengthAtMatch = newBuffer.length(); }
1447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Replaces the characters we consumed with the newly skipped ones.
1467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        void replaceMatch() {
1477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Note: UnicodeString.replace() pins pos to at most length().
1487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int oldLength = oldBuffer.length();
1497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(pos > oldLength) { pos = oldLength; }
1507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            oldBuffer.delete(0, pos).insert(0, newBuffer, 0, skipLengthAtMatch);
1517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            pos = 0;
1527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
1537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        void saveTrieState(CharsTrie trie) { trie.saveState(state); }
1557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        void resetToTrieState(CharsTrie trie) { trie.resetToState(state); }
1567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Combining marks skipped in previous discontiguous-contraction matching.
1587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // After that discontiguous contraction was completed, we start reading them from here.
1597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        private final StringBuilder oldBuffer = new StringBuilder();
1607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Combining marks newly skipped in current discontiguous-contraction matching.
1617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // These might have been read from the normal text or from the oldBuffer.
1627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        private final StringBuilder newBuffer = new StringBuilder();
1637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Reading index in oldBuffer,
1647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()).
1657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        private int pos;
1667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // newBuffer.length() at the time of the last matching character.
1677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // When a partial match fails, we back out skipped and partial-matching input characters.
1687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        private int skipLengthAtMatch;
1697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // We save the trie state before we attempt to match a character,
1707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // so that we can skip it and try the next one.
1717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        private CharsTrie.State state = new CharsTrie.State();
1727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    };
1737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
1757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Partially constructs the iterator.
1767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * In Java, we cache partially constructed iterators
1777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * and finish their setup when starting to work on text
1787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * (via reset(boolean) and the setText(numeric, ...) methods of subclasses).
1797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * This avoids memory allocations for iterators that remain unused.
1807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
1817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * <p>In C++, there is only one constructor, and iterators are
1827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * stack-allocated as needed.
1837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
1847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public CollationIterator(CollationData d) {
1857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        trie = d.trie;
1867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        data = d;
1877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        numCpFwd = -1;
1887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        isNumeric = false;
1897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        ceBuffer = null;
1907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
1927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public CollationIterator(CollationData d, boolean numeric) {
1937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        trie = d.trie;
1947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        data = d;
1957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        numCpFwd = -1;
1967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        isNumeric = numeric;
1977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        ceBuffer = new CEBuffer();
1987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
1997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    @Override
2017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public boolean equals(Object other) {
2027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Subclasses: Call this method and then add more specific checks.
2037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Compare the iterator state but not the collation data (trie & data fields):
2047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Assume that the caller compares the data.
2057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Ignore skipped since that should be unused between calls to nextCE().
2067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // (It only stays around to avoid another memory allocation.)
2077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(other == null) { return false; }
2087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(!this.getClass().equals(other.getClass())) { return false; }
2097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        CollationIterator o = (CollationIterator)other;
2107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(!(ceBuffer.length == o.ceBuffer.length &&
2117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                cesIndex == o.cesIndex &&
2127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                numCpFwd == o.numCpFwd &&
2137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                isNumeric == o.isNumeric)) {
2147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return false;
2157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for(int i = 0; i < ceBuffer.length; ++i) {
2177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(ceBuffer.get(i) != o.ceBuffer.get(i)) { return false; }
2187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return true;
2207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
2217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2222d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert    @Override
2232d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert    public int hashCode() {
2242d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert        // Dummy return to prevent compile warnings.
2252d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert        return 0;
2262d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert    }
2272d2bb24f747c65578da13d5b13b82f0669690461Fredrik Roubert
2287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
2297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Resets the iterator state and sets the position to the specified offset.
2307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Subclasses must implement, and must call the parent class method,
2317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * or CollationIterator.reset().
2327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
2337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public abstract void resetToOffset(int newOffset);
2347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public abstract int getOffset();
2367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
2387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Returns the next collation element.
2397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
2407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public final long nextCE() {
2417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(cesIndex < ceBuffer.length) {
2427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Return the next buffered CE.
2437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return ceBuffer.get(cesIndex++);
2447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        assert cesIndex == ceBuffer.length;
2467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        ceBuffer.incLength();
2477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        long cAndCE32 = handleNextCE32();
2487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int c = (int)(cAndCE32 >> 32);
2497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int ce32 = (int)cAndCE32;
2507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int t = ce32 & 0xff;
2517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(t < Collation.SPECIAL_CE32_LOW_BYTE) {  // Forced-inline of isSpecialCE32(ce32).
2527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Normal CE from the main data.
2537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Forced-inline of ceFromSimpleCE32(ce32).
2547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return ceBuffer.set(cesIndex++,
2557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
2567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        CollationData d;
2587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // The compiler should be able to optimize the previous and the following
2597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // comparisons of t with the same constant.
2607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(t == Collation.SPECIAL_CE32_LOW_BYTE) {
2617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(c < 0) {
2627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return ceBuffer.set(cesIndex++, Collation.NO_CE);
2637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
2647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            d = data.base;
2657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            ce32 = d.getCE32(c);
2667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            t = ce32 & 0xff;
2677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(t < Collation.SPECIAL_CE32_LOW_BYTE) {
2687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Normal CE from the base data.
2697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return ceBuffer.set(cesIndex++,
2707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
2717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
2727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        } else {
2737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            d = data;
2747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(t == Collation.LONG_PRIMARY_CE32_LOW_BYTE) {
2767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Forced-inline of ceFromLongPrimaryCE32(ce32).
2777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return ceBuffer.set(cesIndex++,
2787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ((long)(ce32 - t) << 32) | Collation.COMMON_SEC_AND_TER_CE);
2797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return nextCEFromCE32(d, c, ce32);
2817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
2827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
2847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Fetches all CEs.
2857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @return getCEsLength()
2867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
2877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public final int fetchCEs() {
2887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        while(nextCE() != Collation.NO_CE) {
2897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // No need to loop for each expansion CE.
2907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            cesIndex = ceBuffer.length;
2917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
2927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return ceBuffer.length;
2937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
2947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
2957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
2967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Overwrites the current CE (the last one returned by nextCE()).
2977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
2987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    final void setCurrentCE(long ce) {
2997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        assert cesIndex > 0;
3007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        ceBuffer.set(cesIndex - 1, ce);
3017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
3027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
3047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Returns the previous collation element.
3057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
3067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public final long previousCE(UVector32 offsets) {
3077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(ceBuffer.length > 0) {
3087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Return the previous buffered CE.
3097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return ceBuffer.get(--ceBuffer.length);
3107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
3117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        offsets.removeAllElements();
3127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int limitOffset = getOffset();
3137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int c = previousCodePoint();
3147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(c < 0) { return Collation.NO_CE; }
3157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(data.isUnsafeBackward(c, isNumeric)) {
3167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return previousCEUnsafe(c, offsets);
3177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
3187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Simple, safe-backwards iteration:
3197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Get a CE going backwards, handle prefixes but no contractions.
3207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int ce32 = data.getCE32(c);
3217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        CollationData d;
3227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(ce32 == Collation.FALLBACK_CE32) {
3237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            d = data.base;
3247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            ce32 = d.getCE32(c);
3257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        } else {
3267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            d = data;
3277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
3287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(Collation.isSimpleOrLongCE32(ce32)) {
3297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return Collation.ceFromCE32(ce32);
3307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
3317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        appendCEsFromCE32(d, c, ce32, false);
3327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(ceBuffer.length > 1) {
3337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            offsets.addElement(getOffset());
3347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // For an expansion, the offset of each non-initial CE is the limit offset,
3357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // consistent with forward iteration.
3367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            while(offsets.size() <= ceBuffer.length) {
3377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                offsets.addElement(limitOffset);
3387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            };
3397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
3407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return ceBuffer.get(--ceBuffer.length);
3417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
3427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public final int getCEsLength() {
3447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return ceBuffer.length;
3457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
3467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public final long getCE(int i) {
3487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return ceBuffer.get(i);
3497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
3507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public final long[] getCEs() {
3527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return ceBuffer.getCEs();
3537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
3547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    final void clearCEs() {
3567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        cesIndex = ceBuffer.length = 0;
3577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
3587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public final void clearCEsIfNoneRemaining() {
3607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(cesIndex == ceBuffer.length) { clearCEs(); }
3617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
3627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
3647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Returns the next code point (with post-increment).
3657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Public for identical-level comparison and for testing.
3667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
3677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public abstract int nextCodePoint();
3687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
3707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Returns the previous code point (with pre-decrement).
3717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Public for identical-level comparison and for testing.
3727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
3737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    public abstract int previousCodePoint();
3747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected final void reset() {
3767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        cesIndex = ceBuffer.length = 0;
3777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(skipped != null) { skipped.clear(); }
3787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
3797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
3807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Resets the state as well as the numeric setting,
3817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * and completes the initialization.
3827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Only exists in Java where we reset cached CollationIterator instances
3837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * rather than stack-allocating temporary ones.
3847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * (See also the constructor comments.)
3857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
3867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected final void reset(boolean numeric) {
3877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(ceBuffer == null) {
3887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            ceBuffer = new CEBuffer();
3897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
3907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        reset();
3917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        isNumeric = numeric;
3927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
3937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
3947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
3957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Returns the next code point and its local CE32 value.
3967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Returns Collation.FALLBACK_CE32 at the end of the text (c<0)
3977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * or when c's CE32 value is to be looked up in the base data (fallback).
3987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
3997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * The code point is used for fallbacks, context and implicit weights.
4007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32).
4017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
4027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Returns the code point in bits 63..32 (signed) and the CE32 in bits 31..0.
4037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
4047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected long handleNextCE32() {
4057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int c = nextCodePoint();
4067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(c < 0) { return NO_CP_AND_CE32; }
4077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return makeCodePointAndCE32Pair(c, data.getCE32(c));
4087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
4097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected long makeCodePointAndCE32Pair(int c, int ce32) {
4107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return ((long)c << 32) | (ce32 & 0xffffffffL);
4117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
4127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected static final long NO_CP_AND_CE32 = (-1L << 32) | (Collation.FALLBACK_CE32 & 0xffffffffL);
4137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
4157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit.
4167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Returns the trail surrogate in that case and advances past it,
4177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * if a trail surrogate follows the lead surrogate.
4187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Otherwise returns any other code unit and does not advance.
4197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
4207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected char handleGetTrailSurrogate() {
4217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return 0;
4227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
4237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
4257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator.
4267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * (Not needed in Java.)
4277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
4287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /*protected boolean foundNULTerminator() {
4297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return false;
4307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }*/
4317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
4337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * @return false if surrogate code points U+D800..U+DFFF
4347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *         map to their own implicit primary weights (for UTF-16),
4357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *         or true if they map to CE(U+FFFD) (for UTF-8)
4367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
4377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected boolean forbidSurrogateCodePoints() {
4387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return false;
4397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
4407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected abstract void forwardNumCodePoints(int num);
4427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected abstract void backwardNumCodePoints(int num);
4447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
4467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Returns the CE32 from the data trie.
4477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Normally the same as data.getCE32(), but overridden in the builder.
4487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Call this only when the faster data.getCE32() cannot be used.
4497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
4507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected int getDataCE32(int c) {
4517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return data.getCE32(c);
4527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
4537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected int getCE32FromBuilderData(int ce32) {
4557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        throw new ICUException("internal program error: should be unreachable");
4567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
4577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
4587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected final void appendCEsFromCE32(CollationData d, int c, int ce32,
4597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                           boolean forward) {
4607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        while(Collation.isSpecialCE32(ce32)) {
4617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            switch(Collation.tagFromCE32(ce32)) {
4627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.FALLBACK_TAG:
4637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.RESERVED_TAG_3:
4647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                throw new ICUException("internal program error: should be unreachable");
4657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.LONG_PRIMARY_TAG:
4667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ceBuffer.append(Collation.ceFromLongPrimaryCE32(ce32));
4677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return;
4687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.LONG_SECONDARY_TAG:
4697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ceBuffer.append(Collation.ceFromLongSecondaryCE32(ce32));
4707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return;
4717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.LATIN_EXPANSION_TAG:
4727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ceBuffer.ensureAppendCapacity(2);
4737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ceBuffer.set(ceBuffer.length, Collation.latinCE0FromCE32(ce32));
4747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ceBuffer.set(ceBuffer.length + 1, Collation.latinCE1FromCE32(ce32));
4757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ceBuffer.length += 2;
4767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return;
4777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.EXPANSION32_TAG: {
4787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int index = Collation.indexFromCE32(ce32);
4797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int length = Collation.lengthFromCE32(ce32);
4807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ceBuffer.ensureAppendCapacity(length);
4817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                do {
4827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ceBuffer.appendUnsafe(Collation.ceFromCE32(d.ce32s[index++]));
4837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } while(--length > 0);
4847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return;
4857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
4867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.EXPANSION_TAG: {
4877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int index = Collation.indexFromCE32(ce32);
4887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int length = Collation.lengthFromCE32(ce32);
4897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ceBuffer.ensureAppendCapacity(length);
4907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                do {
4917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ceBuffer.appendUnsafe(d.ces[index++]);
4927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } while(--length > 0);
4937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return;
4947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
4957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.BUILDER_DATA_TAG:
4967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ce32 = getCE32FromBuilderData(ce32);
4977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(ce32 == Collation.FALLBACK_CE32) {
4987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    d = data.base;
4997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ce32 = d.getCE32(c);
5007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
5017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                break;
5027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.PREFIX_TAG:
5037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(forward) { backwardNumCodePoints(1); }
5047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ce32 = getCE32FromPrefix(d, ce32);
5057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(forward) { forwardNumCodePoints(1); }
5067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                break;
5077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.CONTRACTION_TAG: {
5087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int index = Collation.indexFromCE32(ce32);
5097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int defaultCE32 = d.getCE32FromContexts(index);  // Default if no suffix match.
5107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(!forward) {
5117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // Backward contractions are handled by previousCEUnsafe().
5127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // c has contractions but they were not found.
5137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ce32 = defaultCE32;
5147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    break;
5157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
5167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int nextCp;
5177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(skipped == null && numCpFwd < 0) {
5187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path,
5197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // avoiding the function call and the nextSkippedCodePoint() overhead.
5207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    nextCp = nextCodePoint();
5217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if(nextCp < 0) {
5227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        // No more text.
5237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        ce32 = defaultCE32;
5247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        break;
5257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
5267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            !CollationFCD.mayHaveLccc(nextCp)) {
5277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        // All contraction suffixes start with characters with lccc!=0
5287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        // but the next code point has lccc==0.
5297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        backwardNumCodePoints(1);
5307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        ce32 = defaultCE32;
5317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        break;
5327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
5337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else {
5347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    nextCp = nextSkippedCodePoint();
5357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if(nextCp < 0) {
5367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        // No more text.
5377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        ce32 = defaultCE32;
5387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        break;
5397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
5407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            !CollationFCD.mayHaveLccc(nextCp)) {
5417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        // All contraction suffixes start with characters with lccc!=0
5427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        // but the next code point has lccc==0.
5437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        backwardNumSkipped(1);
5447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        ce32 = defaultCE32;
5457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        break;
5467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
5477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
5487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ce32 = nextCE32FromContraction(d, ce32, d.contexts, index + 2, defaultCE32, nextCp);
5497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(ce32 == Collation.NO_CE32) {
5507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // CEs from a discontiguous contraction plus the skipped combining marks
5517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // have been appended already.
5527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    return;
5537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
5547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                break;
5557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
5567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.DIGIT_TAG:
5577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(isNumeric) {
5587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    appendNumericCEs(ce32, forward);
5597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    return;
5607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else {
5617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // Fetch the non-numeric-collation CE32 and continue.
5627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
5637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    break;
5647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
5657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.U0000_TAG:
5667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                assert(c == 0);
5677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // NUL-terminated input not supported in Java.
5687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Fetch the normal ce32 for U+0000 and continue.
5697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ce32 = d.ce32s[0];
5707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                break;
5717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.HANGUL_TAG: {
5727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int[] jamoCE32s = d.jamoCE32s;
5737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                c -= Hangul.HANGUL_BASE;
5747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int t = c % Hangul.JAMO_T_COUNT;
5757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                c /= Hangul.JAMO_T_COUNT;
5767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int v = c % Hangul.JAMO_V_COUNT;
5777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                c /= Hangul.JAMO_V_COUNT;
5787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if((ce32 & Collation.HANGUL_NO_SPECIAL_JAMO) != 0) {
5797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // None of the Jamo CE32s are isSpecialCE32().
5807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // Avoid recursive function calls and per-Jamo tests.
5817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3);
5827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ceBuffer.set(ceBuffer.length, Collation.ceFromCE32(jamoCE32s[c]));
5837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ceBuffer.set(ceBuffer.length + 1, Collation.ceFromCE32(jamoCE32s[19 + v]));
5847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ceBuffer.length += 2;
5857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if(t != 0) {
5867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        ceBuffer.appendUnsafe(Collation.ceFromCE32(jamoCE32s[39 + t]));
5877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
5887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    return;
5897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else {
5907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // We should not need to compute each Jamo code point.
5917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // In particular, there should be no offset or implicit ce32.
5927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[c], forward);
5937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[19 + v], forward);
5947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if(t == 0) { return; }
5957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // offset 39 = 19 + 21 - 1:
5967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // 19 = JAMO_L_COUNT
5977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // 21 = JAMO_T_COUNT
5987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // -1 = omit t==0
5997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ce32 = jamoCE32s[39 + t];
6007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    c = Collation.SENTINEL_CP;
6017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    break;
6027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
6037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
6047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.LEAD_SURROGATE_TAG: {
6057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                assert(forward);  // Backward iteration should never see lead surrogate code _unit_ data.
6067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                assert(isLeadSurrogate(c));
6077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                char trail;
6087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(Character.isLowSurrogate(trail = handleGetTrailSurrogate())) {
6097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    c = Character.toCodePoint((char)c, trail);
6107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ce32 &= Collation.LEAD_TYPE_MASK;
6117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if(ce32 == Collation.LEAD_ALL_UNASSIGNED) {
6127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        ce32 = Collation.UNASSIGNED_CE32;  // unassigned-implicit
6137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    } else if(ce32 == Collation.LEAD_ALL_FALLBACK ||
6147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            (ce32 = d.getCE32FromSupplementary(c)) == Collation.FALLBACK_CE32) {
6157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        // fall back to the base data
6167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        d = d.base;
6177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        ce32 = d.getCE32FromSupplementary(c);
6187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
6197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else {
6207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // c is an unpaired surrogate.
6217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ce32 = Collation.UNASSIGNED_CE32;
6227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
6237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                break;
6247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
6257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.OFFSET_TAG:
6267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                assert(c >= 0);
6277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ceBuffer.append(d.getCEFromOffsetCE32(c, ce32));
6287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return;
6297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            case Collation.IMPLICIT_TAG:
6307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                assert(c >= 0);
6317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(isSurrogate(c) && forbidSurrogateCodePoints()) {
6327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ce32 = Collation.FFFD_CE32;
6337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    break;
6347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else {
6357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ceBuffer.append(Collation.unassignedCEFromCodePoint(c));
6367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    return;
6377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
6387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
6397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
6407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        ceBuffer.append(Collation.ceFromSimpleCE32(ce32));
6417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
6427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
6437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // TODO: Propose widening the UTF16 method.
6447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private static final boolean isSurrogate(int c) {
6457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return (c & 0xfffff800) == 0xd800;
6467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
6477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
6487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // TODO: Propose widening the UTF16 method.
6497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected static final boolean isLeadSurrogate(int c) {
6507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return (c & 0xfffffc00) == 0xd800;
6517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
6527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
6537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // TODO: Propose widening the UTF16 method.
6547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected static final boolean isTrailSurrogate(int c) {
6557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return (c & 0xfffffc00) == 0xdc00;
6567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
6577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
6587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // Main lookup trie of the data object.
6597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected final Trie2_32 trie;
6607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    protected final CollationData data;
6617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
6627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private final long nextCEFromCE32(CollationData d, int c, int ce32) {
6637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        --ceBuffer.length;  // Undo ceBuffer.incLength().
6647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        appendCEsFromCE32(d, c, ce32, true);
6657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return ceBuffer.get(cesIndex++);
6667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
6677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
6687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private final int getCE32FromPrefix(CollationData d, int ce32) {
6697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int index = Collation.indexFromCE32(ce32);
6707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        ce32 = d.getCE32FromContexts(index);  // Default if no prefix match.
6717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        index += 2;
6727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Number of code points read before the original code point.
6737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int lookBehind = 0;
6747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        CharsTrie prefixes = new CharsTrie(d.contexts, index);
6757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for(;;) {
6767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int c = previousCodePoint();
6777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(c < 0) { break; }
6787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            ++lookBehind;
6797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            BytesTrie.Result match = prefixes.nextForCodePoint(c);
6807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(match.hasValue()) {
6817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ce32 = prefixes.getValue();
6827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
6837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(!match.hasNext()) { break; }
6847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
6857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        forwardNumCodePoints(lookBehind);
6867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return ce32;
6877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
6887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
6897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private final int nextSkippedCodePoint() {
6907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(skipped != null && skipped.hasNext()) { return skipped.next(); }
6917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(numCpFwd == 0) { return Collation.SENTINEL_CP; }
6927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int c = nextCodePoint();
6937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(skipped != null && !skipped.isEmpty() && c >= 0) { skipped.incBeyond(); }
6947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(numCpFwd > 0 && c >= 0) { --numCpFwd; }
6957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return c;
6967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
6977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
6987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private final void backwardNumSkipped(int n) {
6997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(skipped != null && !skipped.isEmpty()) {
7007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            n = skipped.backwardNumCodePoints(n);
7017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
7027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        backwardNumCodePoints(n);
7037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(numCpFwd >= 0) { numCpFwd += n; }
7047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
7057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
7067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private final int nextCE32FromContraction(
7077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            CollationData d, int contractionCE32,
7087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            CharSequence trieChars, int trieOffset, int ce32, int c) {
7097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // c: next code point after the original one
7107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
7117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Number of code points read beyond the original code point.
7127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Needed for discontiguous contraction matching.
7137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int lookAhead = 1;
7147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Number of code points read since the last match (initially only c).
7157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int sinceMatch = 1;
7167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Normally we only need a contiguous match,
7177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // and therefore need not remember the suffixes state from before a mismatch for retrying.
7187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // If we are already processing skipped combining marks, then we do track the state.
7197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        CharsTrie suffixes = new CharsTrie(trieChars, trieOffset);
7207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
7217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        BytesTrie.Result match = suffixes.firstForCodePoint(c);
7227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for(;;) {
7237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int nextCp;
7247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(match.hasValue()) {
7257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ce32 = suffixes.getValue();
7267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(!match.hasNext() || (c = nextSkippedCodePoint()) < 0) {
7277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    return ce32;
7287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
7297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
7307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                sinceMatch = 1;
7317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            } else if(match == BytesTrie.Result.NO_MATCH || (nextCp = nextSkippedCodePoint()) < 0) {
7327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // No match for c, or partial match (BytesTrie.Result.NO_VALUE) and no further text.
7337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Back up if necessary, and try a discontiguous contraction.
7347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if((contractionCE32 & Collation.CONTRACT_TRAILING_CCC) != 0 &&
7357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        // Discontiguous contraction matching extends an existing match.
7367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        // If there is no match yet, then there is nothing to do.
7377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        ((contractionCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) == 0 ||
7387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            sinceMatch < lookAhead)) {
7397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // The last character of at least one suffix has lccc!=0,
7407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // allowing for discontiguous contractions.
7417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // UCA S2.1.1 only processes non-starters immediately following
7427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    // "a match in the table" (sinceMatch=1).
7437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if(sinceMatch > 1) {
7447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        // Return to the state after the last match.
7457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        // (Return to sinceMatch=0 and re-fetch the first partially-matched character.)
7467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        backwardNumSkipped(sinceMatch);
7477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        c = nextSkippedCodePoint();
7487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        lookAhead -= sinceMatch - 1;
7497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        sinceMatch = 1;
7507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
7517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    if(d.getFCD16(c) > 0xff) {
7527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                        return nextCE32FromDiscontiguousContraction(
7537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                            d, suffixes, ce32, lookAhead, c);
7547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    }
7557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
7567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                break;
7577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            } else {
7587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Continue after partial match (BytesTrie.Result.NO_VALUE) for c.
7597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // It does not have a result value, therefore it is not itself "a match in the table".
7607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // If a partially-matched c has ccc!=0 then
7617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // it might be skipped in discontiguous contraction.
7627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                c = nextCp;
7637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ++sinceMatch;
7647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
7657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            ++lookAhead;
7667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            match = suffixes.nextForCodePoint(c);
7677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
7687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        backwardNumSkipped(sinceMatch);
7697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return ce32;
7707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
7717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
7727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private final int nextCE32FromDiscontiguousContraction(
7737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            CollationData d, CharsTrie suffixes, int ce32,
7747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int lookAhead, int c) {
7757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // UCA section 3.3.2 Contractions:
7767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Contractions that end with non-starter characters
7777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // are known as discontiguous contractions.
7787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // ... discontiguous contractions must be detected in input text
7797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // whenever the final sequence of non-starter characters could be rearranged
7807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // so as to make a contiguous matching sequence that is canonically equivalent.
7817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
7827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // UCA: http://www.unicode.org/reports/tr10/#S2.1
7837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // S2.1 Find the longest initial substring S at each point that has a match in the table.
7847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // S2.1.1 If there are any non-starters following S, process each non-starter C.
7857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // S2.1.2 If C is not blocked from S, find if S + C has a match in the table.
7867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //     Note: A non-starter in a string is called blocked
7877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //     if there is another non-starter of the same canonical combining class or zero
7887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        //     between it and the last character of canonical combining class 0.
7897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // S2.1.3 If there is a match, replace S by S + C, and remove C.
7907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
7917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // First: Is a discontiguous contraction even possible?
7927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int fcd16 = d.getFCD16(c);
7937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        assert(fcd16 > 0xff);  // The caller checked this already, as a shortcut.
7947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int nextCp = nextSkippedCodePoint();
7957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(nextCp < 0) {
7967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // No further text.
7977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            backwardNumSkipped(1);
7987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return ce32;
7997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
8007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        ++lookAhead;
8017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int prevCC = fcd16 & 0xff;
8027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        fcd16 = d.getFCD16(nextCp);
8037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(fcd16 <= 0xff) {
8047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // The next code point after c is a starter (S2.1.1 "process each non-starter").
8057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            backwardNumSkipped(2);
8067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            return ce32;
8077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
8087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
8097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // We have read and matched (lookAhead-2) code points,
8107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // read non-matching c and peeked ahead at nextCp.
8117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Return to the state before the mismatch and continue matching with nextCp.
8127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(skipped == null || skipped.isEmpty()) {
8137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(skipped == null) {
8147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                skipped = new SkippedState();
8157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
8167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            suffixes.reset();
8177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(lookAhead > 2) {
8187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Replay the partial match so far.
8197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                backwardNumCodePoints(lookAhead);
8207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                suffixes.firstForCodePoint(nextCodePoint());
8217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                for(int i = 3; i < lookAhead; ++i) {
8227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    suffixes.nextForCodePoint(nextCodePoint());
8237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
8247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Skip c (which did not match) and nextCp (which we will try now).
8257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                forwardNumCodePoints(2);
8267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
8277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            skipped.saveTrieState(suffixes);
8287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        } else {
8297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Reset to the trie state before the failed match of c.
8307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            skipped.resetToTrieState(suffixes);
8317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
8327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
8337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        skipped.setFirstSkipped(c);
8347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Number of code points read since the last match (at this point: c and nextCp).
8357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int sinceMatch = 2;
8367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        c = nextCp;
8377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        for(;;) {
8387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            BytesTrie.Result match;
8397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2)
8407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(prevCC < (fcd16 >> 8) && (match = suffixes.nextForCodePoint(c)).hasValue()) {
8417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // "If there is a match, replace S by S + C, and remove C." (S2.1.3)
8427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Keep prevCC unchanged.
8437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ce32 = suffixes.getValue();
8447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                sinceMatch = 0;
8457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                skipped.recordMatch();
8467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(!match.hasNext()) { break; }
8477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                skipped.saveTrieState(suffixes);
8487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            } else {
8497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // No match for "S + C", skip C.
8507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                skipped.skip(c);
8517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                skipped.resetToTrieState(suffixes);
8527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                prevCC = fcd16 & 0xff;
8537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
8547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if((c = nextSkippedCodePoint()) < 0) { break; }
8557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            ++sinceMatch;
8567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            fcd16 = d.getFCD16(c);
8577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(fcd16 <= 0xff) {
8587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // The next code point after c is a starter (S2.1.1 "process each non-starter").
8597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                break;
8607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
8617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
8627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        backwardNumSkipped(sinceMatch);
8637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        boolean isTopDiscontiguous = skipped.isEmpty();
8647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        skipped.replaceMatch();
8657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(isTopDiscontiguous && !skipped.isEmpty()) {
8667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // We did get a match after skipping one or more combining marks,
8677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // and we are not in a recursive discontiguous contraction.
8687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Append CEs from the contraction ce32
8697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // and then from the combining marks that we skipped before the match.
8707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            c = Collation.SENTINEL_CP;
8717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            for(;;) {
8727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                appendCEsFromCE32(d, c, ce32, true);
8737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Fetch CE32s for skipped combining marks from the normal data, with fallback,
8747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // rather than from the CollationData where we found the contraction.
8757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(!skipped.hasNext()) { break; }
8767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                c = skipped.next();
8777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ce32 = getDataCE32(c);
8787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(ce32 == Collation.FALLBACK_CE32) {
8797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    d = data.base;
8807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ce32 = d.getCE32(c);
8817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                } else {
8827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    d = data;
8837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
8847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Note: A nested discontiguous-contraction match
8857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // replaces consumed combining marks with newly skipped ones
8867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // and resets the reading position to the beginning.
8877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
8887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            skipped.clear();
8897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            ce32 = Collation.NO_CE32;  // Signal to the caller that the result is in the ceBuffer.
8907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
8917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return ce32;
8927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
8937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
8947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
8957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Returns the previous CE when data.isUnsafeBackward(c, isNumeric).
8967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
8977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private final long previousCEUnsafe(int c, UVector32 offsets) {
8987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // We just move through the input counting safe and unsafe code points
8997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // without collecting the unsafe-backward substring into a buffer and
9007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // switching to it.
9017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // This is to keep the logic simple. Otherwise we would have to handle
9027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // prefix matching going before the backward buffer, switching
9037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // to iteration and back, etc.
9047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // In the most important case of iterating over a normal string,
9057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // reading from the string itself is already maximally fast.
9067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // The only drawback there is that after getting the CEs we always
9077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // skip backward to the safe character rather than switching out
9087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // of a backwardBuffer.
9097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // But this should not be the common case for previousCE(),
9107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // and correctness and maintainability are more important than
9117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // complex optimizations.
9127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Find the first safe character before c.
9137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int numBackward = 1;
9147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        while((c = previousCodePoint()) >= 0) {
9157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            ++numBackward;
9167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(!data.isUnsafeBackward(c, isNumeric)) {
9177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                break;
9187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
9197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
9207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Set the forward iteration limit.
9217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Note: This counts code points.
9227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // We cannot enforce a limit in the middle of a surrogate pair or similar.
9237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        numCpFwd = numBackward;
9247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Reset the forward iterator.
9257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        cesIndex = 0;
9267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        assert(ceBuffer.length == 0);
9277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Go forward and collect the CEs.
9287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int offset = getOffset();
9297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        while(numCpFwd > 0) {
9307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // nextCE() normally reads one code point.
9317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Contraction matching and digit specials read more and check numCpFwd.
9327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            --numCpFwd;
9337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Append one or more CEs to the ceBuffer.
9347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            nextCE();
9357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            assert(ceBuffer.get(ceBuffer.length - 1) != Collation.NO_CE);
9367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // No need to loop for getting each expansion CE from nextCE().
9377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            cesIndex = ceBuffer.length;
9387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // However, we need to write an offset for each CE.
9397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // This is for CollationElementIterator.getOffset() to return
9407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // intermediate offsets from the unsafe-backwards segment.
9417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            assert(offsets.size() < ceBuffer.length);
9427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            offsets.addElement(offset);
9437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // For an expansion, the offset of each non-initial CE is the limit offset,
9447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // consistent with forward iteration.
9457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            offset = getOffset();
9467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            while(offsets.size() < ceBuffer.length) {
9477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                offsets.addElement(offset);
9487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            };
9497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
9507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        assert(offsets.size() == ceBuffer.length);
9517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // End offset corresponding to just after the unsafe-backwards segment.
9527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        offsets.addElement(offset);
9537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Reset the forward iteration limit
9547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // and move backward to before the segment for which we fetched CEs.
9557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        numCpFwd = -1;
9567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        backwardNumCodePoints(numBackward);
9577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Use the collected CEs and return the last one.
9587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        cesIndex = 0;  // Avoid cesIndex > ceBuffer.length when that gets decremented.
9597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        return ceBuffer.get(--ceBuffer.length);
9607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
9617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
9627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
9637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Turns a string of digits (bytes 0..9)
9647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * into a sequence of CEs that will sort in numeric order.
9657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     *
9667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Starts from this ce32's digit value and consumes the following/preceding digits.
9677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * The digits string must not be empty and must not have leading zeros.
9687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
9697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private final void appendNumericCEs(int ce32, boolean forward) {
9707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Collect digits.
9717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // TODO: Use some kind of a byte buffer? We only store values 0..9.
9727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        StringBuilder digits = new StringBuilder();
9737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(forward) {
9747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            for(;;) {
9757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                char digit = Collation.digitFromCE32(ce32);
9767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                digits.append(digit);
9777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(numCpFwd == 0) { break; }
9787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int c = nextCodePoint();
9797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(c < 0) { break; }
9807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ce32 = data.getCE32(c);
9817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(ce32 == Collation.FALLBACK_CE32) {
9827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ce32 = data.base.getCE32(c);
9837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
9847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
9857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    backwardNumCodePoints(1);
9867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    break;
9877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
9887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(numCpFwd > 0) { --numCpFwd; }
9897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
9907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        } else {
9917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            for(;;) {
9927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                char digit = Collation.digitFromCE32(ce32);
9937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                digits.append(digit);
9947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                int c = previousCodePoint();
9957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(c < 0) { break; }
9967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ce32 = data.getCE32(c);
9977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(ce32 == Collation.FALLBACK_CE32) {
9987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ce32 = data.base.getCE32(c);
9997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
10007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
10017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    forwardNumCodePoints(1);
10027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    break;
10037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                }
10047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
10057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Reverse the digit string.
10067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            digits.reverse();
10077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
10087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int pos = 0;
10097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        do {
10107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Skip leading zeros.
10117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            while(pos < (digits.length() - 1) && digits.charAt(pos) == 0) { ++pos; }
10127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Write a sequence of CEs for at most 254 digits at a time.
10137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int segmentLength = digits.length() - pos;
10147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(segmentLength > 254) { segmentLength = 254; }
10157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            appendNumericSegmentCEs(digits.subSequence(pos, pos + segmentLength));
10167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            pos += segmentLength;
10177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        } while(pos < digits.length());
10187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
10197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
10207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    /**
10217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Turns 1..254 digits into a sequence of CEs.
10227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     * Called by appendNumericCEs() for each segment of at most 254 digits.
10237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert     */
10247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private final void appendNumericSegmentCEs(CharSequence digits) {
10257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int length = digits.length();
10267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        assert(1 <= length && length <= 254);
10277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        assert(length == 1 || digits.charAt(0) != 0);
10287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        long numericPrimary = data.numericPrimary;
10297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Note: We use primary byte values 2..255: digits are not compressible.
10307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if(length <= 7) {
10317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Very dense encoding for small numbers.
10327935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int value = digits.charAt(0);
10337935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            for(int i = 1; i < length; ++i) {
10347935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                value = value * 10 + digits.charAt(i);
10357935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
10367935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Primary weight second byte values:
10377935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            //     74 byte values   2.. 75 for small numbers in two-byte primary weights.
10387935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            //     40 byte values  76..115 for medium numbers in three-byte primary weights.
10397935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            //     16 byte values 116..131 for large numbers in four-byte primary weights.
10407935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            //    124 byte values 132..255 for very large numbers with 4..127 digit pairs.
10417935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int firstByte = 2;
10427935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            int numBytes = 74;
10437935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(value < numBytes) {
10447935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Two-byte primary for 0..73, good for day & month numbers etc.
10457935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                long primary = numericPrimary | ((firstByte + value) << 16);
10467935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ceBuffer.append(Collation.makeCE(primary));
10477935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return;
10487935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
10497935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            value -= numBytes;
10507935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            firstByte += numBytes;
10517935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            numBytes = 40;
10527935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(value < numBytes * 254) {
10537935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more.
10547935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                long primary = numericPrimary |
10557935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                    ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8);
10567935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ceBuffer.append(Collation.makeCE(primary));
10577935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return;
10587935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
10597935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            value -= numBytes * 254;
10607935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            firstByte += numBytes;
10617935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            numBytes = 16;
10627935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(value < numBytes * 254 * 254) {
10637935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Four-byte primary for 10234..1042489=10234+16*254*254-1.
10647935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                long primary = numericPrimary | (2 + value % 254);
10657935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                value /= 254;
10667935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                primary |= (2 + value % 254) << 8;
10677935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                value /= 254;
10687935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                primary |= (firstByte + value % 254) << 16;
10697935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ceBuffer.append(Collation.makeCE(primary));
10707935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                return;
10717935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
10727935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // original value > 1042489
10737935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
10747935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        assert(length >= 7);
10757935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
10767935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // The second primary byte value 132..255 indicates the number of digit pairs (4..127),
10777935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // then we generate primary bytes with those pairs.
10787935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Omit trailing 00 pairs.
10797935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Decrement the value for the last pair.
10807935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
10817935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Set the exponent. 4 pairs.132, 5 pairs.133, ..., 127 pairs.255.
10827935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int numPairs = (length + 1) / 2;
10837935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        long primary = numericPrimary | ((132 - 4 + numPairs) << 16);
10847935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Find the length without trailing 00 pairs.
10857935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        while(digits.charAt(length - 1) == 0 && digits.charAt(length - 2) == 0) {
10867935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            length -= 2;
10877935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
10887935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Read the first pair.
10897935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int pair;
10907935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int pos;
10917935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        if((length & 1) != 0) {
10927935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            // Only "half a pair" if we have an odd number of digits.
10937935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            pair = digits.charAt(0);
10947935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            pos = 1;
10957935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        } else {
10967935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            pair = digits.charAt(0) * 10 + digits.charAt(1);
10977935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            pos = 2;
10987935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
10997935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        pair = 11 + 2 * pair;
11007935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        // Add the pairs of digits between pos and length.
11017935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        int shift = 8;
11027935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        while(pos < length) {
11037935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            if(shift == 0) {
11047935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // Every three pairs/bytes we need to store a 4-byte-primary CE
11057935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                // and start with a new CE with the '0' primary lead byte.
11067935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                primary |= pair;
11077935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                ceBuffer.append(Collation.makeCE(primary));
11087935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                primary = numericPrimary;
11097935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                shift = 16;
11107935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            } else {
11117935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                primary |= pair << shift;
11127935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert                shift -= 8;
11137935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            }
11147935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            pair = 11 + 2 * (digits.charAt(pos) * 10 + digits.charAt(pos + 1));
11157935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert            pos += 2;
11167935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        }
11177935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        primary |= (pair - 1) << shift;
11187935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert        ceBuffer.append(Collation.makeCE(primary));
11197935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    }
11207935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
11217935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private CEBuffer ceBuffer;
11227935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private int cesIndex;
11237935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
11247935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private SkippedState skipped;
11257935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert
11267935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // Number of code points to read forward, or -1.
11277935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // Used as a forward iteration limit in previousCEUnsafe().
11287935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private int numCpFwd;
11297935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    // Numeric collation (CollationSettings.NUMERIC).
11307935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert    private boolean isNumeric;
11317935b1839a081ed19ae0d33029ad3c09632a2caaFredrik Roubert}
1132