1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5*******************************************************************************
6* Copyright (C) 2012-2015, International Business Machines
7* Corporation and others.  All Rights Reserved.
8*******************************************************************************
9* CollationDataBuilder.java, ported from collationdatabuilder.h/.cpp
10*
11* C++ version created on: 2012apr01
12* created by: Markus W. Scherer
13*/
14
15package android.icu.impl.coll;
16
17import java.util.ArrayList;
18import java.util.Arrays;
19import java.util.Iterator;
20
21import android.icu.impl.Norm2AllModes;
22import android.icu.impl.Normalizer2Impl;
23import android.icu.impl.Normalizer2Impl.Hangul;
24import android.icu.impl.Trie2;
25import android.icu.impl.Trie2Writable;
26import android.icu.lang.UCharacter;
27import android.icu.text.UnicodeSet;
28import android.icu.text.UnicodeSetIterator;
29import android.icu.util.CharsTrie;
30import android.icu.util.CharsTrieBuilder;
31import android.icu.util.StringTrieBuilder;
32
33/**
34 * Low-level CollationData builder.
35 * Takes (character, CE) pairs and builds them into runtime data structures.
36 * Supports characters with context prefixes and contraction suffixes.
37 */
38final class CollationDataBuilder {  // not final in C++
39    /**
40     * Collation element modifier. Interface class for a modifier
41     * that changes a tailoring builder's temporary CEs to final CEs.
42     * Called for every non-special CE32 and every expansion CE.
43     */
44    interface CEModifier {
45        /** Returns a new CE to replace the non-special input CE32, or else Collation.NO_CE. */
46        long modifyCE32(int ce32);
47        /** Returns a new CE to replace the input CE, or else Collation.NO_CE. */
48        long modifyCE(long ce);
49    }
50
51    CollationDataBuilder() {
52        nfcImpl = Norm2AllModes.getNFCInstance().impl;
53        base = null;
54        baseSettings = null;
55        trie = null;
56        ce32s = new UVector32();
57        ce64s = new UVector64();
58        conditionalCE32s = new ArrayList<ConditionalCE32>();
59        modified = false;
60        fastLatinEnabled = false;
61        fastLatinBuilder = null;
62        collIter = null;
63        // Reserve the first CE32 for U+0000.
64        ce32s.addElement(0);
65    }
66
67    void initForTailoring(CollationData b) {
68        if(trie != null) {
69            throw new IllegalStateException("attempt to reuse a CollationDataBuilder");
70        }
71        if(b == null) {
72            throw new IllegalArgumentException("null CollationData");
73        }
74        base = b;
75
76        // For a tailoring, the default is to fall back to the base.
77        trie = new Trie2Writable(Collation.FALLBACK_CE32, Collation.FFFD_CE32);
78
79        // Set the Latin-1 letters block so that it is allocated first in the data array,
80        // to try to improve locality of reference when sorting Latin-1 text.
81        // Do not use utrie2_setRange32() since that will not actually allocate blocks
82        // that are filled with the default value.
83        // ASCII (0..7F) is already preallocated anyway.
84        for(int c = 0xc0; c <= 0xff; ++c) {
85            trie.set(c, Collation.FALLBACK_CE32);
86        }
87
88        // Hangul syllables are not tailorable (except via tailoring Jamos).
89        // Always set the Hangul tag to help performance.
90        // Do this here, rather than in buildMappings(),
91        // so that we see the HANGUL_TAG in various assertions.
92        int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0);
93        trie.setRange(Hangul.HANGUL_BASE, Hangul.HANGUL_END, hangulCE32, true);
94
95        // Copy the set contents but don't copy/clone the set as a whole because
96        // that would copy the isFrozen state too.
97        unsafeBackwardSet.addAll(b.unsafeBackwardSet);
98    }
99
100    boolean isCompressibleLeadByte(int b) {
101        return base.isCompressibleLeadByte(b);
102    }
103
104    boolean isCompressiblePrimary(long p) {
105        return isCompressibleLeadByte((int)p >>> 24);
106    }
107
108    /**
109     * @return true if this builder has mappings (e.g., add() has been called)
110     */
111    boolean hasMappings() { return modified; }
112
113    /**
114     * @return true if c has CEs in this builder
115     */
116    boolean isAssigned(int c) {
117        return Collation.isAssignedCE32(trie.get(c));
118    }
119
120    void add(CharSequence prefix, CharSequence s, long ces[], int cesLength) {
121        int ce32 = encodeCEs(ces, cesLength);
122        addCE32(prefix, s, ce32);
123    }
124
125    /**
126     * Encodes the ces as either the returned ce32 by itself,
127     * or by storing an expansion, with the returned ce32 referring to that.
128     *
129     * <p>add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength))
130     */
131    int encodeCEs(long ces[], int cesLength) {
132        if(cesLength < 0 || cesLength > Collation.MAX_EXPANSION_LENGTH) {
133            throw new IllegalArgumentException("mapping to too many CEs");
134        }
135        if(!isMutable()) {
136            throw new IllegalStateException("attempt to add mappings after build()");
137        }
138        if(cesLength == 0) {
139            // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE.
140            // Do this here so that callers need not do it.
141            return encodeOneCEAsCE32(0);
142        } else if(cesLength == 1) {
143            return encodeOneCE(ces[0]);
144        } else if(cesLength == 2) {
145            // Try to encode two CEs as one CE32.
146            long ce0 = ces[0];
147            long ce1 = ces[1];
148            long p0 = ce0 >>> 32;
149            if((ce0 & 0xffffffffff00ffL) == Collation.COMMON_SECONDARY_CE &&
150                    (ce1 & 0xffffffff00ffffffL) == Collation.COMMON_TERTIARY_CE &&
151                    p0 != 0) {
152                // Latin mini expansion
153                return
154                    (int)p0 |
155                    (((int)ce0 & 0xff00) << 8) |
156                    (((int)ce1 >> 16) & 0xff00) |
157                    Collation.SPECIAL_CE32_LOW_BYTE |
158                    Collation.LATIN_EXPANSION_TAG;
159            }
160        }
161        // Try to encode two or more CEs as CE32s.
162        int[] newCE32s = new int[Collation.MAX_EXPANSION_LENGTH];  // TODO: instance field?
163        for(int i = 0;; ++i) {
164            if(i == cesLength) {
165                return encodeExpansion32(newCE32s, 0, cesLength);
166            }
167            int ce32 = encodeOneCEAsCE32(ces[i]);
168            if(ce32 == Collation.NO_CE32) { break; }
169            newCE32s[i] = ce32;
170        }
171        return encodeExpansion(ces, 0, cesLength);
172    }
173
174    void addCE32(CharSequence prefix, CharSequence s, int ce32) {
175        if(s.length() == 0) {
176            throw new IllegalArgumentException("mapping from empty string");
177        }
178        if(!isMutable()) {
179            throw new IllegalStateException("attempt to add mappings after build()");
180        }
181        int c = Character.codePointAt(s, 0);
182        int cLength = Character.charCount(c);
183        int oldCE32 = trie.get(c);
184        boolean hasContext = prefix.length() != 0|| s.length() > cLength;
185        if(oldCE32 == Collation.FALLBACK_CE32) {
186            // First tailoring for c.
187            // If c has contextual base mappings or if we add a contextual mapping,
188            // then copy the base mappings.
189            // Otherwise we just override the base mapping.
190            int baseCE32 = base.getFinalCE32(base.getCE32(c));
191            if(hasContext || Collation.ce32HasContext(baseCE32)) {
192                oldCE32 = copyFromBaseCE32(c, baseCE32, true);
193                trie.set(c, oldCE32);
194            }
195        }
196        if(!hasContext) {
197            // No prefix, no contraction.
198            if(!isBuilderContextCE32(oldCE32)) {
199                trie.set(c, ce32);
200            } else {
201                ConditionalCE32 cond = getConditionalCE32ForCE32(oldCE32);
202                cond.builtCE32 = Collation.NO_CE32;
203                cond.ce32 = ce32;
204            }
205        } else {
206            ConditionalCE32 cond;
207            if(!isBuilderContextCE32(oldCE32)) {
208                // Replace the simple oldCE32 with a builder context CE32
209                // pointing to a new ConditionalCE32 list head.
210                int index = addConditionalCE32("\0", oldCE32);
211                int contextCE32 = makeBuilderContextCE32(index);
212                trie.set(c, contextCE32);
213                contextChars.add(c);
214                cond = getConditionalCE32(index);
215            } else {
216                cond = getConditionalCE32ForCE32(oldCE32);
217                cond.builtCE32 = Collation.NO_CE32;
218            }
219            CharSequence suffix = s.subSequence(cLength, s.length());
220            String context = new StringBuilder().append((char)prefix.length()).
221                    append(prefix).append(suffix).toString();
222            unsafeBackwardSet.addAll(suffix);
223            for(;;) {
224                // invariant: context > cond.context
225                int next = cond.next;
226                if(next < 0) {
227                    // Append a new ConditionalCE32 after cond.
228                    int index = addConditionalCE32(context, ce32);
229                    cond.next = index;
230                    break;
231                }
232                ConditionalCE32 nextCond = getConditionalCE32(next);
233                int cmp = context.compareTo(nextCond.context);
234                if(cmp < 0) {
235                    // Insert a new ConditionalCE32 between cond and nextCond.
236                    int index = addConditionalCE32(context, ce32);
237                    cond.next = index;
238                    getConditionalCE32(index).next = next;
239                    break;
240                } else if(cmp == 0) {
241                    // Same context as before, overwrite its ce32.
242                    nextCond.ce32 = ce32;
243                    break;
244                }
245                cond = nextCond;
246            }
247        }
248        modified = true;
249    }
250
251    /**
252     * Copies all mappings from the src builder, with modifications.
253     * This builder here must not be built yet, and should be empty.
254     */
255    void copyFrom(CollationDataBuilder src, CEModifier modifier) {
256        if(!isMutable()) {
257            throw new IllegalStateException("attempt to copyFrom() after build()");
258        }
259        CopyHelper helper = new CopyHelper(src, this, modifier);
260        Iterator<Trie2.Range> trieIterator = src.trie.iterator();
261        Trie2.Range range;
262        while(trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
263            enumRangeForCopy(range.startCodePoint, range.endCodePoint, range.value, helper);
264        }
265        // Update the contextChars and the unsafeBackwardSet while copying,
266        // in case a character had conditional mappings in the source builder
267        // and they were removed later.
268        modified |= src.modified;
269    }
270
271    void optimize(UnicodeSet set) {
272        if(set.isEmpty()) { return; }
273        UnicodeSetIterator iter = new UnicodeSetIterator(set);
274        while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
275            int c = iter.codepoint;
276            int ce32 = trie.get(c);
277            if(ce32 == Collation.FALLBACK_CE32) {
278                ce32 = base.getFinalCE32(base.getCE32(c));
279                ce32 = copyFromBaseCE32(c, ce32, true);
280                trie.set(c, ce32);
281            }
282        }
283        modified = true;
284    }
285
286    void suppressContractions(UnicodeSet set) {
287        if(set.isEmpty()) { return; }
288        UnicodeSetIterator iter = new UnicodeSetIterator(set);
289        while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) {
290            int c = iter.codepoint;
291            int ce32 = trie.get(c);
292            if(ce32 == Collation.FALLBACK_CE32) {
293                ce32 = base.getFinalCE32(base.getCE32(c));
294                if(Collation.ce32HasContext(ce32)) {
295                    ce32 = copyFromBaseCE32(c, ce32, false /* without context */);
296                    trie.set(c, ce32);
297                }
298            } else if(isBuilderContextCE32(ce32)) {
299                ce32 = getConditionalCE32ForCE32(ce32).ce32;
300                // Simply abandon the list of ConditionalCE32.
301                // The caller will copy this builder in the end,
302                // eliminating unreachable data.
303                trie.set(c, ce32);
304                contextChars.remove(c);
305            }
306        }
307        modified = true;
308    }
309
310    void enableFastLatin() { fastLatinEnabled = true; }
311    void build(CollationData data) {
312        buildMappings(data);
313        if(base != null) {
314            data.numericPrimary = base.numericPrimary;
315            data.compressibleBytes = base.compressibleBytes;
316            data.numScripts = base.numScripts;
317            data.scriptsIndex = base.scriptsIndex;
318            data.scriptStarts = base.scriptStarts;
319        }
320        buildFastLatinTable(data);
321    }
322
323    /**
324     * Looks up CEs for s and appends them to the ces array.
325     * Does not handle normalization: s should be in FCD form.
326     *
327     * Does not write completely ignorable CEs.
328     * Does not write beyond Collation.MAX_EXPANSION_LENGTH.
329     *
330     * @return incremented cesLength
331     */
332    int getCEs(CharSequence s, long ces[], int cesLength) {
333        return getCEs(s, 0, ces, cesLength);
334    }
335
336    int getCEs(CharSequence prefix, CharSequence s, long ces[], int cesLength) {
337        int prefixLength = prefix.length();
338        if(prefixLength == 0) {
339            return getCEs(s, 0, ces, cesLength);
340        } else {
341            return getCEs(new StringBuilder(prefix).append(s), prefixLength, ces, cesLength);
342        }
343    }
344
345    /**
346     * Build-time context and CE32 for a code point.
347     * If a code point has contextual mappings, then the default (no-context) mapping
348     * and all conditional mappings are stored in a singly-linked list
349     * of ConditionalCE32, sorted by context strings.
350     *
351     * Context strings sort by prefix length, then by prefix, then by contraction suffix.
352     * Context strings must be unique and in ascending order.
353     */
354    private static final class ConditionalCE32 {
355        ConditionalCE32(String ct, int ce) {
356            context = ct;
357            ce32 = ce;
358            defaultCE32 = Collation.NO_CE32;
359            builtCE32 = Collation.NO_CE32;
360            next = -1;
361        }
362
363        boolean hasContext() { return context.length() > 1; }
364        int prefixLength() { return context.charAt(0); }
365
366        /**
367         * "\0" for the first entry for any code point, with its default CE32.
368         *
369         * Otherwise one unit with the length of the prefix string,
370         * then the prefix string, then the contraction suffix.
371         */
372        String context;
373        /**
374         * CE32 for the code point and its context.
375         * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag).
376         */
377        int ce32;
378        /**
379         * Default CE32 for all contexts with this same prefix.
380         * Initially NO_CE32. Set only while building runtime data structures,
381         * and only on one of the nodes of a sub-list with the same prefix.
382         */
383        int defaultCE32;
384        /**
385         * CE32 for the built contexts.
386         * When fetching CEs from the builder, the contexts are built into their runtime form
387         * so that the normal collation implementation can process them.
388         * The result is cached in the list head. It is reset when the contexts are modified.
389         */
390        int builtCE32;
391        /**
392         * Index of the next ConditionalCE32.
393         * Negative for the end of the list.
394         */
395        int next;
396    }
397
398    protected int getCE32FromOffsetCE32(boolean fromBase, int c, int ce32) {
399        int i = Collation.indexFromCE32(ce32);
400        long dataCE = fromBase ? base.ces[i] : ce64s.elementAti(i);
401        long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE);
402        return Collation.makeLongPrimaryCE32(p);
403    }
404
405    protected int addCE(long ce) {
406        int length = ce64s.size();
407        for(int i = 0; i < length; ++i) {
408            if(ce == ce64s.elementAti(i)) { return i; }
409        }
410        ce64s.addElement(ce);
411        return length;
412    }
413
414    protected int addCE32(int ce32) {
415        int length = ce32s.size();
416        for(int i = 0; i < length; ++i) {
417            if(ce32 == ce32s.elementAti(i)) { return i; }
418        }
419        ce32s.addElement(ce32);
420        return length;
421    }
422
423    protected int addConditionalCE32(String context, int ce32) {
424        assert(context.length() != 0);
425        int index = conditionalCE32s.size();
426        if(index > Collation.MAX_INDEX) {
427            throw new IndexOutOfBoundsException("too many context-sensitive mappings");
428            // BufferOverflowException is a better fit
429            // but cannot be constructed with a message string.
430        }
431        ConditionalCE32 cond = new ConditionalCE32(context, ce32);
432        conditionalCE32s.add(cond);
433        return index;
434    }
435
436    protected ConditionalCE32 getConditionalCE32(int index) {
437        return conditionalCE32s.get(index);
438    }
439    protected ConditionalCE32 getConditionalCE32ForCE32(int ce32) {
440        return getConditionalCE32(Collation.indexFromCE32(ce32));
441    }
442
443    protected static int makeBuilderContextCE32(int index) {
444        return Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, index);
445    }
446    protected static boolean isBuilderContextCE32(int ce32) {
447        return Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG);
448    }
449
450    protected static int encodeOneCEAsCE32(long ce) {
451        long p = ce >>> 32;
452        int lower32 = (int)ce;
453        int t = lower32 & 0xffff;
454        assert((t & 0xc000) != 0xc000);  // Impossible case bits 11 mark special CE32s.
455        if((ce & 0xffff00ff00ffL) == 0) {
456            // normal form ppppsstt
457            return (int)p | (lower32 >>> 16) | (t >> 8);
458        } else if((ce & 0xffffffffffL) == Collation.COMMON_SEC_AND_TER_CE) {
459            // long-primary form ppppppC1
460            return Collation.makeLongPrimaryCE32(p);
461        } else if(p == 0 && (t & 0xff) == 0) {
462            // long-secondary form ssssttC2
463            return Collation.makeLongSecondaryCE32(lower32);
464        }
465        return Collation.NO_CE32;
466    }
467
468    protected int encodeOneCE(long ce) {
469        // Try to encode one CE as one CE32.
470        int ce32 = encodeOneCEAsCE32(ce);
471        if(ce32 != Collation.NO_CE32) { return ce32; }
472        int index = addCE(ce);
473        if(index > Collation.MAX_INDEX) {
474            throw new IndexOutOfBoundsException("too many mappings");
475            // BufferOverflowException is a better fit
476            // but cannot be constructed with a message string.
477        }
478        return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, index, 1);
479    }
480
481    protected int encodeExpansion(long ces[], int start, int length) {
482        // See if this sequence of CEs has already been stored.
483        long first = ces[start];
484        int ce64sMax = ce64s.size() - length;
485        for(int i = 0; i <= ce64sMax; ++i) {
486            if(first == ce64s.elementAti(i)) {
487                if(i > Collation.MAX_INDEX) {
488                    throw new IndexOutOfBoundsException("too many mappings");
489                    // BufferOverflowException is a better fit
490                    // but cannot be constructed with a message string.
491                }
492                for(int j = 1;; ++j) {
493                    if(j == length) {
494                        return Collation.makeCE32FromTagIndexAndLength(
495                                Collation.EXPANSION_TAG, i, length);
496                    }
497                    if(ce64s.elementAti(i + j) != ces[start + j]) { break; }
498                }
499            }
500        }
501        // Store the new sequence.
502        int i = ce64s.size();
503        if(i > Collation.MAX_INDEX) {
504            throw new IndexOutOfBoundsException("too many mappings");
505            // BufferOverflowException is a better fit
506            // but cannot be constructed with a message string.
507        }
508        for(int j = 0; j < length; ++j) {
509            ce64s.addElement(ces[start + j]);
510        }
511        return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, i, length);
512    }
513
514    protected int encodeExpansion32(int newCE32s[], int start, int length) {
515        // See if this sequence of CE32s has already been stored.
516        int first = newCE32s[start];
517        int ce32sMax = ce32s.size() - length;
518        for(int i = 0; i <= ce32sMax; ++i) {
519            if(first == ce32s.elementAti(i)) {
520                if(i > Collation.MAX_INDEX) {
521                    throw new IndexOutOfBoundsException("too many mappings");
522                    // BufferOverflowException is a better fit
523                    // but cannot be constructed with a message string.
524                }
525                for(int j = 1;; ++j) {
526                    if(j == length) {
527                        return Collation.makeCE32FromTagIndexAndLength(
528                                Collation.EXPANSION32_TAG, i, length);
529                    }
530                    if(ce32s.elementAti(i + j) != newCE32s[start + j]) { break; }
531                }
532            }
533        }
534        // Store the new sequence.
535        int i = ce32s.size();
536        if(i > Collation.MAX_INDEX) {
537            throw new IndexOutOfBoundsException("too many mappings");
538            // BufferOverflowException is a better fit
539            // but cannot be constructed with a message string.
540        }
541        for(int j = 0; j < length; ++j) {
542            ce32s.addElement(newCE32s[start + j]);
543        }
544        return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION32_TAG, i, length);
545    }
546
547    protected int copyFromBaseCE32(int c, int ce32, boolean withContext) {
548        if(!Collation.isSpecialCE32(ce32)) { return ce32; }
549        switch(Collation.tagFromCE32(ce32)) {
550        case Collation.LONG_PRIMARY_TAG:
551        case Collation.LONG_SECONDARY_TAG:
552        case Collation.LATIN_EXPANSION_TAG:
553            // copy as is
554            break;
555        case Collation.EXPANSION32_TAG: {
556            int index = Collation.indexFromCE32(ce32);
557            int length = Collation.lengthFromCE32(ce32);
558            ce32 = encodeExpansion32(base.ce32s, index, length);
559            break;
560        }
561        case Collation.EXPANSION_TAG: {
562            int index = Collation.indexFromCE32(ce32);
563            int length = Collation.lengthFromCE32(ce32);
564            ce32 = encodeExpansion(base.ces, index, length);
565            break;
566        }
567        case Collation.PREFIX_TAG: {
568            // Flatten prefixes and nested suffixes (contractions)
569            // into a linear list of ConditionalCE32.
570            int trieIndex = Collation.indexFromCE32(ce32);
571            ce32 = base.getCE32FromContexts(trieIndex);  // Default if no prefix match.
572            if(!withContext) {
573                return copyFromBaseCE32(c, ce32, false);
574            }
575            ConditionalCE32 head = new ConditionalCE32("", 0);
576            StringBuilder context = new StringBuilder("\0");
577            int index;
578            if(Collation.isContractionCE32(ce32)) {
579                index = copyContractionsFromBaseCE32(context, c, ce32, head);
580            } else {
581                ce32 = copyFromBaseCE32(c, ce32, true);
582                head.next = index = addConditionalCE32(context.toString(), ce32);
583            }
584            ConditionalCE32 cond = getConditionalCE32(index);  // the last ConditionalCE32 so far
585            CharsTrie.Iterator prefixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0);
586            while(prefixes.hasNext()) {
587                CharsTrie.Entry entry = prefixes.next();
588                context.setLength(0);
589                context.append(entry.chars).reverse().insert(0, (char)entry.chars.length());
590                ce32 = entry.value;
591                if(Collation.isContractionCE32(ce32)) {
592                    index = copyContractionsFromBaseCE32(context, c, ce32, cond);
593                } else {
594                    ce32 = copyFromBaseCE32(c, ce32, true);
595                    cond.next = index = addConditionalCE32(context.toString(), ce32);
596                }
597                cond = getConditionalCE32(index);
598            }
599            ce32 = makeBuilderContextCE32(head.next);
600            contextChars.add(c);
601            break;
602        }
603        case Collation.CONTRACTION_TAG: {
604            if(!withContext) {
605                int index = Collation.indexFromCE32(ce32);
606                ce32 = base.getCE32FromContexts(index);  // Default if no suffix match.
607                return copyFromBaseCE32(c, ce32, false);
608            }
609            ConditionalCE32 head = new ConditionalCE32("", 0);
610            StringBuilder context = new StringBuilder("\0");
611            copyContractionsFromBaseCE32(context, c, ce32, head);
612            ce32 = makeBuilderContextCE32(head.next);
613            contextChars.add(c);
614            break;
615        }
616        case Collation.HANGUL_TAG:
617            throw new UnsupportedOperationException("We forbid tailoring of Hangul syllables.");
618        case Collation.OFFSET_TAG:
619            ce32 = getCE32FromOffsetCE32(true, c, ce32);
620            break;
621        case Collation.IMPLICIT_TAG:
622            ce32 = encodeOneCE(Collation.unassignedCEFromCodePoint(c));
623            break;
624        default:
625            throw new AssertionError("copyFromBaseCE32(c, ce32, withContext) " +
626                    "requires ce32 == base.getFinalCE32(ce32)");
627        }
628        return ce32;
629    }
630
631    /**
632     * Copies base contractions to a list of ConditionalCE32.
633     * Sets cond.next to the index of the first new item
634     * and returns the index of the last new item.
635     */
636    protected int copyContractionsFromBaseCE32(StringBuilder context, int c, int ce32,
637            ConditionalCE32 cond) {
638        int trieIndex = Collation.indexFromCE32(ce32);
639        int index;
640        if((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
641            // No match on the single code point.
642            // We are underneath a prefix, and the default mapping is just
643            // a fallback to the mappings for a shorter prefix.
644            assert(context.length() > 1);
645            index = -1;
646        } else {
647            ce32 = base.getCE32FromContexts(trieIndex);  // Default if no suffix match.
648            assert(!Collation.isContractionCE32(ce32));
649            ce32 = copyFromBaseCE32(c, ce32, true);
650            cond.next = index = addConditionalCE32(context.toString(), ce32);
651            cond = getConditionalCE32(index);
652        }
653
654        int suffixStart = context.length();
655        CharsTrie.Iterator suffixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0);
656        while(suffixes.hasNext()) {
657            CharsTrie.Entry entry = suffixes.next();
658            context.append(entry.chars);
659            ce32 = copyFromBaseCE32(c, entry.value, true);
660            cond.next = index = addConditionalCE32(context.toString(), ce32);
661            // No need to update the unsafeBackwardSet because the tailoring set
662            // is already a copy of the base set.
663            cond = getConditionalCE32(index);
664            context.setLength(suffixStart);
665        }
666        assert(index >= 0);
667        return index;
668    }
669
670    private static final class CopyHelper {
671        CopyHelper(CollationDataBuilder s, CollationDataBuilder d,
672                  CollationDataBuilder.CEModifier m) {
673            src = s;
674            dest = d;
675            modifier = m;
676        }
677
678        void copyRangeCE32(int start, int end, int ce32) {
679            ce32 = copyCE32(ce32);
680            dest.trie.setRange(start, end, ce32, true);
681            if(CollationDataBuilder.isBuilderContextCE32(ce32)) {
682                dest.contextChars.add(start, end);
683            }
684        }
685
686        int copyCE32(int ce32) {
687            if(!Collation.isSpecialCE32(ce32)) {
688                long ce = modifier.modifyCE32(ce32);
689                if(ce != Collation.NO_CE) {
690                    ce32 = dest.encodeOneCE(ce);
691                }
692            } else {
693                int tag = Collation.tagFromCE32(ce32);
694                if(tag == Collation.EXPANSION32_TAG) {
695                    int[] srcCE32s = src.ce32s.getBuffer();
696                    int srcIndex = Collation.indexFromCE32(ce32);
697                    int length = Collation.lengthFromCE32(ce32);
698                    // Inspect the source CE32s. Just copy them if none are modified.
699                    // Otherwise copy to modifiedCEs, with modifications.
700                    boolean isModified = false;
701                    for(int i = 0; i < length; ++i) {
702                        ce32 = srcCE32s[srcIndex + i];
703                        long ce;
704                        if(Collation.isSpecialCE32(ce32) ||
705                                (ce = modifier.modifyCE32(ce32)) == Collation.NO_CE) {
706                            if(isModified) {
707                                modifiedCEs[i] = Collation.ceFromCE32(ce32);
708                            }
709                        } else {
710                            if(!isModified) {
711                                for(int j = 0; j < i; ++j) {
712                                    modifiedCEs[j] = Collation.ceFromCE32(srcCE32s[srcIndex + j]);
713                                }
714                                isModified = true;
715                            }
716                            modifiedCEs[i] = ce;
717                        }
718                    }
719                    if(isModified) {
720                        ce32 = dest.encodeCEs(modifiedCEs, length);
721                    } else {
722                        ce32 = dest.encodeExpansion32(srcCE32s, srcIndex, length);
723                    }
724                } else if(tag == Collation.EXPANSION_TAG) {
725                    long[] srcCEs = src.ce64s.getBuffer();
726                    int srcIndex = Collation.indexFromCE32(ce32);
727                    int length = Collation.lengthFromCE32(ce32);
728                    // Inspect the source CEs. Just copy them if none are modified.
729                    // Otherwise copy to modifiedCEs, with modifications.
730                    boolean isModified = false;
731                    for(int i = 0; i < length; ++i) {
732                        long srcCE = srcCEs[srcIndex + i];
733                        long ce = modifier.modifyCE(srcCE);
734                        if(ce == Collation.NO_CE) {
735                            if(isModified) {
736                                modifiedCEs[i] = srcCE;
737                            }
738                        } else {
739                            if(!isModified) {
740                                for(int j = 0; j < i; ++j) {
741                                    modifiedCEs[j] = srcCEs[srcIndex + j];
742                                }
743                                isModified = true;
744                            }
745                            modifiedCEs[i] = ce;
746                        }
747                    }
748                    if(isModified) {
749                        ce32 = dest.encodeCEs(modifiedCEs, length);
750                    } else {
751                        ce32 = dest.encodeExpansion(srcCEs, srcIndex, length);
752                    }
753                } else if(tag == Collation.BUILDER_DATA_TAG) {
754                    // Copy the list of ConditionalCE32.
755                    ConditionalCE32 cond = src.getConditionalCE32ForCE32(ce32);
756                    assert(!cond.hasContext());
757                    int destIndex = dest.addConditionalCE32(
758                            cond.context, copyCE32(cond.ce32));
759                    ce32 = CollationDataBuilder.makeBuilderContextCE32(destIndex);
760                    while(cond.next >= 0) {
761                        cond = src.getConditionalCE32(cond.next);
762                        ConditionalCE32 prevDestCond = dest.getConditionalCE32(destIndex);
763                        destIndex = dest.addConditionalCE32(
764                                cond.context, copyCE32(cond.ce32));
765                        int suffixStart = cond.prefixLength() + 1;
766                        dest.unsafeBackwardSet.addAll(cond.context.substring(suffixStart));
767                        prevDestCond.next = destIndex;
768                    }
769                } else {
770                    // Just copy long CEs and Latin mini expansions (and other expected values) as is,
771                    // assuming that the modifier would not modify them.
772                    assert(tag == Collation.LONG_PRIMARY_TAG ||
773                            tag == Collation.LONG_SECONDARY_TAG ||
774                            tag == Collation.LATIN_EXPANSION_TAG ||
775                            tag == Collation.HANGUL_TAG);
776                }
777            }
778            return ce32;
779        }
780
781        CollationDataBuilder src;
782        CollationDataBuilder dest;
783        CollationDataBuilder.CEModifier modifier;
784        long[] modifiedCEs = new long[Collation.MAX_EXPANSION_LENGTH];
785    }
786
787    private static void
788    enumRangeForCopy(int start, int end, int value, CopyHelper helper) {
789        if(value != Collation.UNASSIGNED_CE32 && value != Collation.FALLBACK_CE32) {
790            helper.copyRangeCE32(start, end, value);
791        }
792    }
793
794    protected boolean getJamoCE32s(int jamoCE32s[]) {
795        boolean anyJamoAssigned = base == null;  // always set jamoCE32s in the base data
796        boolean needToCopyFromBase = false;
797        for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) {  // Count across Jamo types.
798            int jamo = jamoCpFromIndex(j);
799            boolean fromBase = false;
800            int ce32 = trie.get(jamo);
801            anyJamoAssigned |= Collation.isAssignedCE32(ce32);
802            // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned.
803            // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.)
804            if(ce32 == Collation.FALLBACK_CE32) {
805                fromBase = true;
806                ce32 = base.getCE32(jamo);
807            }
808            if(Collation.isSpecialCE32(ce32)) {
809                switch(Collation.tagFromCE32(ce32)) {
810                case Collation.LONG_PRIMARY_TAG:
811                case Collation.LONG_SECONDARY_TAG:
812                case Collation.LATIN_EXPANSION_TAG:
813                    // Copy the ce32 as-is.
814                    break;
815                case Collation.EXPANSION32_TAG:
816                case Collation.EXPANSION_TAG:
817                case Collation.PREFIX_TAG:
818                case Collation.CONTRACTION_TAG:
819                    if(fromBase) {
820                        // Defer copying until we know if anyJamoAssigned.
821                        ce32 = Collation.FALLBACK_CE32;
822                        needToCopyFromBase = true;
823                    }
824                    break;
825                case Collation.IMPLICIT_TAG:
826                    // An unassigned Jamo should only occur in tests with incomplete bases.
827                    assert(fromBase);
828                    ce32 = Collation.FALLBACK_CE32;
829                    needToCopyFromBase = true;
830                    break;
831                case Collation.OFFSET_TAG:
832                    ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32);
833                    break;
834                case Collation.FALLBACK_TAG:
835                case Collation.RESERVED_TAG_3:
836                case Collation.BUILDER_DATA_TAG:
837                case Collation.DIGIT_TAG:
838                case Collation.U0000_TAG:
839                case Collation.HANGUL_TAG:
840                case Collation.LEAD_SURROGATE_TAG:
841                    throw new AssertionError(String.format("unexpected special tag in ce32=0x%08x", ce32));
842                }
843            }
844            jamoCE32s[j] = ce32;
845        }
846        if(anyJamoAssigned && needToCopyFromBase) {
847            for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) {
848                if(jamoCE32s[j] == Collation.FALLBACK_CE32) {
849                    int jamo = jamoCpFromIndex(j);
850                    jamoCE32s[j] = copyFromBaseCE32(jamo, base.getCE32(jamo),
851                                                    /*withContext=*/ true);
852                }
853            }
854        }
855        return anyJamoAssigned;
856    }
857
858    protected void setDigitTags() {
859        UnicodeSet digits = new UnicodeSet("[:Nd:]");
860        UnicodeSetIterator iter = new UnicodeSetIterator(digits);
861        while(iter.next()) {
862            assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
863            int c = iter.codepoint;
864            int ce32 = trie.get(c);
865            if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) {
866                int index = addCE32(ce32);
867                if(index > Collation.MAX_INDEX) {
868                    throw new IndexOutOfBoundsException("too many mappings");
869                    // BufferOverflowException is a better fit
870                    // but cannot be constructed with a message string.
871                }
872                ce32 = Collation.makeCE32FromTagIndexAndLength(
873                        Collation.DIGIT_TAG, index, UCharacter.digit(c));  // u_charDigitValue(c)
874                trie.set(c, ce32);
875            }
876        }
877    }
878
879    protected void setLeadSurrogates() {
880        for(char lead = 0xd800; lead < 0xdc00; ++lead) {
881            int leadValue = -1;
882            // utrie2_enumForLeadSurrogate(trie, lead, null, , &value);
883            Iterator<Trie2.Range> trieIterator = trie.iteratorForLeadSurrogate(lead);
884            while(trieIterator.hasNext()) {
885                Trie2.Range range = trieIterator.next();
886                // The rest of this loop is equivalent to C++ enumRangeLeadValue().
887                int value = range.value;
888                if(value == Collation.UNASSIGNED_CE32) {
889                    value = Collation.LEAD_ALL_UNASSIGNED;
890                } else if(value == Collation.FALLBACK_CE32) {
891                    value = Collation.LEAD_ALL_FALLBACK;
892                } else {
893                    leadValue = Collation.LEAD_MIXED;
894                    break;
895                }
896                if(leadValue < 0) {
897                    leadValue = value;
898                } else if(leadValue != value) {
899                    leadValue = Collation.LEAD_MIXED;
900                    break;
901                }
902            }
903            trie.setForLeadSurrogateCodeUnit(lead,
904                    Collation.makeCE32FromTagAndIndex(Collation.LEAD_SURROGATE_TAG, 0) | leadValue);
905        }
906    }
907
908    protected void buildMappings(CollationData data) {
909        if(!isMutable()) {
910            throw new IllegalStateException("attempt to build() after build()");
911        }
912
913        buildContexts();
914
915        int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
916        int jamoIndex = -1;
917        if(getJamoCE32s(jamoCE32s)) {
918            jamoIndex = ce32s.size();
919            for(int i = 0; i < CollationData.JAMO_CE32S_LENGTH; ++i) {
920                ce32s.addElement(jamoCE32s[i]);
921            }
922            // Small optimization: Use a bit in the Hangul ce32
923            // to indicate that none of the Jamo CE32s are isSpecialCE32()
924            // (as it should be in the root collator).
925            // It allows CollationIterator to avoid recursive function calls and per-Jamo tests.
926            // In order to still have good trie compression and keep this code simple,
927            // we only set this flag if a whole block of 588 Hangul syllables starting with
928            // a common leading consonant (Jamo L) has this property.
929            boolean isAnyJamoVTSpecial = false;
930            for(int i = Hangul.JAMO_L_COUNT; i < CollationData.JAMO_CE32S_LENGTH; ++i) {
931                if(Collation.isSpecialCE32(jamoCE32s[i])) {
932                    isAnyJamoVTSpecial = true;
933                    break;
934                }
935            }
936            int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0);
937            int c = Hangul.HANGUL_BASE;
938            for(int i = 0; i < Hangul.JAMO_L_COUNT; ++i) {  // iterate over the Jamo L
939                int ce32 = hangulCE32;
940                if(!isAnyJamoVTSpecial && !Collation.isSpecialCE32(jamoCE32s[i])) {
941                    ce32 |= Collation.HANGUL_NO_SPECIAL_JAMO;
942                }
943                int limit = c + Hangul.JAMO_VT_COUNT;
944                trie.setRange(c, limit - 1, ce32, true);
945                c = limit;
946            }
947        } else {
948            // Copy the Hangul CE32s from the base in blocks per Jamo L,
949            // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks.
950            for(int c = Hangul.HANGUL_BASE; c < Hangul.HANGUL_LIMIT;) {
951                int ce32 = base.getCE32(c);
952                assert(Collation.hasCE32Tag(ce32, Collation.HANGUL_TAG));
953                int limit = c + Hangul.JAMO_VT_COUNT;
954                trie.setRange(c, limit - 1, ce32, true);
955                c = limit;
956            }
957        }
958
959        setDigitTags();
960        setLeadSurrogates();
961
962        // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG.
963        ce32s.setElementAt(trie.get(0), 0);
964        trie.set(0, Collation.makeCE32FromTagAndIndex(Collation.U0000_TAG, 0));
965
966        data.trie = trie.toTrie2_32();
967
968        // Mark each lead surrogate as "unsafe"
969        // if any of its 1024 associated supplementary code points is "unsafe".
970        int c = 0x10000;
971        for(char lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
972            if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) {
973                unsafeBackwardSet.add(lead);
974            }
975        }
976        unsafeBackwardSet.freeze();
977
978        data.ce32s = ce32s.getBuffer();
979        data.ces = ce64s.getBuffer();
980        data.contexts = contexts.toString();
981
982        data.base = base;
983        if(jamoIndex >= 0) {
984            data.jamoCE32s = jamoCE32s;  // C++: data.ce32s + jamoIndex
985        } else {
986            data.jamoCE32s = base.jamoCE32s;
987        }
988        data.unsafeBackwardSet = unsafeBackwardSet;
989    }
990
991    protected void clearContexts() {
992        contexts.setLength(0);
993        UnicodeSetIterator iter = new UnicodeSetIterator(contextChars);
994        while(iter.next()) {
995            assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
996            int ce32 = trie.get(iter.codepoint);
997            assert(isBuilderContextCE32(ce32));
998            getConditionalCE32ForCE32(ce32).builtCE32 = Collation.NO_CE32;
999        }
1000    }
1001
1002    protected void buildContexts() {
1003        // Ignore abandoned lists and the cached builtCE32,
1004        // and build all contexts from scratch.
1005        contexts.setLength(0);
1006        UnicodeSetIterator iter = new UnicodeSetIterator(contextChars);
1007        while(iter.next()) {
1008            assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
1009            int c = iter.codepoint;
1010            int ce32 = trie.get(c);
1011            if(!isBuilderContextCE32(ce32)) {
1012                throw new AssertionError("Impossible: No context data for c in contextChars.");
1013            }
1014            ConditionalCE32 cond = getConditionalCE32ForCE32(ce32);
1015            ce32 = buildContext(cond);
1016            trie.set(c, ce32);
1017        }
1018    }
1019
1020    protected int buildContext(ConditionalCE32 head) {
1021        // The list head must have no context.
1022        assert(!head.hasContext());
1023        // The list head must be followed by one or more nodes that all do have context.
1024        assert(head.next >= 0);
1025        CharsTrieBuilder prefixBuilder = new CharsTrieBuilder();
1026        CharsTrieBuilder contractionBuilder = new CharsTrieBuilder();
1027        for(ConditionalCE32 cond = head;; cond = getConditionalCE32(cond.next)) {
1028            // After the list head, the prefix or suffix can be empty, but not both.
1029            assert(cond == head || cond.hasContext());
1030            int prefixLength = cond.prefixLength();
1031            StringBuilder prefix = new StringBuilder().append(cond.context, 0, prefixLength + 1);
1032            String prefixString = prefix.toString();
1033            // Collect all contraction suffixes for one prefix.
1034            ConditionalCE32 firstCond = cond;
1035            ConditionalCE32 lastCond = cond;
1036            while(cond.next >= 0 &&
1037                    (cond = getConditionalCE32(cond.next)).context.startsWith(prefixString)) {
1038                lastCond = cond;
1039            }
1040            int ce32;
1041            int suffixStart = prefixLength + 1;  // == prefix.length()
1042            if(lastCond.context.length() == suffixStart) {
1043                // One prefix without contraction suffix.
1044                assert(firstCond == lastCond);
1045                ce32 = lastCond.ce32;
1046                cond = lastCond;
1047            } else {
1048                // Build the contractions trie.
1049                contractionBuilder.clear();
1050                // Entry for an empty suffix, to be stored before the trie.
1051                int emptySuffixCE32 = Collation.NO_CE32;  // Will always be set to a real value.
1052                int flags = 0;
1053                if(firstCond.context.length() == suffixStart) {
1054                    // There is a mapping for the prefix and the single character c. (p|c)
1055                    // If no other suffix matches, then we return this value.
1056                    emptySuffixCE32 = firstCond.ce32;
1057                    cond = getConditionalCE32(firstCond.next);
1058                } else {
1059                    // There is no mapping for the prefix and just the single character.
1060                    // (There is no p|c, only p|cd, p|ce etc.)
1061                    flags |= Collation.CONTRACT_SINGLE_CP_NO_MATCH;
1062                    // When the prefix matches but none of the prefix-specific suffixes,
1063                    // then we fall back to the mappings with the next-longest prefix,
1064                    // and ultimately to mappings with no prefix.
1065                    // Each fallback might be another set of contractions.
1066                    // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c,
1067                    // then in text "pch" we find the ch contraction.
1068                    for(cond = head;; cond = getConditionalCE32(cond.next)) {
1069                        int length = cond.prefixLength();
1070                        if(length == prefixLength) { break; }
1071                        if(cond.defaultCE32 != Collation.NO_CE32 &&
1072                                (length==0 || prefixString.regionMatches(
1073                                        prefix.length() - length, cond.context, 1, length)
1074                                        /* C++: prefix.endsWith(cond.context, 1, length) */)) {
1075                            emptySuffixCE32 = cond.defaultCE32;
1076                        }
1077                    }
1078                    cond = firstCond;
1079                }
1080                // Optimization: Set a flag when
1081                // the first character of every contraction suffix has lccc!=0.
1082                // Short-circuits contraction matching when a normal letter follows.
1083                flags |= Collation.CONTRACT_NEXT_CCC;
1084                // Add all of the non-empty suffixes into the contraction trie.
1085                for(;;) {
1086                    String suffix = cond.context.substring(suffixStart);
1087                    int fcd16 = nfcImpl.getFCD16(suffix.codePointAt(0));
1088                    if(fcd16 <= 0xff) {
1089                        flags &= ~Collation.CONTRACT_NEXT_CCC;
1090                    }
1091                    fcd16 = nfcImpl.getFCD16(suffix.codePointBefore(suffix.length()));
1092                    if(fcd16 > 0xff) {
1093                        // The last suffix character has lccc!=0, allowing for discontiguous contractions.
1094                        flags |= Collation.CONTRACT_TRAILING_CCC;
1095                    }
1096                    contractionBuilder.add(suffix, cond.ce32);
1097                    if(cond == lastCond) { break; }
1098                    cond = getConditionalCE32(cond.next);
1099                }
1100                int index = addContextTrie(emptySuffixCE32, contractionBuilder);
1101                if(index > Collation.MAX_INDEX) {
1102                    throw new IndexOutOfBoundsException("too many context-sensitive mappings");
1103                    // BufferOverflowException is a better fit
1104                    // but cannot be constructed with a message string.
1105                }
1106                ce32 = Collation.makeCE32FromTagAndIndex(Collation.CONTRACTION_TAG, index) | flags;
1107            }
1108            assert(cond == lastCond);
1109            firstCond.defaultCE32 = ce32;
1110            if(prefixLength == 0) {
1111                if(cond.next < 0) {
1112                    // No non-empty prefixes, only contractions.
1113                    return ce32;
1114                }
1115            } else {
1116                prefix.delete(0, 1);  // Remove the length unit.
1117                prefix.reverse();
1118                prefixBuilder.add(prefix, ce32);
1119                if(cond.next < 0) { break; }
1120            }
1121        }
1122        assert(head.defaultCE32 != Collation.NO_CE32);
1123        int index = addContextTrie(head.defaultCE32, prefixBuilder);
1124        if(index > Collation.MAX_INDEX) {
1125            throw new IndexOutOfBoundsException("too many context-sensitive mappings");
1126            // BufferOverflowException is a better fit
1127            // but cannot be constructed with a message string.
1128        }
1129        return Collation.makeCE32FromTagAndIndex(Collation.PREFIX_TAG, index);
1130    }
1131
1132    protected int addContextTrie(int defaultCE32, CharsTrieBuilder trieBuilder) {
1133        StringBuilder context = new StringBuilder();
1134        context.append((char)(defaultCE32 >> 16)).append((char)defaultCE32);
1135        context.append(trieBuilder.buildCharSequence(StringTrieBuilder.Option.SMALL));
1136        int index = contexts.indexOf(context.toString());
1137        if(index < 0) {
1138            index = contexts.length();
1139            contexts.append(context);
1140        }
1141        return index;
1142    }
1143
1144    protected void buildFastLatinTable(CollationData data) {
1145        if(!fastLatinEnabled) { return; }
1146
1147        fastLatinBuilder = new CollationFastLatinBuilder();
1148        if(fastLatinBuilder.forData(data)) {
1149            char[] header = fastLatinBuilder.getHeader();
1150            char[] table = fastLatinBuilder.getTable();
1151            if(base != null &&
1152                    Arrays.equals(header, base.fastLatinTableHeader) &&
1153                    Arrays.equals(table, base.fastLatinTable)) {
1154                // Same fast Latin table as in the base, use that one instead.
1155                fastLatinBuilder = null;
1156                header = base.fastLatinTableHeader;
1157                table = base.fastLatinTable;
1158            }
1159            data.fastLatinTableHeader = header;
1160            data.fastLatinTable = table;
1161        } else {
1162            fastLatinBuilder = null;
1163        }
1164    }
1165
1166    protected int getCEs(CharSequence s, int start, long ces[], int cesLength) {
1167        if(collIter == null) {
1168            collIter = new DataBuilderCollationIterator(this, new CollationData(nfcImpl));
1169            if(collIter == null) { return 0; }
1170        }
1171        return collIter.fetchCEs(s, start, ces, cesLength);
1172    }
1173
1174    protected static int jamoCpFromIndex(int i) {
1175        // 0 <= i < CollationData.JAMO_CE32S_LENGTH = 19 + 21 + 27
1176        if(i < Hangul.JAMO_L_COUNT) { return Hangul.JAMO_L_BASE + i; }
1177        i -= Hangul.JAMO_L_COUNT;
1178        if(i < Hangul.JAMO_V_COUNT) { return Hangul.JAMO_V_BASE + i; }
1179        i -= Hangul.JAMO_V_COUNT;
1180        // i < 27
1181        return Hangul.JAMO_T_BASE + 1 + i;
1182    }
1183
1184    /**
1185     * Build-time collation element and character iterator.
1186     * Uses the runtime CollationIterator for fetching CEs for a string
1187     * but reads from the builder's unfinished data structures.
1188     * In particular, this class reads from the unfinished trie
1189     * and has to avoid CollationIterator.nextCE() and redirect other
1190     * calls to data.getCE32() and data.getCE32FromSupplementary().
1191     *
1192     * We do this so that we need not implement the collation algorithm
1193     * again for the builder and make it behave exactly like the runtime code.
1194     * That would be more difficult to test and maintain than this indirection.
1195     *
1196     * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data,
1197     * so the data accesses from those code paths need not be modified.
1198     *
1199     * This class iterates directly over whole code points
1200     * so that the CollationIterator does not need the finished trie
1201     * for handling the LEAD_SURROGATE_TAG.
1202     */
1203    private static final class DataBuilderCollationIterator extends CollationIterator {
1204        DataBuilderCollationIterator(CollationDataBuilder b, CollationData newData) {
1205            super(newData, /*numeric=*/ false);
1206            builder = b;
1207            builderData = newData;
1208            builderData.base = builder.base;
1209            // Set all of the jamoCE32s[] to indirection CE32s.
1210            for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) {  // Count across Jamo types.
1211                int jamo = CollationDataBuilder.jamoCpFromIndex(j);
1212                jamoCE32s[j] = Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, jamo) |
1213                        CollationDataBuilder.IS_BUILDER_JAMO_CE32;
1214            }
1215            builderData.jamoCE32s = jamoCE32s;
1216        }
1217
1218        int fetchCEs(CharSequence str, int start, long ces[], int cesLength) {
1219            // Set the pointers each time, in case they changed due to reallocation.
1220            builderData.ce32s = builder.ce32s.getBuffer();
1221            builderData.ces = builder.ce64s.getBuffer();
1222            builderData.contexts = builder.contexts.toString();
1223            // Modified copy of CollationIterator.nextCE() and CollationIterator.nextCEFromCE32().
1224            reset();
1225            s = str;
1226            pos = start;
1227            while(pos < s.length()) {
1228                // No need to keep all CEs in the iterator buffer.
1229                clearCEs();
1230                int c = Character.codePointAt(s, pos);
1231                pos += Character.charCount(c);
1232                int ce32 = builder.trie.get(c);
1233                CollationData d;
1234                if(ce32 == Collation.FALLBACK_CE32) {
1235                    d = builder.base;
1236                    ce32 = builder.base.getCE32(c);
1237                } else {
1238                    d = builderData;
1239                }
1240                appendCEsFromCE32(d, c, ce32, /*forward=*/ true);
1241                for(int i = 0; i < getCEsLength(); ++i) {
1242                    long ce = getCE(i);
1243                    if(ce != 0) {
1244                        if(cesLength < Collation.MAX_EXPANSION_LENGTH) {
1245                            ces[cesLength] = ce;
1246                        }
1247                        ++cesLength;
1248                    }
1249                }
1250            }
1251            return cesLength;
1252        }
1253
1254        @Override
1255        public void resetToOffset(int newOffset) {
1256            reset();
1257            pos = newOffset;
1258        }
1259
1260        @Override
1261        public int getOffset() {
1262            return pos;
1263        }
1264
1265        @Override
1266        public int nextCodePoint() {
1267            if(pos == s.length()) {
1268                return Collation.SENTINEL_CP;
1269            }
1270            int c = Character.codePointAt(s, pos);
1271            pos += Character.charCount(c);
1272            return c;
1273        }
1274
1275        @Override
1276        public int previousCodePoint() {
1277            if(pos == 0) {
1278                return Collation.SENTINEL_CP;
1279            }
1280            int c = Character.codePointBefore(s, pos);
1281            pos -= Character.charCount(c);
1282            return c;
1283        }
1284
1285        @Override
1286        protected void forwardNumCodePoints(int num) {
1287            pos = Character.offsetByCodePoints(s, pos, num);
1288        }
1289
1290        @Override
1291        protected void backwardNumCodePoints(int num) {
1292            pos = Character.offsetByCodePoints(s, pos, -num);
1293        }
1294
1295        @Override
1296        protected int getDataCE32(int c) {
1297            return builder.trie.get(c);
1298        }
1299
1300        @Override
1301        protected int getCE32FromBuilderData(int ce32) {
1302            assert(Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG));
1303            if((ce32 & CollationDataBuilder.IS_BUILDER_JAMO_CE32) != 0) {
1304                int jamo = Collation.indexFromCE32(ce32);
1305                return builder.trie.get(jamo);
1306            } else {
1307                ConditionalCE32 cond = builder.getConditionalCE32ForCE32(ce32);
1308                if(cond.builtCE32 == Collation.NO_CE32) {
1309                    // Build the context-sensitive mappings into their runtime form and cache the result.
1310                    try {
1311                        cond.builtCE32 = builder.buildContext(cond);
1312                    } catch(IndexOutOfBoundsException e) {
1313                        builder.clearContexts();
1314                        cond.builtCE32 = builder.buildContext(cond);
1315                    }
1316                    builderData.contexts = builder.contexts.toString();
1317                }
1318                return cond.builtCE32;
1319            }
1320        }
1321
1322        protected final CollationDataBuilder builder;
1323        protected final CollationData builderData;
1324        protected final int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH];
1325        protected CharSequence s;
1326        protected int pos;
1327    }
1328
1329    protected final boolean isMutable() {
1330        // C++ tests !(trie == NULL || utrie2_isFrozen(trie))
1331        // but Java Trie2Writable does not have an observable isFrozen() state.
1332        return trie != null && unsafeBackwardSet != null && !unsafeBackwardSet.isFrozen();
1333    }
1334
1335    /** @see Collation.BUILDER_DATA_TAG */
1336    private static final int IS_BUILDER_JAMO_CE32 = 0x100;
1337
1338    protected Normalizer2Impl nfcImpl;
1339    protected CollationData base;
1340    protected CollationSettings baseSettings;
1341    protected Trie2Writable trie;
1342    protected UVector32 ce32s;
1343    protected UVector64 ce64s;
1344    protected ArrayList<ConditionalCE32> conditionalCE32s;  // vector of ConditionalCE32
1345    // Characters that have context (prefixes or contraction suffixes).
1346    protected UnicodeSet contextChars = new UnicodeSet();
1347    // Serialized UCharsTrie structures for finalized contexts.
1348    protected StringBuilder contexts = new StringBuilder();
1349    protected UnicodeSet unsafeBackwardSet = new UnicodeSet();
1350    protected boolean modified;
1351
1352    protected boolean fastLatinEnabled;
1353    protected CollationFastLatinBuilder fastLatinBuilder;
1354
1355    protected DataBuilderCollationIterator collIter;
1356}
1357