1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5*******************************************************************************
6* Copyright (C) 2013-2015, International Business Machines
7* Corporation and others.  All Rights Reserved.
8*******************************************************************************
9* CollationBuilder.java, ported from collationbuilder.h/.cpp
10*
11* C++ version created on: 2013may06
12* created by: Markus W. Scherer
13*/
14
15package android.icu.impl.coll;
16
17import java.text.ParseException;
18
19import android.icu.impl.Norm2AllModes;
20import android.icu.impl.Normalizer2Impl;
21import android.icu.impl.Normalizer2Impl.Hangul;
22import android.icu.lang.UScript;
23import android.icu.text.CanonicalIterator;
24import android.icu.text.Collator;
25import android.icu.text.Normalizer2;
26import android.icu.text.UnicodeSet;
27import android.icu.text.UnicodeSetIterator;
28import android.icu.util.ULocale;
29
30/**
31 * @hide Only a subset of ICU is exposed in Android
32 */
33public final class CollationBuilder extends CollationRuleParser.Sink {
34    private static final boolean DEBUG = false;
35    private static final class BundleImporter implements CollationRuleParser.Importer {
36        BundleImporter() {}
37        @Override
38        public String getRules(String localeID, String collationType) {
39            return CollationLoader.loadRules(new ULocale(localeID), collationType);
40        }
41    }
42
43    public CollationBuilder(CollationTailoring b) {
44        nfd = Normalizer2.getNFDInstance();
45        fcd = Norm2AllModes.getFCDNormalizer2();
46        nfcImpl = Norm2AllModes.getNFCInstance().impl;
47        base = b;
48        baseData = b.data;
49        rootElements = new CollationRootElements(b.data.rootElements);
50        variableTop = 0;
51        dataBuilder = new CollationDataBuilder();
52        fastLatinEnabled = true;
53        cesLength = 0;
54        rootPrimaryIndexes = new UVector32();
55        nodes = new UVector64();
56        nfcImpl.ensureCanonIterData();
57        dataBuilder.initForTailoring(baseData);
58    }
59
60    public CollationTailoring parseAndBuild(String ruleString) throws ParseException {
61        if(baseData.rootElements == null) {
62            // C++ U_MISSING_RESOURCE_ERROR
63            throw new UnsupportedOperationException(
64                    "missing root elements data, tailoring not supported");
65        }
66        CollationTailoring tailoring = new CollationTailoring(base.settings);
67        CollationRuleParser parser = new CollationRuleParser(baseData);
68        // Note: This always bases &[last variable] and &[first regular]
69        // on the root collator's maxVariable/variableTop.
70        // If we wanted this to change after [maxVariable x], then we would keep
71        // the tailoring.settings pointer here and read its variableTop when we need it.
72        // See http://unicode.org/cldr/trac/ticket/6070
73        variableTop = base.settings.readOnly().variableTop;
74        parser.setSink(this);
75        // In Java, there is only one Importer implementation.
76        // In C++, the importer is a parameter for this method.
77        parser.setImporter(new BundleImporter());
78        CollationSettings ownedSettings = tailoring.settings.copyOnWrite();
79        parser.parse(ruleString, ownedSettings);
80        if(dataBuilder.hasMappings()) {
81            makeTailoredCEs();
82            closeOverComposites();
83            finalizeCEs();
84            // Copy all of ASCII, and Latin-1 letters, into each tailoring.
85            optimizeSet.add(0, 0x7f);
86            optimizeSet.add(0xc0, 0xff);
87            // Hangul is decomposed on the fly during collation,
88            // and the tailoring data is always built with HANGUL_TAG specials.
89            optimizeSet.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END);
90            dataBuilder.optimize(optimizeSet);
91            tailoring.ensureOwnedData();
92            if(fastLatinEnabled) { dataBuilder.enableFastLatin(); }
93            dataBuilder.build(tailoring.ownedData);
94            // C++ tailoring.builder = dataBuilder;
95            dataBuilder = null;
96        } else {
97            tailoring.data = baseData;
98        }
99        ownedSettings.fastLatinOptions = CollationFastLatin.getOptions(
100                tailoring.data, ownedSettings,
101                ownedSettings.fastLatinPrimaries);
102        tailoring.setRules(ruleString);
103        // In Java, we do not have a rules version.
104        // In C++, the genrb build tool reads and supplies one,
105        // and the rulesVersion is a parameter for this method.
106        tailoring.setVersion(base.version, 0 /* rulesVersion */);
107        return tailoring;
108    }
109
110    /** Implements CollationRuleParser.Sink. */
111    @Override
112    void addReset(int strength, CharSequence str) {
113        assert(str.length() != 0);
114        if(str.charAt(0) == CollationRuleParser.POS_LEAD) {
115            ces[0] = getSpecialResetPosition(str);
116            cesLength = 1;
117            assert((ces[0] & Collation.CASE_AND_QUATERNARY_MASK) == 0);
118        } else {
119            // normal reset to a character or string
120            String nfdString = nfd.normalize(str);
121            cesLength = dataBuilder.getCEs(nfdString, ces, 0);
122            if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
123                throw new IllegalArgumentException(
124                        "reset position maps to too many collation elements (more than 31)");
125            }
126        }
127        if(strength == Collator.IDENTICAL) { return; }  // simple reset-at-position
128
129        // &[before strength]position
130        assert(Collator.PRIMARY <= strength && strength <= Collator.TERTIARY);
131        int index = findOrInsertNodeForCEs(strength);
132
133        long node = nodes.elementAti(index);
134        // If the index is for a "weaker" node,
135        // then skip backwards over this and further "weaker" nodes.
136        while(strengthFromNode(node) > strength) {
137            index = previousIndexFromNode(node);
138            node = nodes.elementAti(index);
139        }
140
141        // Find or insert a node whose index we will put into a temporary CE.
142        if(strengthFromNode(node) == strength && isTailoredNode(node)) {
143            // Reset to just before this same-strength tailored node.
144            index = previousIndexFromNode(node);
145        } else if(strength == Collator.PRIMARY) {
146            // root primary node (has no previous index)
147            long p = weight32FromNode(node);
148            if(p == 0) {
149                throw new UnsupportedOperationException(
150                        "reset primary-before ignorable not possible");
151            }
152            if(p <= rootElements.getFirstPrimary()) {
153                // There is no primary gap between ignorables and the space-first-primary.
154                throw new UnsupportedOperationException(
155                        "reset primary-before first non-ignorable not supported");
156            }
157            if(p == Collation.FIRST_TRAILING_PRIMARY) {
158                // We do not support tailoring to an unassigned-implicit CE.
159                throw new UnsupportedOperationException(
160                        "reset primary-before [first trailing] not supported");
161            }
162            p = rootElements.getPrimaryBefore(p, baseData.isCompressiblePrimary(p));
163            index = findOrInsertNodeForPrimary(p);
164            // Go to the last node in this list:
165            // Tailor after the last node between adjacent root nodes.
166            for(;;) {
167                node = nodes.elementAti(index);
168                int nextIndex = nextIndexFromNode(node);
169                if(nextIndex == 0) { break; }
170                index = nextIndex;
171            }
172        } else {
173            // &[before 2] or &[before 3]
174            index = findCommonNode(index, Collator.SECONDARY);
175            if(strength >= Collator.TERTIARY) {
176                index = findCommonNode(index, Collator.TERTIARY);
177            }
178            // findCommonNode() stayed on the stronger node or moved to
179            // an explicit common-weight node of the reset-before strength.
180            node = nodes.elementAti(index);
181            if(strengthFromNode(node) == strength) {
182                // Found a same-strength node with an explicit weight.
183                int weight16 = weight16FromNode(node);
184                if(weight16 == 0) {
185                    throw new UnsupportedOperationException(
186                            (strength == Collator.SECONDARY) ?
187                                    "reset secondary-before secondary ignorable not possible" :
188                                    "reset tertiary-before completely ignorable not possible");
189                }
190                assert(weight16 > Collation.BEFORE_WEIGHT16);
191                // Reset to just before this node.
192                // Insert the preceding same-level explicit weight if it is not there already.
193                // Which explicit weight immediately precedes this one?
194                weight16 = getWeight16Before(index, node, strength);
195                // Does this preceding weight have a node?
196                int previousWeight16;
197                int previousIndex = previousIndexFromNode(node);
198                for(int i = previousIndex;; i = previousIndexFromNode(node)) {
199                    node = nodes.elementAti(i);
200                    int previousStrength = strengthFromNode(node);
201                    if(previousStrength < strength) {
202                        assert(weight16 >= Collation.COMMON_WEIGHT16 || i == previousIndex);
203                        // Either the reset element has an above-common weight and
204                        // the parent node provides the implied common weight,
205                        // or the reset element has a weight<=common in the node
206                        // right after the parent, and we need to insert the preceding weight.
207                        previousWeight16 = Collation.COMMON_WEIGHT16;
208                        break;
209                    } else if(previousStrength == strength && !isTailoredNode(node)) {
210                        previousWeight16 = weight16FromNode(node);
211                        break;
212                    }
213                    // Skip weaker nodes and same-level tailored nodes.
214                }
215                if(previousWeight16 == weight16) {
216                    // The preceding weight has a node,
217                    // maybe with following weaker or tailored nodes.
218                    // Reset to the last of them.
219                    index = previousIndex;
220                } else {
221                    // Insert a node with the preceding weight, reset to that.
222                    node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
223                    index = insertNodeBetween(previousIndex, index, node);
224                }
225            } else {
226                // Found a stronger node with implied strength-common weight.
227                int weight16 = getWeight16Before(index, node, strength);
228                index = findOrInsertWeakNode(index, weight16, strength);
229            }
230            // Strength of the temporary CE = strength of its reset position.
231            // Code above raises an error if the before-strength is stronger.
232            strength = ceStrength(ces[cesLength - 1]);
233        }
234        ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
235    }
236
237    /**
238     * Returns the secondary or tertiary weight preceding the current node's weight.
239     * node=nodes[index].
240     */
241    private int getWeight16Before(int index, long node, int level) {
242        assert(strengthFromNode(node) < level || !isTailoredNode(node));
243        // Collect the root CE weights if this node is for a root CE.
244        // If it is not, then return the low non-primary boundary for a tailored CE.
245        int t;
246        if(strengthFromNode(node) == Collator.TERTIARY) {
247            t = weight16FromNode(node);
248        } else {
249            t = Collation.COMMON_WEIGHT16;  // Stronger node with implied common weight.
250        }
251        while(strengthFromNode(node) > Collator.SECONDARY) {
252            index = previousIndexFromNode(node);
253            node = nodes.elementAti(index);
254        }
255        if(isTailoredNode(node)) {
256            return Collation.BEFORE_WEIGHT16;
257        }
258        int s;
259        if(strengthFromNode(node) == Collator.SECONDARY) {
260            s = weight16FromNode(node);
261        } else {
262            s = Collation.COMMON_WEIGHT16;  // Stronger node with implied common weight.
263        }
264        while(strengthFromNode(node) > Collator.PRIMARY) {
265            index = previousIndexFromNode(node);
266            node = nodes.elementAti(index);
267        }
268        if(isTailoredNode(node)) {
269            return Collation.BEFORE_WEIGHT16;
270        }
271        // [p, s, t] is a root CE. Return the preceding weight for the requested level.
272        long p = weight32FromNode(node);
273        int weight16;
274        if(level == Collator.SECONDARY) {
275            weight16 = rootElements.getSecondaryBefore(p, s);
276        } else {
277            weight16 = rootElements.getTertiaryBefore(p, s, t);
278            assert((weight16 & ~Collation.ONLY_TERTIARY_MASK) == 0);
279        }
280        return weight16;
281    }
282
283    private long getSpecialResetPosition(CharSequence str) {
284        assert(str.length() == 2);
285        long ce;
286        int strength = Collator.PRIMARY;
287        boolean isBoundary = false;
288        CollationRuleParser.Position pos =
289                CollationRuleParser.POSITION_VALUES[str.charAt(1) - CollationRuleParser.POS_BASE];
290        switch(pos) {
291        case FIRST_TERTIARY_IGNORABLE:
292            // Quaternary CEs are not supported.
293            // Non-zero quaternary weights are possible only on tertiary or stronger CEs.
294            return 0;
295        case LAST_TERTIARY_IGNORABLE:
296            return 0;
297        case FIRST_SECONDARY_IGNORABLE: {
298            // Look for a tailored tertiary node after [0, 0, 0].
299            int index = findOrInsertNodeForRootCE(0, Collator.TERTIARY);
300            long node = nodes.elementAti(index);
301            if((index = nextIndexFromNode(node)) != 0) {
302                node = nodes.elementAti(index);
303                assert(strengthFromNode(node) <= Collator.TERTIARY);
304                if(isTailoredNode(node) && strengthFromNode(node) == Collator.TERTIARY) {
305                    return tempCEFromIndexAndStrength(index, Collator.TERTIARY);
306                }
307            }
308            return rootElements.getFirstTertiaryCE();
309            // No need to look for nodeHasAnyBefore() on a tertiary node.
310        }
311        case LAST_SECONDARY_IGNORABLE:
312            ce = rootElements.getLastTertiaryCE();
313            strength = Collator.TERTIARY;
314            break;
315        case FIRST_PRIMARY_IGNORABLE: {
316            // Look for a tailored secondary node after [0, 0, *].
317            int index = findOrInsertNodeForRootCE(0, Collator.SECONDARY);
318            long node = nodes.elementAti(index);
319            while((index = nextIndexFromNode(node)) != 0) {
320                node = nodes.elementAti(index);
321                strength = strengthFromNode(node);
322                if(strength < Collator.SECONDARY) { break; }
323                if(strength == Collator.SECONDARY) {
324                    if(isTailoredNode(node)) {
325                        if(nodeHasBefore3(node)) {
326                            index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
327                            assert(isTailoredNode(nodes.elementAti(index)));
328                        }
329                        return tempCEFromIndexAndStrength(index, Collator.SECONDARY);
330                    } else {
331                        break;
332                    }
333                }
334            }
335            ce = rootElements.getFirstSecondaryCE();
336            strength = Collator.SECONDARY;
337            break;
338        }
339        case LAST_PRIMARY_IGNORABLE:
340            ce = rootElements.getLastSecondaryCE();
341            strength = Collator.SECONDARY;
342            break;
343        case FIRST_VARIABLE:
344            ce = rootElements.getFirstPrimaryCE();
345            isBoundary = true;  // FractionalUCA.txt: FDD1 00A0, SPACE first primary
346            break;
347        case LAST_VARIABLE:
348            ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);
349            break;
350        case FIRST_REGULAR:
351            ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);
352            isBoundary = true;  // FractionalUCA.txt: FDD1 263A, SYMBOL first primary
353            break;
354        case LAST_REGULAR:
355            // Use the Hani-first-primary rather than the actual last "regular" CE before it,
356            // for backward compatibility with behavior before the introduction of
357            // script-first-primary CEs in the root collator.
358            ce = rootElements.firstCEWithPrimaryAtLeast(
359                baseData.getFirstPrimaryForGroup(UScript.HAN));
360            break;
361        case FIRST_IMPLICIT:
362            ce = baseData.getSingleCE(0x4e00);
363            break;
364        case LAST_IMPLICIT:
365            // We do not support tailoring to an unassigned-implicit CE.
366            throw new UnsupportedOperationException(
367                    "reset to [last implicit] not supported");
368        case FIRST_TRAILING:
369            ce = Collation.makeCE(Collation.FIRST_TRAILING_PRIMARY);
370            isBoundary = true;  // trailing first primary (there is no mapping for it)
371            break;
372        case LAST_TRAILING:
373            throw new IllegalArgumentException("LDML forbids tailoring to U+FFFF");
374        default:
375            assert(false);
376            return 0;
377        }
378
379        int index = findOrInsertNodeForRootCE(ce, strength);
380        long node = nodes.elementAti(index);
381        if((pos.ordinal() & 1) == 0) {
382            // even pos = [first xyz]
383            if(!nodeHasAnyBefore(node) && isBoundary) {
384                // A <group> first primary boundary is artificially added to FractionalUCA.txt.
385                // It is reachable via its special contraction, but is not normally used.
386                // Find the first character tailored after the boundary CE,
387                // or the first real root CE after it.
388                if((index = nextIndexFromNode(node)) != 0) {
389                    // If there is a following node, then it must be tailored
390                    // because there are no root CEs with a boundary primary
391                    // and non-common secondary/tertiary weights.
392                    node = nodes.elementAti(index);
393                    assert(isTailoredNode(node));
394                    ce = tempCEFromIndexAndStrength(index, strength);
395                } else {
396                    assert(strength == Collator.PRIMARY);
397                    long p = ce >>> 32;
398                    int pIndex = rootElements.findPrimary(p);
399                    boolean isCompressible = baseData.isCompressiblePrimary(p);
400                    p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);
401                    ce = Collation.makeCE(p);
402                    index = findOrInsertNodeForRootCE(ce, Collator.PRIMARY);
403                    node = nodes.elementAti(index);
404                }
405            }
406            if(nodeHasAnyBefore(node)) {
407                // Get the first node that was tailored before this one at a weaker strength.
408                if(nodeHasBefore2(node)) {
409                    index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
410                    node = nodes.elementAti(index);
411                }
412                if(nodeHasBefore3(node)) {
413                    index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
414                }
415                assert(isTailoredNode(nodes.elementAti(index)));
416                ce = tempCEFromIndexAndStrength(index, strength);
417            }
418        } else {
419            // odd pos = [last xyz]
420            // Find the last node that was tailored after the [last xyz]
421            // at a strength no greater than the position's strength.
422            for(;;) {
423                int nextIndex = nextIndexFromNode(node);
424                if(nextIndex == 0) { break; }
425                long nextNode = nodes.elementAti(nextIndex);
426                if(strengthFromNode(nextNode) < strength) { break; }
427                index = nextIndex;
428                node = nextNode;
429            }
430            // Do not make a temporary CE for a root node.
431            // This last node might be the node for the root CE itself,
432            // or a node with a common secondary or tertiary weight.
433            if(isTailoredNode(node)) {
434                ce = tempCEFromIndexAndStrength(index, strength);
435            }
436        }
437        return ce;
438    }
439
440    /** Implements CollationRuleParser.Sink. */
441    @Override
442    void addRelation(int strength, CharSequence prefix, CharSequence str, CharSequence extension) {
443        String nfdPrefix;
444        if(prefix.length() == 0) {
445            nfdPrefix = "";
446        } else {
447            nfdPrefix = nfd.normalize(prefix);
448        }
449        String nfdString = nfd.normalize(str);
450
451        // The runtime code decomposes Hangul syllables on the fly,
452        // with recursive processing but without making the Jamo pieces visible for matching.
453        // It does not work with certain types of contextual mappings.
454        int nfdLength = nfdString.length();
455        if(nfdLength >= 2) {
456            char c = nfdString.charAt(0);
457            if(Hangul.isJamoL(c) || Hangul.isJamoV(c)) {
458                // While handling a Hangul syllable, contractions starting with Jamo L or V
459                // would not see the following Jamo of that syllable.
460                throw new UnsupportedOperationException(
461                        "contractions starting with conjoining Jamo L or V not supported");
462            }
463            c = nfdString.charAt(nfdLength - 1);
464            if(Hangul.isJamoL(c) ||
465                    (Hangul.isJamoV(c) && Hangul.isJamoL(nfdString.charAt(nfdLength - 2)))) {
466                // A contraction ending with Jamo L or L+V would require
467                // generating Hangul syllables in addTailComposites() (588 for a Jamo L),
468                // or decomposing a following Hangul syllable on the fly, during contraction matching.
469                throw new UnsupportedOperationException(
470                        "contractions ending with conjoining Jamo L or L+V not supported");
471            }
472            // A Hangul syllable completely inside a contraction is ok.
473        }
474        // Note: If there is a prefix, then the parser checked that
475        // both the prefix and the string beging with NFC boundaries (not Jamo V or T).
476        // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))
477        // (While handling a Hangul syllable, prefixes on Jamo V or T
478        // would not see the previous Jamo of that syllable.)
479
480        if(strength != Collator.IDENTICAL) {
481            // Find the node index after which we insert the new tailored node.
482            int index = findOrInsertNodeForCEs(strength);
483            assert(cesLength > 0);
484            long ce = ces[cesLength - 1];
485            if(strength == Collator.PRIMARY && !isTempCE(ce) && (ce >>> 32) == 0) {
486                // There is no primary gap between ignorables and the space-first-primary.
487                throw new UnsupportedOperationException(
488                        "tailoring primary after ignorables not supported");
489            }
490            if(strength == Collator.QUATERNARY && ce == 0) {
491                // The CE data structure does not support non-zero quaternary weights
492                // on tertiary ignorables.
493                throw new UnsupportedOperationException(
494                        "tailoring quaternary after tertiary ignorables not supported");
495            }
496            // Insert the new tailored node.
497            index = insertTailoredNodeAfter(index, strength);
498            // Strength of the temporary CE:
499            // The new relation may yield a stronger CE but not a weaker one.
500            int tempStrength = ceStrength(ce);
501            if(strength < tempStrength) { tempStrength = strength; }
502            ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);
503        }
504
505        setCaseBits(nfdString);
506
507        int cesLengthBeforeExtension = cesLength;
508        if(extension.length() != 0) {
509            String nfdExtension = nfd.normalize(extension);
510            cesLength = dataBuilder.getCEs(nfdExtension, ces, cesLength);
511            if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
512                throw new IllegalArgumentException(
513                        "extension string adds too many collation elements (more than 31 total)");
514            }
515        }
516        int ce32 = Collation.UNASSIGNED_CE32;
517        if((!nfdPrefix.contentEquals(prefix) || !nfdString.contentEquals(str)) &&
518                !ignorePrefix(prefix) && !ignoreString(str)) {
519            // Map from the original input to the CEs.
520            // We do this in case the canonical closure is incomplete,
521            // so that it is possible to explicitly provide the missing mappings.
522            ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32);
523        }
524        addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32);
525        cesLength = cesLengthBeforeExtension;
526    }
527
528    /**
529     * Picks one of the current CEs and finds or inserts a node in the graph
530     * for the CE + strength.
531     */
532    private int findOrInsertNodeForCEs(int strength) {
533        assert(Collator.PRIMARY <= strength && strength <= Collator.QUATERNARY);
534
535        // Find the last CE that is at least as "strong" as the requested difference.
536        // Note: Stronger is smaller (Collator.PRIMARY=0).
537        long ce;
538        for(;; --cesLength) {
539            if(cesLength == 0) {
540                ce = ces[0] = 0;
541                cesLength = 1;
542                break;
543            } else {
544                ce = ces[cesLength - 1];
545            }
546            if(ceStrength(ce) <= strength) { break; }
547        }
548
549        if(isTempCE(ce)) {
550            // No need to findCommonNode() here for lower levels
551            // because insertTailoredNodeAfter() will do that anyway.
552            return indexFromTempCE(ce);
553        }
554
555        // root CE
556        if((int)(ce >>> 56) == Collation.UNASSIGNED_IMPLICIT_BYTE) {
557            throw new UnsupportedOperationException(
558                    "tailoring relative to an unassigned code point not supported");
559        }
560        return findOrInsertNodeForRootCE(ce, strength);
561    }
562
563    private int findOrInsertNodeForRootCE(long ce, int strength) {
564        assert((int)(ce >>> 56) != Collation.UNASSIGNED_IMPLICIT_BYTE);
565
566        // Find or insert the node for each of the root CE's weights,
567        // down to the requested level/strength.
568        // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
569        assert((ce & 0xc0) == 0);
570        int index = findOrInsertNodeForPrimary(ce >>> 32);
571        if(strength >= Collator.SECONDARY) {
572            int lower32 = (int)ce;
573            index = findOrInsertWeakNode(index, lower32 >>> 16, Collator.SECONDARY);
574            if(strength >= Collator.TERTIARY) {
575                index = findOrInsertWeakNode(index, lower32 & Collation.ONLY_TERTIARY_MASK,
576                                            Collator.TERTIARY);
577            }
578        }
579        return index;
580    }
581
582    /**
583     * Like Java Collections.binarySearch(List, key, Comparator).
584     *
585     * @return the index>=0 where the item was found,
586     *         or the index<0 for inserting the string at ~index in sorted order
587     *         (index into rootPrimaryIndexes)
588     */
589    private static final int binarySearchForRootPrimaryNode(
590            int[] rootPrimaryIndexes, int length, long[] nodes, long p) {
591        if(length == 0) { return ~0; }
592        int start = 0;
593        int limit = length;
594        for (;;) {
595            int i = (int)(((long)start + (long)limit) / 2);
596            long node = nodes[rootPrimaryIndexes[i]];
597            long nodePrimary = node >>> 32;  // weight32FromNode(node)
598            if (p == nodePrimary) {
599                return i;
600            } else if (p < nodePrimary) {
601                if (i == start) {
602                    return ~start;  // insert s before i
603                }
604                limit = i;
605            } else {
606                if (i == start) {
607                    return ~(start + 1);  // insert s after i
608                }
609                start = i;
610            }
611        }
612    }
613
614    /** Finds or inserts the node for a root CE's primary weight. */
615    private int findOrInsertNodeForPrimary(long p) {
616        int rootIndex = binarySearchForRootPrimaryNode(
617            rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);
618        if(rootIndex >= 0) {
619            return rootPrimaryIndexes.elementAti(rootIndex);
620        } else {
621            // Start a new list of nodes with this primary.
622            int index = nodes.size();
623            nodes.addElement(nodeFromWeight32(p));
624            rootPrimaryIndexes.insertElementAt(index, ~rootIndex);
625            return index;
626        }
627    }
628
629    /** Finds or inserts the node for a secondary or tertiary weight. */
630    private int findOrInsertWeakNode(int index, int weight16, int level) {
631        assert(0 <= index && index < nodes.size());
632        assert(Collator.SECONDARY <= level && level <= Collator.TERTIARY);
633
634        if(weight16 == Collation.COMMON_WEIGHT16) {
635            return findCommonNode(index, level);
636        }
637
638        // If this will be the first below-common weight for the parent node,
639        // then we will also need to insert a common weight after it.
640        long node = nodes.elementAti(index);
641        assert(strengthFromNode(node) < level);  // parent node is stronger
642        if(weight16 != 0 && weight16 < Collation.COMMON_WEIGHT16) {
643            int hasThisLevelBefore = level == Collator.SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;
644            if((node & hasThisLevelBefore) == 0) {
645                // The parent node has an implied level-common weight.
646                long commonNode =
647                    nodeFromWeight16(Collation.COMMON_WEIGHT16) | nodeFromStrength(level);
648                if(level == Collator.SECONDARY) {
649                    // Move the HAS_BEFORE3 flag from the parent node
650                    // to the new secondary common node.
651                    commonNode |= node & HAS_BEFORE3;
652                    node &= ~(long)HAS_BEFORE3;
653                }
654                nodes.setElementAt(node | hasThisLevelBefore, index);
655                // Insert below-common-weight node.
656                int nextIndex = nextIndexFromNode(node);
657                node = nodeFromWeight16(weight16) | nodeFromStrength(level);
658                index = insertNodeBetween(index, nextIndex, node);
659                // Insert common-weight node.
660                insertNodeBetween(index, nextIndex, commonNode);
661                // Return index of below-common-weight node.
662                return index;
663            }
664        }
665
666        // Find the root CE's weight for this level.
667        // Postpone insertion if not found:
668        // Insert the new root node before the next stronger node,
669        // or before the next root node with the same strength and a larger weight.
670        int nextIndex;
671        while((nextIndex = nextIndexFromNode(node)) != 0) {
672            node = nodes.elementAti(nextIndex);
673            int nextStrength = strengthFromNode(node);
674            if(nextStrength <= level) {
675                // Insert before a stronger node.
676                if(nextStrength < level) { break; }
677                // nextStrength == level
678                if(!isTailoredNode(node)) {
679                    int nextWeight16 = weight16FromNode(node);
680                    if(nextWeight16 == weight16) {
681                        // Found the node for the root CE up to this level.
682                        return nextIndex;
683                    }
684                    // Insert before a node with a larger same-strength weight.
685                    if(nextWeight16 > weight16) { break; }
686                }
687            }
688            // Skip the next node.
689            index = nextIndex;
690        }
691        node = nodeFromWeight16(weight16) | nodeFromStrength(level);
692        return insertNodeBetween(index, nextIndex, node);
693    }
694
695    /**
696     * Makes and inserts a new tailored node into the list, after the one at index.
697     * Skips over nodes of weaker strength to maintain collation order
698     * ("postpone insertion").
699     * @return the new node's index
700     */
701    private int insertTailoredNodeAfter(int index, int strength) {
702        assert(0 <= index && index < nodes.size());
703        if(strength >= Collator.SECONDARY) {
704            index = findCommonNode(index, Collator.SECONDARY);
705            if(strength >= Collator.TERTIARY) {
706                index = findCommonNode(index, Collator.TERTIARY);
707            }
708        }
709        // Postpone insertion:
710        // Insert the new node before the next one with a strength at least as strong.
711        long node = nodes.elementAti(index);
712        int nextIndex;
713        while((nextIndex = nextIndexFromNode(node)) != 0) {
714            node = nodes.elementAti(nextIndex);
715            if(strengthFromNode(node) <= strength) { break; }
716            // Skip the next node which has a weaker (larger) strength than the new one.
717            index = nextIndex;
718        }
719        node = IS_TAILORED | nodeFromStrength(strength);
720        return insertNodeBetween(index, nextIndex, node);
721    }
722
723    /**
724     * Inserts a new node into the list, between list-adjacent items.
725     * The node's previous and next indexes must not be set yet.
726     * @return the new node's index
727     */
728    private int insertNodeBetween(int index, int nextIndex, long node) {
729        assert(previousIndexFromNode(node) == 0);
730        assert(nextIndexFromNode(node) == 0);
731        assert(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);
732        // Append the new node and link it to the existing nodes.
733        int newIndex = nodes.size();
734        node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);
735        nodes.addElement(node);
736        // nodes[index].nextIndex = newIndex
737        node = nodes.elementAti(index);
738        nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);
739        // nodes[nextIndex].previousIndex = newIndex
740        if(nextIndex != 0) {
741            node = nodes.elementAti(nextIndex);
742            nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);
743        }
744        return newIndex;
745    }
746
747    /**
748     * Finds the node which implies or contains a common=05 weight of the given strength
749     * (secondary or tertiary), if the current node is stronger.
750     * Skips weaker nodes and tailored nodes if the current node is stronger
751     * and is followed by an explicit-common-weight node.
752     * Always returns the input index if that node is no stronger than the given strength.
753     */
754    private int findCommonNode(int index, int strength) {
755        assert(Collator.SECONDARY <= strength && strength <= Collator.TERTIARY);
756        long node = nodes.elementAti(index);
757        if(strengthFromNode(node) >= strength) {
758            // The current node is no stronger.
759            return index;
760        }
761        if(strength == Collator.SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) {
762            // The current node implies the strength-common weight.
763            return index;
764        }
765        index = nextIndexFromNode(node);
766        node = nodes.elementAti(index);
767        assert(!isTailoredNode(node) && strengthFromNode(node) == strength &&
768                weight16FromNode(node) < Collation.COMMON_WEIGHT16);
769        // Skip to the explicit common node.
770        do {
771            index = nextIndexFromNode(node);
772            node = nodes.elementAti(index);
773            assert(strengthFromNode(node) >= strength);
774        } while(isTailoredNode(node) || strengthFromNode(node) > strength ||
775                weight16FromNode(node) < Collation.COMMON_WEIGHT16);
776        assert(weight16FromNode(node) == Collation.COMMON_WEIGHT16);
777        return index;
778    }
779
780    private void setCaseBits(CharSequence nfdString) {
781        int numTailoredPrimaries = 0;
782        for(int i = 0; i < cesLength; ++i) {
783            if(ceStrength(ces[i]) == Collator.PRIMARY) { ++numTailoredPrimaries; }
784        }
785        // We should not be able to get too many case bits because
786        // cesLength<=31==MAX_EXPANSION_LENGTH.
787        // 31 pairs of case bits fit into an long without setting its sign bit.
788        assert(numTailoredPrimaries <= 31);
789
790        long cases = 0;
791        if(numTailoredPrimaries > 0) {
792            CharSequence s = nfdString;
793            UTF16CollationIterator baseCEs = new UTF16CollationIterator(baseData, false, s, 0);
794            int baseCEsLength = baseCEs.fetchCEs() - 1;
795            assert(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation.NO_CE);
796
797            int lastCase = 0;
798            int numBasePrimaries = 0;
799            for(int i = 0; i < baseCEsLength; ++i) {
800                long ce = baseCEs.getCE(i);
801                if((ce >>> 32) != 0) {
802                    ++numBasePrimaries;
803                    int c = ((int)ce >> 14) & 3;
804                    assert(c == 0 || c == 2);  // lowercase or uppercase, no mixed case in any base CE
805                    if(numBasePrimaries < numTailoredPrimaries) {
806                        cases |= (long)c << ((numBasePrimaries - 1) * 2);
807                    } else if(numBasePrimaries == numTailoredPrimaries) {
808                        lastCase = c;
809                    } else if(c != lastCase) {
810                        // There are more base primary CEs than tailored primaries.
811                        // Set mixed case if the case bits of the remainder differ.
812                        lastCase = 1;
813                        // Nothing more can change.
814                        break;
815                    }
816                }
817            }
818            if(numBasePrimaries >= numTailoredPrimaries) {
819                cases |= (long)lastCase << ((numTailoredPrimaries - 1) * 2);
820            }
821        }
822
823        for(int i = 0; i < cesLength; ++i) {
824            long ce = ces[i] & 0xffffffffffff3fffL;  // clear old case bits
825            int strength = ceStrength(ce);
826            if(strength == Collator.PRIMARY) {
827                ce |= (cases & 3) << 14;
828                cases >>>= 2;
829            } else if(strength == Collator.TERTIARY) {
830                // Tertiary CEs must have uppercase bits.
831                // See the LDML spec, and comments in class CollationCompare.
832                ce |= 0x8000;
833            }
834            // Tertiary ignorable CEs must have 0 case bits.
835            // We set 0 case bits for secondary CEs too
836            // since currently only U+0345 is cased and maps to a secondary CE,
837            // and it is lowercase. Other secondaries are uncased.
838            // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.
839            ces[i] = ce;
840        }
841    }
842
843    /** Implements CollationRuleParser.Sink. */
844    @Override
845    void suppressContractions(UnicodeSet set) {
846        dataBuilder.suppressContractions(set);
847    }
848
849    /** Implements CollationRuleParser.Sink. */
850    @Override
851    void optimize(UnicodeSet set) {
852        optimizeSet.addAll(set);
853    }
854
855    /**
856     * Adds the mapping and its canonical closure.
857     * Takes ce32=dataBuilder.encodeCEs(...) so that the data builder
858     * need not re-encode the CEs multiple times.
859     */
860    private int addWithClosure(CharSequence nfdPrefix, CharSequence nfdString,
861                long[] newCEs, int newCEsLength, int ce32) {
862        // Map from the NFD input to the CEs.
863        ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32);
864        ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32);
865        addTailComposites(nfdPrefix, nfdString);
866        return ce32;
867    }
868
869    private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString,
870                long[] newCEs, int newCEsLength, int ce32) {
871        // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)
872        // TODO: make CanonicalIterator work with CharSequence, or maybe change arguments here to String
873        if(nfdPrefix.length() == 0) {
874            CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
875            String prefix = "";
876            for(;;) {
877                String str = stringIter.next();
878                if(str == null) { break; }
879                if(ignoreString(str) || str.contentEquals(nfdString)) { continue; }
880                ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
881            }
882        } else {
883            CanonicalIterator prefixIter = new CanonicalIterator(nfdPrefix.toString());
884            CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
885            for(;;) {
886                String prefix = prefixIter.next();
887                if(prefix == null) { break; }
888                if(ignorePrefix(prefix)) { continue; }
889                boolean samePrefix = prefix.contentEquals(nfdPrefix);
890                for(;;) {
891                    String str = stringIter.next();
892                    if(str == null) { break; }
893                    if(ignoreString(str) || (samePrefix && str.contentEquals(nfdString))) { continue; }
894                    ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
895                }
896                stringIter.reset();
897            }
898        }
899        return ce32;
900    }
901
902    private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) {
903        // Look for the last starter in the NFD string.
904        int lastStarter;
905        int indexAfterLastStarter = nfdString.length();
906        for(;;) {
907            if(indexAfterLastStarter == 0) { return; }  // no starter at all
908            lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter);
909            if(nfd.getCombiningClass(lastStarter) == 0) { break; }
910            indexAfterLastStarter -= Character.charCount(lastStarter);
911        }
912        // No closure to Hangul syllables since we decompose them on the fly.
913        if(Hangul.isJamoL(lastStarter)) { return; }
914
915        // Are there any composites whose decomposition starts with the lastStarter?
916        // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
917        // We might find some more equivalent mappings here if it did.
918        UnicodeSet composites = new UnicodeSet();
919        if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }
920
921        StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder();
922        long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH];
923        UnicodeSetIterator iter = new UnicodeSetIterator(composites);
924        while(iter.next()) {
925            assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
926            int composite = iter.codepoint;
927            String decomp = nfd.getDecomposition(composite);
928            if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
929                    newNFDString, newString)) {
930                continue;
931            }
932            int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0);
933            if(newCEsLength > Collation.MAX_EXPANSION_LENGTH) {
934                // Ignore mappings that we cannot store.
935                continue;
936            }
937            // Note: It is possible that the newCEs do not make use of the mapping
938            // for which we are adding the tail composites, in which case we might be adding
939            // unnecessary mappings.
940            // For example, when we add tail composites for ae^ (^=combining circumflex),
941            // UCA discontiguous-contraction matching does not find any matches
942            // for ae_^ (_=any combining diacritic below) *unless* there is also
943            // a contraction mapping for ae.
944            // Thus, if there is no ae contraction, then the ae^ mapping is ignored
945            // while fetching the newCEs for ae_^.
946            // TODO: Try to detect this effectively.
947            // (Alternatively, print a warning when prefix contractions are missing.)
948
949            // We do not need an explicit mapping for the NFD strings.
950            // It is fine if the NFD input collates like this via a sequence of mappings.
951            // It also saves a little bit of space, and may reduce the set of characters with contractions.
952            int ce32 = addIfDifferent(nfdPrefix, newString,
953                                          newCEs, newCEsLength, Collation.UNASSIGNED_CE32);
954            if(ce32 != Collation.UNASSIGNED_CE32) {
955                // was different, was added
956                addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32);
957            }
958        }
959    }
960
961    private boolean mergeCompositeIntoString(CharSequence nfdString, int indexAfterLastStarter,
962                int composite, CharSequence decomp,
963                StringBuilder newNFDString, StringBuilder newString) {
964        assert(Character.codePointBefore(nfdString, indexAfterLastStarter) ==
965                Character.codePointAt(decomp, 0));
966        int lastStarterLength = Character.offsetByCodePoints(decomp, 0, 1);
967        if(lastStarterLength == decomp.length()) {
968            // Singleton decompositions should be found by addWithClosure()
969            // and the CanonicalIterator, so we can ignore them here.
970            return false;
971        }
972        if(equalSubSequences(nfdString, indexAfterLastStarter, decomp, lastStarterLength)) {
973            // same strings, nothing new to be found here
974            return false;
975        }
976
977        // Make new FCD strings that combine a composite, or its decomposition,
978        // into the nfdString's last starter and the combining marks following it.
979        // Make an NFD version, and a version with the composite.
980        newNFDString.setLength(0);
981        newNFDString.append(nfdString, 0, indexAfterLastStarter);
982        newString.setLength(0);
983        newString.append(nfdString, 0, indexAfterLastStarter - lastStarterLength)
984            .appendCodePoint(composite);
985
986        // The following is related to discontiguous contraction matching,
987        // but builds only FCD strings (or else returns false).
988        int sourceIndex = indexAfterLastStarter;
989        int decompIndex = lastStarterLength;
990        // Small optimization: We keep the source character across loop iterations
991        // because we do not always consume it,
992        // and then need not fetch it again nor look up its combining class again.
993        int sourceChar = Collation.SENTINEL_CP;
994        // The cc variables need to be declared before the loop so that at the end
995        // they are set to the last combining classes seen.
996        int sourceCC = 0;
997        int decompCC = 0;
998        for(;;) {
999            if(sourceChar < 0) {
1000                if(sourceIndex >= nfdString.length()) { break; }
1001                sourceChar = Character.codePointAt(nfdString, sourceIndex);
1002                sourceCC = nfd.getCombiningClass(sourceChar);
1003                assert(sourceCC != 0);
1004            }
1005            // We consume a decomposition character in each iteration.
1006            if(decompIndex >= decomp.length()) { break; }
1007            int decompChar = Character.codePointAt(decomp, decompIndex);
1008            decompCC = nfd.getCombiningClass(decompChar);
1009            // Compare the two characters and their combining classes.
1010            if(decompCC == 0) {
1011                // Unable to merge because the source contains a non-zero combining mark
1012                // but the composite's decomposition contains another starter.
1013                // The strings would not be equivalent.
1014                return false;
1015            } else if(sourceCC < decompCC) {
1016                // Composite + sourceChar would not be FCD.
1017                return false;
1018            } else if(decompCC < sourceCC) {
1019                newNFDString.appendCodePoint(decompChar);
1020                decompIndex += Character.charCount(decompChar);
1021            } else if(decompChar != sourceChar) {
1022                // Blocked because same combining class.
1023                return false;
1024            } else {  // match: decompChar == sourceChar
1025                newNFDString.appendCodePoint(decompChar);
1026                decompIndex += Character.charCount(decompChar);
1027                sourceIndex += Character.charCount(decompChar);
1028                sourceChar = Collation.SENTINEL_CP;
1029            }
1030        }
1031        // We are at the end of at least one of the two inputs.
1032        if(sourceChar >= 0) {  // more characters from nfdString but not from decomp
1033            if(sourceCC < decompCC) {
1034                // Appending the next source character to the composite would not be FCD.
1035                return false;
1036            }
1037            newNFDString.append(nfdString, sourceIndex, nfdString.length());
1038            newString.append(nfdString, sourceIndex, nfdString.length());
1039        } else if(decompIndex < decomp.length()) {  // more characters from decomp, not from nfdString
1040            newNFDString.append(decomp, decompIndex, decomp.length());
1041        }
1042        assert(nfd.isNormalized(newNFDString));
1043        assert(fcd.isNormalized(newString));
1044        assert(nfd.normalize(newString).equals(newNFDString.toString()));  // canonically equivalent
1045        return true;
1046    }
1047
1048    private boolean equalSubSequences(CharSequence left, int leftStart, CharSequence right, int rightStart) {
1049        // C++ UnicodeString::compare(leftStart, 0x7fffffff, right, rightStart, 0x7fffffff) == 0
1050        int leftLength = left.length();
1051        if((leftLength - leftStart) != (right.length() - rightStart)) { return false; }
1052        while(leftStart < leftLength) {
1053            if(left.charAt(leftStart++) != right.charAt(rightStart++)) {
1054                return false;
1055            }
1056        }
1057        return true;
1058    }
1059    private boolean ignorePrefix(CharSequence s) {
1060        // Do not map non-FCD prefixes.
1061        return !isFCD(s);
1062    }
1063    private boolean ignoreString(CharSequence s) {
1064        // Do not map non-FCD strings.
1065        // Do not map strings that start with Hangul syllables: We decompose those on the fly.
1066        return !isFCD(s) || Hangul.isHangul(s.charAt(0));
1067    }
1068    private boolean isFCD(CharSequence s) {
1069        return fcd.isNormalized(s);
1070    }
1071
1072    private static final UnicodeSet COMPOSITES = new UnicodeSet("[:NFD_QC=N:]");
1073    static {
1074        // Hangul is decomposed on the fly during collation.
1075        COMPOSITES.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END);
1076    }
1077
1078    private void closeOverComposites() {
1079        String prefix = "";  // empty
1080        UnicodeSetIterator iter = new UnicodeSetIterator(COMPOSITES);
1081        while(iter.next()) {
1082            assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
1083            String nfdString = nfd.getDecomposition(iter.codepoint);
1084            cesLength = dataBuilder.getCEs(nfdString, ces, 0);
1085            if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
1086                // Too many CEs from the decomposition (unusual), ignore this composite.
1087                // We could add a capacity parameter to getCEs() and reallocate if necessary.
1088                // However, this can only really happen in contrived cases.
1089                continue;
1090            }
1091            String composite = iter.getString();
1092            addIfDifferent(prefix, composite, ces, cesLength, Collation.UNASSIGNED_CE32);
1093        }
1094    }
1095
1096    private int addIfDifferent(CharSequence prefix, CharSequence str,
1097                long[] newCEs, int newCEsLength, int ce32) {
1098        long[] oldCEs = new long[Collation.MAX_EXPANSION_LENGTH];
1099        int oldCEsLength = dataBuilder.getCEs(prefix, str, oldCEs, 0);
1100        if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) {
1101            if(ce32 == Collation.UNASSIGNED_CE32) {
1102                ce32 = dataBuilder.encodeCEs(newCEs, newCEsLength);
1103            }
1104            dataBuilder.addCE32(prefix, str, ce32);
1105        }
1106        return ce32;
1107    }
1108
1109    private static boolean sameCEs(long[] ces1, int ces1Length,
1110                long[] ces2, int ces2Length) {
1111        if(ces1Length != ces2Length) {
1112            return false;
1113        }
1114        assert(ces1Length <= Collation.MAX_EXPANSION_LENGTH);
1115        for(int i = 0; i < ces1Length; ++i) {
1116            if(ces1[i] != ces2[i]) { return false; }
1117        }
1118        return true;
1119    }
1120
1121    private static final int alignWeightRight(int w) {
1122        if(w != 0) {
1123            while((w & 0xff) == 0) { w >>>= 8; }
1124        }
1125        return w;
1126    }
1127
1128    /**
1129     * Walks the tailoring graph and overwrites tailored nodes with new CEs.
1130     * After this, the graph is destroyed.
1131     * The nodes array can then be used only as a source of tailored CEs.
1132     */
1133    private void makeTailoredCEs() {
1134        CollationWeights primaries = new CollationWeights();
1135        CollationWeights secondaries = new CollationWeights();
1136        CollationWeights tertiaries = new CollationWeights();
1137        long[] nodesArray = nodes.getBuffer();
1138        if(DEBUG) {
1139            System.out.println("\nCollationBuilder.makeTailoredCEs()");
1140        }
1141
1142        for(int rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
1143            int i = rootPrimaryIndexes.elementAti(rpi);
1144            long node = nodesArray[i];
1145            long p = weight32FromNode(node);
1146            int s = p == 0 ? 0 : Collation.COMMON_WEIGHT16;
1147            int t = s;
1148            int q = 0;
1149            boolean pIsTailored = false;
1150            boolean sIsTailored = false;
1151            boolean tIsTailored = false;
1152            if(DEBUG) {
1153                System.out.printf("\nprimary     %x\n", alignWeightRight((int)p));
1154            }
1155            int pIndex = p == 0 ? 0 : rootElements.findPrimary(p);
1156            int nextIndex = nextIndexFromNode(node);
1157            while(nextIndex != 0) {
1158                i = nextIndex;
1159                node = nodesArray[i];
1160                nextIndex = nextIndexFromNode(node);
1161                int strength = strengthFromNode(node);
1162                if(strength == Collator.QUATERNARY) {
1163                    assert(isTailoredNode(node));
1164                    if(DEBUG) {
1165                        System.out.print("      quat+     ");
1166                    }
1167                    if(q == 3) {
1168                        // C++ U_BUFFER_OVERFLOW_ERROR
1169                        throw new UnsupportedOperationException("quaternary tailoring gap too small");
1170                    }
1171                    ++q;
1172                } else {
1173                    if(strength == Collator.TERTIARY) {
1174                        if(isTailoredNode(node)) {
1175                            if(DEBUG) {
1176                                System.out.print("    ter+        ");
1177                            }
1178                            if(!tIsTailored) {
1179                                // First tailored tertiary node for [p, s].
1180                                int tCount = countTailoredNodes(nodesArray, nextIndex,
1181                                                                    Collator.TERTIARY) + 1;
1182                                int tLimit;
1183                                if(t == 0) {
1184                                    // Gap at the beginning of the tertiary CE range.
1185                                    t = rootElements.getTertiaryBoundary() - 0x100;
1186                                    tLimit = (int)rootElements.getFirstTertiaryCE() & Collation.ONLY_TERTIARY_MASK;
1187                                } else if(!pIsTailored && !sIsTailored) {
1188                                    // p and s are root weights.
1189                                    tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
1190                                } else if(t == Collation.BEFORE_WEIGHT16) {
1191                                    tLimit = Collation.COMMON_WEIGHT16;
1192                                } else {
1193                                    // [p, s] is tailored.
1194                                    assert(t == Collation.COMMON_WEIGHT16);
1195                                    tLimit = rootElements.getTertiaryBoundary();
1196                                }
1197                                assert(tLimit == 0x4000 || (tLimit & ~Collation.ONLY_TERTIARY_MASK) == 0);
1198                                tertiaries.initForTertiary();
1199                                if(!tertiaries.allocWeights(t, tLimit, tCount)) {
1200                                    // C++ U_BUFFER_OVERFLOW_ERROR
1201                                    throw new UnsupportedOperationException("tertiary tailoring gap too small");
1202                                }
1203                                tIsTailored = true;
1204                            }
1205                            t = (int)tertiaries.nextWeight();
1206                            assert(t != 0xffffffff);
1207                        } else {
1208                            t = weight16FromNode(node);
1209                            tIsTailored = false;
1210                            if(DEBUG) {
1211                                System.out.printf("    ter     %x\n", alignWeightRight(t));
1212                            }
1213                        }
1214                    } else {
1215                        if(strength == Collator.SECONDARY) {
1216                            if(isTailoredNode(node)) {
1217                                if(DEBUG) {
1218                                    System.out.print("  sec+          ");
1219                                }
1220                                if(!sIsTailored) {
1221                                    // First tailored secondary node for p.
1222                                    int sCount = countTailoredNodes(nodesArray, nextIndex,
1223                                                                        Collator.SECONDARY) + 1;
1224                                    int sLimit;
1225                                    if(s == 0) {
1226                                        // Gap at the beginning of the secondary CE range.
1227                                        s = rootElements.getSecondaryBoundary() - 0x100;
1228                                        sLimit = (int)(rootElements.getFirstSecondaryCE() >> 16);
1229                                    } else if(!pIsTailored) {
1230                                        // p is a root primary.
1231                                        sLimit = rootElements.getSecondaryAfter(pIndex, s);
1232                                    } else if(s == Collation.BEFORE_WEIGHT16) {
1233                                        sLimit = Collation.COMMON_WEIGHT16;
1234                                    } else {
1235                                        // p is a tailored primary.
1236                                        assert(s == Collation.COMMON_WEIGHT16);
1237                                        sLimit = rootElements.getSecondaryBoundary();
1238                                    }
1239                                    if(s == Collation.COMMON_WEIGHT16) {
1240                                        // Do not tailor into the getSortKey() range of
1241                                        // compressed common secondaries.
1242                                        s = rootElements.getLastCommonSecondary();
1243                                    }
1244                                    secondaries.initForSecondary();
1245                                    if(!secondaries.allocWeights(s, sLimit, sCount)) {
1246                                        // C++ U_BUFFER_OVERFLOW_ERROR
1247                                        if(DEBUG) {
1248                                            System.out.printf("!secondaries.allocWeights(%x, %x, sCount=%d)\n",
1249                                                    alignWeightRight(s), alignWeightRight(sLimit),
1250                                                    alignWeightRight(sCount));
1251                                        }
1252                                        throw new UnsupportedOperationException("secondary tailoring gap too small");
1253                                    }
1254                                    sIsTailored = true;
1255                                }
1256                                s = (int)secondaries.nextWeight();
1257                                assert(s != 0xffffffff);
1258                            } else {
1259                                s = weight16FromNode(node);
1260                                sIsTailored = false;
1261                                if(DEBUG) {
1262                                    System.out.printf("  sec       %x\n", alignWeightRight(s));
1263                                }
1264                            }
1265                        } else /* Collator.PRIMARY */ {
1266                            assert(isTailoredNode(node));
1267                            if(DEBUG) {
1268                                System.out.print("pri+            ");
1269                            }
1270                            if(!pIsTailored) {
1271                                // First tailored primary node in this list.
1272                                int pCount = countTailoredNodes(nodesArray, nextIndex,
1273                                                                    Collator.PRIMARY) + 1;
1274                                boolean isCompressible = baseData.isCompressiblePrimary(p);
1275                                long pLimit =
1276                                    rootElements.getPrimaryAfter(p, pIndex, isCompressible);
1277                                primaries.initForPrimary(isCompressible);
1278                                if(!primaries.allocWeights(p, pLimit, pCount)) {
1279                                    // C++ U_BUFFER_OVERFLOW_ERROR  // TODO: introduce a more specific UErrorCode?
1280                                    throw new UnsupportedOperationException("primary tailoring gap too small");
1281                                }
1282                                pIsTailored = true;
1283                            }
1284                            p = primaries.nextWeight();
1285                            assert(p != 0xffffffffL);
1286                            s = Collation.COMMON_WEIGHT16;
1287                            sIsTailored = false;
1288                        }
1289                        t = s == 0 ? 0 : Collation.COMMON_WEIGHT16;
1290                        tIsTailored = false;
1291                    }
1292                    q = 0;
1293                }
1294                if(isTailoredNode(node)) {
1295                    nodesArray[i] = Collation.makeCE(p, s, t, q);
1296                    if(DEBUG) {
1297                        System.out.printf("%016x\n", nodesArray[i]);
1298                    }
1299                }
1300            }
1301        }
1302    }
1303
1304    /**
1305     * Counts the tailored nodes of the given strength up to the next node
1306     * which is either stronger or has an explicit weight of this strength.
1307     */
1308    private static int countTailoredNodes(long[] nodesArray, int i, int strength) {
1309        int count = 0;
1310        for(;;) {
1311            if(i == 0) { break; }
1312            long node = nodesArray[i];
1313            if(strengthFromNode(node) < strength) { break; }
1314            if(strengthFromNode(node) == strength) {
1315                if(isTailoredNode(node)) {
1316                    ++count;
1317                } else {
1318                    break;
1319                }
1320            }
1321            i = nextIndexFromNode(node);
1322        }
1323        return count;
1324    }
1325
1326    private static final class CEFinalizer implements CollationDataBuilder.CEModifier {
1327        CEFinalizer(long[] ces) {
1328            finalCEs = ces;
1329        }
1330        @Override
1331        public long modifyCE32(int ce32) {
1332            assert(!Collation.isSpecialCE32(ce32));
1333            if(CollationBuilder.isTempCE32(ce32)) {
1334                // retain case bits
1335                return finalCEs[CollationBuilder.indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);
1336            } else {
1337                return Collation.NO_CE;
1338            }
1339        }
1340        @Override
1341        public long modifyCE(long ce) {
1342            if(CollationBuilder.isTempCE(ce)) {
1343                // retain case bits
1344                return finalCEs[CollationBuilder.indexFromTempCE(ce)] | (ce & 0xc000);
1345            } else {
1346                return Collation.NO_CE;
1347            }
1348        }
1349
1350        private long[] finalCEs;
1351    };
1352
1353    /** Replaces temporary CEs with the final CEs they point to. */
1354    private void finalizeCEs() {
1355        CollationDataBuilder newBuilder = new CollationDataBuilder();
1356        newBuilder.initForTailoring(baseData);
1357        CEFinalizer finalizer = new CEFinalizer(nodes.getBuffer());
1358        newBuilder.copyFrom(dataBuilder, finalizer);
1359        dataBuilder = newBuilder;
1360    }
1361
1362    /**
1363     * Encodes "temporary CE" data into a CE that fits into the CE32 data structure,
1364     * with 2-byte primary, 1-byte secondary and 6-bit tertiary,
1365     * with valid CE byte values.
1366     *
1367     * The index must not exceed 20 bits (0xfffff).
1368     * The strength must fit into 2 bits (Collator.PRIMARY..Collator.QUATERNARY).
1369     *
1370     * Temporary CEs are distinguished from real CEs by their use of
1371     * secondary weights 06..45 which are otherwise reserved for compressed sort keys.
1372     *
1373     * The case bits are unused and available.
1374     */
1375    private static long tempCEFromIndexAndStrength(int index, int strength) {
1376        return
1377            // CE byte offsets, to ensure valid CE bytes, and case bits 11
1378            0x4040000006002000L +
1379            // index bits 19..13 -> primary byte 1 = CE bits 63..56 (byte values 40..BF)
1380            ((long)(index & 0xfe000) << 43) +
1381            // index bits 12..6 -> primary byte 2 = CE bits 55..48 (byte values 40..BF)
1382            ((long)(index & 0x1fc0) << 42) +
1383            // index bits 5..0 -> secondary byte 1 = CE bits 31..24 (byte values 06..45)
1384            ((index & 0x3f) << 24) +
1385            // strength bits 1..0 -> tertiary byte 1 = CE bits 13..8 (byte values 20..23)
1386            (strength << 8);
1387    }
1388    private static int indexFromTempCE(long tempCE) {
1389        tempCE -= 0x4040000006002000L;
1390        return
1391            ((int)(tempCE >> 43) & 0xfe000) |
1392            ((int)(tempCE >> 42) & 0x1fc0) |
1393            ((int)(tempCE >> 24) & 0x3f);
1394    }
1395    private static int strengthFromTempCE(long tempCE) {
1396        return ((int)tempCE >> 8) & 3;
1397    }
1398    private static boolean isTempCE(long ce) {
1399        int sec = (int)ce >>> 24;
1400        return 6 <= sec && sec <= 0x45;
1401    }
1402
1403    private static int indexFromTempCE32(int tempCE32) {
1404        tempCE32 -= 0x40400620;
1405        return
1406            ((tempCE32 >> 11) & 0xfe000) |
1407            ((tempCE32 >> 10) & 0x1fc0) |
1408            ((tempCE32 >> 8) & 0x3f);
1409    }
1410    private static boolean isTempCE32(int ce32) {
1411        return
1412            (ce32 & 0xff) >= 2 &&  // not a long-primary/long-secondary CE32
1413            6 <= ((ce32 >> 8) & 0xff) && ((ce32 >> 8) & 0xff) <= 0x45;
1414    }
1415
1416    private static int ceStrength(long ce) {
1417        return
1418            isTempCE(ce) ? strengthFromTempCE(ce) :
1419            (ce & 0xff00000000000000L) != 0 ? Collator.PRIMARY :
1420            ((int)ce & 0xff000000) != 0 ? Collator.SECONDARY :
1421            ce != 0 ? Collator.TERTIARY :
1422            Collator.IDENTICAL;
1423    }
1424
1425    /** At most 1M nodes, limited by the 20 bits in node bit fields. */
1426    private static final int MAX_INDEX = 0xfffff;
1427    /**
1428     * Node bit 6 is set on a primary node if there are nodes
1429     * with secondary values below the common secondary weight (05).
1430     */
1431    private static final int HAS_BEFORE2 = 0x40;
1432    /**
1433     * Node bit 5 is set on a primary or secondary node if there are nodes
1434     * with tertiary values below the common tertiary weight (05).
1435     */
1436    private static final int HAS_BEFORE3 = 0x20;
1437    /**
1438     * Node bit 3 distinguishes a tailored node, which has no weight value,
1439     * from a node with an explicit (root or default) weight.
1440     */
1441    private static final int IS_TAILORED = 8;
1442
1443    private static long nodeFromWeight32(long weight32) {
1444        return weight32 << 32;
1445    }
1446    private static long nodeFromWeight16(int weight16) {
1447        return (long)weight16 << 48;
1448    }
1449    private static long nodeFromPreviousIndex(int previous) {
1450        return (long)previous << 28;
1451    }
1452    private static long nodeFromNextIndex(int next) {
1453        return next << 8;
1454    }
1455    private static long nodeFromStrength(int strength) {
1456        return strength;
1457    }
1458
1459    private static long weight32FromNode(long node) {
1460        return node >>> 32;
1461    }
1462    private static int weight16FromNode(long node) {
1463        return (int)(node >> 48) & 0xffff;
1464    }
1465    private static int previousIndexFromNode(long node) {
1466        return (int)(node >> 28) & MAX_INDEX;
1467    }
1468    private static int nextIndexFromNode(long node) {
1469        return ((int)node >> 8) & MAX_INDEX;
1470    }
1471    private static int strengthFromNode(long node) {
1472        return (int)node & 3;
1473    }
1474
1475    private static boolean nodeHasBefore2(long node) {
1476        return (node & HAS_BEFORE2) != 0;
1477    }
1478    private static boolean nodeHasBefore3(long node) {
1479        return (node & HAS_BEFORE3) != 0;
1480    }
1481    private static boolean nodeHasAnyBefore(long node) {
1482        return (node & (HAS_BEFORE2 | HAS_BEFORE3)) != 0;
1483    }
1484    private static boolean isTailoredNode(long node) {
1485        return (node & IS_TAILORED) != 0;
1486    }
1487
1488    private static long changeNodePreviousIndex(long node, int previous) {
1489        return (node & 0xffff00000fffffffL) | nodeFromPreviousIndex(previous);
1490    }
1491    private static long changeNodeNextIndex(long node, int next) {
1492        return (node & 0xfffffffff00000ffL) | nodeFromNextIndex(next);
1493    }
1494
1495    private Normalizer2 nfd, fcd;
1496    private Normalizer2Impl nfcImpl;
1497
1498    private CollationTailoring base;
1499    private CollationData baseData;
1500    private CollationRootElements rootElements;
1501    private long variableTop;
1502
1503    private CollationDataBuilder dataBuilder;
1504    private boolean fastLatinEnabled;
1505    private UnicodeSet optimizeSet = new UnicodeSet();
1506
1507    private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH];
1508    private int cesLength;
1509
1510    /**
1511     * Indexes of nodes with root primary weights, sorted by primary.
1512     * Compact form of a TreeMap from root primary to node index.
1513     *
1514     * This is a performance optimization for finding reset positions.
1515     * Without this, we would have to search through the entire nodes list.
1516     * It also allows storing root primary weights in list head nodes,
1517     * without previous index, leaving room in root primary nodes for 32-bit primary weights.
1518     */
1519    private UVector32 rootPrimaryIndexes;
1520    /**
1521     * Data structure for assigning tailored weights and CEs.
1522     * Doubly-linked lists of nodes in mostly collation order.
1523     * Each list starts with a root primary node and ends with a nextIndex of 0.
1524     *
1525     * When there are any nodes in the list, then there is always a root primary node at index 0.
1526     * This allows some code not to have to check explicitly for nextIndex==0.
1527     *
1528     * Root primary nodes have 32-bit weights but do not have previous indexes.
1529     * All other nodes have at most 16-bit weights and do have previous indexes.
1530     *
1531     * Nodes with explicit weights store root collator weights,
1532     * or default weak weights (e.g., secondary 05) for stronger nodes.
1533     * "Tailored" nodes, with the IS_TAILORED bit set,
1534     * do not store explicit weights but rather
1535     * create a difference of a certain strength from the preceding node.
1536     *
1537     * A root node is followed by either
1538     * - a root/default node of the same strength, or
1539     * - a root/default node of the next-weaker strength, or
1540     * - a tailored node of the same strength.
1541     *
1542     * A node of a given strength normally implies "common" weights on weaker levels.
1543     *
1544     * A node with HAS_BEFORE2 must be immediately followed by
1545     * a secondary node with an explicit below-common weight, then a secondary tailored node,
1546     * and later an explicit common-secondary node.
1547     * The below-common weight can be a root weight,
1548     * or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight
1549     * or before the lowest root weight.
1550     * (&[before 2] resets to an explicit secondary node so that
1551     * the following addRelation(secondary) tailors right after that.
1552     * If we did not have this node and instead were to reset on the primary node,
1553     * then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.)
1554     *
1555     * If the flag is not set, then there are no explicit secondary nodes
1556     * with the common or lower weights.
1557     *
1558     * Same for HAS_BEFORE3 for tertiary nodes and weights.
1559     * A node must not have both flags set.
1560     *
1561     * Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs
1562     * which point to stable indexes in this list,
1563     * and temporary CEs stored in a CollationDataBuilder only point to tailored nodes.
1564     *
1565     * A temporary CE in the ces[] array may point to a non-tailored reset-before-position node,
1566     * until the next relation is added.
1567     *
1568     * At the end, the tailored weights are allocated as necessary,
1569     * then the tailored nodes are replaced with final CEs,
1570     * and the CollationData is rewritten by replacing temporary CEs with final ones.
1571     *
1572     * We cannot simply insert new nodes in the middle of the array
1573     * because that would invalidate the indexes stored in existing temporary CEs.
1574     * We need to use a linked graph with stable indexes to existing nodes.
1575     * A doubly-linked list seems easiest to maintain.
1576     *
1577     * Each node is stored as an long, with its fields stored as bit fields.
1578     *
1579     * Root primary node:
1580     * - primary weight: 32 bits 63..32
1581     * - reserved/unused/zero: 4 bits 31..28
1582     *
1583     * Weaker root nodes & tailored nodes:
1584     * - a weight: 16 bits 63..48
1585     *   + a root or default weight for a non-tailored node
1586     *   + unused/zero for a tailored node
1587     * - index to the previous node: 20 bits 47..28
1588     *
1589     * All types of nodes:
1590     * - index to the next node: 20 bits 27..8
1591     *   + nextIndex=0 in last node per root-primary list
1592     * - reserved/unused/zero bits: bits 7, 4, 2
1593     * - HAS_BEFORE2: bit 6
1594     * - HAS_BEFORE3: bit 5
1595     * - IS_TAILORED: bit 3
1596     * - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0
1597     *
1598     * We could allocate structs with pointers, but we would have to store them
1599     * in a pointer list so that they can be indexed from temporary CEs,
1600     * and they would require more memory allocations.
1601     */
1602    private UVector64 nodes;
1603}
1604