1/* GENERATED SOURCE. DO NOT MODIFY. */
2/*
3*******************************************************************************
4* Copyright (C) 2013-2015, International Business Machines
5* Corporation and others.  All Rights Reserved.
6*******************************************************************************
7* CollationBuilder.java, ported from collationbuilder.h/.cpp
8*
9* C++ version created on: 2013may06
10* created by: Markus W. Scherer
11*/
12
13package android.icu.impl.coll;
14
15import java.text.ParseException;
16
17import android.icu.impl.Norm2AllModes;
18import android.icu.impl.Normalizer2Impl;
19import android.icu.impl.Normalizer2Impl.Hangul;
20import android.icu.lang.UScript;
21import android.icu.text.CanonicalIterator;
22import android.icu.text.Collator;
23import android.icu.text.Normalizer2;
24import android.icu.text.UnicodeSet;
25import android.icu.text.UnicodeSetIterator;
26import android.icu.util.ULocale;
27
28/**
29 * @hide Only a subset of ICU is exposed in Android
30 */
31public final class CollationBuilder extends CollationRuleParser.Sink {
32    private static final boolean DEBUG = false;
33    private static final class BundleImporter implements CollationRuleParser.Importer {
34        BundleImporter() {}
35        public String getRules(String localeID, String collationType) {
36            return CollationLoader.loadRules(new ULocale(localeID), collationType);
37        }
38    }
39
40    public CollationBuilder(CollationTailoring b) {
41        nfd = Normalizer2.getNFDInstance();
42        fcd = Norm2AllModes.getFCDNormalizer2();
43        nfcImpl = Norm2AllModes.getNFCInstance().impl;
44        base = b;
45        baseData = b.data;
46        rootElements = new CollationRootElements(b.data.rootElements);
47        variableTop = 0;
48        dataBuilder = new CollationDataBuilder();
49        fastLatinEnabled = true;
50        cesLength = 0;
51        rootPrimaryIndexes = new UVector32();
52        nodes = new UVector64();
53        nfcImpl.ensureCanonIterData();
54        dataBuilder.initForTailoring(baseData);
55    }
56
57    public CollationTailoring parseAndBuild(String ruleString) throws ParseException {
58        if(baseData.rootElements == null) {
59            // C++ U_MISSING_RESOURCE_ERROR
60            throw new UnsupportedOperationException(
61                    "missing root elements data, tailoring not supported");
62        }
63        CollationTailoring tailoring = new CollationTailoring(base.settings);
64        CollationRuleParser parser = new CollationRuleParser(baseData);
65        // Note: This always bases &[last variable] and &[first regular]
66        // on the root collator's maxVariable/variableTop.
67        // If we wanted this to change after [maxVariable x], then we would keep
68        // the tailoring.settings pointer here and read its variableTop when we need it.
69        // See http://unicode.org/cldr/trac/ticket/6070
70        variableTop = base.settings.readOnly().variableTop;
71        parser.setSink(this);
72        // In Java, there is only one Importer implementation.
73        // In C++, the importer is a parameter for this method.
74        parser.setImporter(new BundleImporter());
75        CollationSettings ownedSettings = tailoring.settings.copyOnWrite();
76        parser.parse(ruleString, ownedSettings);
77        if(dataBuilder.hasMappings()) {
78            makeTailoredCEs();
79            closeOverComposites();
80            finalizeCEs();
81            // Copy all of ASCII, and Latin-1 letters, into each tailoring.
82            optimizeSet.add(0, 0x7f);
83            optimizeSet.add(0xc0, 0xff);
84            // Hangul is decomposed on the fly during collation,
85            // and the tailoring data is always built with HANGUL_TAG specials.
86            optimizeSet.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END);
87            dataBuilder.optimize(optimizeSet);
88            tailoring.ensureOwnedData();
89            if(fastLatinEnabled) { dataBuilder.enableFastLatin(); }
90            dataBuilder.build(tailoring.ownedData);
91            // C++ tailoring.builder = dataBuilder;
92            dataBuilder = null;
93        } else {
94            tailoring.data = baseData;
95        }
96        ownedSettings.fastLatinOptions = CollationFastLatin.getOptions(
97                tailoring.data, ownedSettings,
98                ownedSettings.fastLatinPrimaries);
99        tailoring.setRules(ruleString);
100        // In Java, we do not have a rules version.
101        // In C++, the genrb build tool reads and supplies one,
102        // and the rulesVersion is a parameter for this method.
103        tailoring.setVersion(base.version, 0 /* rulesVersion */);
104        return tailoring;
105    }
106
107    /** Implements CollationRuleParser.Sink. */
108    @Override
109    void addReset(int strength, CharSequence str) {
110        assert(str.length() != 0);
111        if(str.charAt(0) == CollationRuleParser.POS_LEAD) {
112            ces[0] = getSpecialResetPosition(str);
113            cesLength = 1;
114            assert((ces[0] & Collation.CASE_AND_QUATERNARY_MASK) == 0);
115        } else {
116            // normal reset to a character or string
117            String nfdString = nfd.normalize(str);
118            cesLength = dataBuilder.getCEs(nfdString, ces, 0);
119            if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
120                throw new IllegalArgumentException(
121                        "reset position maps to too many collation elements (more than 31)");
122            }
123        }
124        if(strength == Collator.IDENTICAL) { return; }  // simple reset-at-position
125
126        // &[before strength]position
127        assert(Collator.PRIMARY <= strength && strength <= Collator.TERTIARY);
128        int index = findOrInsertNodeForCEs(strength);
129
130        long node = nodes.elementAti(index);
131        // If the index is for a "weaker" node,
132        // then skip backwards over this and further "weaker" nodes.
133        while(strengthFromNode(node) > strength) {
134            index = previousIndexFromNode(node);
135            node = nodes.elementAti(index);
136        }
137
138        // Find or insert a node whose index we will put into a temporary CE.
139        if(strengthFromNode(node) == strength && isTailoredNode(node)) {
140            // Reset to just before this same-strength tailored node.
141            index = previousIndexFromNode(node);
142        } else if(strength == Collator.PRIMARY) {
143            // root primary node (has no previous index)
144            long p = weight32FromNode(node);
145            if(p == 0) {
146                throw new UnsupportedOperationException(
147                        "reset primary-before ignorable not possible");
148            }
149            if(p <= rootElements.getFirstPrimary()) {
150                // There is no primary gap between ignorables and the space-first-primary.
151                throw new UnsupportedOperationException(
152                        "reset primary-before first non-ignorable not supported");
153            }
154            if(p == Collation.FIRST_TRAILING_PRIMARY) {
155                // We do not support tailoring to an unassigned-implicit CE.
156                throw new UnsupportedOperationException(
157                        "reset primary-before [first trailing] not supported");
158            }
159            p = rootElements.getPrimaryBefore(p, baseData.isCompressiblePrimary(p));
160            index = findOrInsertNodeForPrimary(p);
161            // Go to the last node in this list:
162            // Tailor after the last node between adjacent root nodes.
163            for(;;) {
164                node = nodes.elementAti(index);
165                int nextIndex = nextIndexFromNode(node);
166                if(nextIndex == 0) { break; }
167                index = nextIndex;
168            }
169        } else {
170            // &[before 2] or &[before 3]
171            index = findCommonNode(index, Collator.SECONDARY);
172            if(strength >= Collator.TERTIARY) {
173                index = findCommonNode(index, Collator.TERTIARY);
174            }
175            // findCommonNode() stayed on the stronger node or moved to
176            // an explicit common-weight node of the reset-before strength.
177            node = nodes.elementAti(index);
178            if(strengthFromNode(node) == strength) {
179                // Found a same-strength node with an explicit weight.
180                int weight16 = weight16FromNode(node);
181                if(weight16 == 0) {
182                    throw new UnsupportedOperationException(
183                            (strength == Collator.SECONDARY) ?
184                                    "reset secondary-before secondary ignorable not possible" :
185                                    "reset tertiary-before completely ignorable not possible");
186                }
187                assert(weight16 > Collation.BEFORE_WEIGHT16);
188                // Reset to just before this node.
189                // Insert the preceding same-level explicit weight if it is not there already.
190                // Which explicit weight immediately precedes this one?
191                weight16 = getWeight16Before(index, node, strength);
192                // Does this preceding weight have a node?
193                int previousWeight16;
194                int previousIndex = previousIndexFromNode(node);
195                for(int i = previousIndex;; i = previousIndexFromNode(node)) {
196                    node = nodes.elementAti(i);
197                    int previousStrength = strengthFromNode(node);
198                    if(previousStrength < strength) {
199                        assert(weight16 >= Collation.COMMON_WEIGHT16 || i == previousIndex);
200                        // Either the reset element has an above-common weight and
201                        // the parent node provides the implied common weight,
202                        // or the reset element has a weight<=common in the node
203                        // right after the parent, and we need to insert the preceding weight.
204                        previousWeight16 = Collation.COMMON_WEIGHT16;
205                        break;
206                    } else if(previousStrength == strength && !isTailoredNode(node)) {
207                        previousWeight16 = weight16FromNode(node);
208                        break;
209                    }
210                    // Skip weaker nodes and same-level tailored nodes.
211                }
212                if(previousWeight16 == weight16) {
213                    // The preceding weight has a node,
214                    // maybe with following weaker or tailored nodes.
215                    // Reset to the last of them.
216                    index = previousIndex;
217                } else {
218                    // Insert a node with the preceding weight, reset to that.
219                    node = nodeFromWeight16(weight16) | nodeFromStrength(strength);
220                    index = insertNodeBetween(previousIndex, index, node);
221                }
222            } else {
223                // Found a stronger node with implied strength-common weight.
224                int weight16 = getWeight16Before(index, node, strength);
225                index = findOrInsertWeakNode(index, weight16, strength);
226            }
227            // Strength of the temporary CE = strength of its reset position.
228            // Code above raises an error if the before-strength is stronger.
229            strength = ceStrength(ces[cesLength - 1]);
230        }
231        ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength);
232    }
233
234    /**
235     * Returns the secondary or tertiary weight preceding the current node's weight.
236     * node=nodes[index].
237     */
238    private int getWeight16Before(int index, long node, int level) {
239        assert(strengthFromNode(node) < level || !isTailoredNode(node));
240        // Collect the root CE weights if this node is for a root CE.
241        // If it is not, then return the low non-primary boundary for a tailored CE.
242        int t;
243        if(strengthFromNode(node) == Collator.TERTIARY) {
244            t = weight16FromNode(node);
245        } else {
246            t = Collation.COMMON_WEIGHT16;  // Stronger node with implied common weight.
247        }
248        while(strengthFromNode(node) > Collator.SECONDARY) {
249            index = previousIndexFromNode(node);
250            node = nodes.elementAti(index);
251        }
252        if(isTailoredNode(node)) {
253            return Collation.BEFORE_WEIGHT16;
254        }
255        int s;
256        if(strengthFromNode(node) == Collator.SECONDARY) {
257            s = weight16FromNode(node);
258        } else {
259            s = Collation.COMMON_WEIGHT16;  // Stronger node with implied common weight.
260        }
261        while(strengthFromNode(node) > Collator.PRIMARY) {
262            index = previousIndexFromNode(node);
263            node = nodes.elementAti(index);
264        }
265        if(isTailoredNode(node)) {
266            return Collation.BEFORE_WEIGHT16;
267        }
268        // [p, s, t] is a root CE. Return the preceding weight for the requested level.
269        long p = weight32FromNode(node);
270        int weight16;
271        if(level == Collator.SECONDARY) {
272            weight16 = rootElements.getSecondaryBefore(p, s);
273        } else {
274            weight16 = rootElements.getTertiaryBefore(p, s, t);
275            assert((weight16 & ~Collation.ONLY_TERTIARY_MASK) == 0);
276        }
277        return weight16;
278    }
279
280    private long getSpecialResetPosition(CharSequence str) {
281        assert(str.length() == 2);
282        long ce;
283        int strength = Collator.PRIMARY;
284        boolean isBoundary = false;
285        CollationRuleParser.Position pos =
286                CollationRuleParser.POSITION_VALUES[str.charAt(1) - CollationRuleParser.POS_BASE];
287        switch(pos) {
288        case FIRST_TERTIARY_IGNORABLE:
289            // Quaternary CEs are not supported.
290            // Non-zero quaternary weights are possible only on tertiary or stronger CEs.
291            return 0;
292        case LAST_TERTIARY_IGNORABLE:
293            return 0;
294        case FIRST_SECONDARY_IGNORABLE: {
295            // Look for a tailored tertiary node after [0, 0, 0].
296            int index = findOrInsertNodeForRootCE(0, Collator.TERTIARY);
297            long node = nodes.elementAti(index);
298            if((index = nextIndexFromNode(node)) != 0) {
299                node = nodes.elementAti(index);
300                assert(strengthFromNode(node) <= Collator.TERTIARY);
301                if(isTailoredNode(node) && strengthFromNode(node) == Collator.TERTIARY) {
302                    return tempCEFromIndexAndStrength(index, Collator.TERTIARY);
303                }
304            }
305            return rootElements.getFirstTertiaryCE();
306            // No need to look for nodeHasAnyBefore() on a tertiary node.
307        }
308        case LAST_SECONDARY_IGNORABLE:
309            ce = rootElements.getLastTertiaryCE();
310            strength = Collator.TERTIARY;
311            break;
312        case FIRST_PRIMARY_IGNORABLE: {
313            // Look for a tailored secondary node after [0, 0, *].
314            int index = findOrInsertNodeForRootCE(0, Collator.SECONDARY);
315            long node = nodes.elementAti(index);
316            while((index = nextIndexFromNode(node)) != 0) {
317                node = nodes.elementAti(index);
318                strength = strengthFromNode(node);
319                if(strength < Collator.SECONDARY) { break; }
320                if(strength == Collator.SECONDARY) {
321                    if(isTailoredNode(node)) {
322                        if(nodeHasBefore3(node)) {
323                            index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
324                            assert(isTailoredNode(nodes.elementAti(index)));
325                        }
326                        return tempCEFromIndexAndStrength(index, Collator.SECONDARY);
327                    } else {
328                        break;
329                    }
330                }
331            }
332            ce = rootElements.getFirstSecondaryCE();
333            strength = Collator.SECONDARY;
334            break;
335        }
336        case LAST_PRIMARY_IGNORABLE:
337            ce = rootElements.getLastSecondaryCE();
338            strength = Collator.SECONDARY;
339            break;
340        case FIRST_VARIABLE:
341            ce = rootElements.getFirstPrimaryCE();
342            isBoundary = true;  // FractionalUCA.txt: FDD1 00A0, SPACE first primary
343            break;
344        case LAST_VARIABLE:
345            ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1);
346            break;
347        case FIRST_REGULAR:
348            ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1);
349            isBoundary = true;  // FractionalUCA.txt: FDD1 263A, SYMBOL first primary
350            break;
351        case LAST_REGULAR:
352            // Use the Hani-first-primary rather than the actual last "regular" CE before it,
353            // for backward compatibility with behavior before the introduction of
354            // script-first-primary CEs in the root collator.
355            ce = rootElements.firstCEWithPrimaryAtLeast(
356                baseData.getFirstPrimaryForGroup(UScript.HAN));
357            break;
358        case FIRST_IMPLICIT:
359            ce = baseData.getSingleCE(0x4e00);
360            break;
361        case LAST_IMPLICIT:
362            // We do not support tailoring to an unassigned-implicit CE.
363            throw new UnsupportedOperationException(
364                    "reset to [last implicit] not supported");
365        case FIRST_TRAILING:
366            ce = Collation.makeCE(Collation.FIRST_TRAILING_PRIMARY);
367            isBoundary = true;  // trailing first primary (there is no mapping for it)
368            break;
369        case LAST_TRAILING:
370            throw new IllegalArgumentException("LDML forbids tailoring to U+FFFF");
371        default:
372            assert(false);
373            return 0;
374        }
375
376        int index = findOrInsertNodeForRootCE(ce, strength);
377        long node = nodes.elementAti(index);
378        if((pos.ordinal() & 1) == 0) {
379            // even pos = [first xyz]
380            if(!nodeHasAnyBefore(node) && isBoundary) {
381                // A <group> first primary boundary is artificially added to FractionalUCA.txt.
382                // It is reachable via its special contraction, but is not normally used.
383                // Find the first character tailored after the boundary CE,
384                // or the first real root CE after it.
385                if((index = nextIndexFromNode(node)) != 0) {
386                    // If there is a following node, then it must be tailored
387                    // because there are no root CEs with a boundary primary
388                    // and non-common secondary/tertiary weights.
389                    node = nodes.elementAti(index);
390                    assert(isTailoredNode(node));
391                    ce = tempCEFromIndexAndStrength(index, strength);
392                } else {
393                    assert(strength == Collator.PRIMARY);
394                    long p = ce >>> 32;
395                    int pIndex = rootElements.findPrimary(p);
396                    boolean isCompressible = baseData.isCompressiblePrimary(p);
397                    p = rootElements.getPrimaryAfter(p, pIndex, isCompressible);
398                    ce = Collation.makeCE(p);
399                    index = findOrInsertNodeForRootCE(ce, Collator.PRIMARY);
400                    node = nodes.elementAti(index);
401                }
402            }
403            if(nodeHasAnyBefore(node)) {
404                // Get the first node that was tailored before this one at a weaker strength.
405                if(nodeHasBefore2(node)) {
406                    index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
407                    node = nodes.elementAti(index);
408                }
409                if(nodeHasBefore3(node)) {
410                    index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node)));
411                }
412                assert(isTailoredNode(nodes.elementAti(index)));
413                ce = tempCEFromIndexAndStrength(index, strength);
414            }
415        } else {
416            // odd pos = [last xyz]
417            // Find the last node that was tailored after the [last xyz]
418            // at a strength no greater than the position's strength.
419            for(;;) {
420                int nextIndex = nextIndexFromNode(node);
421                if(nextIndex == 0) { break; }
422                long nextNode = nodes.elementAti(nextIndex);
423                if(strengthFromNode(nextNode) < strength) { break; }
424                index = nextIndex;
425                node = nextNode;
426            }
427            // Do not make a temporary CE for a root node.
428            // This last node might be the node for the root CE itself,
429            // or a node with a common secondary or tertiary weight.
430            if(isTailoredNode(node)) {
431                ce = tempCEFromIndexAndStrength(index, strength);
432            }
433        }
434        return ce;
435    }
436
437    /** Implements CollationRuleParser.Sink. */
438    // Java 6: @Override
439    void addRelation(int strength, CharSequence prefix, CharSequence str, CharSequence extension) {
440        String nfdPrefix;
441        if(prefix.length() == 0) {
442            nfdPrefix = "";
443        } else {
444            nfdPrefix = nfd.normalize(prefix);
445        }
446        String nfdString = nfd.normalize(str);
447
448        // The runtime code decomposes Hangul syllables on the fly,
449        // with recursive processing but without making the Jamo pieces visible for matching.
450        // It does not work with certain types of contextual mappings.
451        int nfdLength = nfdString.length();
452        if(nfdLength >= 2) {
453            char c = nfdString.charAt(0);
454            if(Hangul.isJamoL(c) || Hangul.isJamoV(c)) {
455                // While handling a Hangul syllable, contractions starting with Jamo L or V
456                // would not see the following Jamo of that syllable.
457                throw new UnsupportedOperationException(
458                        "contractions starting with conjoining Jamo L or V not supported");
459            }
460            c = nfdString.charAt(nfdLength - 1);
461            if(Hangul.isJamoL(c) ||
462                    (Hangul.isJamoV(c) && Hangul.isJamoL(nfdString.charAt(nfdLength - 2)))) {
463                // A contraction ending with Jamo L or L+V would require
464                // generating Hangul syllables in addTailComposites() (588 for a Jamo L),
465                // or decomposing a following Hangul syllable on the fly, during contraction matching.
466                throw new UnsupportedOperationException(
467                        "contractions ending with conjoining Jamo L or L+V not supported");
468            }
469            // A Hangul syllable completely inside a contraction is ok.
470        }
471        // Note: If there is a prefix, then the parser checked that
472        // both the prefix and the string beging with NFC boundaries (not Jamo V or T).
473        // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0))
474        // (While handling a Hangul syllable, prefixes on Jamo V or T
475        // would not see the previous Jamo of that syllable.)
476
477        if(strength != Collator.IDENTICAL) {
478            // Find the node index after which we insert the new tailored node.
479            int index = findOrInsertNodeForCEs(strength);
480            assert(cesLength > 0);
481            long ce = ces[cesLength - 1];
482            if(strength == Collator.PRIMARY && !isTempCE(ce) && (ce >>> 32) == 0) {
483                // There is no primary gap between ignorables and the space-first-primary.
484                throw new UnsupportedOperationException(
485                        "tailoring primary after ignorables not supported");
486            }
487            if(strength == Collator.QUATERNARY && ce == 0) {
488                // The CE data structure does not support non-zero quaternary weights
489                // on tertiary ignorables.
490                throw new UnsupportedOperationException(
491                        "tailoring quaternary after tertiary ignorables not supported");
492            }
493            // Insert the new tailored node.
494            index = insertTailoredNodeAfter(index, strength);
495            // Strength of the temporary CE:
496            // The new relation may yield a stronger CE but not a weaker one.
497            int tempStrength = ceStrength(ce);
498            if(strength < tempStrength) { tempStrength = strength; }
499            ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength);
500        }
501
502        setCaseBits(nfdString);
503
504        int cesLengthBeforeExtension = cesLength;
505        if(extension.length() != 0) {
506            String nfdExtension = nfd.normalize(extension);
507            cesLength = dataBuilder.getCEs(nfdExtension, ces, cesLength);
508            if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
509                throw new IllegalArgumentException(
510                        "extension string adds too many collation elements (more than 31 total)");
511            }
512        }
513        int ce32 = Collation.UNASSIGNED_CE32;
514        if((!nfdPrefix.contentEquals(prefix) || !nfdString.contentEquals(str)) &&
515                !ignorePrefix(prefix) && !ignoreString(str)) {
516            // Map from the original input to the CEs.
517            // We do this in case the canonical closure is incomplete,
518            // so that it is possible to explicitly provide the missing mappings.
519            ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32);
520        }
521        addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32);
522        cesLength = cesLengthBeforeExtension;
523    }
524
525    /**
526     * Picks one of the current CEs and finds or inserts a node in the graph
527     * for the CE + strength.
528     */
529    private int findOrInsertNodeForCEs(int strength) {
530        assert(Collator.PRIMARY <= strength && strength <= Collator.QUATERNARY);
531
532        // Find the last CE that is at least as "strong" as the requested difference.
533        // Note: Stronger is smaller (Collator.PRIMARY=0).
534        long ce;
535        for(;; --cesLength) {
536            if(cesLength == 0) {
537                ce = ces[0] = 0;
538                cesLength = 1;
539                break;
540            } else {
541                ce = ces[cesLength - 1];
542            }
543            if(ceStrength(ce) <= strength) { break; }
544        }
545
546        if(isTempCE(ce)) {
547            // No need to findCommonNode() here for lower levels
548            // because insertTailoredNodeAfter() will do that anyway.
549            return indexFromTempCE(ce);
550        }
551
552        // root CE
553        if((int)(ce >>> 56) == Collation.UNASSIGNED_IMPLICIT_BYTE) {
554            throw new UnsupportedOperationException(
555                    "tailoring relative to an unassigned code point not supported");
556        }
557        return findOrInsertNodeForRootCE(ce, strength);
558    }
559
560    private int findOrInsertNodeForRootCE(long ce, int strength) {
561        assert((int)(ce >>> 56) != Collation.UNASSIGNED_IMPLICIT_BYTE);
562
563        // Find or insert the node for each of the root CE's weights,
564        // down to the requested level/strength.
565        // Root CEs must have common=zero quaternary weights (for which we never insert any nodes).
566        assert((ce & 0xc0) == 0);
567        int index = findOrInsertNodeForPrimary(ce >>> 32);
568        if(strength >= Collator.SECONDARY) {
569            int lower32 = (int)ce;
570            index = findOrInsertWeakNode(index, lower32 >>> 16, Collator.SECONDARY);
571            if(strength >= Collator.TERTIARY) {
572                index = findOrInsertWeakNode(index, lower32 & Collation.ONLY_TERTIARY_MASK,
573                                            Collator.TERTIARY);
574            }
575        }
576        return index;
577    }
578
579    /**
580     * Like Java Collections.binarySearch(List, key, Comparator).
581     *
582     * @return the index>=0 where the item was found,
583     *         or the index<0 for inserting the string at ~index in sorted order
584     *         (index into rootPrimaryIndexes)
585     */
586    private static final int binarySearchForRootPrimaryNode(
587            int[] rootPrimaryIndexes, int length, long[] nodes, long p) {
588        if(length == 0) { return ~0; }
589        int start = 0;
590        int limit = length;
591        for (;;) {
592            int i = (start + limit) / 2;
593            long node = nodes[rootPrimaryIndexes[i]];
594            long nodePrimary = node >>> 32;  // weight32FromNode(node)
595            if (p == nodePrimary) {
596                return i;
597            } else if (p < nodePrimary) {
598                if (i == start) {
599                    return ~start;  // insert s before i
600                }
601                limit = i;
602            } else {
603                if (i == start) {
604                    return ~(start + 1);  // insert s after i
605                }
606                start = i;
607            }
608        }
609    }
610
611    /** Finds or inserts the node for a root CE's primary weight. */
612    private int findOrInsertNodeForPrimary(long p) {
613        int rootIndex = binarySearchForRootPrimaryNode(
614            rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p);
615        if(rootIndex >= 0) {
616            return rootPrimaryIndexes.elementAti(rootIndex);
617        } else {
618            // Start a new list of nodes with this primary.
619            int index = nodes.size();
620            nodes.addElement(nodeFromWeight32(p));
621            rootPrimaryIndexes.insertElementAt(index, ~rootIndex);
622            return index;
623        }
624    }
625
626    /** Finds or inserts the node for a secondary or tertiary weight. */
627    private int findOrInsertWeakNode(int index, int weight16, int level) {
628        assert(0 <= index && index < nodes.size());
629        assert(Collator.SECONDARY <= level && level <= Collator.TERTIARY);
630
631        if(weight16 == Collation.COMMON_WEIGHT16) {
632            return findCommonNode(index, level);
633        }
634
635        // If this will be the first below-common weight for the parent node,
636        // then we will also need to insert a common weight after it.
637        long node = nodes.elementAti(index);
638        assert(strengthFromNode(node) < level);  // parent node is stronger
639        if(weight16 != 0 && weight16 < Collation.COMMON_WEIGHT16) {
640            int hasThisLevelBefore = level == Collator.SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3;
641            if((node & hasThisLevelBefore) == 0) {
642                // The parent node has an implied level-common weight.
643                long commonNode =
644                    nodeFromWeight16(Collation.COMMON_WEIGHT16) | nodeFromStrength(level);
645                if(level == Collator.SECONDARY) {
646                    // Move the HAS_BEFORE3 flag from the parent node
647                    // to the new secondary common node.
648                    commonNode |= node & HAS_BEFORE3;
649                    node &= ~(long)HAS_BEFORE3;
650                }
651                nodes.setElementAt(node | hasThisLevelBefore, index);
652                // Insert below-common-weight node.
653                int nextIndex = nextIndexFromNode(node);
654                node = nodeFromWeight16(weight16) | nodeFromStrength(level);
655                index = insertNodeBetween(index, nextIndex, node);
656                // Insert common-weight node.
657                insertNodeBetween(index, nextIndex, commonNode);
658                // Return index of below-common-weight node.
659                return index;
660            }
661        }
662
663        // Find the root CE's weight for this level.
664        // Postpone insertion if not found:
665        // Insert the new root node before the next stronger node,
666        // or before the next root node with the same strength and a larger weight.
667        int nextIndex;
668        while((nextIndex = nextIndexFromNode(node)) != 0) {
669            node = nodes.elementAti(nextIndex);
670            int nextStrength = strengthFromNode(node);
671            if(nextStrength <= level) {
672                // Insert before a stronger node.
673                if(nextStrength < level) { break; }
674                // nextStrength == level
675                if(!isTailoredNode(node)) {
676                    int nextWeight16 = weight16FromNode(node);
677                    if(nextWeight16 == weight16) {
678                        // Found the node for the root CE up to this level.
679                        return nextIndex;
680                    }
681                    // Insert before a node with a larger same-strength weight.
682                    if(nextWeight16 > weight16) { break; }
683                }
684            }
685            // Skip the next node.
686            index = nextIndex;
687        }
688        node = nodeFromWeight16(weight16) | nodeFromStrength(level);
689        return insertNodeBetween(index, nextIndex, node);
690    }
691
692    /**
693     * Makes and inserts a new tailored node into the list, after the one at index.
694     * Skips over nodes of weaker strength to maintain collation order
695     * ("postpone insertion").
696     * @return the new node's index
697     */
698    private int insertTailoredNodeAfter(int index, int strength) {
699        assert(0 <= index && index < nodes.size());
700        if(strength >= Collator.SECONDARY) {
701            index = findCommonNode(index, Collator.SECONDARY);
702            if(strength >= Collator.TERTIARY) {
703                index = findCommonNode(index, Collator.TERTIARY);
704            }
705        }
706        // Postpone insertion:
707        // Insert the new node before the next one with a strength at least as strong.
708        long node = nodes.elementAti(index);
709        int nextIndex;
710        while((nextIndex = nextIndexFromNode(node)) != 0) {
711            node = nodes.elementAti(nextIndex);
712            if(strengthFromNode(node) <= strength) { break; }
713            // Skip the next node which has a weaker (larger) strength than the new one.
714            index = nextIndex;
715        }
716        node = IS_TAILORED | nodeFromStrength(strength);
717        return insertNodeBetween(index, nextIndex, node);
718    }
719
720    /**
721     * Inserts a new node into the list, between list-adjacent items.
722     * The node's previous and next indexes must not be set yet.
723     * @return the new node's index
724     */
725    private int insertNodeBetween(int index, int nextIndex, long node) {
726        assert(previousIndexFromNode(node) == 0);
727        assert(nextIndexFromNode(node) == 0);
728        assert(nextIndexFromNode(nodes.elementAti(index)) == nextIndex);
729        // Append the new node and link it to the existing nodes.
730        int newIndex = nodes.size();
731        node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex);
732        nodes.addElement(node);
733        // nodes[index].nextIndex = newIndex
734        node = nodes.elementAti(index);
735        nodes.setElementAt(changeNodeNextIndex(node, newIndex), index);
736        // nodes[nextIndex].previousIndex = newIndex
737        if(nextIndex != 0) {
738            node = nodes.elementAti(nextIndex);
739            nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex);
740        }
741        return newIndex;
742    }
743
744    /**
745     * Finds the node which implies or contains a common=05 weight of the given strength
746     * (secondary or tertiary), if the current node is stronger.
747     * Skips weaker nodes and tailored nodes if the current node is stronger
748     * and is followed by an explicit-common-weight node.
749     * Always returns the input index if that node is no stronger than the given strength.
750     */
751    private int findCommonNode(int index, int strength) {
752        assert(Collator.SECONDARY <= strength && strength <= Collator.TERTIARY);
753        long node = nodes.elementAti(index);
754        if(strengthFromNode(node) >= strength) {
755            // The current node is no stronger.
756            return index;
757        }
758        if(strength == Collator.SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) {
759            // The current node implies the strength-common weight.
760            return index;
761        }
762        index = nextIndexFromNode(node);
763        node = nodes.elementAti(index);
764        assert(!isTailoredNode(node) && strengthFromNode(node) == strength &&
765                weight16FromNode(node) < Collation.COMMON_WEIGHT16);
766        // Skip to the explicit common node.
767        do {
768            index = nextIndexFromNode(node);
769            node = nodes.elementAti(index);
770            assert(strengthFromNode(node) >= strength);
771        } while(isTailoredNode(node) || strengthFromNode(node) > strength ||
772                weight16FromNode(node) < Collation.COMMON_WEIGHT16);
773        assert(weight16FromNode(node) == Collation.COMMON_WEIGHT16);
774        return index;
775    }
776
777    private void setCaseBits(CharSequence nfdString) {
778        int numTailoredPrimaries = 0;
779        for(int i = 0; i < cesLength; ++i) {
780            if(ceStrength(ces[i]) == Collator.PRIMARY) { ++numTailoredPrimaries; }
781        }
782        // We should not be able to get too many case bits because
783        // cesLength<=31==MAX_EXPANSION_LENGTH.
784        // 31 pairs of case bits fit into an long without setting its sign bit.
785        assert(numTailoredPrimaries <= 31);
786
787        long cases = 0;
788        if(numTailoredPrimaries > 0) {
789            CharSequence s = nfdString;
790            UTF16CollationIterator baseCEs = new UTF16CollationIterator(baseData, false, s, 0);
791            int baseCEsLength = baseCEs.fetchCEs() - 1;
792            assert(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation.NO_CE);
793
794            int lastCase = 0;
795            int numBasePrimaries = 0;
796            for(int i = 0; i < baseCEsLength; ++i) {
797                long ce = baseCEs.getCE(i);
798                if((ce >>> 32) != 0) {
799                    ++numBasePrimaries;
800                    int c = ((int)ce >> 14) & 3;
801                    assert(c == 0 || c == 2);  // lowercase or uppercase, no mixed case in any base CE
802                    if(numBasePrimaries < numTailoredPrimaries) {
803                        cases |= (long)c << ((numBasePrimaries - 1) * 2);
804                    } else if(numBasePrimaries == numTailoredPrimaries) {
805                        lastCase = c;
806                    } else if(c != lastCase) {
807                        // There are more base primary CEs than tailored primaries.
808                        // Set mixed case if the case bits of the remainder differ.
809                        lastCase = 1;
810                        // Nothing more can change.
811                        break;
812                    }
813                }
814            }
815            if(numBasePrimaries >= numTailoredPrimaries) {
816                cases |= (long)lastCase << ((numTailoredPrimaries - 1) * 2);
817            }
818        }
819
820        for(int i = 0; i < cesLength; ++i) {
821            long ce = ces[i] & 0xffffffffffff3fffL;  // clear old case bits
822            int strength = ceStrength(ce);
823            if(strength == Collator.PRIMARY) {
824                ce |= (cases & 3) << 14;
825                cases >>>= 2;
826            } else if(strength == Collator.TERTIARY) {
827                // Tertiary CEs must have uppercase bits.
828                // See the LDML spec, and comments in class CollationCompare.
829                ce |= 0x8000;
830            }
831            // Tertiary ignorable CEs must have 0 case bits.
832            // We set 0 case bits for secondary CEs too
833            // since currently only U+0345 is cased and maps to a secondary CE,
834            // and it is lowercase. Other secondaries are uncased.
835            // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight.
836            ces[i] = ce;
837        }
838    }
839
840    /** Implements CollationRuleParser.Sink. */
841    @Override
842    void suppressContractions(UnicodeSet set) {
843        dataBuilder.suppressContractions(set);
844    }
845
846    /** Implements CollationRuleParser.Sink. */
847    @Override
848    void optimize(UnicodeSet set) {
849        optimizeSet.addAll(set);
850    }
851
852    /**
853     * Adds the mapping and its canonical closure.
854     * Takes ce32=dataBuilder.encodeCEs(...) so that the data builder
855     * need not re-encode the CEs multiple times.
856     */
857    private int addWithClosure(CharSequence nfdPrefix, CharSequence nfdString,
858                long[] newCEs, int newCEsLength, int ce32) {
859        // Map from the NFD input to the CEs.
860        ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32);
861        ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32);
862        addTailComposites(nfdPrefix, nfdString);
863        return ce32;
864    }
865
866    private int addOnlyClosure(CharSequence nfdPrefix, CharSequence nfdString,
867                long[] newCEs, int newCEsLength, int ce32) {
868        // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.)
869        // TODO: make CanonicalIterator work with CharSequence, or maybe change arguments here to String
870        if(nfdPrefix.length() == 0) {
871            CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
872            String prefix = "";
873            for(;;) {
874                String str = stringIter.next();
875                if(str == null) { break; }
876                if(ignoreString(str) || str.contentEquals(nfdString)) { continue; }
877                ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
878            }
879        } else {
880            CanonicalIterator prefixIter = new CanonicalIterator(nfdPrefix.toString());
881            CanonicalIterator stringIter = new CanonicalIterator(nfdString.toString());
882            for(;;) {
883                String prefix = prefixIter.next();
884                if(prefix == null) { break; }
885                if(ignorePrefix(prefix)) { continue; }
886                boolean samePrefix = prefix.contentEquals(nfdPrefix);
887                for(;;) {
888                    String str = stringIter.next();
889                    if(str == null) { break; }
890                    if(ignoreString(str) || (samePrefix && str.contentEquals(nfdString))) { continue; }
891                    ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32);
892                }
893                stringIter.reset();
894            }
895        }
896        return ce32;
897    }
898
899    private void addTailComposites(CharSequence nfdPrefix, CharSequence nfdString) {
900        // Look for the last starter in the NFD string.
901        int lastStarter;
902        int indexAfterLastStarter = nfdString.length();
903        for(;;) {
904            if(indexAfterLastStarter == 0) { return; }  // no starter at all
905            lastStarter = Character.codePointBefore(nfdString, indexAfterLastStarter);
906            if(nfd.getCombiningClass(lastStarter) == 0) { break; }
907            indexAfterLastStarter -= Character.charCount(lastStarter);
908        }
909        // No closure to Hangul syllables since we decompose them on the fly.
910        if(Hangul.isJamoL(lastStarter)) { return; }
911
912        // Are there any composites whose decomposition starts with the lastStarter?
913        // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters.
914        // We might find some more equivalent mappings here if it did.
915        UnicodeSet composites = new UnicodeSet();
916        if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; }
917
918        StringBuilder newNFDString = new StringBuilder(), newString = new StringBuilder();
919        long[] newCEs = new long[Collation.MAX_EXPANSION_LENGTH];
920        UnicodeSetIterator iter = new UnicodeSetIterator(composites);
921        while(iter.next()) {
922            assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
923            int composite = iter.codepoint;
924            String decomp = nfd.getDecomposition(composite);
925            if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp,
926                    newNFDString, newString)) {
927                continue;
928            }
929            int newCEsLength = dataBuilder.getCEs(nfdPrefix, newNFDString, newCEs, 0);
930            if(newCEsLength > Collation.MAX_EXPANSION_LENGTH) {
931                // Ignore mappings that we cannot store.
932                continue;
933            }
934            // Note: It is possible that the newCEs do not make use of the mapping
935            // for which we are adding the tail composites, in which case we might be adding
936            // unnecessary mappings.
937            // For example, when we add tail composites for ae^ (^=combining circumflex),
938            // UCA discontiguous-contraction matching does not find any matches
939            // for ae_^ (_=any combining diacritic below) *unless* there is also
940            // a contraction mapping for ae.
941            // Thus, if there is no ae contraction, then the ae^ mapping is ignored
942            // while fetching the newCEs for ae_^.
943            // TODO: Try to detect this effectively.
944            // (Alternatively, print a warning when prefix contractions are missing.)
945
946            // We do not need an explicit mapping for the NFD strings.
947            // It is fine if the NFD input collates like this via a sequence of mappings.
948            // It also saves a little bit of space, and may reduce the set of characters with contractions.
949            int ce32 = addIfDifferent(nfdPrefix, newString,
950                                          newCEs, newCEsLength, Collation.UNASSIGNED_CE32);
951            if(ce32 != Collation.UNASSIGNED_CE32) {
952                // was different, was added
953                addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32);
954            }
955        }
956    }
957
958    private boolean mergeCompositeIntoString(CharSequence nfdString, int indexAfterLastStarter,
959                int composite, CharSequence decomp,
960                StringBuilder newNFDString, StringBuilder newString) {
961        assert(Character.codePointBefore(nfdString, indexAfterLastStarter) ==
962                Character.codePointAt(decomp, 0));
963        int lastStarterLength = Character.offsetByCodePoints(decomp, 0, 1);
964        if(lastStarterLength == decomp.length()) {
965            // Singleton decompositions should be found by addWithClosure()
966            // and the CanonicalIterator, so we can ignore them here.
967            return false;
968        }
969        if(equalSubSequences(nfdString, indexAfterLastStarter, decomp, lastStarterLength)) {
970            // same strings, nothing new to be found here
971            return false;
972        }
973
974        // Make new FCD strings that combine a composite, or its decomposition,
975        // into the nfdString's last starter and the combining marks following it.
976        // Make an NFD version, and a version with the composite.
977        newNFDString.setLength(0);
978        newNFDString.append(nfdString, 0, indexAfterLastStarter);
979        newString.setLength(0);
980        newString.append(nfdString, 0, indexAfterLastStarter - lastStarterLength)
981            .appendCodePoint(composite);
982
983        // The following is related to discontiguous contraction matching,
984        // but builds only FCD strings (or else returns false).
985        int sourceIndex = indexAfterLastStarter;
986        int decompIndex = lastStarterLength;
987        // Small optimization: We keep the source character across loop iterations
988        // because we do not always consume it,
989        // and then need not fetch it again nor look up its combining class again.
990        int sourceChar = Collation.SENTINEL_CP;
991        // The cc variables need to be declared before the loop so that at the end
992        // they are set to the last combining classes seen.
993        int sourceCC = 0;
994        int decompCC = 0;
995        for(;;) {
996            if(sourceChar < 0) {
997                if(sourceIndex >= nfdString.length()) { break; }
998                sourceChar = Character.codePointAt(nfdString, sourceIndex);
999                sourceCC = nfd.getCombiningClass(sourceChar);
1000                assert(sourceCC != 0);
1001            }
1002            // We consume a decomposition character in each iteration.
1003            if(decompIndex >= decomp.length()) { break; }
1004            int decompChar = Character.codePointAt(decomp, decompIndex);
1005            decompCC = nfd.getCombiningClass(decompChar);
1006            // Compare the two characters and their combining classes.
1007            if(decompCC == 0) {
1008                // Unable to merge because the source contains a non-zero combining mark
1009                // but the composite's decomposition contains another starter.
1010                // The strings would not be equivalent.
1011                return false;
1012            } else if(sourceCC < decompCC) {
1013                // Composite + sourceChar would not be FCD.
1014                return false;
1015            } else if(decompCC < sourceCC) {
1016                newNFDString.appendCodePoint(decompChar);
1017                decompIndex += Character.charCount(decompChar);
1018            } else if(decompChar != sourceChar) {
1019                // Blocked because same combining class.
1020                return false;
1021            } else {  // match: decompChar == sourceChar
1022                newNFDString.appendCodePoint(decompChar);
1023                decompIndex += Character.charCount(decompChar);
1024                sourceIndex += Character.charCount(decompChar);
1025                sourceChar = Collation.SENTINEL_CP;
1026            }
1027        }
1028        // We are at the end of at least one of the two inputs.
1029        if(sourceChar >= 0) {  // more characters from nfdString but not from decomp
1030            if(sourceCC < decompCC) {
1031                // Appending the next source character to the composite would not be FCD.
1032                return false;
1033            }
1034            newNFDString.append(nfdString, sourceIndex, nfdString.length());
1035            newString.append(nfdString, sourceIndex, nfdString.length());
1036        } else if(decompIndex < decomp.length()) {  // more characters from decomp, not from nfdString
1037            newNFDString.append(decomp, decompIndex, decomp.length());
1038        }
1039        assert(nfd.isNormalized(newNFDString));
1040        assert(fcd.isNormalized(newString));
1041        assert(nfd.normalize(newString).equals(newNFDString.toString()));  // canonically equivalent
1042        return true;
1043    }
1044
1045    private boolean equalSubSequences(CharSequence left, int leftStart, CharSequence right, int rightStart) {
1046        // C++ UnicodeString::compare(leftStart, 0x7fffffff, right, rightStart, 0x7fffffff) == 0
1047        int leftLength = left.length();
1048        if((leftLength - leftStart) != (right.length() - rightStart)) { return false; }
1049        while(leftStart < leftLength) {
1050            if(left.charAt(leftStart++) != right.charAt(rightStart++)) {
1051                return false;
1052            }
1053        }
1054        return true;
1055    }
1056    private boolean ignorePrefix(CharSequence s) {
1057        // Do not map non-FCD prefixes.
1058        return !isFCD(s);
1059    }
1060    private boolean ignoreString(CharSequence s) {
1061        // Do not map non-FCD strings.
1062        // Do not map strings that start with Hangul syllables: We decompose those on the fly.
1063        return !isFCD(s) || Hangul.isHangul(s.charAt(0));
1064    }
1065    private boolean isFCD(CharSequence s) {
1066        return fcd.isNormalized(s);
1067    }
1068
1069    private static final UnicodeSet COMPOSITES = new UnicodeSet("[:NFD_QC=N:]");
1070    static {
1071        // Hangul is decomposed on the fly during collation.
1072        COMPOSITES.remove(Hangul.HANGUL_BASE, Hangul.HANGUL_END);
1073    }
1074
1075    private void closeOverComposites() {
1076        String prefix = "";  // empty
1077        UnicodeSetIterator iter = new UnicodeSetIterator(COMPOSITES);
1078        while(iter.next()) {
1079            assert(iter.codepoint != UnicodeSetIterator.IS_STRING);
1080            String nfdString = nfd.getDecomposition(iter.codepoint);
1081            cesLength = dataBuilder.getCEs(nfdString, ces, 0);
1082            if(cesLength > Collation.MAX_EXPANSION_LENGTH) {
1083                // Too many CEs from the decomposition (unusual), ignore this composite.
1084                // We could add a capacity parameter to getCEs() and reallocate if necessary.
1085                // However, this can only really happen in contrived cases.
1086                continue;
1087            }
1088            String composite = iter.getString();
1089            addIfDifferent(prefix, composite, ces, cesLength, Collation.UNASSIGNED_CE32);
1090        }
1091    }
1092
1093    private int addIfDifferent(CharSequence prefix, CharSequence str,
1094                long[] newCEs, int newCEsLength, int ce32) {
1095        long[] oldCEs = new long[Collation.MAX_EXPANSION_LENGTH];
1096        int oldCEsLength = dataBuilder.getCEs(prefix, str, oldCEs, 0);
1097        if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) {
1098            if(ce32 == Collation.UNASSIGNED_CE32) {
1099                ce32 = dataBuilder.encodeCEs(newCEs, newCEsLength);
1100            }
1101            dataBuilder.addCE32(prefix, str, ce32);
1102        }
1103        return ce32;
1104    }
1105
1106    private static boolean sameCEs(long[] ces1, int ces1Length,
1107                long[] ces2, int ces2Length) {
1108        if(ces1Length != ces2Length) {
1109            return false;
1110        }
1111        assert(ces1Length <= Collation.MAX_EXPANSION_LENGTH);
1112        for(int i = 0; i < ces1Length; ++i) {
1113            if(ces1[i] != ces2[i]) { return false; }
1114        }
1115        return true;
1116    }
1117
1118    private static final int alignWeightRight(int w) {
1119        if(w != 0) {
1120            while((w & 0xff) == 0) { w >>>= 8; }
1121        }
1122        return w;
1123    }
1124
1125    /**
1126     * Walks the tailoring graph and overwrites tailored nodes with new CEs.
1127     * After this, the graph is destroyed.
1128     * The nodes array can then be used only as a source of tailored CEs.
1129     */
1130    private void makeTailoredCEs() {
1131        CollationWeights primaries = new CollationWeights();
1132        CollationWeights secondaries = new CollationWeights();
1133        CollationWeights tertiaries = new CollationWeights();
1134        long[] nodesArray = nodes.getBuffer();
1135        if(DEBUG) {
1136            System.out.println("\nCollationBuilder.makeTailoredCEs()");
1137        }
1138
1139        for(int rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) {
1140            int i = rootPrimaryIndexes.elementAti(rpi);
1141            long node = nodesArray[i];
1142            long p = weight32FromNode(node);
1143            int s = p == 0 ? 0 : Collation.COMMON_WEIGHT16;
1144            int t = s;
1145            int q = 0;
1146            boolean pIsTailored = false;
1147            boolean sIsTailored = false;
1148            boolean tIsTailored = false;
1149            if(DEBUG) {
1150                System.out.printf("\nprimary     %x\n", alignWeightRight((int)p));
1151            }
1152            int pIndex = p == 0 ? 0 : rootElements.findPrimary(p);
1153            int nextIndex = nextIndexFromNode(node);
1154            while(nextIndex != 0) {
1155                i = nextIndex;
1156                node = nodesArray[i];
1157                nextIndex = nextIndexFromNode(node);
1158                int strength = strengthFromNode(node);
1159                if(strength == Collator.QUATERNARY) {
1160                    assert(isTailoredNode(node));
1161                    if(DEBUG) {
1162                        System.out.print("      quat+     ");
1163                    }
1164                    if(q == 3) {
1165                        // C++ U_BUFFER_OVERFLOW_ERROR
1166                        throw new UnsupportedOperationException("quaternary tailoring gap too small");
1167                    }
1168                    ++q;
1169                } else {
1170                    if(strength == Collator.TERTIARY) {
1171                        if(isTailoredNode(node)) {
1172                            if(DEBUG) {
1173                                System.out.print("    ter+        ");
1174                            }
1175                            if(!tIsTailored) {
1176                                // First tailored tertiary node for [p, s].
1177                                int tCount = countTailoredNodes(nodesArray, nextIndex,
1178                                                                    Collator.TERTIARY) + 1;
1179                                int tLimit;
1180                                if(t == 0) {
1181                                    // Gap at the beginning of the tertiary CE range.
1182                                    t = rootElements.getTertiaryBoundary() - 0x100;
1183                                    tLimit = (int)rootElements.getFirstTertiaryCE() & Collation.ONLY_TERTIARY_MASK;
1184                                } else if(!pIsTailored && !sIsTailored) {
1185                                    // p and s are root weights.
1186                                    tLimit = rootElements.getTertiaryAfter(pIndex, s, t);
1187                                } else if(t == Collation.BEFORE_WEIGHT16) {
1188                                    tLimit = Collation.COMMON_WEIGHT16;
1189                                } else {
1190                                    // [p, s] is tailored.
1191                                    assert(t == Collation.COMMON_WEIGHT16);
1192                                    tLimit = rootElements.getTertiaryBoundary();
1193                                }
1194                                assert(tLimit == 0x4000 || (tLimit & ~Collation.ONLY_TERTIARY_MASK) == 0);
1195                                tertiaries.initForTertiary();
1196                                if(!tertiaries.allocWeights(t, tLimit, tCount)) {
1197                                    // C++ U_BUFFER_OVERFLOW_ERROR
1198                                    throw new UnsupportedOperationException("tertiary tailoring gap too small");
1199                                }
1200                                tIsTailored = true;
1201                            }
1202                            t = (int)tertiaries.nextWeight();
1203                            assert(t != 0xffffffff);
1204                        } else {
1205                            t = weight16FromNode(node);
1206                            tIsTailored = false;
1207                            if(DEBUG) {
1208                                System.out.printf("    ter     %x\n", alignWeightRight(t));
1209                            }
1210                        }
1211                    } else {
1212                        if(strength == Collator.SECONDARY) {
1213                            if(isTailoredNode(node)) {
1214                                if(DEBUG) {
1215                                    System.out.print("  sec+          ");
1216                                }
1217                                if(!sIsTailored) {
1218                                    // First tailored secondary node for p.
1219                                    int sCount = countTailoredNodes(nodesArray, nextIndex,
1220                                                                        Collator.SECONDARY) + 1;
1221                                    int sLimit;
1222                                    if(s == 0) {
1223                                        // Gap at the beginning of the secondary CE range.
1224                                        s = rootElements.getSecondaryBoundary() - 0x100;
1225                                        sLimit = (int)(rootElements.getFirstSecondaryCE() >> 16);
1226                                    } else if(!pIsTailored) {
1227                                        // p is a root primary.
1228                                        sLimit = rootElements.getSecondaryAfter(pIndex, s);
1229                                    } else if(s == Collation.BEFORE_WEIGHT16) {
1230                                        sLimit = Collation.COMMON_WEIGHT16;
1231                                    } else {
1232                                        // p is a tailored primary.
1233                                        assert(s == Collation.COMMON_WEIGHT16);
1234                                        sLimit = rootElements.getSecondaryBoundary();
1235                                    }
1236                                    if(s == Collation.COMMON_WEIGHT16) {
1237                                        // Do not tailor into the getSortKey() range of
1238                                        // compressed common secondaries.
1239                                        s = rootElements.getLastCommonSecondary();
1240                                    }
1241                                    secondaries.initForSecondary();
1242                                    if(!secondaries.allocWeights(s, sLimit, sCount)) {
1243                                        // C++ U_BUFFER_OVERFLOW_ERROR
1244                                        if(DEBUG) {
1245                                            System.out.printf("!secondaries.allocWeights(%x, %x, sCount=%d)\n",
1246                                                    alignWeightRight(s), alignWeightRight(sLimit),
1247                                                    alignWeightRight(sCount));
1248                                        }
1249                                        throw new UnsupportedOperationException("secondary tailoring gap too small");
1250                                    }
1251                                    sIsTailored = true;
1252                                }
1253                                s = (int)secondaries.nextWeight();
1254                                assert(s != 0xffffffff);
1255                            } else {
1256                                s = weight16FromNode(node);
1257                                sIsTailored = false;
1258                                if(DEBUG) {
1259                                    System.out.printf("  sec       %x\n", alignWeightRight(s));
1260                                }
1261                            }
1262                        } else /* Collator.PRIMARY */ {
1263                            assert(isTailoredNode(node));
1264                            if(DEBUG) {
1265                                System.out.print("pri+            ");
1266                            }
1267                            if(!pIsTailored) {
1268                                // First tailored primary node in this list.
1269                                int pCount = countTailoredNodes(nodesArray, nextIndex,
1270                                                                    Collator.PRIMARY) + 1;
1271                                boolean isCompressible = baseData.isCompressiblePrimary(p);
1272                                long pLimit =
1273                                    rootElements.getPrimaryAfter(p, pIndex, isCompressible);
1274                                primaries.initForPrimary(isCompressible);
1275                                if(!primaries.allocWeights(p, pLimit, pCount)) {
1276                                    // C++ U_BUFFER_OVERFLOW_ERROR  // TODO: introduce a more specific UErrorCode?
1277                                    throw new UnsupportedOperationException("primary tailoring gap too small");
1278                                }
1279                                pIsTailored = true;
1280                            }
1281                            p = primaries.nextWeight();
1282                            assert(p != 0xffffffffL);
1283                            s = Collation.COMMON_WEIGHT16;
1284                            sIsTailored = false;
1285                        }
1286                        t = s == 0 ? 0 : Collation.COMMON_WEIGHT16;
1287                        tIsTailored = false;
1288                    }
1289                    q = 0;
1290                }
1291                if(isTailoredNode(node)) {
1292                    nodesArray[i] = Collation.makeCE(p, s, t, q);
1293                    if(DEBUG) {
1294                        System.out.printf("%016x\n", nodesArray[i]);
1295                    }
1296                }
1297            }
1298        }
1299    }
1300
1301    /**
1302     * Counts the tailored nodes of the given strength up to the next node
1303     * which is either stronger or has an explicit weight of this strength.
1304     */
1305    private static int countTailoredNodes(long[] nodesArray, int i, int strength) {
1306        int count = 0;
1307        for(;;) {
1308            if(i == 0) { break; }
1309            long node = nodesArray[i];
1310            if(strengthFromNode(node) < strength) { break; }
1311            if(strengthFromNode(node) == strength) {
1312                if(isTailoredNode(node)) {
1313                    ++count;
1314                } else {
1315                    break;
1316                }
1317            }
1318            i = nextIndexFromNode(node);
1319        }
1320        return count;
1321    }
1322
1323    private static final class CEFinalizer implements CollationDataBuilder.CEModifier {
1324        CEFinalizer(long[] ces) {
1325            finalCEs = ces;
1326        }
1327        public long modifyCE32(int ce32) {
1328            assert(!Collation.isSpecialCE32(ce32));
1329            if(CollationBuilder.isTempCE32(ce32)) {
1330                // retain case bits
1331                return finalCEs[CollationBuilder.indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8);
1332            } else {
1333                return Collation.NO_CE;
1334            }
1335        }
1336        public long modifyCE(long ce) {
1337            if(CollationBuilder.isTempCE(ce)) {
1338                // retain case bits
1339                return finalCEs[CollationBuilder.indexFromTempCE(ce)] | (ce & 0xc000);
1340            } else {
1341                return Collation.NO_CE;
1342            }
1343        }
1344
1345        private long[] finalCEs;
1346    };
1347
1348    /** Replaces temporary CEs with the final CEs they point to. */
1349    private void finalizeCEs() {
1350        CollationDataBuilder newBuilder = new CollationDataBuilder();
1351        newBuilder.initForTailoring(baseData);
1352        CEFinalizer finalizer = new CEFinalizer(nodes.getBuffer());
1353        newBuilder.copyFrom(dataBuilder, finalizer);
1354        dataBuilder = newBuilder;
1355    }
1356
1357    /**
1358     * Encodes "temporary CE" data into a CE that fits into the CE32 data structure,
1359     * with 2-byte primary, 1-byte secondary and 6-bit tertiary,
1360     * with valid CE byte values.
1361     *
1362     * The index must not exceed 20 bits (0xfffff).
1363     * The strength must fit into 2 bits (Collator.PRIMARY..Collator.QUATERNARY).
1364     *
1365     * Temporary CEs are distinguished from real CEs by their use of
1366     * secondary weights 06..45 which are otherwise reserved for compressed sort keys.
1367     *
1368     * The case bits are unused and available.
1369     */
1370    private static long tempCEFromIndexAndStrength(int index, int strength) {
1371        return
1372            // CE byte offsets, to ensure valid CE bytes, and case bits 11
1373            0x4040000006002000L +
1374            // index bits 19..13 -> primary byte 1 = CE bits 63..56 (byte values 40..BF)
1375            ((long)(index & 0xfe000) << 43) +
1376            // index bits 12..6 -> primary byte 2 = CE bits 55..48 (byte values 40..BF)
1377            ((long)(index & 0x1fc0) << 42) +
1378            // index bits 5..0 -> secondary byte 1 = CE bits 31..24 (byte values 06..45)
1379            ((index & 0x3f) << 24) +
1380            // strength bits 1..0 -> tertiary byte 1 = CE bits 13..8 (byte values 20..23)
1381            (strength << 8);
1382    }
1383    private static int indexFromTempCE(long tempCE) {
1384        tempCE -= 0x4040000006002000L;
1385        return
1386            ((int)(tempCE >> 43) & 0xfe000) |
1387            ((int)(tempCE >> 42) & 0x1fc0) |
1388            ((int)(tempCE >> 24) & 0x3f);
1389    }
1390    private static int strengthFromTempCE(long tempCE) {
1391        return ((int)tempCE >> 8) & 3;
1392    }
1393    private static boolean isTempCE(long ce) {
1394        int sec = (int)ce >>> 24;
1395        return 6 <= sec && sec <= 0x45;
1396    }
1397
1398    private static int indexFromTempCE32(int tempCE32) {
1399        tempCE32 -= 0x40400620;
1400        return
1401            ((tempCE32 >> 11) & 0xfe000) |
1402            ((tempCE32 >> 10) & 0x1fc0) |
1403            ((tempCE32 >> 8) & 0x3f);
1404    }
1405    private static boolean isTempCE32(int ce32) {
1406        return
1407            (ce32 & 0xff) >= 2 &&  // not a long-primary/long-secondary CE32
1408            6 <= ((ce32 >> 8) & 0xff) && ((ce32 >> 8) & 0xff) <= 0x45;
1409    }
1410
1411    private static int ceStrength(long ce) {
1412        return
1413            isTempCE(ce) ? strengthFromTempCE(ce) :
1414            (ce & 0xff00000000000000L) != 0 ? Collator.PRIMARY :
1415            ((int)ce & 0xff000000) != 0 ? Collator.SECONDARY :
1416            ce != 0 ? Collator.TERTIARY :
1417            Collator.IDENTICAL;
1418    }
1419
1420    /** At most 1M nodes, limited by the 20 bits in node bit fields. */
1421    private static final int MAX_INDEX = 0xfffff;
1422    /**
1423     * Node bit 6 is set on a primary node if there are nodes
1424     * with secondary values below the common secondary weight (05).
1425     */
1426    private static final int HAS_BEFORE2 = 0x40;
1427    /**
1428     * Node bit 5 is set on a primary or secondary node if there are nodes
1429     * with tertiary values below the common tertiary weight (05).
1430     */
1431    private static final int HAS_BEFORE3 = 0x20;
1432    /**
1433     * Node bit 3 distinguishes a tailored node, which has no weight value,
1434     * from a node with an explicit (root or default) weight.
1435     */
1436    private static final int IS_TAILORED = 8;
1437
1438    private static long nodeFromWeight32(long weight32) {
1439        return weight32 << 32;
1440    }
1441    private static long nodeFromWeight16(int weight16) {
1442        return (long)weight16 << 48;
1443    }
1444    private static long nodeFromPreviousIndex(int previous) {
1445        return (long)previous << 28;
1446    }
1447    private static long nodeFromNextIndex(int next) {
1448        return next << 8;
1449    }
1450    private static long nodeFromStrength(int strength) {
1451        return strength;
1452    }
1453
1454    private static long weight32FromNode(long node) {
1455        return node >>> 32;
1456    }
1457    private static int weight16FromNode(long node) {
1458        return (int)(node >> 48) & 0xffff;
1459    }
1460    private static int previousIndexFromNode(long node) {
1461        return (int)(node >> 28) & MAX_INDEX;
1462    }
1463    private static int nextIndexFromNode(long node) {
1464        return ((int)node >> 8) & MAX_INDEX;
1465    }
1466    private static int strengthFromNode(long node) {
1467        return (int)node & 3;
1468    }
1469
1470    private static boolean nodeHasBefore2(long node) {
1471        return (node & HAS_BEFORE2) != 0;
1472    }
1473    private static boolean nodeHasBefore3(long node) {
1474        return (node & HAS_BEFORE3) != 0;
1475    }
1476    private static boolean nodeHasAnyBefore(long node) {
1477        return (node & (HAS_BEFORE2 | HAS_BEFORE3)) != 0;
1478    }
1479    private static boolean isTailoredNode(long node) {
1480        return (node & IS_TAILORED) != 0;
1481    }
1482
1483    private static long changeNodePreviousIndex(long node, int previous) {
1484        return (node & 0xffff00000fffffffL) | nodeFromPreviousIndex(previous);
1485    }
1486    private static long changeNodeNextIndex(long node, int next) {
1487        return (node & 0xfffffffff00000ffL) | nodeFromNextIndex(next);
1488    }
1489
1490    private Normalizer2 nfd, fcd;
1491    private Normalizer2Impl nfcImpl;
1492
1493    private CollationTailoring base;
1494    private CollationData baseData;
1495    private CollationRootElements rootElements;
1496    private long variableTop;
1497
1498    private CollationDataBuilder dataBuilder;
1499    private boolean fastLatinEnabled;
1500    private UnicodeSet optimizeSet = new UnicodeSet();
1501
1502    private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH];
1503    private int cesLength;
1504
1505    /**
1506     * Indexes of nodes with root primary weights, sorted by primary.
1507     * Compact form of a TreeMap from root primary to node index.
1508     *
1509     * This is a performance optimization for finding reset positions.
1510     * Without this, we would have to search through the entire nodes list.
1511     * It also allows storing root primary weights in list head nodes,
1512     * without previous index, leaving room in root primary nodes for 32-bit primary weights.
1513     */
1514    private UVector32 rootPrimaryIndexes;
1515    /**
1516     * Data structure for assigning tailored weights and CEs.
1517     * Doubly-linked lists of nodes in mostly collation order.
1518     * Each list starts with a root primary node and ends with a nextIndex of 0.
1519     *
1520     * When there are any nodes in the list, then there is always a root primary node at index 0.
1521     * This allows some code not to have to check explicitly for nextIndex==0.
1522     *
1523     * Root primary nodes have 32-bit weights but do not have previous indexes.
1524     * All other nodes have at most 16-bit weights and do have previous indexes.
1525     *
1526     * Nodes with explicit weights store root collator weights,
1527     * or default weak weights (e.g., secondary 05) for stronger nodes.
1528     * "Tailored" nodes, with the IS_TAILORED bit set,
1529     * do not store explicit weights but rather
1530     * create a difference of a certain strength from the preceding node.
1531     *
1532     * A root node is followed by either
1533     * - a root/default node of the same strength, or
1534     * - a root/default node of the next-weaker strength, or
1535     * - a tailored node of the same strength.
1536     *
1537     * A node of a given strength normally implies "common" weights on weaker levels.
1538     *
1539     * A node with HAS_BEFORE2 must be immediately followed by
1540     * a secondary node with an explicit below-common weight, then a secondary tailored node,
1541     * and later an explicit common-secondary node.
1542     * The below-common weight can be a root weight,
1543     * or it can be BEFORE_WEIGHT16 for tailoring before an implied common weight
1544     * or before the lowest root weight.
1545     * (&[before 2] resets to an explicit secondary node so that
1546     * the following addRelation(secondary) tailors right after that.
1547     * If we did not have this node and instead were to reset on the primary node,
1548     * then addRelation(secondary) would skip forward to the the COMMON_WEIGHT16 node.)
1549     *
1550     * If the flag is not set, then there are no explicit secondary nodes
1551     * with the common or lower weights.
1552     *
1553     * Same for HAS_BEFORE3 for tertiary nodes and weights.
1554     * A node must not have both flags set.
1555     *
1556     * Tailored CEs are initially represented in a CollationDataBuilder as temporary CEs
1557     * which point to stable indexes in this list,
1558     * and temporary CEs stored in a CollationDataBuilder only point to tailored nodes.
1559     *
1560     * A temporary CE in the ces[] array may point to a non-tailored reset-before-position node,
1561     * until the next relation is added.
1562     *
1563     * At the end, the tailored weights are allocated as necessary,
1564     * then the tailored nodes are replaced with final CEs,
1565     * and the CollationData is rewritten by replacing temporary CEs with final ones.
1566     *
1567     * We cannot simply insert new nodes in the middle of the array
1568     * because that would invalidate the indexes stored in existing temporary CEs.
1569     * We need to use a linked graph with stable indexes to existing nodes.
1570     * A doubly-linked list seems easiest to maintain.
1571     *
1572     * Each node is stored as an long, with its fields stored as bit fields.
1573     *
1574     * Root primary node:
1575     * - primary weight: 32 bits 63..32
1576     * - reserved/unused/zero: 4 bits 31..28
1577     *
1578     * Weaker root nodes & tailored nodes:
1579     * - a weight: 16 bits 63..48
1580     *   + a root or default weight for a non-tailored node
1581     *   + unused/zero for a tailored node
1582     * - index to the previous node: 20 bits 47..28
1583     *
1584     * All types of nodes:
1585     * - index to the next node: 20 bits 27..8
1586     *   + nextIndex=0 in last node per root-primary list
1587     * - reserved/unused/zero bits: bits 7, 4, 2
1588     * - HAS_BEFORE2: bit 6
1589     * - HAS_BEFORE3: bit 5
1590     * - IS_TAILORED: bit 3
1591     * - the difference strength (primary/secondary/tertiary/quaternary): 2 bits 1..0
1592     *
1593     * We could allocate structs with pointers, but we would have to store them
1594     * in a pointer list so that they can be indexed from temporary CEs,
1595     * and they would require more memory allocations.
1596     */
1597    private UVector64 nodes;
1598}
1599