1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4*******************************************************************************
5* Copyright (C) 2013-2015, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* CollationFastLatinBuilder.java, ported from collationfastlatinbuilder.h/.cpp
9*
10* C++ version created on: 2013aug09
11* created by: Markus W. Scherer
12*/
13
14package com.ibm.icu.impl.coll;
15
16import com.ibm.icu.lang.UScript;
17import com.ibm.icu.text.Collator;
18import com.ibm.icu.util.CharsTrie;
19
20final class CollationFastLatinBuilder {
21    // #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0  // 0 or 1 or 2
22
23    /**
24     * Compare two signed long values as if they were unsigned.
25     */
26    private static final int compareInt64AsUnsigned(long a, long b) {
27        a += 0x8000000000000000L;
28        b += 0x8000000000000000L;
29        if(a < b) {
30            return -1;
31        } else if(a > b) {
32            return 1;
33        } else {
34            return 0;
35        }
36    }
37
38    /**
39     * Like Java Collections.binarySearch(List, String, Comparator).
40     *
41     * @return the index>=0 where the item was found,
42     *         or the index<0 for inserting the string at ~index in sorted order
43     */
44    private static final int binarySearch(long[] list, int limit, long ce) {
45        if (limit == 0) { return ~0; }
46        int start = 0;
47        for (;;) {
48            int i = (int)(((long)start + (long)limit) / 2);
49            int cmp = compareInt64AsUnsigned(ce, list[i]);
50            if (cmp == 0) {
51                return i;
52            } else if (cmp < 0) {
53                if (i == start) {
54                    return ~start;  // insert ce before i
55                }
56                limit = i;
57            } else {
58                if (i == start) {
59                    return ~(start + 1);  // insert ce after i
60                }
61                start = i;
62            }
63        }
64    }
65
66    CollationFastLatinBuilder() {
67        ce0 = 0;
68        ce1 = 0;
69        contractionCEs = new UVector64();
70        uniqueCEs = new UVector64();
71        miniCEs = null;
72        firstDigitPrimary = 0;
73        firstLatinPrimary = 0;
74        lastLatinPrimary = 0;
75        firstShortPrimary = 0;
76        shortPrimaryOverflow = false;
77        headerLength = 0;
78    }
79
80    boolean forData(CollationData data) {
81        if(result.length() != 0) {  // This builder is not reusable.
82            throw new IllegalStateException("attempt to reuse a CollationFastLatinBuilder");
83        }
84        if(!loadGroups(data)) { return false; }
85
86        // Fast handling of digits.
87        firstShortPrimary = firstDigitPrimary;
88        getCEs(data);
89        encodeUniqueCEs();
90        if(shortPrimaryOverflow) {
91            // Give digits long mini primaries,
92            // so that there are more short primaries for letters.
93            firstShortPrimary = firstLatinPrimary;
94            resetCEs();
95            getCEs(data);
96            encodeUniqueCEs();
97        }
98        // Note: If we still have a short-primary overflow but not a long-primary overflow,
99        // then we could calculate how many more long primaries would fit,
100        // and set the firstShortPrimary to that many after the current firstShortPrimary,
101        // and try again.
102        // However, this might only benefit the en_US_POSIX tailoring,
103        // and it is simpler to suppress building fast Latin data for it in genrb,
104        // or by returning false here if shortPrimaryOverflow.
105
106        boolean ok = !shortPrimaryOverflow;
107        if(ok) {
108            encodeCharCEs();
109            encodeContractions();
110        }
111        contractionCEs.removeAllElements();  // might reduce heap memory usage
112        uniqueCEs.removeAllElements();
113        return ok;
114    }
115
116    // C++ returns one combined array with the contents of the result buffer.
117    // Java returns two arrays (header & table) because we cannot use pointer arithmetic,
118    // and we do not want to index into the table with an offset.
119    char[] getHeader() {
120        char[] resultArray = new char[headerLength];
121        result.getChars(0, headerLength, resultArray, 0);
122        return resultArray;
123    }
124
125    char[] getTable() {
126        char[] resultArray = new char[result.length() - headerLength];
127        result.getChars(headerLength, result.length(), resultArray, 0);
128        return resultArray;
129    }
130
131    private boolean loadGroups(CollationData data) {
132        headerLength = 1 + NUM_SPECIAL_GROUPS;
133        int r0 = (CollationFastLatin.VERSION << 8) | headerLength;
134        result.append((char)r0);
135        // The first few reordering groups should be special groups
136        // (space, punct, ..., digit) followed by Latn, then Grek and other scripts.
137        for(int i = 0; i < NUM_SPECIAL_GROUPS; ++i) {
138            lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(Collator.ReorderCodes.FIRST + i);
139            if(lastSpecialPrimaries[i] == 0) {
140                // missing data
141                return false;
142            }
143            result.append(0);  // reserve a slot for this group
144        }
145
146        firstDigitPrimary = data.getFirstPrimaryForGroup(Collator.ReorderCodes.DIGIT);
147        firstLatinPrimary = data.getFirstPrimaryForGroup(UScript.LATIN);
148        lastLatinPrimary = data.getLastPrimaryForGroup(UScript.LATIN);
149        if(firstDigitPrimary == 0 || firstLatinPrimary == 0) {
150            // missing data
151            return false;
152        }
153        return true;
154    }
155
156    private boolean inSameGroup(long p, long q) {
157        // Both or neither need to be encoded as short primaries,
158        // so that we can test only one and use the same bit mask.
159        if(p >= firstShortPrimary) {
160            return q >= firstShortPrimary;
161        } else if(q >= firstShortPrimary) {
162            return false;
163        }
164        // Both or neither must be potentially-variable,
165        // so that we can test only one and determine if both are variable.
166        long lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1];
167        if(p > lastVariablePrimary) {
168            return q > lastVariablePrimary;
169        } else if(q > lastVariablePrimary) {
170            return false;
171        }
172        // Both will be encoded with long mini primaries.
173        // They must be in the same special reordering group,
174        // so that we can test only one and determine if both are variable.
175        assert(p != 0 && q != 0);
176        for(int i = 0;; ++i) {  // will terminate
177            long lastPrimary = lastSpecialPrimaries[i];
178            if(p <= lastPrimary) {
179                return q <= lastPrimary;
180            } else if(q <= lastPrimary) {
181                return false;
182            }
183        }
184    }
185
186    private void resetCEs() {
187        contractionCEs.removeAllElements();
188        uniqueCEs.removeAllElements();
189        shortPrimaryOverflow = false;
190        result.setLength(headerLength);
191    }
192
193    private void getCEs(CollationData data) {
194        int i = 0;
195        for(char c = 0;; ++i, ++c) {
196            if(c == CollationFastLatin.LATIN_LIMIT) {
197                c = CollationFastLatin.PUNCT_START;
198            } else if(c == CollationFastLatin.PUNCT_LIMIT) {
199                break;
200            }
201            CollationData d;
202            int ce32 = data.getCE32(c);
203            if(ce32 == Collation.FALLBACK_CE32) {
204                d = data.base;
205                ce32 = d.getCE32(c);
206            } else {
207                d = data;
208            }
209            if(getCEsFromCE32(d, c, ce32)) {
210                charCEs[i][0] = ce0;
211                charCEs[i][1] = ce1;
212                addUniqueCE(ce0);
213                addUniqueCE(ce1);
214            } else {
215                // bail out for c
216                charCEs[i][0] = ce0 = Collation.NO_CE;
217                charCEs[i][1] = ce1 = 0;
218            }
219            if(c == 0 && !isContractionCharCE(ce0)) {
220                // Always map U+0000 to a contraction.
221                // Write a contraction list with only a default value if there is no real contraction.
222                assert(contractionCEs.isEmpty());
223                addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1);
224                charCEs[0][0] = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG;
225                charCEs[0][1] = 0;
226            }
227        }
228        // Terminate the last contraction list.
229        contractionCEs.addElement(CollationFastLatin.CONTR_CHAR_MASK);
230    }
231
232    private boolean getCEsFromCE32(CollationData data, int c, int ce32) {
233        ce32 = data.getFinalCE32(ce32);
234        ce1 = 0;
235        if(Collation.isSimpleOrLongCE32(ce32)) {
236            ce0 = Collation.ceFromCE32(ce32);
237        } else {
238            switch(Collation.tagFromCE32(ce32)) {
239            case Collation.LATIN_EXPANSION_TAG:
240                ce0 = Collation.latinCE0FromCE32(ce32);
241                ce1 = Collation.latinCE1FromCE32(ce32);
242                break;
243            case Collation.EXPANSION32_TAG: {
244                int index = Collation.indexFromCE32(ce32);
245                int length = Collation.lengthFromCE32(ce32);
246                if(length <= 2) {
247                    ce0 = Collation.ceFromCE32(data.ce32s[index]);
248                    if(length == 2) {
249                        ce1 = Collation.ceFromCE32(data.ce32s[index + 1]);
250                    }
251                    break;
252                } else {
253                    return false;
254                }
255            }
256            case Collation.EXPANSION_TAG: {
257                int index = Collation.indexFromCE32(ce32);
258                int length = Collation.lengthFromCE32(ce32);
259                if(length <= 2) {
260                    ce0 = data.ces[index];
261                    if(length == 2) {
262                        ce1 = data.ces[index + 1];
263                    }
264                    break;
265                } else {
266                    return false;
267                }
268            }
269            // Note: We could support PREFIX_TAG (assert c>=0)
270            // by recursing on its default CE32 and checking that none of the prefixes starts
271            // with a fast Latin character.
272            // However, currently (2013) there are only the L-before-middle-dot
273            // prefix mappings in the Latin range, and those would be rejected anyway.
274            case Collation.CONTRACTION_TAG:
275                assert(c >= 0);
276                return getCEsFromContractionCE32(data, ce32);
277            case Collation.OFFSET_TAG:
278                assert(c >= 0);
279                ce0 = data.getCEFromOffsetCE32(c, ce32);
280                break;
281            default:
282                return false;
283            }
284        }
285        // A mapping can be completely ignorable.
286        if(ce0 == 0) { return ce1 == 0; }
287        // We do not support an ignorable ce0 unless it is completely ignorable.
288        long p0 = ce0 >>> 32;
289        if(p0 == 0) { return false; }
290        // We only support primaries up to the Latin script.
291        if(p0 > lastLatinPrimary) { return false; }
292        // We support non-common secondary and case weights only together with short primaries.
293        int lower32_0 = (int)ce0;
294        if(p0 < firstShortPrimary) {
295            int sc0 = lower32_0 & Collation.SECONDARY_AND_CASE_MASK;
296            if(sc0 != Collation.COMMON_SECONDARY_CE) { return false; }
297        }
298        // No below-common tertiary weights.
299        if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; }
300        if(ce1 != 0) {
301            // Both primaries must be in the same group,
302            // or both must get short mini primaries,
303            // or a short-primary CE is followed by a secondary CE.
304            // This is so that we can test the first primary and use the same mask for both,
305            // and determine for both whether they are variable.
306            long p1 = ce1 >>> 32;
307            if(p1 == 0 ? p0 < firstShortPrimary : !inSameGroup(p0, p1)) { return false; }
308            int lower32_1 = (int)ce1;
309            // No tertiary CEs.
310            if((lower32_1 >>> 16) == 0) { return false; }
311            // We support non-common secondary and case weights
312            // only for secondary CEs or together with short primaries.
313            if(p1 != 0 && p1 < firstShortPrimary) {
314                int sc1 = lower32_1 & Collation.SECONDARY_AND_CASE_MASK;
315                if(sc1 != Collation.COMMON_SECONDARY_CE) { return false; }
316            }
317            // No below-common tertiary weights.
318            if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; }
319        }
320        // No quaternary weights.
321        if(((ce0 | ce1) & Collation.QUATERNARY_MASK) != 0) { return false; }
322        return true;
323    }
324
325    private boolean getCEsFromContractionCE32(CollationData data, int ce32) {
326        int trieIndex = Collation.indexFromCE32(ce32);
327        ce32 = data.getCE32FromContexts(trieIndex);  // Default if no suffix match.
328        // Since the original ce32 is not a prefix mapping,
329        // the default ce32 must not be another contraction.
330        assert(!Collation.isContractionCE32(ce32));
331        int contractionIndex = contractionCEs.size();
332        if(getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) {
333            addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1);
334        } else {
335            // Bail out for c-without-contraction.
336            addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, Collation.NO_CE, 0);
337        }
338        // Handle an encodable contraction unless the next contraction is too long
339        // and starts with the same character.
340        int prevX = -1;
341        boolean addContraction = false;
342        CharsTrie.Iterator suffixes = CharsTrie.iterator(data.contexts, trieIndex + 2, 0);
343        while(suffixes.hasNext()) {
344            CharsTrie.Entry entry = suffixes.next();
345            CharSequence suffix = entry.chars;
346            int x = CollationFastLatin.getCharIndex(suffix.charAt(0));
347            if(x < 0) { continue; }  // ignore anything but fast Latin text
348            if(x == prevX) {
349                if(addContraction) {
350                    // Bail out for all contractions starting with this character.
351                    addContractionEntry(x, Collation.NO_CE, 0);
352                    addContraction = false;
353                }
354                continue;
355            }
356            if(addContraction) {
357                addContractionEntry(prevX, ce0, ce1);
358            }
359            ce32 = entry.value;
360            if(suffix.length() == 1 && getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) {
361                addContraction = true;
362            } else {
363                addContractionEntry(x, Collation.NO_CE, 0);
364                addContraction = false;
365            }
366            prevX = x;
367        }
368        if(addContraction) {
369            addContractionEntry(prevX, ce0, ce1);
370        }
371        // Note: There might not be any fast Latin contractions, but
372        // we need to enter contraction handling anyway so that we can bail out
373        // when there is a non-fast-Latin character following.
374        // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the
375        // following umlaut and bail out, rather than return the difference of Y vs. u.
376        ce0 = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | contractionIndex;
377        ce1 = 0;
378        return true;
379    }
380
381    private void addContractionEntry(int x, long cce0, long cce1) {
382        contractionCEs.addElement(x);
383        contractionCEs.addElement(cce0);
384        contractionCEs.addElement(cce1);
385        addUniqueCE(cce0);
386        addUniqueCE(cce1);
387    }
388
389    private void addUniqueCE(long ce) {
390        if(ce == 0 || (ce >>> 32) == Collation.NO_CE_PRIMARY) { return; }
391        ce &= ~(long)Collation.CASE_MASK;  // blank out case bits
392        int i = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
393        if(i < 0) {
394            uniqueCEs.insertElementAt(ce, ~i);
395        }
396    }
397
398    private int getMiniCE(long ce) {
399        ce &= ~(long)Collation.CASE_MASK;  // blank out case bits
400        int index = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce);
401        assert(index >= 0);
402        return miniCEs[index];
403    }
404
405    private void encodeUniqueCEs() {
406        miniCEs = new char[uniqueCEs.size()];
407        int group = 0;
408        long lastGroupPrimary = lastSpecialPrimaries[group];
409        // The lowest unique CE must be at least a secondary CE.
410        assert(((int)uniqueCEs.elementAti(0) >>> 16) != 0);
411        long prevPrimary = 0;
412        int prevSecondary = 0;
413        int pri = 0;
414        int sec = 0;
415        int ter = CollationFastLatin.COMMON_TER;
416        for(int i = 0; i < uniqueCEs.size(); ++i) {
417            long ce = uniqueCEs.elementAti(i);
418            // Note: At least one of the p/s/t weights changes from one unique CE to the next.
419            // (uniqueCEs does not store case bits.)
420            long p = ce >>> 32;
421            if(p != prevPrimary) {
422                while(p > lastGroupPrimary) {
423                    assert(pri <= CollationFastLatin.MAX_LONG);
424                    // Set the group's header entry to the
425                    // last "long primary" in or before the group.
426                    result.setCharAt(1 + group, (char)pri);
427                    if(++group < NUM_SPECIAL_GROUPS) {
428                        lastGroupPrimary = lastSpecialPrimaries[group];
429                    } else {
430                        lastGroupPrimary = 0xffffffffL;
431                        break;
432                    }
433                }
434                if(p < firstShortPrimary) {
435                    if(pri == 0) {
436                        pri = CollationFastLatin.MIN_LONG;
437                    } else if(pri < CollationFastLatin.MAX_LONG) {
438                        pri += CollationFastLatin.LONG_INC;
439                    } else {
440    /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
441                        printf("long-primary overflow for %08x\n", p);
442    #endif */
443                        miniCEs[i] = CollationFastLatin.BAIL_OUT;
444                        continue;
445                    }
446                } else {
447                    if(pri < CollationFastLatin.MIN_SHORT) {
448                        pri = CollationFastLatin.MIN_SHORT;
449                    } else if(pri < (CollationFastLatin.MAX_SHORT - CollationFastLatin.SHORT_INC)) {
450                        // Reserve the highest primary weight for U+FFFF.
451                        pri += CollationFastLatin.SHORT_INC;
452                    } else {
453    /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
454                        printf("short-primary overflow for %08x\n", p);
455    #endif */
456                        shortPrimaryOverflow = true;
457                        miniCEs[i] = CollationFastLatin.BAIL_OUT;
458                        continue;
459                    }
460                }
461                prevPrimary = p;
462                prevSecondary = Collation.COMMON_WEIGHT16;
463                sec = CollationFastLatin.COMMON_SEC;
464                ter = CollationFastLatin.COMMON_TER;
465            }
466            int lower32 = (int)ce;
467            int s = lower32 >>> 16;
468            if(s != prevSecondary) {
469                if(pri == 0) {
470                    if(sec == 0) {
471                        sec = CollationFastLatin.MIN_SEC_HIGH;
472                    } else if(sec < CollationFastLatin.MAX_SEC_HIGH) {
473                        sec += CollationFastLatin.SEC_INC;
474                    } else {
475                        miniCEs[i] = CollationFastLatin.BAIL_OUT;
476                        continue;
477                    }
478                    prevSecondary = s;
479                    ter = CollationFastLatin.COMMON_TER;
480                } else if(s < Collation.COMMON_WEIGHT16) {
481                    if(sec == CollationFastLatin.COMMON_SEC) {
482                        sec = CollationFastLatin.MIN_SEC_BEFORE;
483                    } else if(sec < CollationFastLatin.MAX_SEC_BEFORE) {
484                        sec += CollationFastLatin.SEC_INC;
485                    } else {
486                        miniCEs[i] = CollationFastLatin.BAIL_OUT;
487                        continue;
488                    }
489                } else if(s == Collation.COMMON_WEIGHT16) {
490                    sec = CollationFastLatin.COMMON_SEC;
491                } else {
492                    if(sec < CollationFastLatin.MIN_SEC_AFTER) {
493                        sec = CollationFastLatin.MIN_SEC_AFTER;
494                    } else if(sec < CollationFastLatin.MAX_SEC_AFTER) {
495                        sec += CollationFastLatin.SEC_INC;
496                    } else {
497                        miniCEs[i] = CollationFastLatin.BAIL_OUT;
498                        continue;
499                    }
500                }
501                prevSecondary = s;
502                ter = CollationFastLatin.COMMON_TER;
503            }
504            assert((lower32 & Collation.CASE_MASK) == 0);  // blanked out in uniqueCEs
505            int t = lower32 & Collation.ONLY_TERTIARY_MASK;
506            if(t > Collation.COMMON_WEIGHT16) {
507                if(ter < CollationFastLatin.MAX_TER_AFTER) {
508                    ++ter;
509                } else {
510                    miniCEs[i] = CollationFastLatin.BAIL_OUT;
511                    continue;
512                }
513            }
514            if(CollationFastLatin.MIN_LONG <= pri && pri <= CollationFastLatin.MAX_LONG) {
515                assert(sec == CollationFastLatin.COMMON_SEC);
516                miniCEs[i] = (char)(pri | ter);
517            } else {
518                miniCEs[i] = (char)(pri | sec | ter);
519            }
520        }
521    /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
522        printf("last mini primary: %04x\n", pri);
523    #endif */
524    /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2
525        for(int i = 0; i < uniqueCEs.size(); ++i) {
526            long ce = uniqueCEs.elementAti(i);
527            printf("unique CE 0x%016lx -> 0x%04x\n", ce, miniCEs[i]);
528        }
529    #endif */
530    }
531
532    private void encodeCharCEs() {
533        int miniCEsStart = result.length();
534        for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) {
535            result.append(0);  // initialize to completely ignorable
536        }
537        int indexBase = result.length();
538        for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) {
539            long ce = charCEs[i][0];
540            if(isContractionCharCE(ce)) { continue; }  // defer contraction
541            int miniCE = encodeTwoCEs(ce, charCEs[i][1]);
542            if((miniCE >>> 16) > 0) {   // if ((unsigned)miniCE > 0xffff)
543                // Note: There is a chance that this new expansion is the same as a previous one,
544                // and if so, then we could reuse the other expansion.
545                // However, that seems unlikely.
546                int expansionIndex = result.length() - indexBase;
547                if(expansionIndex > CollationFastLatin.INDEX_MASK) {
548                    miniCE = CollationFastLatin.BAIL_OUT;
549                } else {
550                    result.append((char)(miniCE >> 16)).append((char)miniCE);
551                    miniCE = CollationFastLatin.EXPANSION | expansionIndex;
552                }
553            }
554            result.setCharAt(miniCEsStart + i, (char)miniCE);
555        }
556    }
557
558    private void encodeContractions() {
559        // We encode all contraction lists so that the first word of a list
560        // terminates the previous list, and we only need one additional terminator at the end.
561        int indexBase = headerLength + CollationFastLatin.NUM_FAST_CHARS;
562        int firstContractionIndex = result.length();
563        for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) {
564            long ce = charCEs[i][0];
565            if(!isContractionCharCE(ce)) { continue; }
566            int contractionIndex = result.length() - indexBase;
567            if(contractionIndex > CollationFastLatin.INDEX_MASK) {
568                result.setCharAt(headerLength + i, (char) CollationFastLatin.BAIL_OUT);
569                continue;
570            }
571            boolean firstTriple = true;
572            for(int index = (int)ce & 0x7fffffff;; index += 3) {
573                long x = contractionCEs.elementAti(index);
574                if(x == CollationFastLatin.CONTR_CHAR_MASK && !firstTriple) { break; }
575                long cce0 = contractionCEs.elementAti(index + 1);
576                long cce1 = contractionCEs.elementAti(index + 2);
577                int miniCE = encodeTwoCEs(cce0, cce1);
578                if(miniCE == CollationFastLatin.BAIL_OUT) {
579                    result.append((char)(x | (1 << CollationFastLatin.CONTR_LENGTH_SHIFT)));
580                } else if((miniCE >>> 16) == 0) {  // if ((unsigned)miniCE <= 0xffff)
581                    result.append((char)(x | (2 << CollationFastLatin.CONTR_LENGTH_SHIFT)));
582                    result.append((char)miniCE);
583                } else {
584                    result.append((char)(x | (3 << CollationFastLatin.CONTR_LENGTH_SHIFT)));
585                    result.append((char)(miniCE >> 16)).append((char)miniCE);
586                }
587                firstTriple = false;
588            }
589            // Note: There is a chance that this new contraction list is the same as a previous one,
590            // and if so, then we could truncate the result and reuse the other list.
591            // However, that seems unlikely.
592            result.setCharAt(headerLength + i,
593                            (char)(CollationFastLatin.CONTRACTION | contractionIndex));
594        }
595        if(result.length() > firstContractionIndex) {
596            // Terminate the last contraction list.
597            result.append((char)CollationFastLatin.CONTR_CHAR_MASK);
598        }
599    /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER
600        printf("** fast Latin %d * 2 = %d bytes\n", result.length(), result.length() * 2);
601        puts("   header & below-digit groups map");
602        int i = 0;
603        for(; i < headerLength; ++i) {
604            printf(" %04x", result[i]);
605        }
606        printf("\n   char mini CEs");
607        assert(CollationFastLatin.NUM_FAST_CHARS % 16 == 0);
608        for(; i < indexBase; i += 16) {
609            int c = i - headerLength;
610            if(c >= CollationFastLatin.LATIN_LIMIT) {
611                c = CollationFastLatin.PUNCT_START + c - CollationFastLatin.LATIN_LIMIT;
612            }
613            printf("\n %04x:", c);
614            for(int j = 0; j < 16; ++j) {
615                printf(" %04x", result[i + j]);
616            }
617        }
618        printf("\n   expansions & contractions");
619        for(; i < result.length(); ++i) {
620            if((i - indexBase) % 16 == 0) { puts(""); }
621            printf(" %04x", result[i]);
622        }
623        puts("");
624    #endif */
625    }
626
627    private int encodeTwoCEs(long first, long second) {
628        if(first == 0) {
629            return 0;  // completely ignorable
630        }
631        if(first == Collation.NO_CE) {
632            return CollationFastLatin.BAIL_OUT;
633        }
634        assert((first >>> 32) != Collation.NO_CE_PRIMARY);
635
636        int miniCE = getMiniCE(first);
637        if(miniCE == CollationFastLatin.BAIL_OUT) { return miniCE; }
638        if(miniCE >= CollationFastLatin.MIN_SHORT) {
639            // Extract & copy the case bits.
640            // Shift them from normal CE bits 15..14 to mini CE bits 4..3.
641            int c = (((int)first & Collation.CASE_MASK) >> (14 - 3));
642            // Only in mini CEs: Ignorable case bits = 0, lowercase = 1.
643            c += CollationFastLatin.LOWER_CASE;
644            miniCE |= c;
645        }
646        if(second == 0) { return miniCE; }
647
648        int miniCE1 = getMiniCE(second);
649        if(miniCE1 == CollationFastLatin.BAIL_OUT) { return miniCE1; }
650
651        int case1 = (int)second & Collation.CASE_MASK;
652        if(miniCE >= CollationFastLatin.MIN_SHORT &&
653                (miniCE & CollationFastLatin.SECONDARY_MASK) == CollationFastLatin.COMMON_SEC) {
654            // Try to combine the two mini CEs into one.
655            int sec1 = miniCE1 & CollationFastLatin.SECONDARY_MASK;
656            int ter1 = miniCE1 & CollationFastLatin.TERTIARY_MASK;
657            if(sec1 >= CollationFastLatin.MIN_SEC_HIGH && case1 == 0 &&
658                    ter1 == CollationFastLatin.COMMON_TER) {
659                // sec1>=sec_high implies pri1==0.
660                return (miniCE & ~CollationFastLatin.SECONDARY_MASK) | sec1;
661            }
662        }
663
664        if(miniCE1 <= CollationFastLatin.SECONDARY_MASK || CollationFastLatin.MIN_SHORT <= miniCE1) {
665            // Secondary CE, or a CE with a short primary, copy the case bits.
666            case1 = (case1 >> (14 - 3)) + CollationFastLatin.LOWER_CASE;
667            miniCE1 |= case1;
668        }
669        return (miniCE << 16) | miniCE1;
670    }
671
672    private static boolean isContractionCharCE(long ce) {
673        return (ce >>> 32) == Collation.NO_CE_PRIMARY && ce != Collation.NO_CE;
674    }
675
676    // space, punct, symbol, currency (not digit)
677    private static final int NUM_SPECIAL_GROUPS =
678            Collator.ReorderCodes.CURRENCY - Collator.ReorderCodes.FIRST + 1;
679
680    private static final long CONTRACTION_FLAG = 0x80000000L;
681
682    // temporary "buffer"
683    private long ce0, ce1;
684
685    private long[][] charCEs = new long[CollationFastLatin.NUM_FAST_CHARS][2];
686
687    private UVector64 contractionCEs;
688    private UVector64 uniqueCEs;
689
690    /** One 16-bit mini CE per unique CE. */
691    private char[] miniCEs;
692
693    // These are constant for a given root collator.
694    long[] lastSpecialPrimaries = new long[NUM_SPECIAL_GROUPS];
695    private long firstDigitPrimary;
696    private long firstLatinPrimary;
697    private long lastLatinPrimary;
698    // This determines the first normal primary weight which is mapped to
699    // a short mini primary. It must be >=firstDigitPrimary.
700    private long firstShortPrimary;
701
702    private boolean shortPrimaryOverflow;
703
704    private StringBuilder result = new StringBuilder();
705    private int headerLength;
706}
707