1/*
2*******************************************************************************
3* Copyright (C) 2013-2015, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* CollationFastLatin.java, ported from collationfastlatin.h/.cpp
7*
8* C++ version created on: 2013aug09
9* created by: Markus W. Scherer
10*/
11
12package com.ibm.icu.impl.coll;
13
14import com.ibm.icu.lang.UScript;
15import com.ibm.icu.text.Collator;
16
17public final class CollationFastLatin /* all static */ {
18    /**
19     * Fast Latin format version (one byte 1..FF).
20     * Must be incremented for any runtime-incompatible changes,
21     * in particular, for changes to any of the following constants.
22     *
23     * When the major version number of the main data format changes,
24     * we can reset this fast Latin version to 1.
25     */
26    public static final int VERSION = 2;
27
28    public static final int LATIN_MAX = 0x17f;
29    public static final int LATIN_LIMIT = LATIN_MAX + 1;
30
31    static final int LATIN_MAX_UTF8_LEAD = 0xc5;  // UTF-8 lead byte of LATIN_MAX
32
33    static final int PUNCT_START = 0x2000;
34    static final int PUNCT_LIMIT = 0x2040;
35
36    // excludes U+FFFE & U+FFFF
37    static final int NUM_FAST_CHARS = LATIN_LIMIT + (PUNCT_LIMIT - PUNCT_START);
38
39    // Note on the supported weight ranges:
40    // Analysis of UCA 6.3 and CLDR 23 non-search tailorings shows that
41    // the CEs for characters in the above ranges, excluding expansions with length >2,
42    // excluding contractions of >2 characters, and other restrictions
43    // (see the builder's getCEsFromCE32()),
44    // use at most about 150 primary weights,
45    // where about 94 primary weights are possibly-variable (space/punct/symbol/currency),
46    // at most 4 secondary before-common weights,
47    // at most 4 secondary after-common weights,
48    // at most 16 secondary high weights (in secondary CEs), and
49    // at most 4 tertiary after-common weights.
50    // The following ranges are designed to support slightly more weights than that.
51    // (en_US_POSIX is unusual: It creates about 64 variable + 116 Latin primaries.)
52
53    // Digits may use long primaries (preserving more short ones)
54    // or short primaries (faster) without changing this data structure.
55    // (If we supported numeric collation, then digits would have to have long primaries
56    // so that special handling does not affect the fast path.)
57
58    static final int SHORT_PRIMARY_MASK = 0xfc00;  // bits 15..10
59    static final int INDEX_MASK = 0x3ff;  // bits 9..0 for expansions & contractions
60    static final int SECONDARY_MASK = 0x3e0;  // bits 9..5
61    static final int CASE_MASK = 0x18;  // bits 4..3
62    static final int LONG_PRIMARY_MASK = 0xfff8;  // bits 15..3
63    static final int TERTIARY_MASK = 7;  // bits 2..0
64    static final int CASE_AND_TERTIARY_MASK = CASE_MASK | TERTIARY_MASK;
65
66    static final int TWO_SHORT_PRIMARIES_MASK =
67            (SHORT_PRIMARY_MASK << 16) | SHORT_PRIMARY_MASK;  // 0xfc00fc00
68    static final int TWO_LONG_PRIMARIES_MASK =
69            (LONG_PRIMARY_MASK << 16) | LONG_PRIMARY_MASK;  // 0xfff8fff8
70    static final int TWO_SECONDARIES_MASK =
71            (SECONDARY_MASK << 16) | SECONDARY_MASK;  // 0x3e003e0
72    static final int TWO_CASES_MASK =
73            (CASE_MASK << 16) | CASE_MASK;  // 0x180018
74    static final int TWO_TERTIARIES_MASK =
75            (TERTIARY_MASK << 16) | TERTIARY_MASK;  // 0x70007
76
77    /**
78     * Contraction with one fast Latin character.
79     * Use INDEX_MASK to find the start of the contraction list after the fixed table.
80     * The first entry contains the default mapping.
81     * Otherwise use CONTR_CHAR_MASK for the contraction character index
82     * (in ascending order).
83     * Use CONTR_LENGTH_SHIFT for the length of the entry
84     * (1=BAIL_OUT, 2=one CE, 3=two CEs).
85     *
86     * Also, U+0000 maps to a contraction entry, so that the fast path need not
87     * check for NUL termination.
88     * It usually maps to a contraction list with only the completely ignorable default value.
89     */
90    static final int CONTRACTION = 0x400;
91    /**
92     * An expansion encodes two CEs.
93     * Use INDEX_MASK to find the pair of CEs after the fixed table.
94     *
95     * The higher a mini CE value, the easier it is to process.
96     * For expansions and higher, no context needs to be considered.
97     */
98    static final int EXPANSION = 0x800;
99    /**
100     * Encodes one CE with a long/low mini primary (there are 128).
101     * All potentially-variable primaries must be in this range,
102     * to make the short-primary path as fast as possible.
103     */
104    static final int MIN_LONG = 0xc00;
105    static final int LONG_INC = 8;
106    static final int MAX_LONG = 0xff8;
107    /**
108     * Encodes one CE with a short/high primary (there are 60),
109     * plus a secondary CE if the secondary weight is high.
110     * Fast handling: At least all letter primaries should be in this range.
111     */
112    static final int MIN_SHORT = 0x1000;
113    static final int SHORT_INC = 0x400;
114    /** The highest primary weight is reserved for U+FFFF. */
115    static final int MAX_SHORT = SHORT_PRIMARY_MASK;
116
117    static final int MIN_SEC_BEFORE = 0;  // must add SEC_OFFSET
118    static final int SEC_INC = 0x20;
119    static final int MAX_SEC_BEFORE = MIN_SEC_BEFORE + 4 * SEC_INC;  // 5 before common
120    static final int COMMON_SEC = MAX_SEC_BEFORE + SEC_INC;
121    static final int MIN_SEC_AFTER = COMMON_SEC + SEC_INC;
122    static final int MAX_SEC_AFTER = MIN_SEC_AFTER + 5 * SEC_INC;  // 6 after common
123    static final int MIN_SEC_HIGH = MAX_SEC_AFTER + SEC_INC;  // 20 high secondaries
124    static final int MAX_SEC_HIGH = SECONDARY_MASK;
125
126    /**
127     * Lookup: Add this offset to secondary weights, except for completely ignorable CEs.
128     * Must be greater than any special value, e.g., MERGE_WEIGHT.
129     * The exact value is not relevant for the format version.
130     */
131    static final int SEC_OFFSET = SEC_INC;
132    static final int COMMON_SEC_PLUS_OFFSET = COMMON_SEC + SEC_OFFSET;
133
134    static final int TWO_SEC_OFFSETS =
135            (SEC_OFFSET << 16) | SEC_OFFSET;  // 0x200020
136    static final int TWO_COMMON_SEC_PLUS_OFFSET =
137            (COMMON_SEC_PLUS_OFFSET << 16) | COMMON_SEC_PLUS_OFFSET;
138
139    static final int LOWER_CASE = 8;  // case bits include this offset
140    static final int TWO_LOWER_CASES = (LOWER_CASE << 16) | LOWER_CASE;  // 0x80008
141
142    static final int COMMON_TER = 0;  // must add TER_OFFSET
143    static final int MAX_TER_AFTER = 7;  // 7 after common
144
145    /**
146     * Lookup: Add this offset to tertiary weights, except for completely ignorable CEs.
147     * Must be greater than any special value, e.g., MERGE_WEIGHT.
148     * Must be greater than case bits as well, so that with combined case+tertiary weights
149     * plus the offset the tertiary bits does not spill over into the case bits.
150     * The exact value is not relevant for the format version.
151     */
152    static final int TER_OFFSET = SEC_OFFSET;
153    static final int COMMON_TER_PLUS_OFFSET = COMMON_TER + TER_OFFSET;
154
155    static final int TWO_TER_OFFSETS = (TER_OFFSET << 16) | TER_OFFSET;
156    static final int TWO_COMMON_TER_PLUS_OFFSET =
157            (COMMON_TER_PLUS_OFFSET << 16) | COMMON_TER_PLUS_OFFSET;
158
159    static final int MERGE_WEIGHT = 3;
160    static final int EOS = 2;  // end of string
161    static final int BAIL_OUT = 1;
162
163    /**
164     * Contraction result first word bits 8..0 contain the
165     * second contraction character, as a char index 0..NUM_FAST_CHARS-1.
166     * Each contraction list is terminated with a word containing CONTR_CHAR_MASK.
167     */
168    static final int CONTR_CHAR_MASK = 0x1ff;
169    /**
170     * Contraction result first word bits 10..9 contain the result length:
171     * 1=bail out, 2=one mini CE, 3=two mini CEs
172     */
173    static final int CONTR_LENGTH_SHIFT = 9;
174
175    /**
176     * Comparison return value when the regular comparison must be used.
177     * The exact value is not relevant for the format version.
178     */
179    public static final int BAIL_OUT_RESULT = -2;
180
181    static int getCharIndex(char c) {
182        if(c <= LATIN_MAX) {
183            return c;
184        } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
185            return c - (PUNCT_START - LATIN_LIMIT);
186        } else {
187            // Not a fast Latin character.
188            // Note: U+FFFE & U+FFFF are forbidden in tailorings
189            // and thus do not occur in any contractions.
190            return -1;
191        }
192    }
193
194    /**
195     * Computes the options value for the compare functions
196     * and writes the precomputed primary weights.
197     * Returns -1 if the Latin fastpath is not supported for the data and settings.
198     * The capacity must be LATIN_LIMIT.
199     */
200    public static int getOptions(CollationData data, CollationSettings settings,
201            char[] primaries) {
202        char[] header = data.fastLatinTableHeader;
203        if(header == null) { return -1; }
204        assert((header[0] >> 8) == VERSION);
205        assert(primaries.length == LATIN_LIMIT);
206        if(primaries.length != LATIN_LIMIT) { return -1; }
207
208        int miniVarTop;
209        if((settings.options & CollationSettings.ALTERNATE_MASK) == 0) {
210            // No mini primaries are variable, set a variableTop just below the
211            // lowest long mini primary.
212            miniVarTop = MIN_LONG - 1;
213        } else {
214            int headerLength = header[0] & 0xff;
215            int i = 1 + settings.getMaxVariable();
216            if(i >= headerLength) {
217                return -1;  // variableTop >= digits, should not occur
218            }
219            miniVarTop = header[i];
220        }
221
222        boolean digitsAreReordered = false;
223        if(settings.hasReordering()) {
224            long prevStart = 0;
225            long beforeDigitStart = 0;
226            long digitStart = 0;
227            long afterDigitStart = 0;
228            for(int group = Collator.ReorderCodes.FIRST;
229                    group < Collator.ReorderCodes.FIRST + CollationData.MAX_NUM_SPECIAL_REORDER_CODES;
230                    ++group) {
231                long start = data.getFirstPrimaryForGroup(group);
232                start = settings.reorder(start);
233                if(group == Collator.ReorderCodes.DIGIT) {
234                    beforeDigitStart = prevStart;
235                    digitStart = start;
236                } else if(start != 0) {
237                    if(start < prevStart) {
238                        // The permutation affects the groups up to Latin.
239                        return -1;
240                    }
241                    // In the future, there might be a special group between digits & Latin.
242                    if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
243                        afterDigitStart = start;
244                    }
245                    prevStart = start;
246                }
247            }
248            long latinStart = data.getFirstPrimaryForGroup(UScript.LATIN);
249            latinStart = settings.reorder(latinStart);
250            if(latinStart < prevStart) {
251                return -1;
252            }
253            if(afterDigitStart == 0) {
254                afterDigitStart = latinStart;
255            }
256            if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
257                digitsAreReordered = true;
258            }
259        }
260
261        char[] table = data.fastLatinTable;  // skip the header
262        for(int c = 0; c < LATIN_LIMIT; ++c) {
263            int p = table[c];
264            if(p >= MIN_SHORT) {
265                p &= SHORT_PRIMARY_MASK;
266            } else if(p > miniVarTop) {
267                p &= LONG_PRIMARY_MASK;
268            } else {
269                p = 0;
270            }
271            primaries[c] = (char)p;
272        }
273        if(digitsAreReordered || (settings.options & CollationSettings.NUMERIC) != 0) {
274            // Bail out for digits.
275            for(int c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
276        }
277
278        // Shift the miniVarTop above other options.
279        return (miniVarTop << 16) | settings.options;
280    }
281
282    public static int compareUTF16(char[] table, char[] primaries, int options,
283            CharSequence left, CharSequence right, int startIndex) {
284        // This is a modified copy of CollationCompare.compareUpToQuaternary(),
285        // optimized for common Latin text.
286        // Keep them in sync!
287
288        int variableTop = options >> 16;  // see getOptions()
289        options &= 0xffff;  // needed for CollationSettings.getStrength() to work
290
291        // Check for supported characters, fetch mini CEs, and compare primaries.
292        int leftIndex = startIndex, rightIndex = startIndex;
293        /**
294         * Single mini CE or a pair.
295         * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
296         * If there is only one, then it is in the lower bits, and the upper bits are 0.
297         */
298        int leftPair = 0, rightPair = 0;
299        for(;;) {
300            // We fetch CEs until we get a non-ignorable primary or reach the end.
301            while(leftPair == 0) {
302                if(leftIndex == left.length()) {
303                    leftPair = EOS;
304                    break;
305                }
306                int c = left.charAt(leftIndex++);
307                if(c <= LATIN_MAX) {
308                    leftPair = primaries[c];
309                    if(leftPair != 0) { break; }
310                    if(c <= 0x39 && c >= 0x30 && (options & CollationSettings.NUMERIC) != 0) {
311                        return BAIL_OUT_RESULT;
312                    }
313                    leftPair = table[c];
314                } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
315                    leftPair = table[c - PUNCT_START + LATIN_LIMIT];
316                } else {
317                    leftPair = lookup(table, c);
318                }
319                if(leftPair >= MIN_SHORT) {
320                    leftPair &= SHORT_PRIMARY_MASK;
321                    break;
322                } else if(leftPair > variableTop) {
323                    leftPair &= LONG_PRIMARY_MASK;
324                    break;
325                } else {
326                    long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
327                    if(pairAndInc < 0) {
328                        ++leftIndex;
329                        pairAndInc = ~pairAndInc;
330                    }
331                    leftPair = (int)pairAndInc;
332                    if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
333                    leftPair = getPrimaries(variableTop, leftPair);
334                }
335            }
336
337            while(rightPair == 0) {
338                if(rightIndex == right.length()) {
339                    rightPair = EOS;
340                    break;
341                }
342                int c = right.charAt(rightIndex++);
343                if(c <= LATIN_MAX) {
344                    rightPair = primaries[c];
345                    if(rightPair != 0) { break; }
346                    if(c <= 0x39 && c >= 0x30 && (options & CollationSettings.NUMERIC) != 0) {
347                        return BAIL_OUT_RESULT;
348                    }
349                    rightPair = table[c];
350                } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
351                    rightPair = table[c - PUNCT_START + LATIN_LIMIT];
352                } else {
353                    rightPair = lookup(table, c);
354                }
355                if(rightPair >= MIN_SHORT) {
356                    rightPair &= SHORT_PRIMARY_MASK;
357                    break;
358                } else if(rightPair > variableTop) {
359                    rightPair &= LONG_PRIMARY_MASK;
360                    break;
361                } else {
362                    long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
363                    if(pairAndInc < 0) {
364                        ++rightIndex;
365                        pairAndInc = ~pairAndInc;
366                    }
367                    rightPair = (int)pairAndInc;
368                    if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
369                    rightPair = getPrimaries(variableTop, rightPair);
370                }
371            }
372
373            if(leftPair == rightPair) {
374                if(leftPair == EOS) { break; }
375                leftPair = rightPair = 0;
376                continue;
377            }
378            int leftPrimary = leftPair & 0xffff;
379            int rightPrimary = rightPair & 0xffff;
380            if(leftPrimary != rightPrimary) {
381                // Return the primary difference.
382                return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER;
383            }
384            if(leftPair == EOS) { break; }
385            leftPair >>>= 16;
386            rightPair >>>= 16;
387        }
388        // In the following, we need to re-fetch each character because we did not buffer the CEs,
389        // but we know that the string is well-formed and
390        // only contains supported characters and mappings.
391
392        // We might skip the secondary level but continue with the case level
393        // which is turned on separately.
394        if(CollationSettings.getStrength(options) >= Collator.SECONDARY) {
395            leftIndex = rightIndex = startIndex;
396            leftPair = rightPair = 0;
397            for(;;) {
398                while(leftPair == 0) {
399                    if(leftIndex == left.length()) {
400                        leftPair = EOS;
401                        break;
402                    }
403                    int c = left.charAt(leftIndex++);
404                    if(c <= LATIN_MAX) {
405                        leftPair = table[c];
406                    } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
407                        leftPair = table[c - PUNCT_START + LATIN_LIMIT];
408                    } else {
409                        leftPair = lookup(table, c);
410                    }
411                    if(leftPair >= MIN_SHORT) {
412                        leftPair = getSecondariesFromOneShortCE(leftPair);
413                        break;
414                    } else if(leftPair > variableTop) {
415                        leftPair = COMMON_SEC_PLUS_OFFSET;
416                        break;
417                    } else {
418                        long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
419                        if(pairAndInc < 0) {
420                            ++leftIndex;
421                            pairAndInc = ~pairAndInc;
422                        }
423                        leftPair = getSecondaries(variableTop, (int)pairAndInc);
424                    }
425                }
426
427                while(rightPair == 0) {
428                    if(rightIndex == right.length()) {
429                        rightPair = EOS;
430                        break;
431                    }
432                    int c = right.charAt(rightIndex++);
433                    if(c <= LATIN_MAX) {
434                        rightPair = table[c];
435                    } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
436                        rightPair = table[c - PUNCT_START + LATIN_LIMIT];
437                    } else {
438                        rightPair = lookup(table, c);
439                    }
440                    if(rightPair >= MIN_SHORT) {
441                        rightPair = getSecondariesFromOneShortCE(rightPair);
442                        break;
443                    } else if(rightPair > variableTop) {
444                        rightPair = COMMON_SEC_PLUS_OFFSET;
445                        break;
446                    } else {
447                        long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
448                        if(pairAndInc < 0) {
449                            ++rightIndex;
450                            pairAndInc = ~pairAndInc;
451                        }
452                        rightPair = getSecondaries(variableTop, (int)pairAndInc);
453                    }
454                }
455
456                if(leftPair == rightPair) {
457                    if(leftPair == EOS) { break; }
458                    leftPair = rightPair = 0;
459                    continue;
460                }
461                int leftSecondary = leftPair & 0xffff;
462                int rightSecondary = rightPair & 0xffff;
463                if(leftSecondary != rightSecondary) {
464                    if((options & CollationSettings.BACKWARD_SECONDARY) != 0) {
465                        // Full support for backwards secondary requires backwards contraction matching
466                        // and moving backwards between merge separators.
467                        return BAIL_OUT_RESULT;
468                    }
469                    return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER;
470                }
471                if(leftPair == EOS) { break; }
472                leftPair >>>= 16;
473                rightPair >>>= 16;
474            }
475        }
476
477        if((options & CollationSettings.CASE_LEVEL) != 0) {
478            boolean strengthIsPrimary = CollationSettings.getStrength(options) == Collator.PRIMARY;
479            leftIndex = rightIndex = startIndex;
480            leftPair = rightPair = 0;
481            for(;;) {
482                while(leftPair == 0) {
483                    if(leftIndex == left.length()) {
484                        leftPair = EOS;
485                        break;
486                    }
487                    int c = left.charAt(leftIndex++);
488                    leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
489                    if(leftPair < MIN_LONG) {
490                        long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
491                        if(pairAndInc < 0) {
492                            ++leftIndex;
493                            pairAndInc = ~pairAndInc;
494                        }
495                        leftPair = (int)pairAndInc;
496                    }
497                    leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
498                }
499
500                while(rightPair == 0) {
501                    if(rightIndex == right.length()) {
502                        rightPair = EOS;
503                        break;
504                    }
505                    int c = right.charAt(rightIndex++);
506                    rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
507                    if(rightPair < MIN_LONG) {
508                        long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
509                        if(pairAndInc < 0) {
510                            ++rightIndex;
511                            pairAndInc = ~pairAndInc;
512                        }
513                        rightPair = (int)pairAndInc;
514                    }
515                    rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
516                }
517
518                if(leftPair == rightPair) {
519                    if(leftPair == EOS) { break; }
520                    leftPair = rightPair = 0;
521                    continue;
522                }
523                int leftCase = leftPair & 0xffff;
524                int rightCase = rightPair & 0xffff;
525                if(leftCase != rightCase) {
526                    if((options & CollationSettings.UPPER_FIRST) == 0) {
527                        return (leftCase < rightCase) ? Collation.LESS : Collation.GREATER;
528                    } else {
529                        return (leftCase < rightCase) ? Collation.GREATER : Collation.LESS;
530                    }
531                }
532                if(leftPair == EOS) { break; }
533                leftPair >>>= 16;
534                rightPair >>>= 16;
535            }
536        }
537        if(CollationSettings.getStrength(options) <= Collator.SECONDARY) { return Collation.EQUAL; }
538
539        // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
540        boolean withCaseBits = CollationSettings.isTertiaryWithCaseBits(options);
541
542        leftIndex = rightIndex = startIndex;
543        leftPair = rightPair = 0;
544        for(;;) {
545            while(leftPair == 0) {
546                if(leftIndex == left.length()) {
547                    leftPair = EOS;
548                    break;
549                }
550                int c = left.charAt(leftIndex++);
551                leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
552                if(leftPair < MIN_LONG) {
553                    long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
554                    if(pairAndInc < 0) {
555                        ++leftIndex;
556                        pairAndInc = ~pairAndInc;
557                    }
558                    leftPair = (int)pairAndInc;
559                }
560                leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
561            }
562
563            while(rightPair == 0) {
564                if(rightIndex == right.length()) {
565                    rightPair = EOS;
566                    break;
567                }
568                int c = right.charAt(rightIndex++);
569                rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
570                if(rightPair < MIN_LONG) {
571                    long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
572                    if(pairAndInc < 0) {
573                        ++rightIndex;
574                        pairAndInc = ~pairAndInc;
575                    }
576                    rightPair = (int)pairAndInc;
577                }
578                rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
579            }
580
581            if(leftPair == rightPair) {
582                if(leftPair == EOS) { break; }
583                leftPair = rightPair = 0;
584                continue;
585            }
586            int leftTertiary = leftPair & 0xffff;
587            int rightTertiary = rightPair & 0xffff;
588            if(leftTertiary != rightTertiary) {
589                if(CollationSettings.sortsTertiaryUpperCaseFirst(options)) {
590                    // Pass through EOS and MERGE_WEIGHT
591                    // and keep real tertiary weights larger than the MERGE_WEIGHT.
592                    // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
593                    if(leftTertiary > MERGE_WEIGHT) {
594                        leftTertiary ^= CASE_MASK;
595                    }
596                    if(rightTertiary > MERGE_WEIGHT) {
597                        rightTertiary ^= CASE_MASK;
598                    }
599                }
600                return (leftTertiary < rightTertiary) ? Collation.LESS : Collation.GREATER;
601            }
602            if(leftPair == EOS) { break; }
603            leftPair >>>= 16;
604            rightPair >>>= 16;
605        }
606        if(CollationSettings.getStrength(options) <= Collator.TERTIARY) { return Collation.EQUAL; }
607
608        leftIndex = rightIndex = startIndex;
609        leftPair = rightPair = 0;
610        for(;;) {
611            while(leftPair == 0) {
612                if(leftIndex == left.length()) {
613                    leftPair = EOS;
614                    break;
615                }
616                int c = left.charAt(leftIndex++);
617                leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
618                if(leftPair < MIN_LONG) {
619                    long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
620                    if(pairAndInc < 0) {
621                        ++leftIndex;
622                        pairAndInc = ~pairAndInc;
623                    }
624                    leftPair = (int)pairAndInc;
625                }
626                leftPair = getQuaternaries(variableTop, leftPair);
627            }
628
629            while(rightPair == 0) {
630                if(rightIndex == right.length()) {
631                    rightPair = EOS;
632                    break;
633                }
634                int c = right.charAt(rightIndex++);
635                rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
636                if(rightPair < MIN_LONG) {
637                    long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
638                    if(pairAndInc < 0) {
639                        ++rightIndex;
640                        pairAndInc = ~pairAndInc;
641                    }
642                    rightPair = (int)pairAndInc;
643                }
644                rightPair = getQuaternaries(variableTop, rightPair);
645            }
646
647            if(leftPair == rightPair) {
648                if(leftPair == EOS) { break; }
649                leftPair = rightPair = 0;
650                continue;
651            }
652            int leftQuaternary = leftPair & 0xffff;
653            int rightQuaternary = rightPair & 0xffff;
654            if(leftQuaternary != rightQuaternary) {
655                return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER;
656            }
657            if(leftPair == EOS) { break; }
658            leftPair >>>= 16;
659            rightPair >>>= 16;
660        }
661        return Collation.EQUAL;
662    }
663
664    private static int lookup(char[] table, int c) {
665        assert(c > LATIN_MAX);
666        if(PUNCT_START <= c && c < PUNCT_LIMIT) {
667            return table[c - PUNCT_START + LATIN_LIMIT];
668        } else if(c == 0xfffe) {
669            return MERGE_WEIGHT;
670        } else if(c == 0xffff) {
671            return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
672        } else {
673            return BAIL_OUT;
674        }
675    }
676
677    /**
678     * Java returns a negative result (use the '~' operator) if sIndex is to be incremented.
679     * C++ modifies sIndex.
680     */
681    private static long nextPair(char[] table, int c, int ce, CharSequence s16, int sIndex) {
682        if(ce >= MIN_LONG || ce < CONTRACTION) {
683            return ce;  // simple or special mini CE
684        } else if(ce >= EXPANSION) {
685            int index = NUM_FAST_CHARS + (ce & INDEX_MASK);
686            return ((long)table[index + 1] << 16) | table[index];
687        } else /* ce >= CONTRACTION */ {
688            // Contraction list: Default mapping followed by
689            // 0 or more single-character contraction suffix mappings.
690            int index = NUM_FAST_CHARS + (ce & INDEX_MASK);
691            boolean inc = false;  // true if the next char is consumed.
692            if(sIndex != s16.length()) {
693                // Read the next character.
694                int c2;
695                int nextIndex = sIndex;
696                c2 = s16.charAt(nextIndex++);
697                if(c2 > LATIN_MAX) {
698                    if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
699                        c2 = c2 - PUNCT_START + LATIN_LIMIT;  // 2000..203F -> 0180..01BF
700                    } else if(c2 == 0xfffe || c2 == 0xffff) {
701                        c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
702                    } else {
703                        return BAIL_OUT;
704                    }
705                }
706                // Look for the next character in the contraction suffix list,
707                // which is in ascending order of single suffix characters.
708                int i = index;
709                int head = table[i];  // first skip the default mapping
710                int x;
711                do {
712                    i += head >> CONTR_LENGTH_SHIFT;
713                    head = table[i];
714                    x = head & CONTR_CHAR_MASK;
715                } while(x < c2);
716                if(x == c2) {
717                    index = i;
718                    inc = true;
719                }
720            }
721            // Return the CE or CEs for the default or contraction mapping.
722            int length = table[index] >> CONTR_LENGTH_SHIFT;
723            if(length == 1) {
724                return BAIL_OUT;
725            }
726            ce = table[index + 1];
727            long result;
728            if(length == 2) {
729                result = ce;
730            } else {
731                result = ((long)table[index + 2] << 16) | ce;
732            }
733            return inc ? ~result : result;
734        }
735    }
736
737    private static int getPrimaries(int variableTop, int pair) {
738        int ce = pair & 0xffff;
739        if(ce >= MIN_SHORT) { return pair & TWO_SHORT_PRIMARIES_MASK; }
740        if(ce > variableTop) { return pair & TWO_LONG_PRIMARIES_MASK; }
741        if(ce >= MIN_LONG) { return 0; }  // variable
742        return pair;  // special mini CE
743    }
744
745    private static int getSecondariesFromOneShortCE(int ce) {
746        ce &= SECONDARY_MASK;
747        if(ce < MIN_SEC_HIGH) {
748            return ce + SEC_OFFSET;
749        } else {
750            return ((ce + SEC_OFFSET) << 16) | COMMON_SEC_PLUS_OFFSET;
751        }
752    }
753
754    private static int getSecondaries(int variableTop, int pair) {
755        if(pair <= 0xffff) {
756            // one mini CE
757            if(pair >= MIN_SHORT) {
758                pair = getSecondariesFromOneShortCE(pair);
759            } else if(pair > variableTop) {
760                pair = COMMON_SEC_PLUS_OFFSET;
761            } else if(pair >= MIN_LONG) {
762                pair = 0;  // variable
763            }
764            // else special mini CE
765        } else {
766            int ce = pair & 0xffff;
767            if(ce >= MIN_SHORT) {
768                pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
769            } else if(ce > variableTop) {
770                pair = TWO_COMMON_SEC_PLUS_OFFSET;
771            } else {
772                assert(ce >= MIN_LONG);
773                pair = 0;  // variable
774            }
775        }
776        return pair;
777    }
778
779    private static int getCases(int variableTop, boolean strengthIsPrimary, int pair) {
780        // Primary+caseLevel: Ignore case level weights of primary ignorables.
781        // Otherwise: Ignore case level weights of secondary ignorables.
782        // For details see the comments in the CollationCompare class.
783        // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
784        if(pair <= 0xffff) {
785            // one mini CE
786            if(pair >= MIN_SHORT) {
787                // A high secondary weight means we really have two CEs,
788                // a primary CE and a secondary CE.
789                int ce = pair;
790                pair &= CASE_MASK;  // explicit weight of primary CE
791                if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
792                    pair |= LOWER_CASE << 16;  // implied weight of secondary CE
793                }
794            } else if(pair > variableTop) {
795                pair = LOWER_CASE;
796            } else if(pair >= MIN_LONG) {
797                pair = 0;  // variable
798            }
799            // else special mini CE
800        } else {
801            // two mini CEs, same primary groups, neither expands like above
802            int ce = pair & 0xffff;
803            if(ce >= MIN_SHORT) {
804                if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
805                    pair &= CASE_MASK;
806                } else {
807                    pair &= TWO_CASES_MASK;
808                }
809            } else if(ce > variableTop) {
810                pair = TWO_LOWER_CASES;
811            } else {
812                assert(ce >= MIN_LONG);
813                pair = 0;  // variable
814            }
815        }
816        return pair;
817    }
818
819    private static int getTertiaries(int variableTop, boolean withCaseBits, int pair) {
820        if(pair <= 0xffff) {
821            // one mini CE
822            if(pair >= MIN_SHORT) {
823                // A high secondary weight means we really have two CEs,
824                // a primary CE and a secondary CE.
825                int ce = pair;
826                if(withCaseBits) {
827                    pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
828                    if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
829                        pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
830                    }
831                } else {
832                    pair = (pair & TERTIARY_MASK) + TER_OFFSET;
833                    if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
834                        pair |= COMMON_TER_PLUS_OFFSET << 16;
835                    }
836                }
837            } else if(pair > variableTop) {
838                pair = (pair & TERTIARY_MASK) + TER_OFFSET;
839                if(withCaseBits) {
840                    pair |= LOWER_CASE;
841                }
842            } else if(pair >= MIN_LONG) {
843                pair = 0;  // variable
844            }
845            // else special mini CE
846        } else {
847            // two mini CEs, same primary groups, neither expands like above
848            int ce = pair & 0xffff;
849            if(ce >= MIN_SHORT) {
850                if(withCaseBits) {
851                    pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
852                } else {
853                    pair &= TWO_TERTIARIES_MASK;
854                }
855                pair += TWO_TER_OFFSETS;
856            } else if(ce > variableTop) {
857                pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
858                if(withCaseBits) {
859                    pair |= TWO_LOWER_CASES;
860                }
861            } else {
862                assert(ce >= MIN_LONG);
863                pair = 0;  // variable
864            }
865        }
866        return pair;
867    }
868
869    private static int getQuaternaries(int variableTop, int pair) {
870        // Return the primary weight of a variable CE,
871        // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
872        if(pair <= 0xffff) {
873            // one mini CE
874            if(pair >= MIN_SHORT) {
875                // A high secondary weight means we really have two CEs,
876                // a primary CE and a secondary CE.
877                if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
878                    pair = TWO_SHORT_PRIMARIES_MASK;
879                } else {
880                    pair = SHORT_PRIMARY_MASK;
881                }
882            } else if(pair > variableTop) {
883                pair = SHORT_PRIMARY_MASK;
884            } else if(pair >= MIN_LONG) {
885                pair &= LONG_PRIMARY_MASK;  // variable
886            }
887            // else special mini CE
888        } else {
889            // two mini CEs, same primary groups, neither expands like above
890            int ce = pair & 0xffff;
891            if(ce > variableTop) {
892                pair = TWO_SHORT_PRIMARIES_MASK;
893            } else {
894                assert(ce >= MIN_LONG);
895                pair &= TWO_LONG_PRIMARIES_MASK;  // variable
896            }
897        }
898        return pair;
899    }
900
901    private CollationFastLatin() {}  // no constructor
902}
903