1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2013-2015, International Business Machines
6* Corporation and others.  All Rights Reserved.
7*******************************************************************************
8* collationfastlatin.cpp
9*
10* created on: 2013aug18
11* created by: Markus W. Scherer
12*/
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_COLLATION
17
18#include "unicode/ucol.h"
19#include "collationdata.h"
20#include "collationfastlatin.h"
21#include "collationsettings.h"
22#include "uassert.h"
23
24U_NAMESPACE_BEGIN
25
26int32_t
27CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
28                               uint16_t *primaries, int32_t capacity) {
29    const uint16_t *table = data->fastLatinTable;
30    if(table == NULL) { return -1; }
31    U_ASSERT(capacity == LATIN_LIMIT);
32    if(capacity != LATIN_LIMIT) { return -1; }
33
34    uint32_t miniVarTop;
35    if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
36        // No mini primaries are variable, set a variableTop just below the
37        // lowest long mini primary.
38        miniVarTop = MIN_LONG - 1;
39    } else {
40        int32_t headerLength = *table & 0xff;
41        int32_t i = 1 + settings.getMaxVariable();
42        if(i >= headerLength) {
43            return -1;  // variableTop >= digits, should not occur
44        }
45        miniVarTop = table[i];
46    }
47
48    UBool digitsAreReordered = FALSE;
49    if(settings.hasReordering()) {
50        uint32_t prevStart = 0;
51        uint32_t beforeDigitStart = 0;
52        uint32_t digitStart = 0;
53        uint32_t afterDigitStart = 0;
54        for(int32_t group = UCOL_REORDER_CODE_FIRST;
55                group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES;
56                ++group) {
57            uint32_t start = data->getFirstPrimaryForGroup(group);
58            start = settings.reorder(start);
59            if(group == UCOL_REORDER_CODE_DIGIT) {
60                beforeDigitStart = prevStart;
61                digitStart = start;
62            } else if(start != 0) {
63                if(start < prevStart) {
64                    // The permutation affects the groups up to Latin.
65                    return -1;
66                }
67                // In the future, there might be a special group between digits & Latin.
68                if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
69                    afterDigitStart = start;
70                }
71                prevStart = start;
72            }
73        }
74        uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN);
75        latinStart = settings.reorder(latinStart);
76        if(latinStart < prevStart) {
77            return -1;
78        }
79        if(afterDigitStart == 0) {
80            afterDigitStart = latinStart;
81        }
82        if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
83            digitsAreReordered = TRUE;
84        }
85    }
86
87    table += (table[0] & 0xff);  // skip the header
88    for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
89        uint32_t p = table[c];
90        if(p >= MIN_SHORT) {
91            p &= SHORT_PRIMARY_MASK;
92        } else if(p > miniVarTop) {
93            p &= LONG_PRIMARY_MASK;
94        } else {
95            p = 0;
96        }
97        primaries[c] = (uint16_t)p;
98    }
99    if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) {
100        // Bail out for digits.
101        for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
102    }
103
104    // Shift the miniVarTop above other options.
105    return ((int32_t)miniVarTop << 16) | settings.options;
106}
107
108int32_t
109CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
110                                 const UChar *left, int32_t leftLength,
111                                 const UChar *right, int32_t rightLength) {
112    // This is a modified copy of CollationCompare::compareUpToQuaternary(),
113    // optimized for common Latin text.
114    // Keep them in sync!
115    // Keep compareUTF16() and compareUTF8() in sync very closely!
116
117    U_ASSERT((table[0] >> 8) == VERSION);
118    table += (table[0] & 0xff);  // skip the header
119    uint32_t variableTop = (uint32_t)options >> 16;  // see getOptions()
120    options &= 0xffff;  // needed for CollationSettings::getStrength() to work
121
122    // Check for supported characters, fetch mini CEs, and compare primaries.
123    int32_t leftIndex = 0, rightIndex = 0;
124    /**
125     * Single mini CE or a pair.
126     * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
127     * If there is only one, then it is in the lower bits, and the upper bits are 0.
128     */
129    uint32_t leftPair = 0, rightPair = 0;
130    for(;;) {
131        // We fetch CEs until we get a non-ignorable primary or reach the end.
132        while(leftPair == 0) {
133            if(leftIndex == leftLength) {
134                leftPair = EOS;
135                break;
136            }
137            UChar32 c = left[leftIndex++];
138            if(c <= LATIN_MAX) {
139                leftPair = primaries[c];
140                if(leftPair != 0) { break; }
141                if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
142                    return BAIL_OUT_RESULT;
143                }
144                leftPair = table[c];
145            } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
146                leftPair = table[c - PUNCT_START + LATIN_LIMIT];
147            } else {
148                leftPair = lookup(table, c);
149            }
150            if(leftPair >= MIN_SHORT) {
151                leftPair &= SHORT_PRIMARY_MASK;
152                break;
153            } else if(leftPair > variableTop) {
154                leftPair &= LONG_PRIMARY_MASK;
155                break;
156            } else {
157                leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
158                if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
159                leftPair = getPrimaries(variableTop, leftPair);
160            }
161        }
162
163        while(rightPair == 0) {
164            if(rightIndex == rightLength) {
165                rightPair = EOS;
166                break;
167            }
168            UChar32 c = right[rightIndex++];
169            if(c <= LATIN_MAX) {
170                rightPair = primaries[c];
171                if(rightPair != 0) { break; }
172                if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
173                    return BAIL_OUT_RESULT;
174                }
175                rightPair = table[c];
176            } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
177                rightPair = table[c - PUNCT_START + LATIN_LIMIT];
178            } else {
179                rightPair = lookup(table, c);
180            }
181            if(rightPair >= MIN_SHORT) {
182                rightPair &= SHORT_PRIMARY_MASK;
183                break;
184            } else if(rightPair > variableTop) {
185                rightPair &= LONG_PRIMARY_MASK;
186                break;
187            } else {
188                rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
189                if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
190                rightPair = getPrimaries(variableTop, rightPair);
191            }
192        }
193
194        if(leftPair == rightPair) {
195            if(leftPair == EOS) { break; }
196            leftPair = rightPair = 0;
197            continue;
198        }
199        uint32_t leftPrimary = leftPair & 0xffff;
200        uint32_t rightPrimary = rightPair & 0xffff;
201        if(leftPrimary != rightPrimary) {
202            // Return the primary difference.
203            return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
204        }
205        if(leftPair == EOS) { break; }
206        leftPair >>= 16;
207        rightPair >>= 16;
208    }
209    // In the following, we need to re-fetch each character because we did not buffer the CEs,
210    // but we know that the string is well-formed and
211    // only contains supported characters and mappings.
212
213    // We might skip the secondary level but continue with the case level
214    // which is turned on separately.
215    if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
216        leftIndex = rightIndex = 0;
217        leftPair = rightPair = 0;
218        for(;;) {
219            while(leftPair == 0) {
220                if(leftIndex == leftLength) {
221                    leftPair = EOS;
222                    break;
223                }
224                UChar32 c = left[leftIndex++];
225                if(c <= LATIN_MAX) {
226                    leftPair = table[c];
227                } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
228                    leftPair = table[c - PUNCT_START + LATIN_LIMIT];
229                } else {
230                    leftPair = lookup(table, c);
231                }
232                if(leftPair >= MIN_SHORT) {
233                    leftPair = getSecondariesFromOneShortCE(leftPair);
234                    break;
235                } else if(leftPair > variableTop) {
236                    leftPair = COMMON_SEC_PLUS_OFFSET;
237                    break;
238                } else {
239                    leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
240                    leftPair = getSecondaries(variableTop, leftPair);
241                }
242            }
243
244            while(rightPair == 0) {
245                if(rightIndex == rightLength) {
246                    rightPair = EOS;
247                    break;
248                }
249                UChar32 c = right[rightIndex++];
250                if(c <= LATIN_MAX) {
251                    rightPair = table[c];
252                } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
253                    rightPair = table[c - PUNCT_START + LATIN_LIMIT];
254                } else {
255                    rightPair = lookup(table, c);
256                }
257                if(rightPair >= MIN_SHORT) {
258                    rightPair = getSecondariesFromOneShortCE(rightPair);
259                    break;
260                } else if(rightPair > variableTop) {
261                    rightPair = COMMON_SEC_PLUS_OFFSET;
262                    break;
263                } else {
264                    rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
265                    rightPair = getSecondaries(variableTop, rightPair);
266                }
267            }
268
269            if(leftPair == rightPair) {
270                if(leftPair == EOS) { break; }
271                leftPair = rightPair = 0;
272                continue;
273            }
274            uint32_t leftSecondary = leftPair & 0xffff;
275            uint32_t rightSecondary = rightPair & 0xffff;
276            if(leftSecondary != rightSecondary) {
277                if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
278                    // Full support for backwards secondary requires backwards contraction matching
279                    // and moving backwards between merge separators.
280                    return BAIL_OUT_RESULT;
281                }
282                return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
283            }
284            if(leftPair == EOS) { break; }
285            leftPair >>= 16;
286            rightPair >>= 16;
287        }
288    }
289
290    if((options & CollationSettings::CASE_LEVEL) != 0) {
291        UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
292        leftIndex = rightIndex = 0;
293        leftPair = rightPair = 0;
294        for(;;) {
295            while(leftPair == 0) {
296                if(leftIndex == leftLength) {
297                    leftPair = EOS;
298                    break;
299                }
300                UChar32 c = left[leftIndex++];
301                leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
302                if(leftPair < MIN_LONG) {
303                    leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
304                }
305                leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
306            }
307
308            while(rightPair == 0) {
309                if(rightIndex == rightLength) {
310                    rightPair = EOS;
311                    break;
312                }
313                UChar32 c = right[rightIndex++];
314                rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
315                if(rightPair < MIN_LONG) {
316                    rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
317                }
318                rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
319            }
320
321            if(leftPair == rightPair) {
322                if(leftPair == EOS) { break; }
323                leftPair = rightPair = 0;
324                continue;
325            }
326            uint32_t leftCase = leftPair & 0xffff;
327            uint32_t rightCase = rightPair & 0xffff;
328            if(leftCase != rightCase) {
329                if((options & CollationSettings::UPPER_FIRST) == 0) {
330                    return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
331                } else {
332                    return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
333                }
334            }
335            if(leftPair == EOS) { break; }
336            leftPair >>= 16;
337            rightPair >>= 16;
338        }
339    }
340    if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
341
342    // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
343    UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
344
345    leftIndex = rightIndex = 0;
346    leftPair = rightPair = 0;
347    for(;;) {
348        while(leftPair == 0) {
349            if(leftIndex == leftLength) {
350                leftPair = EOS;
351                break;
352            }
353            UChar32 c = left[leftIndex++];
354            leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
355            if(leftPair < MIN_LONG) {
356                leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
357            }
358            leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
359        }
360
361        while(rightPair == 0) {
362            if(rightIndex == rightLength) {
363                rightPair = EOS;
364                break;
365            }
366            UChar32 c = right[rightIndex++];
367            rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
368            if(rightPair < MIN_LONG) {
369                rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
370            }
371            rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
372        }
373
374        if(leftPair == rightPair) {
375            if(leftPair == EOS) { break; }
376            leftPair = rightPair = 0;
377            continue;
378        }
379        uint32_t leftTertiary = leftPair & 0xffff;
380        uint32_t rightTertiary = rightPair & 0xffff;
381        if(leftTertiary != rightTertiary) {
382            if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
383                // Pass through EOS and MERGE_WEIGHT
384                // and keep real tertiary weights larger than the MERGE_WEIGHT.
385                // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
386                if(leftTertiary > MERGE_WEIGHT) {
387                    leftTertiary ^= CASE_MASK;
388                }
389                if(rightTertiary > MERGE_WEIGHT) {
390                    rightTertiary ^= CASE_MASK;
391                }
392            }
393            return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
394        }
395        if(leftPair == EOS) { break; }
396        leftPair >>= 16;
397        rightPair >>= 16;
398    }
399    if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
400
401    leftIndex = rightIndex = 0;
402    leftPair = rightPair = 0;
403    for(;;) {
404        while(leftPair == 0) {
405            if(leftIndex == leftLength) {
406                leftPair = EOS;
407                break;
408            }
409            UChar32 c = left[leftIndex++];
410            leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
411            if(leftPair < MIN_LONG) {
412                leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
413            }
414            leftPair = getQuaternaries(variableTop, leftPair);
415        }
416
417        while(rightPair == 0) {
418            if(rightIndex == rightLength) {
419                rightPair = EOS;
420                break;
421            }
422            UChar32 c = right[rightIndex++];
423            rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
424            if(rightPair < MIN_LONG) {
425                rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
426            }
427            rightPair = getQuaternaries(variableTop, rightPair);
428        }
429
430        if(leftPair == rightPair) {
431            if(leftPair == EOS) { break; }
432            leftPair = rightPair = 0;
433            continue;
434        }
435        uint32_t leftQuaternary = leftPair & 0xffff;
436        uint32_t rightQuaternary = rightPair & 0xffff;
437        if(leftQuaternary != rightQuaternary) {
438            return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
439        }
440        if(leftPair == EOS) { break; }
441        leftPair >>= 16;
442        rightPair >>= 16;
443    }
444    return UCOL_EQUAL;
445}
446
447int32_t
448CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
449                                 const uint8_t *left, int32_t leftLength,
450                                 const uint8_t *right, int32_t rightLength) {
451    // Keep compareUTF16() and compareUTF8() in sync very closely!
452
453    U_ASSERT((table[0] >> 8) == VERSION);
454    table += (table[0] & 0xff);  // skip the header
455    uint32_t variableTop = (uint32_t)options >> 16;  // see RuleBasedCollator::getFastLatinOptions()
456    options &= 0xffff;  // needed for CollationSettings::getStrength() to work
457
458    // Check for supported characters, fetch mini CEs, and compare primaries.
459    int32_t leftIndex = 0, rightIndex = 0;
460    /**
461     * Single mini CE or a pair.
462     * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
463     * If there is only one, then it is in the lower bits, and the upper bits are 0.
464     */
465    uint32_t leftPair = 0, rightPair = 0;
466    // Note: There is no need to assemble the code point.
467    // We only need to look up the table entry for the character,
468    // and nextPair() looks for whether c==0.
469    for(;;) {
470        // We fetch CEs until we get a non-ignorable primary or reach the end.
471        while(leftPair == 0) {
472            if(leftIndex == leftLength) {
473                leftPair = EOS;
474                break;
475            }
476            UChar32 c = left[leftIndex++];
477            uint8_t t;
478            if(c <= 0x7f) {
479                leftPair = primaries[c];
480                if(leftPair != 0) { break; }
481                if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
482                    return BAIL_OUT_RESULT;
483                }
484                leftPair = table[c];
485            } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
486                    0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
487                ++leftIndex;
488                c = ((c - 0xc2) << 6) + t;
489                leftPair = primaries[c];
490                if(leftPair != 0) { break; }
491                leftPair = table[c];
492            } else {
493                leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
494            }
495            if(leftPair >= MIN_SHORT) {
496                leftPair &= SHORT_PRIMARY_MASK;
497                break;
498            } else if(leftPair > variableTop) {
499                leftPair &= LONG_PRIMARY_MASK;
500                break;
501            } else {
502                leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
503                if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
504                leftPair = getPrimaries(variableTop, leftPair);
505            }
506        }
507
508        while(rightPair == 0) {
509            if(rightIndex == rightLength) {
510                rightPair = EOS;
511                break;
512            }
513            UChar32 c = right[rightIndex++];
514            uint8_t t;
515            if(c <= 0x7f) {
516                rightPair = primaries[c];
517                if(rightPair != 0) { break; }
518                if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
519                    return BAIL_OUT_RESULT;
520                }
521                rightPair = table[c];
522            } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
523                    0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
524                ++rightIndex;
525                c = ((c - 0xc2) << 6) + t;
526                rightPair = primaries[c];
527                if(rightPair != 0) { break; }
528                rightPair = table[c];
529            } else {
530                rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
531            }
532            if(rightPair >= MIN_SHORT) {
533                rightPair &= SHORT_PRIMARY_MASK;
534                break;
535            } else if(rightPair > variableTop) {
536                rightPair &= LONG_PRIMARY_MASK;
537                break;
538            } else {
539                rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
540                if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
541                rightPair = getPrimaries(variableTop, rightPair);
542            }
543        }
544
545        if(leftPair == rightPair) {
546            if(leftPair == EOS) { break; }
547            leftPair = rightPair = 0;
548            continue;
549        }
550        uint32_t leftPrimary = leftPair & 0xffff;
551        uint32_t rightPrimary = rightPair & 0xffff;
552        if(leftPrimary != rightPrimary) {
553            // Return the primary difference.
554            return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
555        }
556        if(leftPair == EOS) { break; }
557        leftPair >>= 16;
558        rightPair >>= 16;
559    }
560    // In the following, we need to re-fetch each character because we did not buffer the CEs,
561    // but we know that the string is well-formed and
562    // only contains supported characters and mappings.
563
564    // We might skip the secondary level but continue with the case level
565    // which is turned on separately.
566    if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
567        leftIndex = rightIndex = 0;
568        leftPair = rightPair = 0;
569        for(;;) {
570            while(leftPair == 0) {
571                if(leftIndex == leftLength) {
572                    leftPair = EOS;
573                    break;
574                }
575                UChar32 c = left[leftIndex++];
576                if(c <= 0x7f) {
577                    leftPair = table[c];
578                } else if(c <= LATIN_MAX_UTF8_LEAD) {
579                    leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
580                } else {
581                    leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
582                }
583                if(leftPair >= MIN_SHORT) {
584                    leftPair = getSecondariesFromOneShortCE(leftPair);
585                    break;
586                } else if(leftPair > variableTop) {
587                    leftPair = COMMON_SEC_PLUS_OFFSET;
588                    break;
589                } else {
590                    leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
591                    leftPair = getSecondaries(variableTop, leftPair);
592                }
593            }
594
595            while(rightPair == 0) {
596                if(rightIndex == rightLength) {
597                    rightPair = EOS;
598                    break;
599                }
600                UChar32 c = right[rightIndex++];
601                if(c <= 0x7f) {
602                    rightPair = table[c];
603                } else if(c <= LATIN_MAX_UTF8_LEAD) {
604                    rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
605                } else {
606                    rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
607                }
608                if(rightPair >= MIN_SHORT) {
609                    rightPair = getSecondariesFromOneShortCE(rightPair);
610                    break;
611                } else if(rightPair > variableTop) {
612                    rightPair = COMMON_SEC_PLUS_OFFSET;
613                    break;
614                } else {
615                    rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
616                    rightPair = getSecondaries(variableTop, rightPair);
617                }
618            }
619
620            if(leftPair == rightPair) {
621                if(leftPair == EOS) { break; }
622                leftPair = rightPair = 0;
623                continue;
624            }
625            uint32_t leftSecondary = leftPair & 0xffff;
626            uint32_t rightSecondary = rightPair & 0xffff;
627            if(leftSecondary != rightSecondary) {
628                if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
629                    // Full support for backwards secondary requires backwards contraction matching
630                    // and moving backwards between merge separators.
631                    return BAIL_OUT_RESULT;
632                }
633                return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
634            }
635            if(leftPair == EOS) { break; }
636            leftPair >>= 16;
637            rightPair >>= 16;
638        }
639    }
640
641    if((options & CollationSettings::CASE_LEVEL) != 0) {
642        UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
643        leftIndex = rightIndex = 0;
644        leftPair = rightPair = 0;
645        for(;;) {
646            while(leftPair == 0) {
647                if(leftIndex == leftLength) {
648                    leftPair = EOS;
649                    break;
650                }
651                UChar32 c = left[leftIndex++];
652                leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
653                if(leftPair < MIN_LONG) {
654                    leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
655                }
656                leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
657            }
658
659            while(rightPair == 0) {
660                if(rightIndex == rightLength) {
661                    rightPair = EOS;
662                    break;
663                }
664                UChar32 c = right[rightIndex++];
665                rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
666                if(rightPair < MIN_LONG) {
667                    rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
668                }
669                rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
670            }
671
672            if(leftPair == rightPair) {
673                if(leftPair == EOS) { break; }
674                leftPair = rightPair = 0;
675                continue;
676            }
677            uint32_t leftCase = leftPair & 0xffff;
678            uint32_t rightCase = rightPair & 0xffff;
679            if(leftCase != rightCase) {
680                if((options & CollationSettings::UPPER_FIRST) == 0) {
681                    return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
682                } else {
683                    return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
684                }
685            }
686            if(leftPair == EOS) { break; }
687            leftPair >>= 16;
688            rightPair >>= 16;
689        }
690    }
691    if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
692
693    // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
694    UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
695
696    leftIndex = rightIndex = 0;
697    leftPair = rightPair = 0;
698    for(;;) {
699        while(leftPair == 0) {
700            if(leftIndex == leftLength) {
701                leftPair = EOS;
702                break;
703            }
704            UChar32 c = left[leftIndex++];
705            leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
706            if(leftPair < MIN_LONG) {
707                leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
708            }
709            leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
710        }
711
712        while(rightPair == 0) {
713            if(rightIndex == rightLength) {
714                rightPair = EOS;
715                break;
716            }
717            UChar32 c = right[rightIndex++];
718            rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
719            if(rightPair < MIN_LONG) {
720                rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
721            }
722            rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
723        }
724
725        if(leftPair == rightPair) {
726            if(leftPair == EOS) { break; }
727            leftPair = rightPair = 0;
728            continue;
729        }
730        uint32_t leftTertiary = leftPair & 0xffff;
731        uint32_t rightTertiary = rightPair & 0xffff;
732        if(leftTertiary != rightTertiary) {
733            if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
734                // Pass through EOS and MERGE_WEIGHT
735                // and keep real tertiary weights larger than the MERGE_WEIGHT.
736                // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
737                if(leftTertiary > MERGE_WEIGHT) {
738                    leftTertiary ^= CASE_MASK;
739                }
740                if(rightTertiary > MERGE_WEIGHT) {
741                    rightTertiary ^= CASE_MASK;
742                }
743            }
744            return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
745        }
746        if(leftPair == EOS) { break; }
747        leftPair >>= 16;
748        rightPair >>= 16;
749    }
750    if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
751
752    leftIndex = rightIndex = 0;
753    leftPair = rightPair = 0;
754    for(;;) {
755        while(leftPair == 0) {
756            if(leftIndex == leftLength) {
757                leftPair = EOS;
758                break;
759            }
760            UChar32 c = left[leftIndex++];
761            leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
762            if(leftPair < MIN_LONG) {
763                leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
764            }
765            leftPair = getQuaternaries(variableTop, leftPair);
766        }
767
768        while(rightPair == 0) {
769            if(rightIndex == rightLength) {
770                rightPair = EOS;
771                break;
772            }
773            UChar32 c = right[rightIndex++];
774            rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
775            if(rightPair < MIN_LONG) {
776                rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
777            }
778            rightPair = getQuaternaries(variableTop, rightPair);
779        }
780
781        if(leftPair == rightPair) {
782            if(leftPair == EOS) { break; }
783            leftPair = rightPair = 0;
784            continue;
785        }
786        uint32_t leftQuaternary = leftPair & 0xffff;
787        uint32_t rightQuaternary = rightPair & 0xffff;
788        if(leftQuaternary != rightQuaternary) {
789            return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
790        }
791        if(leftPair == EOS) { break; }
792        leftPair >>= 16;
793        rightPair >>= 16;
794    }
795    return UCOL_EQUAL;
796}
797
798uint32_t
799CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
800    U_ASSERT(c > LATIN_MAX);
801    if(PUNCT_START <= c && c < PUNCT_LIMIT) {
802        return table[c - PUNCT_START + LATIN_LIMIT];
803    } else if(c == 0xfffe) {
804        return MERGE_WEIGHT;
805    } else if(c == 0xffff) {
806        return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
807    } else {
808        return BAIL_OUT;
809    }
810}
811
812uint32_t
813CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
814                               const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
815    // The caller handled ASCII and valid/supported Latin.
816    U_ASSERT(c > 0x7f);
817    int32_t i2 = sIndex + 1;
818    if(i2 < sLength || sLength < 0) {
819        uint8_t t1 = s8[sIndex];
820        uint8_t t2 = s8[i2];
821        sIndex += 2;
822        if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
823            return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
824        } else if(c == 0xef && t1 == 0xbf) {
825            if(t2 == 0xbe) {
826                return MERGE_WEIGHT;  // U+FFFE
827            } else if(t2 == 0xbf) {
828                return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
829            }
830        }
831    }
832    return BAIL_OUT;
833}
834
835uint32_t
836CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
837                                     const uint8_t *s8, int32_t &sIndex) {
838    // The caller handled ASCII.
839    // The string is well-formed and contains only supported characters.
840    U_ASSERT(c > 0x7f);
841    if(c <= LATIN_MAX_UTF8_LEAD) {
842        return table[((c - 0xc2) << 6) + s8[sIndex++]];  // 0080..017F
843    }
844    uint8_t t2 = s8[sIndex + 1];
845    sIndex += 2;
846    if(c == 0xe2) {
847        return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
848    } else if(t2 == 0xbe) {
849        return MERGE_WEIGHT;  // U+FFFE
850    } else {
851        return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
852    }
853}
854
855uint32_t
856CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
857                             const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
858    if(ce >= MIN_LONG || ce < CONTRACTION) {
859        return ce;  // simple or special mini CE
860    } else if(ce >= EXPANSION) {
861        int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
862        return ((uint32_t)table[index + 1] << 16) | table[index];
863    } else /* ce >= CONTRACTION */ {
864        if(c == 0 && sLength < 0) {
865            sLength = sIndex - 1;
866            return EOS;
867        }
868        // Contraction list: Default mapping followed by
869        // 0 or more single-character contraction suffix mappings.
870        int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
871        if(sIndex != sLength) {
872            // Read the next character.
873            int32_t c2;
874            int32_t nextIndex = sIndex;
875            if(s16 != NULL) {
876                c2 = s16[nextIndex++];
877                if(c2 > LATIN_MAX) {
878                    if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
879                        c2 = c2 - PUNCT_START + LATIN_LIMIT;  // 2000..203F -> 0180..01BF
880                    } else if(c2 == 0xfffe || c2 == 0xffff) {
881                        c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
882                    } else {
883                        return BAIL_OUT;
884                    }
885                }
886            } else {
887                c2 = s8[nextIndex++];
888                if(c2 > 0x7f) {
889                    uint8_t t;
890                    if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
891                            0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
892                        c2 = ((c2 - 0xc2) << 6) + t;  // 0080..017F
893                        ++nextIndex;
894                    } else {
895                        int32_t i2 = nextIndex + 1;
896                        if(i2 < sLength || sLength < 0) {
897                            if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
898                                    0x80 <= (t = s8[i2]) && t <= 0xbf) {
899                                c2 = (LATIN_LIMIT - 0x80) + t;  // 2000..203F -> 0180..01BF
900                            } else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
901                                    ((t = s8[i2]) == 0xbe || t == 0xbf)) {
902                                c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
903                            } else {
904                                return BAIL_OUT;
905                            }
906                        } else {
907                            return BAIL_OUT;
908                        }
909                        nextIndex += 2;
910                    }
911                }
912            }
913            if(c2 == 0 && sLength < 0) {
914                sLength = sIndex;
915                c2 = -1;
916            }
917            // Look for the next character in the contraction suffix list,
918            // which is in ascending order of single suffix characters.
919            int32_t i = index;
920            int32_t head = table[i];  // first skip the default mapping
921            int32_t x;
922            do {
923                i += head >> CONTR_LENGTH_SHIFT;
924                head = table[i];
925                x = head & CONTR_CHAR_MASK;
926            } while(x < c2);
927            if(x == c2) {
928                index = i;
929                sIndex = nextIndex;
930            }
931        }
932        // Return the CE or CEs for the default or contraction mapping.
933        int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
934        if(length == 1) {
935            return BAIL_OUT;
936        }
937        ce = table[index + 1];
938        if(length == 2) {
939            return ce;
940        } else {
941            return ((uint32_t)table[index + 2] << 16) | ce;
942        }
943    }
944}
945
946uint32_t
947CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
948    if(pair <= 0xffff) {
949        // one mini CE
950        if(pair >= MIN_SHORT) {
951            pair = getSecondariesFromOneShortCE(pair);
952        } else if(pair > variableTop) {
953            pair = COMMON_SEC_PLUS_OFFSET;
954        } else if(pair >= MIN_LONG) {
955            pair = 0;  // variable
956        }
957        // else special mini CE
958    } else {
959        uint32_t ce = pair & 0xffff;
960        if(ce >= MIN_SHORT) {
961            pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
962        } else if(ce > variableTop) {
963            pair = TWO_COMMON_SEC_PLUS_OFFSET;
964        } else {
965            U_ASSERT(ce >= MIN_LONG);
966            pair = 0;  // variable
967        }
968    }
969    return pair;
970}
971
972uint32_t
973CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
974    // Primary+caseLevel: Ignore case level weights of primary ignorables.
975    // Otherwise: Ignore case level weights of secondary ignorables.
976    // For details see the comments in the CollationCompare class.
977    // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
978    if(pair <= 0xffff) {
979        // one mini CE
980        if(pair >= MIN_SHORT) {
981            // A high secondary weight means we really have two CEs,
982            // a primary CE and a secondary CE.
983            uint32_t ce = pair;
984            pair &= CASE_MASK;  // explicit weight of primary CE
985            if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
986                pair |= LOWER_CASE << 16;  // implied weight of secondary CE
987            }
988        } else if(pair > variableTop) {
989            pair = LOWER_CASE;
990        } else if(pair >= MIN_LONG) {
991            pair = 0;  // variable
992        }
993        // else special mini CE
994    } else {
995        // two mini CEs, same primary groups, neither expands like above
996        uint32_t ce = pair & 0xffff;
997        if(ce >= MIN_SHORT) {
998            if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
999                pair &= CASE_MASK;
1000            } else {
1001                pair &= TWO_CASES_MASK;
1002            }
1003        } else if(ce > variableTop) {
1004            pair = TWO_LOWER_CASES;
1005        } else {
1006            U_ASSERT(ce >= MIN_LONG);
1007            pair = 0;  // variable
1008        }
1009    }
1010    return pair;
1011}
1012
1013uint32_t
1014CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
1015    if(pair <= 0xffff) {
1016        // one mini CE
1017        if(pair >= MIN_SHORT) {
1018            // A high secondary weight means we really have two CEs,
1019            // a primary CE and a secondary CE.
1020            uint32_t ce = pair;
1021            if(withCaseBits) {
1022                pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
1023                if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1024                    pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
1025                }
1026            } else {
1027                pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1028                if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1029                    pair |= COMMON_TER_PLUS_OFFSET << 16;
1030                }
1031            }
1032        } else if(pair > variableTop) {
1033            pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1034            if(withCaseBits) {
1035                pair |= LOWER_CASE;
1036            }
1037        } else if(pair >= MIN_LONG) {
1038            pair = 0;  // variable
1039        }
1040        // else special mini CE
1041    } else {
1042        // two mini CEs, same primary groups, neither expands like above
1043        uint32_t ce = pair & 0xffff;
1044        if(ce >= MIN_SHORT) {
1045            if(withCaseBits) {
1046                pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
1047            } else {
1048                pair &= TWO_TERTIARIES_MASK;
1049            }
1050            pair += TWO_TER_OFFSETS;
1051        } else if(ce > variableTop) {
1052            pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
1053            if(withCaseBits) {
1054                pair |= TWO_LOWER_CASES;
1055            }
1056        } else {
1057            U_ASSERT(ce >= MIN_LONG);
1058            pair = 0;  // variable
1059        }
1060    }
1061    return pair;
1062}
1063
1064uint32_t
1065CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
1066    // Return the primary weight of a variable CE,
1067    // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
1068    if(pair <= 0xffff) {
1069        // one mini CE
1070        if(pair >= MIN_SHORT) {
1071            // A high secondary weight means we really have two CEs,
1072            // a primary CE and a secondary CE.
1073            if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1074                pair = TWO_SHORT_PRIMARIES_MASK;
1075            } else {
1076                pair = SHORT_PRIMARY_MASK;
1077            }
1078        } else if(pair > variableTop) {
1079            pair = SHORT_PRIMARY_MASK;
1080        } else if(pair >= MIN_LONG) {
1081            pair &= LONG_PRIMARY_MASK;  // variable
1082        }
1083        // else special mini CE
1084    } else {
1085        // two mini CEs, same primary groups, neither expands like above
1086        uint32_t ce = pair & 0xffff;
1087        if(ce > variableTop) {
1088            pair = TWO_SHORT_PRIMARIES_MASK;
1089        } else {
1090            U_ASSERT(ce >= MIN_LONG);
1091            pair &= TWO_LONG_PRIMARIES_MASK;  // variable
1092        }
1093    }
1094    return pair;
1095}
1096
1097U_NAMESPACE_END
1098
1099#endif  // !UCONFIG_NO_COLLATION
1100