1/*
2*******************************************************************************
3* Copyright (C) 2013-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationfastlatin.cpp
7*
8* created on: 2013aug18
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/ucol.h"
17#include "collationdata.h"
18#include "collationfastlatin.h"
19#include "collationsettings.h"
20#include "putilimp.h"  // U_ALIGN_CODE
21#include "uassert.h"
22
23U_NAMESPACE_BEGIN
24
25int32_t
26CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
27                               uint16_t *primaries, int32_t capacity) {
28    const uint16_t *table = data->fastLatinTable;
29    if(table == NULL) { return -1; }
30    U_ASSERT(capacity == LATIN_LIMIT);
31    if(capacity != LATIN_LIMIT) { return -1; }
32
33    uint32_t miniVarTop;
34    if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
35        // No mini primaries are variable, set a variableTop just below the
36        // lowest long mini primary.
37        miniVarTop = MIN_LONG - 1;
38    } else {
39        uint32_t v1 = settings.variableTop >> 24;
40        int32_t headerLength = *table & 0xff;
41        int32_t i = headerLength - 1;
42        if(i <= 0 || v1 > (table[i] & 0x7f)) {
43            return -1;  // variableTop >= digits, should not occur
44        }
45        while(i > 1 && v1 <= (table[i - 1] & 0x7f)) { --i; }
46        // In the table header, the miniVarTop is in bits 15..7, with 4 zero bits 19..16 implied.
47        // Shift right to make it comparable with long mini primaries in bits 15..3.
48        miniVarTop = (table[i] & 0xff80) >> 4;
49    }
50
51    const uint8_t *reorderTable = settings.reorderTable;
52    if(reorderTable != NULL) {
53        const uint16_t *scripts = data->scripts;
54        int32_t length = data->scriptsLength;
55        uint32_t prevLastByte = 0;
56        for(int32_t i = 0; i < length;) {
57            // reordered last byte of the group
58            uint32_t lastByte = reorderTable[scripts[i] & 0xff];
59            if(lastByte < prevLastByte) {
60                // The permutation affects the groups up to Latin.
61                return -1;
62            }
63            if(scripts[i + 2] == USCRIPT_LATIN) { break; }
64            i = i + 2 + scripts[i + 1];
65            prevLastByte = lastByte;
66        }
67    }
68
69    table += (table[0] & 0xff);  // skip the header
70    for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
71        uint32_t p = table[c];
72        if(p >= MIN_SHORT) {
73            p &= SHORT_PRIMARY_MASK;
74        } else if(p > miniVarTop) {
75            p &= LONG_PRIMARY_MASK;
76        } else {
77            p = 0;
78        }
79        primaries[c] = (uint16_t)p;
80    }
81    if((settings.options & CollationSettings::NUMERIC) != 0) {
82        // Bail out for digits.
83        for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
84    }
85
86    // Shift the miniVarTop above other options.
87    return ((int32_t)miniVarTop << 16) | settings.options;
88}
89
90int32_t
91CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
92                                 const UChar *left, int32_t leftLength,
93                                 const UChar *right, int32_t rightLength) {
94    // This is a modified copy of CollationCompare::compareUpToQuaternary(),
95    // optimized for common Latin text.
96    // Keep them in sync!
97    // Keep compareUTF16() and compareUTF8() in sync very closely!
98
99    U_ASSERT((table[0] >> 8) == VERSION);
100    table += (table[0] & 0xff);  // skip the header
101    uint32_t variableTop = (uint32_t)options >> 16;  // see getOptions()
102    options &= 0xffff;  // needed for CollationSettings::getStrength() to work
103
104    // Check for supported characters, fetch mini CEs, and compare primaries.
105    U_ALIGN_CODE(16);
106    int32_t leftIndex = 0, rightIndex = 0;
107    /**
108     * Single mini CE or a pair.
109     * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
110     * If there is only one, then it is in the lower bits, and the upper bits are 0.
111     */
112    uint32_t leftPair = 0, rightPair = 0;
113    for(;;) {
114        // We fetch CEs until we get a non-ignorable primary or reach the end.
115        while(leftPair == 0) {
116            if(leftIndex == leftLength) {
117                leftPair = EOS;
118                break;
119            }
120            UChar32 c = left[leftIndex++];
121            if(c <= LATIN_MAX) {
122                leftPair = primaries[c];
123                if(leftPair != 0) { break; }
124                if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
125                    return BAIL_OUT_RESULT;
126                }
127                leftPair = table[c];
128            } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
129                leftPair = table[c - PUNCT_START + LATIN_LIMIT];
130            } else {
131                leftPair = lookup(table, c);
132            }
133            if(leftPair >= MIN_SHORT) {
134                leftPair &= SHORT_PRIMARY_MASK;
135                break;
136            } else if(leftPair > variableTop) {
137                leftPair &= LONG_PRIMARY_MASK;
138                break;
139            } else {
140                leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
141                if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
142                leftPair = getPrimaries(variableTop, leftPair);
143            }
144        }
145
146        while(rightPair == 0) {
147            if(rightIndex == rightLength) {
148                rightPair = EOS;
149                break;
150            }
151            UChar32 c = right[rightIndex++];
152            if(c <= LATIN_MAX) {
153                rightPair = primaries[c];
154                if(rightPair != 0) { break; }
155                if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
156                    return BAIL_OUT_RESULT;
157                }
158                rightPair = table[c];
159            } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
160                rightPair = table[c - PUNCT_START + LATIN_LIMIT];
161            } else {
162                rightPair = lookup(table, c);
163            }
164            if(rightPair >= MIN_SHORT) {
165                rightPair &= SHORT_PRIMARY_MASK;
166                break;
167            } else if(rightPair > variableTop) {
168                rightPair &= LONG_PRIMARY_MASK;
169                break;
170            } else {
171                rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
172                if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
173                rightPair = getPrimaries(variableTop, rightPair);
174            }
175        }
176
177        if(leftPair == rightPair) {
178            if(leftPair == EOS) { break; }
179            leftPair = rightPair = 0;
180            continue;
181        }
182        uint32_t leftPrimary = leftPair & 0xffff;
183        uint32_t rightPrimary = rightPair & 0xffff;
184        if(leftPrimary != rightPrimary) {
185            // Return the primary difference.
186            return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
187        }
188        if(leftPair == EOS) { break; }
189        leftPair >>= 16;
190        rightPair >>= 16;
191    }
192    // In the following, we need to re-fetch each character because we did not buffer the CEs,
193    // but we know that the string is well-formed and
194    // only contains supported characters and mappings.
195
196    // We might skip the secondary level but continue with the case level
197    // which is turned on separately.
198    if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
199        leftIndex = rightIndex = 0;
200        leftPair = rightPair = 0;
201        for(;;) {
202            while(leftPair == 0) {
203                if(leftIndex == leftLength) {
204                    leftPair = EOS;
205                    break;
206                }
207                UChar32 c = left[leftIndex++];
208                if(c <= LATIN_MAX) {
209                    leftPair = table[c];
210                } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
211                    leftPair = table[c - PUNCT_START + LATIN_LIMIT];
212                } else {
213                    leftPair = lookup(table, c);
214                }
215                if(leftPair >= MIN_SHORT) {
216                    leftPair = getSecondariesFromOneShortCE(leftPair);
217                    break;
218                } else if(leftPair > variableTop) {
219                    leftPair = COMMON_SEC_PLUS_OFFSET;
220                    break;
221                } else {
222                    leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
223                    leftPair = getSecondaries(variableTop, leftPair);
224                }
225            }
226
227            while(rightPair == 0) {
228                if(rightIndex == rightLength) {
229                    rightPair = EOS;
230                    break;
231                }
232                UChar32 c = right[rightIndex++];
233                if(c <= LATIN_MAX) {
234                    rightPair = table[c];
235                } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
236                    rightPair = table[c - PUNCT_START + LATIN_LIMIT];
237                } else {
238                    rightPair = lookup(table, c);
239                }
240                if(rightPair >= MIN_SHORT) {
241                    rightPair = getSecondariesFromOneShortCE(rightPair);
242                    break;
243                } else if(rightPair > variableTop) {
244                    rightPair = COMMON_SEC_PLUS_OFFSET;
245                    break;
246                } else {
247                    rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
248                    rightPair = getSecondaries(variableTop, rightPair);
249                }
250            }
251
252            if(leftPair == rightPair) {
253                if(leftPair == EOS) { break; }
254                leftPair = rightPair = 0;
255                continue;
256            }
257            uint32_t leftSecondary = leftPair & 0xffff;
258            uint32_t rightSecondary = rightPair & 0xffff;
259            if(leftSecondary != rightSecondary) {
260                if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
261                    // Full support for backwards secondary requires backwards contraction matching
262                    // and moving backwards between merge separators.
263                    return BAIL_OUT_RESULT;
264                }
265                return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
266            }
267            if(leftPair == EOS) { break; }
268            leftPair >>= 16;
269            rightPair >>= 16;
270        }
271    }
272
273    if((options & CollationSettings::CASE_LEVEL) != 0) {
274        UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
275        leftIndex = rightIndex = 0;
276        leftPair = rightPair = 0;
277        for(;;) {
278            while(leftPair == 0) {
279                if(leftIndex == leftLength) {
280                    leftPair = EOS;
281                    break;
282                }
283                UChar32 c = left[leftIndex++];
284                leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
285                if(leftPair < MIN_LONG) {
286                    leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
287                }
288                leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
289            }
290
291            while(rightPair == 0) {
292                if(rightIndex == rightLength) {
293                    rightPair = EOS;
294                    break;
295                }
296                UChar32 c = right[rightIndex++];
297                rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
298                if(rightPair < MIN_LONG) {
299                    rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
300                }
301                rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
302            }
303
304            if(leftPair == rightPair) {
305                if(leftPair == EOS) { break; }
306                leftPair = rightPair = 0;
307                continue;
308            }
309            uint32_t leftCase = leftPair & 0xffff;
310            uint32_t rightCase = rightPair & 0xffff;
311            if(leftCase != rightCase) {
312                if((options & CollationSettings::UPPER_FIRST) == 0) {
313                    return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
314                } else {
315                    return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
316                }
317            }
318            if(leftPair == EOS) { break; }
319            leftPair >>= 16;
320            rightPair >>= 16;
321        }
322    }
323    if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
324
325    // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
326    UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
327
328    leftIndex = rightIndex = 0;
329    leftPair = rightPair = 0;
330    for(;;) {
331        while(leftPair == 0) {
332            if(leftIndex == leftLength) {
333                leftPair = EOS;
334                break;
335            }
336            UChar32 c = left[leftIndex++];
337            leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
338            if(leftPair < MIN_LONG) {
339                leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
340            }
341            leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
342        }
343
344        while(rightPair == 0) {
345            if(rightIndex == rightLength) {
346                rightPair = EOS;
347                break;
348            }
349            UChar32 c = right[rightIndex++];
350            rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
351            if(rightPair < MIN_LONG) {
352                rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
353            }
354            rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
355        }
356
357        if(leftPair == rightPair) {
358            if(leftPair == EOS) { break; }
359            leftPair = rightPair = 0;
360            continue;
361        }
362        uint32_t leftTertiary = leftPair & 0xffff;
363        uint32_t rightTertiary = rightPair & 0xffff;
364        if(leftTertiary != rightTertiary) {
365            if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
366                // Pass through EOS and MERGE_WEIGHT
367                // and keep real tertiary weights larger than the MERGE_WEIGHT.
368                // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
369                if(leftTertiary > MERGE_WEIGHT) {
370                    leftTertiary ^= CASE_MASK;
371                }
372                if(rightTertiary > MERGE_WEIGHT) {
373                    rightTertiary ^= CASE_MASK;
374                }
375            }
376            return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
377        }
378        if(leftPair == EOS) { break; }
379        leftPair >>= 16;
380        rightPair >>= 16;
381    }
382    if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
383
384    leftIndex = rightIndex = 0;
385    leftPair = rightPair = 0;
386    for(;;) {
387        while(leftPair == 0) {
388            if(leftIndex == leftLength) {
389                leftPair = EOS;
390                break;
391            }
392            UChar32 c = left[leftIndex++];
393            leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
394            if(leftPair < MIN_LONG) {
395                leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
396            }
397            leftPair = getQuaternaries(variableTop, leftPair);
398        }
399
400        while(rightPair == 0) {
401            if(rightIndex == rightLength) {
402                rightPair = EOS;
403                break;
404            }
405            UChar32 c = right[rightIndex++];
406            rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
407            if(rightPair < MIN_LONG) {
408                rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
409            }
410            rightPair = getQuaternaries(variableTop, rightPair);
411        }
412
413        if(leftPair == rightPair) {
414            if(leftPair == EOS) { break; }
415            leftPair = rightPair = 0;
416            continue;
417        }
418        uint32_t leftQuaternary = leftPair & 0xffff;
419        uint32_t rightQuaternary = rightPair & 0xffff;
420        if(leftQuaternary != rightQuaternary) {
421            return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
422        }
423        if(leftPair == EOS) { break; }
424        leftPair >>= 16;
425        rightPair >>= 16;
426    }
427    return UCOL_EQUAL;
428}
429
430int32_t
431CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
432                                 const uint8_t *left, int32_t leftLength,
433                                 const uint8_t *right, int32_t rightLength) {
434    // Keep compareUTF16() and compareUTF8() in sync very closely!
435
436    U_ASSERT((table[0] >> 8) == VERSION);
437    table += (table[0] & 0xff);  // skip the header
438    uint32_t variableTop = (uint32_t)options >> 16;  // see RuleBasedCollator::getFastLatinOptions()
439    options &= 0xffff;  // needed for CollationSettings::getStrength() to work
440
441    // Check for supported characters, fetch mini CEs, and compare primaries.
442    U_ALIGN_CODE(16);
443    int32_t leftIndex = 0, rightIndex = 0;
444    /**
445     * Single mini CE or a pair.
446     * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
447     * If there is only one, then it is in the lower bits, and the upper bits are 0.
448     */
449    uint32_t leftPair = 0, rightPair = 0;
450    // Note: There is no need to assemble the code point.
451    // We only need to look up the table entry for the character,
452    // and nextPair() looks for whether c==0.
453    for(;;) {
454        // We fetch CEs until we get a non-ignorable primary or reach the end.
455        while(leftPair == 0) {
456            if(leftIndex == leftLength) {
457                leftPair = EOS;
458                break;
459            }
460            UChar32 c = left[leftIndex++];
461            uint8_t t;
462            if(c <= 0x7f) {
463                leftPair = primaries[c];
464                if(leftPair != 0) { break; }
465                if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
466                    return BAIL_OUT_RESULT;
467                }
468                leftPair = table[c];
469            } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
470                    0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
471                ++leftIndex;
472                c = ((c - 0xc2) << 6) + t;
473                leftPair = primaries[c];
474                if(leftPair != 0) { break; }
475                leftPair = table[c];
476            } else {
477                leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
478            }
479            if(leftPair >= MIN_SHORT) {
480                leftPair &= SHORT_PRIMARY_MASK;
481                break;
482            } else if(leftPair > variableTop) {
483                leftPair &= LONG_PRIMARY_MASK;
484                break;
485            } else {
486                leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
487                if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
488                leftPair = getPrimaries(variableTop, leftPair);
489            }
490        }
491
492        while(rightPair == 0) {
493            if(rightIndex == rightLength) {
494                rightPair = EOS;
495                break;
496            }
497            UChar32 c = right[rightIndex++];
498            uint8_t t;
499            if(c <= 0x7f) {
500                rightPair = primaries[c];
501                if(rightPair != 0) { break; }
502                if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
503                    return BAIL_OUT_RESULT;
504                }
505                rightPair = table[c];
506            } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
507                    0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
508                ++rightIndex;
509                c = ((c - 0xc2) << 6) + t;
510                rightPair = primaries[c];
511                if(rightPair != 0) { break; }
512                rightPair = table[c];
513            } else {
514                rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
515            }
516            if(rightPair >= MIN_SHORT) {
517                rightPair &= SHORT_PRIMARY_MASK;
518                break;
519            } else if(rightPair > variableTop) {
520                rightPair &= LONG_PRIMARY_MASK;
521                break;
522            } else {
523                rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
524                if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
525                rightPair = getPrimaries(variableTop, rightPair);
526            }
527        }
528
529        if(leftPair == rightPair) {
530            if(leftPair == EOS) { break; }
531            leftPair = rightPair = 0;
532            continue;
533        }
534        uint32_t leftPrimary = leftPair & 0xffff;
535        uint32_t rightPrimary = rightPair & 0xffff;
536        if(leftPrimary != rightPrimary) {
537            // Return the primary difference.
538            return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
539        }
540        if(leftPair == EOS) { break; }
541        leftPair >>= 16;
542        rightPair >>= 16;
543    }
544    // In the following, we need to re-fetch each character because we did not buffer the CEs,
545    // but we know that the string is well-formed and
546    // only contains supported characters and mappings.
547
548    // We might skip the secondary level but continue with the case level
549    // which is turned on separately.
550    if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
551        leftIndex = rightIndex = 0;
552        leftPair = rightPair = 0;
553        for(;;) {
554            while(leftPair == 0) {
555                if(leftIndex == leftLength) {
556                    leftPair = EOS;
557                    break;
558                }
559                UChar32 c = left[leftIndex++];
560                if(c <= 0x7f) {
561                    leftPair = table[c];
562                } else if(c <= LATIN_MAX_UTF8_LEAD) {
563                    leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
564                } else {
565                    leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
566                }
567                if(leftPair >= MIN_SHORT) {
568                    leftPair = getSecondariesFromOneShortCE(leftPair);
569                    break;
570                } else if(leftPair > variableTop) {
571                    leftPair = COMMON_SEC_PLUS_OFFSET;
572                    break;
573                } else {
574                    leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
575                    leftPair = getSecondaries(variableTop, leftPair);
576                }
577            }
578
579            while(rightPair == 0) {
580                if(rightIndex == rightLength) {
581                    rightPair = EOS;
582                    break;
583                }
584                UChar32 c = right[rightIndex++];
585                if(c <= 0x7f) {
586                    rightPair = table[c];
587                } else if(c <= LATIN_MAX_UTF8_LEAD) {
588                    rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
589                } else {
590                    rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
591                }
592                if(rightPair >= MIN_SHORT) {
593                    rightPair = getSecondariesFromOneShortCE(rightPair);
594                    break;
595                } else if(rightPair > variableTop) {
596                    rightPair = COMMON_SEC_PLUS_OFFSET;
597                    break;
598                } else {
599                    rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
600                    rightPair = getSecondaries(variableTop, rightPair);
601                }
602            }
603
604            if(leftPair == rightPair) {
605                if(leftPair == EOS) { break; }
606                leftPair = rightPair = 0;
607                continue;
608            }
609            uint32_t leftSecondary = leftPair & 0xffff;
610            uint32_t rightSecondary = rightPair & 0xffff;
611            if(leftSecondary != rightSecondary) {
612                if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
613                    // Full support for backwards secondary requires backwards contraction matching
614                    // and moving backwards between merge separators.
615                    return BAIL_OUT_RESULT;
616                }
617                return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
618            }
619            if(leftPair == EOS) { break; }
620            leftPair >>= 16;
621            rightPair >>= 16;
622        }
623    }
624
625    if((options & CollationSettings::CASE_LEVEL) != 0) {
626        UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
627        leftIndex = rightIndex = 0;
628        leftPair = rightPair = 0;
629        for(;;) {
630            while(leftPair == 0) {
631                if(leftIndex == leftLength) {
632                    leftPair = EOS;
633                    break;
634                }
635                UChar32 c = left[leftIndex++];
636                leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
637                if(leftPair < MIN_LONG) {
638                    leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
639                }
640                leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
641            }
642
643            while(rightPair == 0) {
644                if(rightIndex == rightLength) {
645                    rightPair = EOS;
646                    break;
647                }
648                UChar32 c = right[rightIndex++];
649                rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
650                if(rightPair < MIN_LONG) {
651                    rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
652                }
653                rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
654            }
655
656            if(leftPair == rightPair) {
657                if(leftPair == EOS) { break; }
658                leftPair = rightPair = 0;
659                continue;
660            }
661            uint32_t leftCase = leftPair & 0xffff;
662            uint32_t rightCase = rightPair & 0xffff;
663            if(leftCase != rightCase) {
664                if((options & CollationSettings::UPPER_FIRST) == 0) {
665                    return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
666                } else {
667                    return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
668                }
669            }
670            if(leftPair == EOS) { break; }
671            leftPair >>= 16;
672            rightPair >>= 16;
673        }
674    }
675    if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
676
677    // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
678    UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
679
680    leftIndex = rightIndex = 0;
681    leftPair = rightPair = 0;
682    for(;;) {
683        while(leftPair == 0) {
684            if(leftIndex == leftLength) {
685                leftPair = EOS;
686                break;
687            }
688            UChar32 c = left[leftIndex++];
689            leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
690            if(leftPair < MIN_LONG) {
691                leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
692            }
693            leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
694        }
695
696        while(rightPair == 0) {
697            if(rightIndex == rightLength) {
698                rightPair = EOS;
699                break;
700            }
701            UChar32 c = right[rightIndex++];
702            rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
703            if(rightPair < MIN_LONG) {
704                rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
705            }
706            rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
707        }
708
709        if(leftPair == rightPair) {
710            if(leftPair == EOS) { break; }
711            leftPair = rightPair = 0;
712            continue;
713        }
714        uint32_t leftTertiary = leftPair & 0xffff;
715        uint32_t rightTertiary = rightPair & 0xffff;
716        if(leftTertiary != rightTertiary) {
717            if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
718                // Pass through EOS and MERGE_WEIGHT
719                // and keep real tertiary weights larger than the MERGE_WEIGHT.
720                // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
721                if(leftTertiary > MERGE_WEIGHT) {
722                    leftTertiary ^= CASE_MASK;
723                }
724                if(rightTertiary > MERGE_WEIGHT) {
725                    rightTertiary ^= CASE_MASK;
726                }
727            }
728            return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
729        }
730        if(leftPair == EOS) { break; }
731        leftPair >>= 16;
732        rightPair >>= 16;
733    }
734    if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
735
736    leftIndex = rightIndex = 0;
737    leftPair = rightPair = 0;
738    for(;;) {
739        while(leftPair == 0) {
740            if(leftIndex == leftLength) {
741                leftPair = EOS;
742                break;
743            }
744            UChar32 c = left[leftIndex++];
745            leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
746            if(leftPair < MIN_LONG) {
747                leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
748            }
749            leftPair = getQuaternaries(variableTop, leftPair);
750        }
751
752        while(rightPair == 0) {
753            if(rightIndex == rightLength) {
754                rightPair = EOS;
755                break;
756            }
757            UChar32 c = right[rightIndex++];
758            rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
759            if(rightPair < MIN_LONG) {
760                rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
761            }
762            rightPair = getQuaternaries(variableTop, rightPair);
763        }
764
765        if(leftPair == rightPair) {
766            if(leftPair == EOS) { break; }
767            leftPair = rightPair = 0;
768            continue;
769        }
770        uint32_t leftQuaternary = leftPair & 0xffff;
771        uint32_t rightQuaternary = rightPair & 0xffff;
772        if(leftQuaternary != rightQuaternary) {
773            return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
774        }
775        if(leftPair == EOS) { break; }
776        leftPair >>= 16;
777        rightPair >>= 16;
778    }
779    return UCOL_EQUAL;
780}
781
782uint32_t
783CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
784    U_ASSERT(c > LATIN_MAX);
785    if(PUNCT_START <= c && c < PUNCT_LIMIT) {
786        return table[c - PUNCT_START + LATIN_LIMIT];
787    } else if(c == 0xfffe) {
788        return MERGE_WEIGHT;
789    } else if(c == 0xffff) {
790        return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
791    } else {
792        return BAIL_OUT;
793    }
794}
795
796uint32_t
797CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
798                               const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
799    // The caller handled ASCII and valid/supported Latin.
800    U_ASSERT(c > 0x7f);
801    int32_t i2 = sIndex + 1;
802    if(i2 < sLength || sLength < 0) {
803        uint8_t t1 = s8[sIndex];
804        uint8_t t2 = s8[i2];
805        sIndex += 2;
806        if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
807            return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
808        } else if(c == 0xef && t1 == 0xbf) {
809            if(t2 == 0xbe) {
810                return MERGE_WEIGHT;  // U+FFFE
811            } else if(t2 == 0xbf) {
812                return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
813            }
814        }
815    }
816    return BAIL_OUT;
817}
818
819uint32_t
820CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
821                                     const uint8_t *s8, int32_t &sIndex) {
822    // The caller handled ASCII.
823    // The string is well-formed and contains only supported characters.
824    U_ASSERT(c > 0x7f);
825    if(c <= LATIN_MAX_UTF8_LEAD) {
826        return table[((c - 0xc2) << 6) + s8[sIndex++]];  // 0080..017F
827    }
828    uint8_t t2 = s8[sIndex + 1];
829    sIndex += 2;
830    if(c == 0xe2) {
831        return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
832    } else if(t2 == 0xbe) {
833        return MERGE_WEIGHT;  // U+FFFE
834    } else {
835        return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
836    }
837}
838
839uint32_t
840CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
841                             const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
842    if(ce >= MIN_LONG || ce < CONTRACTION) {
843        return ce;  // simple or special mini CE
844    } else if(ce >= EXPANSION) {
845        int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
846        return ((uint32_t)table[index + 1] << 16) | table[index];
847    } else /* ce >= CONTRACTION */ {
848        if(c == 0 && sLength < 0) {
849            sLength = sIndex - 1;
850            return EOS;
851        }
852        // Contraction list: Default mapping followed by
853        // 0 or more single-character contraction suffix mappings.
854        int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
855        if(sIndex != sLength) {
856            // Read the next character.
857            int32_t c2;
858            int32_t nextIndex = sIndex;
859            if(s16 != NULL) {
860                c2 = s16[nextIndex++];
861                if(c2 > LATIN_MAX) {
862                    if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
863                        c2 = c2 - PUNCT_START + LATIN_LIMIT;  // 2000..203F -> 0180..01BF
864                    } else if(c2 == 0xfffe || c2 == 0xffff) {
865                        c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
866                    } else {
867                        return BAIL_OUT;
868                    }
869                }
870            } else {
871                c2 = s8[nextIndex++];
872                if(c2 > 0x7f) {
873                    uint8_t t;
874                    if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
875                            0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
876                        c2 = ((c2 - 0xc2) << 6) + t;  // 0080..017F
877                        ++nextIndex;
878                    } else {
879                        int32_t i2 = nextIndex + 1;
880                        if(i2 < sLength || sLength < 0) {
881                            if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
882                                    0x80 <= (t = s8[i2]) && t <= 0xbf) {
883                                c2 = (LATIN_LIMIT - 0x80) + t;  // 2000..203F -> 0180..01BF
884                            } else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
885                                    ((t = s8[i2]) == 0xbe || t == 0xbf)) {
886                                c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
887                            } else {
888                                return BAIL_OUT;
889                            }
890                        } else {
891                            return BAIL_OUT;
892                        }
893                        nextIndex += 2;
894                    }
895                }
896            }
897            if(c2 == 0 && sLength < 0) {
898                sLength = sIndex;
899                c2 = -1;
900            }
901            // Look for the next character in the contraction suffix list,
902            // which is in ascending order of single suffix characters.
903            int32_t i = index;
904            int32_t head = table[i];  // first skip the default mapping
905            int32_t x;
906            do {
907                i += head >> CONTR_LENGTH_SHIFT;
908                head = table[i];
909                x = head & CONTR_CHAR_MASK;
910            } while(x < c2);
911            if(x == c2) {
912                index = i;
913                sIndex = nextIndex;
914            }
915        }
916        // Return the CE or CEs for the default or contraction mapping.
917        int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
918        if(length == 1) {
919            return BAIL_OUT;
920        }
921        ce = table[index + 1];
922        if(length == 2) {
923            return ce;
924        } else {
925            return ((uint32_t)table[index + 2] << 16) | ce;
926        }
927    }
928}
929
930uint32_t
931CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
932    if(pair <= 0xffff) {
933        // one mini CE
934        if(pair >= MIN_SHORT) {
935            pair = getSecondariesFromOneShortCE(pair);
936        } else if(pair > variableTop) {
937            pair = COMMON_SEC_PLUS_OFFSET;
938        } else if(pair >= MIN_LONG) {
939            pair = 0;  // variable
940        }
941        // else special mini CE
942    } else {
943        uint32_t ce = pair & 0xffff;
944        if(ce >= MIN_SHORT) {
945            pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
946        } else if(ce > variableTop) {
947            pair = TWO_COMMON_SEC_PLUS_OFFSET;
948        } else {
949            U_ASSERT(ce >= MIN_LONG);
950            pair = 0;  // variable
951        }
952    }
953    return pair;
954}
955
956uint32_t
957CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
958    // Primary+caseLevel: Ignore case level weights of primary ignorables.
959    // Otherwise: Ignore case level weights of secondary ignorables.
960    // For details see the comments in the CollationCompare class.
961    // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
962    if(pair <= 0xffff) {
963        // one mini CE
964        if(pair >= MIN_SHORT) {
965            // A high secondary weight means we really have two CEs,
966            // a primary CE and a secondary CE.
967            uint32_t ce = pair;
968            pair &= CASE_MASK;  // explicit weight of primary CE
969            if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
970                pair |= LOWER_CASE << 16;  // implied weight of secondary CE
971            }
972        } else if(pair > variableTop) {
973            pair = LOWER_CASE;
974        } else if(pair >= MIN_LONG) {
975            pair = 0;  // variable
976        }
977        // else special mini CE
978    } else {
979        // two mini CEs, same primary groups, neither expands like above
980        uint32_t ce = pair & 0xffff;
981        if(ce >= MIN_SHORT) {
982            if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
983                pair &= CASE_MASK;
984            } else {
985                pair &= TWO_CASES_MASK;
986            }
987        } else if(ce > variableTop) {
988            pair = TWO_LOWER_CASES;
989        } else {
990            U_ASSERT(ce >= MIN_LONG);
991            pair = 0;  // variable
992        }
993    }
994    return pair;
995}
996
997uint32_t
998CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
999    if(pair <= 0xffff) {
1000        // one mini CE
1001        if(pair >= MIN_SHORT) {
1002            // A high secondary weight means we really have two CEs,
1003            // a primary CE and a secondary CE.
1004            uint32_t ce = pair;
1005            if(withCaseBits) {
1006                pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
1007                if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1008                    pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
1009                }
1010            } else {
1011                pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1012                if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1013                    pair |= COMMON_TER_PLUS_OFFSET << 16;
1014                }
1015            }
1016        } else if(pair > variableTop) {
1017            pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1018            if(withCaseBits) {
1019                pair |= LOWER_CASE;
1020            }
1021        } else if(pair >= MIN_LONG) {
1022            pair = 0;  // variable
1023        }
1024        // else special mini CE
1025    } else {
1026        // two mini CEs, same primary groups, neither expands like above
1027        uint32_t ce = pair & 0xffff;
1028        if(ce >= MIN_SHORT) {
1029            if(withCaseBits) {
1030                pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
1031            } else {
1032                pair &= TWO_TERTIARIES_MASK;
1033            }
1034            pair += TWO_TER_OFFSETS;
1035        } else if(ce > variableTop) {
1036            pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
1037            if(withCaseBits) {
1038                pair |= TWO_LOWER_CASES;
1039            }
1040        } else {
1041            U_ASSERT(ce >= MIN_LONG);
1042            pair = 0;  // variable
1043        }
1044    }
1045    return pair;
1046}
1047
1048uint32_t
1049CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
1050    // Return the primary weight of a variable CE,
1051    // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
1052    if(pair <= 0xffff) {
1053        // one mini CE
1054        if(pair >= MIN_SHORT) {
1055            // A high secondary weight means we really have two CEs,
1056            // a primary CE and a secondary CE.
1057            if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1058                pair = TWO_SHORT_PRIMARIES_MASK;
1059            } else {
1060                pair = SHORT_PRIMARY_MASK;
1061            }
1062        } else if(pair > variableTop) {
1063            pair = SHORT_PRIMARY_MASK;
1064        } else if(pair >= MIN_LONG) {
1065            pair &= LONG_PRIMARY_MASK;  // variable
1066        }
1067        // else special mini CE
1068    } else {
1069        // two mini CEs, same primary groups, neither expands like above
1070        uint32_t ce = pair & 0xffff;
1071        if(ce > variableTop) {
1072            pair = TWO_SHORT_PRIMARIES_MASK;
1073        } else {
1074            U_ASSERT(ce >= MIN_LONG);
1075            pair &= TWO_LONG_PRIMARIES_MASK;  // variable
1076        }
1077    }
1078    return pair;
1079}
1080
1081U_NAMESPACE_END
1082
1083#endif  // !UCONFIG_NO_COLLATION
1084