1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* Copyright (C) 2013-2015, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* collationfastlatin.cpp 9* 10* created on: 2013aug18 11* created by: Markus W. Scherer 12*/ 13 14#include "unicode/utypes.h" 15 16#if !UCONFIG_NO_COLLATION 17 18#include "unicode/ucol.h" 19#include "collationdata.h" 20#include "collationfastlatin.h" 21#include "collationsettings.h" 22#include "uassert.h" 23 24U_NAMESPACE_BEGIN 25 26int32_t 27CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings, 28 uint16_t *primaries, int32_t capacity) { 29 const uint16_t *table = data->fastLatinTable; 30 if(table == NULL) { return -1; } 31 U_ASSERT(capacity == LATIN_LIMIT); 32 if(capacity != LATIN_LIMIT) { return -1; } 33 34 uint32_t miniVarTop; 35 if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { 36 // No mini primaries are variable, set a variableTop just below the 37 // lowest long mini primary. 38 miniVarTop = MIN_LONG - 1; 39 } else { 40 int32_t headerLength = *table & 0xff; 41 int32_t i = 1 + settings.getMaxVariable(); 42 if(i >= headerLength) { 43 return -1; // variableTop >= digits, should not occur 44 } 45 miniVarTop = table[i]; 46 } 47 48 UBool digitsAreReordered = FALSE; 49 if(settings.hasReordering()) { 50 uint32_t prevStart = 0; 51 uint32_t beforeDigitStart = 0; 52 uint32_t digitStart = 0; 53 uint32_t afterDigitStart = 0; 54 for(int32_t group = UCOL_REORDER_CODE_FIRST; 55 group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES; 56 ++group) { 57 uint32_t start = data->getFirstPrimaryForGroup(group); 58 start = settings.reorder(start); 59 if(group == UCOL_REORDER_CODE_DIGIT) { 60 beforeDigitStart = prevStart; 61 digitStart = start; 62 } else if(start != 0) { 63 if(start < prevStart) { 64 // The permutation affects the groups up to Latin. 65 return -1; 66 } 67 // In the future, there might be a special group between digits & Latin. 68 if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { 69 afterDigitStart = start; 70 } 71 prevStart = start; 72 } 73 } 74 uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN); 75 latinStart = settings.reorder(latinStart); 76 if(latinStart < prevStart) { 77 return -1; 78 } 79 if(afterDigitStart == 0) { 80 afterDigitStart = latinStart; 81 } 82 if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { 83 digitsAreReordered = TRUE; 84 } 85 } 86 87 table += (table[0] & 0xff); // skip the header 88 for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { 89 uint32_t p = table[c]; 90 if(p >= MIN_SHORT) { 91 p &= SHORT_PRIMARY_MASK; 92 } else if(p > miniVarTop) { 93 p &= LONG_PRIMARY_MASK; 94 } else { 95 p = 0; 96 } 97 primaries[c] = (uint16_t)p; 98 } 99 if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) { 100 // Bail out for digits. 101 for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } 102 } 103 104 // Shift the miniVarTop above other options. 105 return ((int32_t)miniVarTop << 16) | settings.options; 106} 107 108int32_t 109CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options, 110 const UChar *left, int32_t leftLength, 111 const UChar *right, int32_t rightLength) { 112 // This is a modified copy of CollationCompare::compareUpToQuaternary(), 113 // optimized for common Latin text. 114 // Keep them in sync! 115 // Keep compareUTF16() and compareUTF8() in sync very closely! 116 117 U_ASSERT((table[0] >> 8) == VERSION); 118 table += (table[0] & 0xff); // skip the header 119 uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() 120 options &= 0xffff; // needed for CollationSettings::getStrength() to work 121 122 // Check for supported characters, fetch mini CEs, and compare primaries. 123 int32_t leftIndex = 0, rightIndex = 0; 124 /** 125 * Single mini CE or a pair. 126 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. 127 * If there is only one, then it is in the lower bits, and the upper bits are 0. 128 */ 129 uint32_t leftPair = 0, rightPair = 0; 130 for(;;) { 131 // We fetch CEs until we get a non-ignorable primary or reach the end. 132 while(leftPair == 0) { 133 if(leftIndex == leftLength) { 134 leftPair = EOS; 135 break; 136 } 137 UChar32 c = left[leftIndex++]; 138 if(c <= LATIN_MAX) { 139 leftPair = primaries[c]; 140 if(leftPair != 0) { break; } 141 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 142 return BAIL_OUT_RESULT; 143 } 144 leftPair = table[c]; 145 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 146 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 147 } else { 148 leftPair = lookup(table, c); 149 } 150 if(leftPair >= MIN_SHORT) { 151 leftPair &= SHORT_PRIMARY_MASK; 152 break; 153 } else if(leftPair > variableTop) { 154 leftPair &= LONG_PRIMARY_MASK; 155 break; 156 } else { 157 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 158 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 159 leftPair = getPrimaries(variableTop, leftPair); 160 } 161 } 162 163 while(rightPair == 0) { 164 if(rightIndex == rightLength) { 165 rightPair = EOS; 166 break; 167 } 168 UChar32 c = right[rightIndex++]; 169 if(c <= LATIN_MAX) { 170 rightPair = primaries[c]; 171 if(rightPair != 0) { break; } 172 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 173 return BAIL_OUT_RESULT; 174 } 175 rightPair = table[c]; 176 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 177 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 178 } else { 179 rightPair = lookup(table, c); 180 } 181 if(rightPair >= MIN_SHORT) { 182 rightPair &= SHORT_PRIMARY_MASK; 183 break; 184 } else if(rightPair > variableTop) { 185 rightPair &= LONG_PRIMARY_MASK; 186 break; 187 } else { 188 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 189 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 190 rightPair = getPrimaries(variableTop, rightPair); 191 } 192 } 193 194 if(leftPair == rightPair) { 195 if(leftPair == EOS) { break; } 196 leftPair = rightPair = 0; 197 continue; 198 } 199 uint32_t leftPrimary = leftPair & 0xffff; 200 uint32_t rightPrimary = rightPair & 0xffff; 201 if(leftPrimary != rightPrimary) { 202 // Return the primary difference. 203 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; 204 } 205 if(leftPair == EOS) { break; } 206 leftPair >>= 16; 207 rightPair >>= 16; 208 } 209 // In the following, we need to re-fetch each character because we did not buffer the CEs, 210 // but we know that the string is well-formed and 211 // only contains supported characters and mappings. 212 213 // We might skip the secondary level but continue with the case level 214 // which is turned on separately. 215 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { 216 leftIndex = rightIndex = 0; 217 leftPair = rightPair = 0; 218 for(;;) { 219 while(leftPair == 0) { 220 if(leftIndex == leftLength) { 221 leftPair = EOS; 222 break; 223 } 224 UChar32 c = left[leftIndex++]; 225 if(c <= LATIN_MAX) { 226 leftPair = table[c]; 227 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 228 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 229 } else { 230 leftPair = lookup(table, c); 231 } 232 if(leftPair >= MIN_SHORT) { 233 leftPair = getSecondariesFromOneShortCE(leftPair); 234 break; 235 } else if(leftPair > variableTop) { 236 leftPair = COMMON_SEC_PLUS_OFFSET; 237 break; 238 } else { 239 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 240 leftPair = getSecondaries(variableTop, leftPair); 241 } 242 } 243 244 while(rightPair == 0) { 245 if(rightIndex == rightLength) { 246 rightPair = EOS; 247 break; 248 } 249 UChar32 c = right[rightIndex++]; 250 if(c <= LATIN_MAX) { 251 rightPair = table[c]; 252 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 253 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 254 } else { 255 rightPair = lookup(table, c); 256 } 257 if(rightPair >= MIN_SHORT) { 258 rightPair = getSecondariesFromOneShortCE(rightPair); 259 break; 260 } else if(rightPair > variableTop) { 261 rightPair = COMMON_SEC_PLUS_OFFSET; 262 break; 263 } else { 264 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 265 rightPair = getSecondaries(variableTop, rightPair); 266 } 267 } 268 269 if(leftPair == rightPair) { 270 if(leftPair == EOS) { break; } 271 leftPair = rightPair = 0; 272 continue; 273 } 274 uint32_t leftSecondary = leftPair & 0xffff; 275 uint32_t rightSecondary = rightPair & 0xffff; 276 if(leftSecondary != rightSecondary) { 277 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { 278 // Full support for backwards secondary requires backwards contraction matching 279 // and moving backwards between merge separators. 280 return BAIL_OUT_RESULT; 281 } 282 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 283 } 284 if(leftPair == EOS) { break; } 285 leftPair >>= 16; 286 rightPair >>= 16; 287 } 288 } 289 290 if((options & CollationSettings::CASE_LEVEL) != 0) { 291 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; 292 leftIndex = rightIndex = 0; 293 leftPair = rightPair = 0; 294 for(;;) { 295 while(leftPair == 0) { 296 if(leftIndex == leftLength) { 297 leftPair = EOS; 298 break; 299 } 300 UChar32 c = left[leftIndex++]; 301 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 302 if(leftPair < MIN_LONG) { 303 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 304 } 305 leftPair = getCases(variableTop, strengthIsPrimary, leftPair); 306 } 307 308 while(rightPair == 0) { 309 if(rightIndex == rightLength) { 310 rightPair = EOS; 311 break; 312 } 313 UChar32 c = right[rightIndex++]; 314 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 315 if(rightPair < MIN_LONG) { 316 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 317 } 318 rightPair = getCases(variableTop, strengthIsPrimary, rightPair); 319 } 320 321 if(leftPair == rightPair) { 322 if(leftPair == EOS) { break; } 323 leftPair = rightPair = 0; 324 continue; 325 } 326 uint32_t leftCase = leftPair & 0xffff; 327 uint32_t rightCase = rightPair & 0xffff; 328 if(leftCase != rightCase) { 329 if((options & CollationSettings::UPPER_FIRST) == 0) { 330 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; 331 } else { 332 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; 333 } 334 } 335 if(leftPair == EOS) { break; } 336 leftPair >>= 16; 337 rightPair >>= 16; 338 } 339 } 340 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } 341 342 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 343 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); 344 345 leftIndex = rightIndex = 0; 346 leftPair = rightPair = 0; 347 for(;;) { 348 while(leftPair == 0) { 349 if(leftIndex == leftLength) { 350 leftPair = EOS; 351 break; 352 } 353 UChar32 c = left[leftIndex++]; 354 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 355 if(leftPair < MIN_LONG) { 356 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 357 } 358 leftPair = getTertiaries(variableTop, withCaseBits, leftPair); 359 } 360 361 while(rightPair == 0) { 362 if(rightIndex == rightLength) { 363 rightPair = EOS; 364 break; 365 } 366 UChar32 c = right[rightIndex++]; 367 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 368 if(rightPair < MIN_LONG) { 369 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 370 } 371 rightPair = getTertiaries(variableTop, withCaseBits, rightPair); 372 } 373 374 if(leftPair == rightPair) { 375 if(leftPair == EOS) { break; } 376 leftPair = rightPair = 0; 377 continue; 378 } 379 uint32_t leftTertiary = leftPair & 0xffff; 380 uint32_t rightTertiary = rightPair & 0xffff; 381 if(leftTertiary != rightTertiary) { 382 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { 383 // Pass through EOS and MERGE_WEIGHT 384 // and keep real tertiary weights larger than the MERGE_WEIGHT. 385 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 386 if(leftTertiary > MERGE_WEIGHT) { 387 leftTertiary ^= CASE_MASK; 388 } 389 if(rightTertiary > MERGE_WEIGHT) { 390 rightTertiary ^= CASE_MASK; 391 } 392 } 393 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; 394 } 395 if(leftPair == EOS) { break; } 396 leftPair >>= 16; 397 rightPair >>= 16; 398 } 399 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } 400 401 leftIndex = rightIndex = 0; 402 leftPair = rightPair = 0; 403 for(;;) { 404 while(leftPair == 0) { 405 if(leftIndex == leftLength) { 406 leftPair = EOS; 407 break; 408 } 409 UChar32 c = left[leftIndex++]; 410 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 411 if(leftPair < MIN_LONG) { 412 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); 413 } 414 leftPair = getQuaternaries(variableTop, leftPair); 415 } 416 417 while(rightPair == 0) { 418 if(rightIndex == rightLength) { 419 rightPair = EOS; 420 break; 421 } 422 UChar32 c = right[rightIndex++]; 423 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 424 if(rightPair < MIN_LONG) { 425 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); 426 } 427 rightPair = getQuaternaries(variableTop, rightPair); 428 } 429 430 if(leftPair == rightPair) { 431 if(leftPair == EOS) { break; } 432 leftPair = rightPair = 0; 433 continue; 434 } 435 uint32_t leftQuaternary = leftPair & 0xffff; 436 uint32_t rightQuaternary = rightPair & 0xffff; 437 if(leftQuaternary != rightQuaternary) { 438 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; 439 } 440 if(leftPair == EOS) { break; } 441 leftPair >>= 16; 442 rightPair >>= 16; 443 } 444 return UCOL_EQUAL; 445} 446 447int32_t 448CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options, 449 const uint8_t *left, int32_t leftLength, 450 const uint8_t *right, int32_t rightLength) { 451 // Keep compareUTF16() and compareUTF8() in sync very closely! 452 453 U_ASSERT((table[0] >> 8) == VERSION); 454 table += (table[0] & 0xff); // skip the header 455 uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions() 456 options &= 0xffff; // needed for CollationSettings::getStrength() to work 457 458 // Check for supported characters, fetch mini CEs, and compare primaries. 459 int32_t leftIndex = 0, rightIndex = 0; 460 /** 461 * Single mini CE or a pair. 462 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. 463 * If there is only one, then it is in the lower bits, and the upper bits are 0. 464 */ 465 uint32_t leftPair = 0, rightPair = 0; 466 // Note: There is no need to assemble the code point. 467 // We only need to look up the table entry for the character, 468 // and nextPair() looks for whether c==0. 469 for(;;) { 470 // We fetch CEs until we get a non-ignorable primary or reach the end. 471 while(leftPair == 0) { 472 if(leftIndex == leftLength) { 473 leftPair = EOS; 474 break; 475 } 476 UChar32 c = left[leftIndex++]; 477 uint8_t t; 478 if(c <= 0x7f) { 479 leftPair = primaries[c]; 480 if(leftPair != 0) { break; } 481 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 482 return BAIL_OUT_RESULT; 483 } 484 leftPair = table[c]; 485 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength && 486 0x80 <= (t = left[leftIndex]) && t <= 0xbf) { 487 ++leftIndex; 488 c = ((c - 0xc2) << 6) + t; 489 leftPair = primaries[c]; 490 if(leftPair != 0) { break; } 491 leftPair = table[c]; 492 } else { 493 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength); 494 } 495 if(leftPair >= MIN_SHORT) { 496 leftPair &= SHORT_PRIMARY_MASK; 497 break; 498 } else if(leftPair > variableTop) { 499 leftPair &= LONG_PRIMARY_MASK; 500 break; 501 } else { 502 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 503 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 504 leftPair = getPrimaries(variableTop, leftPair); 505 } 506 } 507 508 while(rightPair == 0) { 509 if(rightIndex == rightLength) { 510 rightPair = EOS; 511 break; 512 } 513 UChar32 c = right[rightIndex++]; 514 uint8_t t; 515 if(c <= 0x7f) { 516 rightPair = primaries[c]; 517 if(rightPair != 0) { break; } 518 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { 519 return BAIL_OUT_RESULT; 520 } 521 rightPair = table[c]; 522 } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength && 523 0x80 <= (t = right[rightIndex]) && t <= 0xbf) { 524 ++rightIndex; 525 c = ((c - 0xc2) << 6) + t; 526 rightPair = primaries[c]; 527 if(rightPair != 0) { break; } 528 rightPair = table[c]; 529 } else { 530 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength); 531 } 532 if(rightPair >= MIN_SHORT) { 533 rightPair &= SHORT_PRIMARY_MASK; 534 break; 535 } else if(rightPair > variableTop) { 536 rightPair &= LONG_PRIMARY_MASK; 537 break; 538 } else { 539 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 540 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 541 rightPair = getPrimaries(variableTop, rightPair); 542 } 543 } 544 545 if(leftPair == rightPair) { 546 if(leftPair == EOS) { break; } 547 leftPair = rightPair = 0; 548 continue; 549 } 550 uint32_t leftPrimary = leftPair & 0xffff; 551 uint32_t rightPrimary = rightPair & 0xffff; 552 if(leftPrimary != rightPrimary) { 553 // Return the primary difference. 554 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; 555 } 556 if(leftPair == EOS) { break; } 557 leftPair >>= 16; 558 rightPair >>= 16; 559 } 560 // In the following, we need to re-fetch each character because we did not buffer the CEs, 561 // but we know that the string is well-formed and 562 // only contains supported characters and mappings. 563 564 // We might skip the secondary level but continue with the case level 565 // which is turned on separately. 566 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { 567 leftIndex = rightIndex = 0; 568 leftPair = rightPair = 0; 569 for(;;) { 570 while(leftPair == 0) { 571 if(leftIndex == leftLength) { 572 leftPair = EOS; 573 break; 574 } 575 UChar32 c = left[leftIndex++]; 576 if(c <= 0x7f) { 577 leftPair = table[c]; 578 } else if(c <= LATIN_MAX_UTF8_LEAD) { 579 leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]]; 580 } else { 581 leftPair = lookupUTF8Unsafe(table, c, left, leftIndex); 582 } 583 if(leftPair >= MIN_SHORT) { 584 leftPair = getSecondariesFromOneShortCE(leftPair); 585 break; 586 } else if(leftPair > variableTop) { 587 leftPair = COMMON_SEC_PLUS_OFFSET; 588 break; 589 } else { 590 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 591 leftPair = getSecondaries(variableTop, leftPair); 592 } 593 } 594 595 while(rightPair == 0) { 596 if(rightIndex == rightLength) { 597 rightPair = EOS; 598 break; 599 } 600 UChar32 c = right[rightIndex++]; 601 if(c <= 0x7f) { 602 rightPair = table[c]; 603 } else if(c <= LATIN_MAX_UTF8_LEAD) { 604 rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]]; 605 } else { 606 rightPair = lookupUTF8Unsafe(table, c, right, rightIndex); 607 } 608 if(rightPair >= MIN_SHORT) { 609 rightPair = getSecondariesFromOneShortCE(rightPair); 610 break; 611 } else if(rightPair > variableTop) { 612 rightPair = COMMON_SEC_PLUS_OFFSET; 613 break; 614 } else { 615 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 616 rightPair = getSecondaries(variableTop, rightPair); 617 } 618 } 619 620 if(leftPair == rightPair) { 621 if(leftPair == EOS) { break; } 622 leftPair = rightPair = 0; 623 continue; 624 } 625 uint32_t leftSecondary = leftPair & 0xffff; 626 uint32_t rightSecondary = rightPair & 0xffff; 627 if(leftSecondary != rightSecondary) { 628 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { 629 // Full support for backwards secondary requires backwards contraction matching 630 // and moving backwards between merge separators. 631 return BAIL_OUT_RESULT; 632 } 633 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 634 } 635 if(leftPair == EOS) { break; } 636 leftPair >>= 16; 637 rightPair >>= 16; 638 } 639 } 640 641 if((options & CollationSettings::CASE_LEVEL) != 0) { 642 UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; 643 leftIndex = rightIndex = 0; 644 leftPair = rightPair = 0; 645 for(;;) { 646 while(leftPair == 0) { 647 if(leftIndex == leftLength) { 648 leftPair = EOS; 649 break; 650 } 651 UChar32 c = left[leftIndex++]; 652 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); 653 if(leftPair < MIN_LONG) { 654 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 655 } 656 leftPair = getCases(variableTop, strengthIsPrimary, leftPair); 657 } 658 659 while(rightPair == 0) { 660 if(rightIndex == rightLength) { 661 rightPair = EOS; 662 break; 663 } 664 UChar32 c = right[rightIndex++]; 665 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); 666 if(rightPair < MIN_LONG) { 667 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 668 } 669 rightPair = getCases(variableTop, strengthIsPrimary, rightPair); 670 } 671 672 if(leftPair == rightPair) { 673 if(leftPair == EOS) { break; } 674 leftPair = rightPair = 0; 675 continue; 676 } 677 uint32_t leftCase = leftPair & 0xffff; 678 uint32_t rightCase = rightPair & 0xffff; 679 if(leftCase != rightCase) { 680 if((options & CollationSettings::UPPER_FIRST) == 0) { 681 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; 682 } else { 683 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; 684 } 685 } 686 if(leftPair == EOS) { break; } 687 leftPair >>= 16; 688 rightPair >>= 16; 689 } 690 } 691 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } 692 693 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 694 UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); 695 696 leftIndex = rightIndex = 0; 697 leftPair = rightPair = 0; 698 for(;;) { 699 while(leftPair == 0) { 700 if(leftIndex == leftLength) { 701 leftPair = EOS; 702 break; 703 } 704 UChar32 c = left[leftIndex++]; 705 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); 706 if(leftPair < MIN_LONG) { 707 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 708 } 709 leftPair = getTertiaries(variableTop, withCaseBits, leftPair); 710 } 711 712 while(rightPair == 0) { 713 if(rightIndex == rightLength) { 714 rightPair = EOS; 715 break; 716 } 717 UChar32 c = right[rightIndex++]; 718 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); 719 if(rightPair < MIN_LONG) { 720 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 721 } 722 rightPair = getTertiaries(variableTop, withCaseBits, rightPair); 723 } 724 725 if(leftPair == rightPair) { 726 if(leftPair == EOS) { break; } 727 leftPair = rightPair = 0; 728 continue; 729 } 730 uint32_t leftTertiary = leftPair & 0xffff; 731 uint32_t rightTertiary = rightPair & 0xffff; 732 if(leftTertiary != rightTertiary) { 733 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { 734 // Pass through EOS and MERGE_WEIGHT 735 // and keep real tertiary weights larger than the MERGE_WEIGHT. 736 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 737 if(leftTertiary > MERGE_WEIGHT) { 738 leftTertiary ^= CASE_MASK; 739 } 740 if(rightTertiary > MERGE_WEIGHT) { 741 rightTertiary ^= CASE_MASK; 742 } 743 } 744 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; 745 } 746 if(leftPair == EOS) { break; } 747 leftPair >>= 16; 748 rightPair >>= 16; 749 } 750 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } 751 752 leftIndex = rightIndex = 0; 753 leftPair = rightPair = 0; 754 for(;;) { 755 while(leftPair == 0) { 756 if(leftIndex == leftLength) { 757 leftPair = EOS; 758 break; 759 } 760 UChar32 c = left[leftIndex++]; 761 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); 762 if(leftPair < MIN_LONG) { 763 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); 764 } 765 leftPair = getQuaternaries(variableTop, leftPair); 766 } 767 768 while(rightPair == 0) { 769 if(rightIndex == rightLength) { 770 rightPair = EOS; 771 break; 772 } 773 UChar32 c = right[rightIndex++]; 774 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); 775 if(rightPair < MIN_LONG) { 776 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); 777 } 778 rightPair = getQuaternaries(variableTop, rightPair); 779 } 780 781 if(leftPair == rightPair) { 782 if(leftPair == EOS) { break; } 783 leftPair = rightPair = 0; 784 continue; 785 } 786 uint32_t leftQuaternary = leftPair & 0xffff; 787 uint32_t rightQuaternary = rightPair & 0xffff; 788 if(leftQuaternary != rightQuaternary) { 789 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; 790 } 791 if(leftPair == EOS) { break; } 792 leftPair >>= 16; 793 rightPair >>= 16; 794 } 795 return UCOL_EQUAL; 796} 797 798uint32_t 799CollationFastLatin::lookup(const uint16_t *table, UChar32 c) { 800 U_ASSERT(c > LATIN_MAX); 801 if(PUNCT_START <= c && c < PUNCT_LIMIT) { 802 return table[c - PUNCT_START + LATIN_LIMIT]; 803 } else if(c == 0xfffe) { 804 return MERGE_WEIGHT; 805 } else if(c == 0xffff) { 806 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; 807 } else { 808 return BAIL_OUT; 809 } 810} 811 812uint32_t 813CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c, 814 const uint8_t *s8, int32_t &sIndex, int32_t sLength) { 815 // The caller handled ASCII and valid/supported Latin. 816 U_ASSERT(c > 0x7f); 817 int32_t i2 = sIndex + 1; 818 if(i2 < sLength || sLength < 0) { 819 uint8_t t1 = s8[sIndex]; 820 uint8_t t2 = s8[i2]; 821 sIndex += 2; 822 if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) { 823 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF 824 } else if(c == 0xef && t1 == 0xbf) { 825 if(t2 == 0xbe) { 826 return MERGE_WEIGHT; // U+FFFE 827 } else if(t2 == 0xbf) { 828 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF 829 } 830 } 831 } 832 return BAIL_OUT; 833} 834 835uint32_t 836CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c, 837 const uint8_t *s8, int32_t &sIndex) { 838 // The caller handled ASCII. 839 // The string is well-formed and contains only supported characters. 840 U_ASSERT(c > 0x7f); 841 if(c <= LATIN_MAX_UTF8_LEAD) { 842 return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F 843 } 844 uint8_t t2 = s8[sIndex + 1]; 845 sIndex += 2; 846 if(c == 0xe2) { 847 return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF 848 } else if(t2 == 0xbe) { 849 return MERGE_WEIGHT; // U+FFFE 850 } else { 851 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF 852 } 853} 854 855uint32_t 856CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce, 857 const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) { 858 if(ce >= MIN_LONG || ce < CONTRACTION) { 859 return ce; // simple or special mini CE 860 } else if(ce >= EXPANSION) { 861 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); 862 return ((uint32_t)table[index + 1] << 16) | table[index]; 863 } else /* ce >= CONTRACTION */ { 864 if(c == 0 && sLength < 0) { 865 sLength = sIndex - 1; 866 return EOS; 867 } 868 // Contraction list: Default mapping followed by 869 // 0 or more single-character contraction suffix mappings. 870 int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); 871 if(sIndex != sLength) { 872 // Read the next character. 873 int32_t c2; 874 int32_t nextIndex = sIndex; 875 if(s16 != NULL) { 876 c2 = s16[nextIndex++]; 877 if(c2 > LATIN_MAX) { 878 if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { 879 c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF 880 } else if(c2 == 0xfffe || c2 == 0xffff) { 881 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. 882 } else { 883 return BAIL_OUT; 884 } 885 } 886 } else { 887 c2 = s8[nextIndex++]; 888 if(c2 > 0x7f) { 889 uint8_t t; 890 if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength && 891 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) { 892 c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F 893 ++nextIndex; 894 } else { 895 int32_t i2 = nextIndex + 1; 896 if(i2 < sLength || sLength < 0) { 897 if(c2 == 0xe2 && s8[nextIndex] == 0x80 && 898 0x80 <= (t = s8[i2]) && t <= 0xbf) { 899 c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF 900 } else if(c2 == 0xef && s8[nextIndex] == 0xbf && 901 ((t = s8[i2]) == 0xbe || t == 0xbf)) { 902 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. 903 } else { 904 return BAIL_OUT; 905 } 906 } else { 907 return BAIL_OUT; 908 } 909 nextIndex += 2; 910 } 911 } 912 } 913 if(c2 == 0 && sLength < 0) { 914 sLength = sIndex; 915 c2 = -1; 916 } 917 // Look for the next character in the contraction suffix list, 918 // which is in ascending order of single suffix characters. 919 int32_t i = index; 920 int32_t head = table[i]; // first skip the default mapping 921 int32_t x; 922 do { 923 i += head >> CONTR_LENGTH_SHIFT; 924 head = table[i]; 925 x = head & CONTR_CHAR_MASK; 926 } while(x < c2); 927 if(x == c2) { 928 index = i; 929 sIndex = nextIndex; 930 } 931 } 932 // Return the CE or CEs for the default or contraction mapping. 933 int32_t length = table[index] >> CONTR_LENGTH_SHIFT; 934 if(length == 1) { 935 return BAIL_OUT; 936 } 937 ce = table[index + 1]; 938 if(length == 2) { 939 return ce; 940 } else { 941 return ((uint32_t)table[index + 2] << 16) | ce; 942 } 943 } 944} 945 946uint32_t 947CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) { 948 if(pair <= 0xffff) { 949 // one mini CE 950 if(pair >= MIN_SHORT) { 951 pair = getSecondariesFromOneShortCE(pair); 952 } else if(pair > variableTop) { 953 pair = COMMON_SEC_PLUS_OFFSET; 954 } else if(pair >= MIN_LONG) { 955 pair = 0; // variable 956 } 957 // else special mini CE 958 } else { 959 uint32_t ce = pair & 0xffff; 960 if(ce >= MIN_SHORT) { 961 pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; 962 } else if(ce > variableTop) { 963 pair = TWO_COMMON_SEC_PLUS_OFFSET; 964 } else { 965 U_ASSERT(ce >= MIN_LONG); 966 pair = 0; // variable 967 } 968 } 969 return pair; 970} 971 972uint32_t 973CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) { 974 // Primary+caseLevel: Ignore case level weights of primary ignorables. 975 // Otherwise: Ignore case level weights of secondary ignorables. 976 // For details see the comments in the CollationCompare class. 977 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 978 if(pair <= 0xffff) { 979 // one mini CE 980 if(pair >= MIN_SHORT) { 981 // A high secondary weight means we really have two CEs, 982 // a primary CE and a secondary CE. 983 uint32_t ce = pair; 984 pair &= CASE_MASK; // explicit weight of primary CE 985 if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 986 pair |= LOWER_CASE << 16; // implied weight of secondary CE 987 } 988 } else if(pair > variableTop) { 989 pair = LOWER_CASE; 990 } else if(pair >= MIN_LONG) { 991 pair = 0; // variable 992 } 993 // else special mini CE 994 } else { 995 // two mini CEs, same primary groups, neither expands like above 996 uint32_t ce = pair & 0xffff; 997 if(ce >= MIN_SHORT) { 998 if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { 999 pair &= CASE_MASK; 1000 } else { 1001 pair &= TWO_CASES_MASK; 1002 } 1003 } else if(ce > variableTop) { 1004 pair = TWO_LOWER_CASES; 1005 } else { 1006 U_ASSERT(ce >= MIN_LONG); 1007 pair = 0; // variable 1008 } 1009 } 1010 return pair; 1011} 1012 1013uint32_t 1014CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) { 1015 if(pair <= 0xffff) { 1016 // one mini CE 1017 if(pair >= MIN_SHORT) { 1018 // A high secondary weight means we really have two CEs, 1019 // a primary CE and a secondary CE. 1020 uint32_t ce = pair; 1021 if(withCaseBits) { 1022 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; 1023 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 1024 pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; 1025 } 1026 } else { 1027 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 1028 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 1029 pair |= COMMON_TER_PLUS_OFFSET << 16; 1030 } 1031 } 1032 } else if(pair > variableTop) { 1033 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 1034 if(withCaseBits) { 1035 pair |= LOWER_CASE; 1036 } 1037 } else if(pair >= MIN_LONG) { 1038 pair = 0; // variable 1039 } 1040 // else special mini CE 1041 } else { 1042 // two mini CEs, same primary groups, neither expands like above 1043 uint32_t ce = pair & 0xffff; 1044 if(ce >= MIN_SHORT) { 1045 if(withCaseBits) { 1046 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; 1047 } else { 1048 pair &= TWO_TERTIARIES_MASK; 1049 } 1050 pair += TWO_TER_OFFSETS; 1051 } else if(ce > variableTop) { 1052 pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; 1053 if(withCaseBits) { 1054 pair |= TWO_LOWER_CASES; 1055 } 1056 } else { 1057 U_ASSERT(ce >= MIN_LONG); 1058 pair = 0; // variable 1059 } 1060 } 1061 return pair; 1062} 1063 1064uint32_t 1065CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) { 1066 // Return the primary weight of a variable CE, 1067 // or the maximum primary weight for a non-variable, not-completely-ignorable CE. 1068 if(pair <= 0xffff) { 1069 // one mini CE 1070 if(pair >= MIN_SHORT) { 1071 // A high secondary weight means we really have two CEs, 1072 // a primary CE and a secondary CE. 1073 if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { 1074 pair = TWO_SHORT_PRIMARIES_MASK; 1075 } else { 1076 pair = SHORT_PRIMARY_MASK; 1077 } 1078 } else if(pair > variableTop) { 1079 pair = SHORT_PRIMARY_MASK; 1080 } else if(pair >= MIN_LONG) { 1081 pair &= LONG_PRIMARY_MASK; // variable 1082 } 1083 // else special mini CE 1084 } else { 1085 // two mini CEs, same primary groups, neither expands like above 1086 uint32_t ce = pair & 0xffff; 1087 if(ce > variableTop) { 1088 pair = TWO_SHORT_PRIMARIES_MASK; 1089 } else { 1090 U_ASSERT(ce >= MIN_LONG); 1091 pair &= TWO_LONG_PRIMARIES_MASK; // variable 1092 } 1093 } 1094 return pair; 1095} 1096 1097U_NAMESPACE_END 1098 1099#endif // !UCONFIG_NO_COLLATION 1100