1/* 2******************************************************************************* 3* Copyright (C) 2013-2015, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* CollationFastLatin.java, ported from collationfastlatin.h/.cpp 7* 8* C++ version created on: 2013aug09 9* created by: Markus W. Scherer 10*/ 11 12package com.ibm.icu.impl.coll; 13 14import com.ibm.icu.lang.UScript; 15import com.ibm.icu.text.Collator; 16 17public final class CollationFastLatin /* all static */ { 18 /** 19 * Fast Latin format version (one byte 1..FF). 20 * Must be incremented for any runtime-incompatible changes, 21 * in particular, for changes to any of the following constants. 22 * 23 * When the major version number of the main data format changes, 24 * we can reset this fast Latin version to 1. 25 */ 26 public static final int VERSION = 2; 27 28 public static final int LATIN_MAX = 0x17f; 29 public static final int LATIN_LIMIT = LATIN_MAX + 1; 30 31 static final int LATIN_MAX_UTF8_LEAD = 0xc5; // UTF-8 lead byte of LATIN_MAX 32 33 static final int PUNCT_START = 0x2000; 34 static final int PUNCT_LIMIT = 0x2040; 35 36 // excludes U+FFFE & U+FFFF 37 static final int NUM_FAST_CHARS = LATIN_LIMIT + (PUNCT_LIMIT - PUNCT_START); 38 39 // Note on the supported weight ranges: 40 // Analysis of UCA 6.3 and CLDR 23 non-search tailorings shows that 41 // the CEs for characters in the above ranges, excluding expansions with length >2, 42 // excluding contractions of >2 characters, and other restrictions 43 // (see the builder's getCEsFromCE32()), 44 // use at most about 150 primary weights, 45 // where about 94 primary weights are possibly-variable (space/punct/symbol/currency), 46 // at most 4 secondary before-common weights, 47 // at most 4 secondary after-common weights, 48 // at most 16 secondary high weights (in secondary CEs), and 49 // at most 4 tertiary after-common weights. 50 // The following ranges are designed to support slightly more weights than that. 51 // (en_US_POSIX is unusual: It creates about 64 variable + 116 Latin primaries.) 52 53 // Digits may use long primaries (preserving more short ones) 54 // or short primaries (faster) without changing this data structure. 55 // (If we supported numeric collation, then digits would have to have long primaries 56 // so that special handling does not affect the fast path.) 57 58 static final int SHORT_PRIMARY_MASK = 0xfc00; // bits 15..10 59 static final int INDEX_MASK = 0x3ff; // bits 9..0 for expansions & contractions 60 static final int SECONDARY_MASK = 0x3e0; // bits 9..5 61 static final int CASE_MASK = 0x18; // bits 4..3 62 static final int LONG_PRIMARY_MASK = 0xfff8; // bits 15..3 63 static final int TERTIARY_MASK = 7; // bits 2..0 64 static final int CASE_AND_TERTIARY_MASK = CASE_MASK | TERTIARY_MASK; 65 66 static final int TWO_SHORT_PRIMARIES_MASK = 67 (SHORT_PRIMARY_MASK << 16) | SHORT_PRIMARY_MASK; // 0xfc00fc00 68 static final int TWO_LONG_PRIMARIES_MASK = 69 (LONG_PRIMARY_MASK << 16) | LONG_PRIMARY_MASK; // 0xfff8fff8 70 static final int TWO_SECONDARIES_MASK = 71 (SECONDARY_MASK << 16) | SECONDARY_MASK; // 0x3e003e0 72 static final int TWO_CASES_MASK = 73 (CASE_MASK << 16) | CASE_MASK; // 0x180018 74 static final int TWO_TERTIARIES_MASK = 75 (TERTIARY_MASK << 16) | TERTIARY_MASK; // 0x70007 76 77 /** 78 * Contraction with one fast Latin character. 79 * Use INDEX_MASK to find the start of the contraction list after the fixed table. 80 * The first entry contains the default mapping. 81 * Otherwise use CONTR_CHAR_MASK for the contraction character index 82 * (in ascending order). 83 * Use CONTR_LENGTH_SHIFT for the length of the entry 84 * (1=BAIL_OUT, 2=one CE, 3=two CEs). 85 * 86 * Also, U+0000 maps to a contraction entry, so that the fast path need not 87 * check for NUL termination. 88 * It usually maps to a contraction list with only the completely ignorable default value. 89 */ 90 static final int CONTRACTION = 0x400; 91 /** 92 * An expansion encodes two CEs. 93 * Use INDEX_MASK to find the pair of CEs after the fixed table. 94 * 95 * The higher a mini CE value, the easier it is to process. 96 * For expansions and higher, no context needs to be considered. 97 */ 98 static final int EXPANSION = 0x800; 99 /** 100 * Encodes one CE with a long/low mini primary (there are 128). 101 * All potentially-variable primaries must be in this range, 102 * to make the short-primary path as fast as possible. 103 */ 104 static final int MIN_LONG = 0xc00; 105 static final int LONG_INC = 8; 106 static final int MAX_LONG = 0xff8; 107 /** 108 * Encodes one CE with a short/high primary (there are 60), 109 * plus a secondary CE if the secondary weight is high. 110 * Fast handling: At least all letter primaries should be in this range. 111 */ 112 static final int MIN_SHORT = 0x1000; 113 static final int SHORT_INC = 0x400; 114 /** The highest primary weight is reserved for U+FFFF. */ 115 static final int MAX_SHORT = SHORT_PRIMARY_MASK; 116 117 static final int MIN_SEC_BEFORE = 0; // must add SEC_OFFSET 118 static final int SEC_INC = 0x20; 119 static final int MAX_SEC_BEFORE = MIN_SEC_BEFORE + 4 * SEC_INC; // 5 before common 120 static final int COMMON_SEC = MAX_SEC_BEFORE + SEC_INC; 121 static final int MIN_SEC_AFTER = COMMON_SEC + SEC_INC; 122 static final int MAX_SEC_AFTER = MIN_SEC_AFTER + 5 * SEC_INC; // 6 after common 123 static final int MIN_SEC_HIGH = MAX_SEC_AFTER + SEC_INC; // 20 high secondaries 124 static final int MAX_SEC_HIGH = SECONDARY_MASK; 125 126 /** 127 * Lookup: Add this offset to secondary weights, except for completely ignorable CEs. 128 * Must be greater than any special value, e.g., MERGE_WEIGHT. 129 * The exact value is not relevant for the format version. 130 */ 131 static final int SEC_OFFSET = SEC_INC; 132 static final int COMMON_SEC_PLUS_OFFSET = COMMON_SEC + SEC_OFFSET; 133 134 static final int TWO_SEC_OFFSETS = 135 (SEC_OFFSET << 16) | SEC_OFFSET; // 0x200020 136 static final int TWO_COMMON_SEC_PLUS_OFFSET = 137 (COMMON_SEC_PLUS_OFFSET << 16) | COMMON_SEC_PLUS_OFFSET; 138 139 static final int LOWER_CASE = 8; // case bits include this offset 140 static final int TWO_LOWER_CASES = (LOWER_CASE << 16) | LOWER_CASE; // 0x80008 141 142 static final int COMMON_TER = 0; // must add TER_OFFSET 143 static final int MAX_TER_AFTER = 7; // 7 after common 144 145 /** 146 * Lookup: Add this offset to tertiary weights, except for completely ignorable CEs. 147 * Must be greater than any special value, e.g., MERGE_WEIGHT. 148 * Must be greater than case bits as well, so that with combined case+tertiary weights 149 * plus the offset the tertiary bits does not spill over into the case bits. 150 * The exact value is not relevant for the format version. 151 */ 152 static final int TER_OFFSET = SEC_OFFSET; 153 static final int COMMON_TER_PLUS_OFFSET = COMMON_TER + TER_OFFSET; 154 155 static final int TWO_TER_OFFSETS = (TER_OFFSET << 16) | TER_OFFSET; 156 static final int TWO_COMMON_TER_PLUS_OFFSET = 157 (COMMON_TER_PLUS_OFFSET << 16) | COMMON_TER_PLUS_OFFSET; 158 159 static final int MERGE_WEIGHT = 3; 160 static final int EOS = 2; // end of string 161 static final int BAIL_OUT = 1; 162 163 /** 164 * Contraction result first word bits 8..0 contain the 165 * second contraction character, as a char index 0..NUM_FAST_CHARS-1. 166 * Each contraction list is terminated with a word containing CONTR_CHAR_MASK. 167 */ 168 static final int CONTR_CHAR_MASK = 0x1ff; 169 /** 170 * Contraction result first word bits 10..9 contain the result length: 171 * 1=bail out, 2=one mini CE, 3=two mini CEs 172 */ 173 static final int CONTR_LENGTH_SHIFT = 9; 174 175 /** 176 * Comparison return value when the regular comparison must be used. 177 * The exact value is not relevant for the format version. 178 */ 179 public static final int BAIL_OUT_RESULT = -2; 180 181 static int getCharIndex(char c) { 182 if(c <= LATIN_MAX) { 183 return c; 184 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 185 return c - (PUNCT_START - LATIN_LIMIT); 186 } else { 187 // Not a fast Latin character. 188 // Note: U+FFFE & U+FFFF are forbidden in tailorings 189 // and thus do not occur in any contractions. 190 return -1; 191 } 192 } 193 194 /** 195 * Computes the options value for the compare functions 196 * and writes the precomputed primary weights. 197 * Returns -1 if the Latin fastpath is not supported for the data and settings. 198 * The capacity must be LATIN_LIMIT. 199 */ 200 public static int getOptions(CollationData data, CollationSettings settings, 201 char[] primaries) { 202 char[] header = data.fastLatinTableHeader; 203 if(header == null) { return -1; } 204 assert((header[0] >> 8) == VERSION); 205 assert(primaries.length == LATIN_LIMIT); 206 if(primaries.length != LATIN_LIMIT) { return -1; } 207 208 int miniVarTop; 209 if((settings.options & CollationSettings.ALTERNATE_MASK) == 0) { 210 // No mini primaries are variable, set a variableTop just below the 211 // lowest long mini primary. 212 miniVarTop = MIN_LONG - 1; 213 } else { 214 int headerLength = header[0] & 0xff; 215 int i = 1 + settings.getMaxVariable(); 216 if(i >= headerLength) { 217 return -1; // variableTop >= digits, should not occur 218 } 219 miniVarTop = header[i]; 220 } 221 222 boolean digitsAreReordered = false; 223 if(settings.hasReordering()) { 224 long prevStart = 0; 225 long beforeDigitStart = 0; 226 long digitStart = 0; 227 long afterDigitStart = 0; 228 for(int group = Collator.ReorderCodes.FIRST; 229 group < Collator.ReorderCodes.FIRST + CollationData.MAX_NUM_SPECIAL_REORDER_CODES; 230 ++group) { 231 long start = data.getFirstPrimaryForGroup(group); 232 start = settings.reorder(start); 233 if(group == Collator.ReorderCodes.DIGIT) { 234 beforeDigitStart = prevStart; 235 digitStart = start; 236 } else if(start != 0) { 237 if(start < prevStart) { 238 // The permutation affects the groups up to Latin. 239 return -1; 240 } 241 // In the future, there might be a special group between digits & Latin. 242 if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { 243 afterDigitStart = start; 244 } 245 prevStart = start; 246 } 247 } 248 long latinStart = data.getFirstPrimaryForGroup(UScript.LATIN); 249 latinStart = settings.reorder(latinStart); 250 if(latinStart < prevStart) { 251 return -1; 252 } 253 if(afterDigitStart == 0) { 254 afterDigitStart = latinStart; 255 } 256 if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { 257 digitsAreReordered = true; 258 } 259 } 260 261 char[] table = data.fastLatinTable; // skip the header 262 for(int c = 0; c < LATIN_LIMIT; ++c) { 263 int p = table[c]; 264 if(p >= MIN_SHORT) { 265 p &= SHORT_PRIMARY_MASK; 266 } else if(p > miniVarTop) { 267 p &= LONG_PRIMARY_MASK; 268 } else { 269 p = 0; 270 } 271 primaries[c] = (char)p; 272 } 273 if(digitsAreReordered || (settings.options & CollationSettings.NUMERIC) != 0) { 274 // Bail out for digits. 275 for(int c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } 276 } 277 278 // Shift the miniVarTop above other options. 279 return (miniVarTop << 16) | settings.options; 280 } 281 282 public static int compareUTF16(char[] table, char[] primaries, int options, 283 CharSequence left, CharSequence right, int startIndex) { 284 // This is a modified copy of CollationCompare.compareUpToQuaternary(), 285 // optimized for common Latin text. 286 // Keep them in sync! 287 288 int variableTop = options >> 16; // see getOptions() 289 options &= 0xffff; // needed for CollationSettings.getStrength() to work 290 291 // Check for supported characters, fetch mini CEs, and compare primaries. 292 int leftIndex = startIndex, rightIndex = startIndex; 293 /** 294 * Single mini CE or a pair. 295 * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. 296 * If there is only one, then it is in the lower bits, and the upper bits are 0. 297 */ 298 int leftPair = 0, rightPair = 0; 299 for(;;) { 300 // We fetch CEs until we get a non-ignorable primary or reach the end. 301 while(leftPair == 0) { 302 if(leftIndex == left.length()) { 303 leftPair = EOS; 304 break; 305 } 306 int c = left.charAt(leftIndex++); 307 if(c <= LATIN_MAX) { 308 leftPair = primaries[c]; 309 if(leftPair != 0) { break; } 310 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings.NUMERIC) != 0) { 311 return BAIL_OUT_RESULT; 312 } 313 leftPair = table[c]; 314 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 315 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 316 } else { 317 leftPair = lookup(table, c); 318 } 319 if(leftPair >= MIN_SHORT) { 320 leftPair &= SHORT_PRIMARY_MASK; 321 break; 322 } else if(leftPair > variableTop) { 323 leftPair &= LONG_PRIMARY_MASK; 324 break; 325 } else { 326 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 327 if(pairAndInc < 0) { 328 ++leftIndex; 329 pairAndInc = ~pairAndInc; 330 } 331 leftPair = (int)pairAndInc; 332 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 333 leftPair = getPrimaries(variableTop, leftPair); 334 } 335 } 336 337 while(rightPair == 0) { 338 if(rightIndex == right.length()) { 339 rightPair = EOS; 340 break; 341 } 342 int c = right.charAt(rightIndex++); 343 if(c <= LATIN_MAX) { 344 rightPair = primaries[c]; 345 if(rightPair != 0) { break; } 346 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings.NUMERIC) != 0) { 347 return BAIL_OUT_RESULT; 348 } 349 rightPair = table[c]; 350 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 351 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 352 } else { 353 rightPair = lookup(table, c); 354 } 355 if(rightPair >= MIN_SHORT) { 356 rightPair &= SHORT_PRIMARY_MASK; 357 break; 358 } else if(rightPair > variableTop) { 359 rightPair &= LONG_PRIMARY_MASK; 360 break; 361 } else { 362 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 363 if(pairAndInc < 0) { 364 ++rightIndex; 365 pairAndInc = ~pairAndInc; 366 } 367 rightPair = (int)pairAndInc; 368 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } 369 rightPair = getPrimaries(variableTop, rightPair); 370 } 371 } 372 373 if(leftPair == rightPair) { 374 if(leftPair == EOS) { break; } 375 leftPair = rightPair = 0; 376 continue; 377 } 378 int leftPrimary = leftPair & 0xffff; 379 int rightPrimary = rightPair & 0xffff; 380 if(leftPrimary != rightPrimary) { 381 // Return the primary difference. 382 return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER; 383 } 384 if(leftPair == EOS) { break; } 385 leftPair >>>= 16; 386 rightPair >>>= 16; 387 } 388 // In the following, we need to re-fetch each character because we did not buffer the CEs, 389 // but we know that the string is well-formed and 390 // only contains supported characters and mappings. 391 392 // We might skip the secondary level but continue with the case level 393 // which is turned on separately. 394 if(CollationSettings.getStrength(options) >= Collator.SECONDARY) { 395 leftIndex = rightIndex = startIndex; 396 leftPair = rightPair = 0; 397 for(;;) { 398 while(leftPair == 0) { 399 if(leftIndex == left.length()) { 400 leftPair = EOS; 401 break; 402 } 403 int c = left.charAt(leftIndex++); 404 if(c <= LATIN_MAX) { 405 leftPair = table[c]; 406 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 407 leftPair = table[c - PUNCT_START + LATIN_LIMIT]; 408 } else { 409 leftPair = lookup(table, c); 410 } 411 if(leftPair >= MIN_SHORT) { 412 leftPair = getSecondariesFromOneShortCE(leftPair); 413 break; 414 } else if(leftPair > variableTop) { 415 leftPair = COMMON_SEC_PLUS_OFFSET; 416 break; 417 } else { 418 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 419 if(pairAndInc < 0) { 420 ++leftIndex; 421 pairAndInc = ~pairAndInc; 422 } 423 leftPair = getSecondaries(variableTop, (int)pairAndInc); 424 } 425 } 426 427 while(rightPair == 0) { 428 if(rightIndex == right.length()) { 429 rightPair = EOS; 430 break; 431 } 432 int c = right.charAt(rightIndex++); 433 if(c <= LATIN_MAX) { 434 rightPair = table[c]; 435 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { 436 rightPair = table[c - PUNCT_START + LATIN_LIMIT]; 437 } else { 438 rightPair = lookup(table, c); 439 } 440 if(rightPair >= MIN_SHORT) { 441 rightPair = getSecondariesFromOneShortCE(rightPair); 442 break; 443 } else if(rightPair > variableTop) { 444 rightPair = COMMON_SEC_PLUS_OFFSET; 445 break; 446 } else { 447 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 448 if(pairAndInc < 0) { 449 ++rightIndex; 450 pairAndInc = ~pairAndInc; 451 } 452 rightPair = getSecondaries(variableTop, (int)pairAndInc); 453 } 454 } 455 456 if(leftPair == rightPair) { 457 if(leftPair == EOS) { break; } 458 leftPair = rightPair = 0; 459 continue; 460 } 461 int leftSecondary = leftPair & 0xffff; 462 int rightSecondary = rightPair & 0xffff; 463 if(leftSecondary != rightSecondary) { 464 if((options & CollationSettings.BACKWARD_SECONDARY) != 0) { 465 // Full support for backwards secondary requires backwards contraction matching 466 // and moving backwards between merge separators. 467 return BAIL_OUT_RESULT; 468 } 469 return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER; 470 } 471 if(leftPair == EOS) { break; } 472 leftPair >>>= 16; 473 rightPair >>>= 16; 474 } 475 } 476 477 if((options & CollationSettings.CASE_LEVEL) != 0) { 478 boolean strengthIsPrimary = CollationSettings.getStrength(options) == Collator.PRIMARY; 479 leftIndex = rightIndex = startIndex; 480 leftPair = rightPair = 0; 481 for(;;) { 482 while(leftPair == 0) { 483 if(leftIndex == left.length()) { 484 leftPair = EOS; 485 break; 486 } 487 int c = left.charAt(leftIndex++); 488 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 489 if(leftPair < MIN_LONG) { 490 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 491 if(pairAndInc < 0) { 492 ++leftIndex; 493 pairAndInc = ~pairAndInc; 494 } 495 leftPair = (int)pairAndInc; 496 } 497 leftPair = getCases(variableTop, strengthIsPrimary, leftPair); 498 } 499 500 while(rightPair == 0) { 501 if(rightIndex == right.length()) { 502 rightPair = EOS; 503 break; 504 } 505 int c = right.charAt(rightIndex++); 506 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 507 if(rightPair < MIN_LONG) { 508 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 509 if(pairAndInc < 0) { 510 ++rightIndex; 511 pairAndInc = ~pairAndInc; 512 } 513 rightPair = (int)pairAndInc; 514 } 515 rightPair = getCases(variableTop, strengthIsPrimary, rightPair); 516 } 517 518 if(leftPair == rightPair) { 519 if(leftPair == EOS) { break; } 520 leftPair = rightPair = 0; 521 continue; 522 } 523 int leftCase = leftPair & 0xffff; 524 int rightCase = rightPair & 0xffff; 525 if(leftCase != rightCase) { 526 if((options & CollationSettings.UPPER_FIRST) == 0) { 527 return (leftCase < rightCase) ? Collation.LESS : Collation.GREATER; 528 } else { 529 return (leftCase < rightCase) ? Collation.GREATER : Collation.LESS; 530 } 531 } 532 if(leftPair == EOS) { break; } 533 leftPair >>>= 16; 534 rightPair >>>= 16; 535 } 536 } 537 if(CollationSettings.getStrength(options) <= Collator.SECONDARY) { return Collation.EQUAL; } 538 539 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 540 boolean withCaseBits = CollationSettings.isTertiaryWithCaseBits(options); 541 542 leftIndex = rightIndex = startIndex; 543 leftPair = rightPair = 0; 544 for(;;) { 545 while(leftPair == 0) { 546 if(leftIndex == left.length()) { 547 leftPair = EOS; 548 break; 549 } 550 int c = left.charAt(leftIndex++); 551 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 552 if(leftPair < MIN_LONG) { 553 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 554 if(pairAndInc < 0) { 555 ++leftIndex; 556 pairAndInc = ~pairAndInc; 557 } 558 leftPair = (int)pairAndInc; 559 } 560 leftPair = getTertiaries(variableTop, withCaseBits, leftPair); 561 } 562 563 while(rightPair == 0) { 564 if(rightIndex == right.length()) { 565 rightPair = EOS; 566 break; 567 } 568 int c = right.charAt(rightIndex++); 569 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 570 if(rightPair < MIN_LONG) { 571 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 572 if(pairAndInc < 0) { 573 ++rightIndex; 574 pairAndInc = ~pairAndInc; 575 } 576 rightPair = (int)pairAndInc; 577 } 578 rightPair = getTertiaries(variableTop, withCaseBits, rightPair); 579 } 580 581 if(leftPair == rightPair) { 582 if(leftPair == EOS) { break; } 583 leftPair = rightPair = 0; 584 continue; 585 } 586 int leftTertiary = leftPair & 0xffff; 587 int rightTertiary = rightPair & 0xffff; 588 if(leftTertiary != rightTertiary) { 589 if(CollationSettings.sortsTertiaryUpperCaseFirst(options)) { 590 // Pass through EOS and MERGE_WEIGHT 591 // and keep real tertiary weights larger than the MERGE_WEIGHT. 592 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 593 if(leftTertiary > MERGE_WEIGHT) { 594 leftTertiary ^= CASE_MASK; 595 } 596 if(rightTertiary > MERGE_WEIGHT) { 597 rightTertiary ^= CASE_MASK; 598 } 599 } 600 return (leftTertiary < rightTertiary) ? Collation.LESS : Collation.GREATER; 601 } 602 if(leftPair == EOS) { break; } 603 leftPair >>>= 16; 604 rightPair >>>= 16; 605 } 606 if(CollationSettings.getStrength(options) <= Collator.TERTIARY) { return Collation.EQUAL; } 607 608 leftIndex = rightIndex = startIndex; 609 leftPair = rightPair = 0; 610 for(;;) { 611 while(leftPair == 0) { 612 if(leftIndex == left.length()) { 613 leftPair = EOS; 614 break; 615 } 616 int c = left.charAt(leftIndex++); 617 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 618 if(leftPair < MIN_LONG) { 619 long pairAndInc = nextPair(table, c, leftPair, left, leftIndex); 620 if(pairAndInc < 0) { 621 ++leftIndex; 622 pairAndInc = ~pairAndInc; 623 } 624 leftPair = (int)pairAndInc; 625 } 626 leftPair = getQuaternaries(variableTop, leftPair); 627 } 628 629 while(rightPair == 0) { 630 if(rightIndex == right.length()) { 631 rightPair = EOS; 632 break; 633 } 634 int c = right.charAt(rightIndex++); 635 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); 636 if(rightPair < MIN_LONG) { 637 long pairAndInc = nextPair(table, c, rightPair, right, rightIndex); 638 if(pairAndInc < 0) { 639 ++rightIndex; 640 pairAndInc = ~pairAndInc; 641 } 642 rightPair = (int)pairAndInc; 643 } 644 rightPair = getQuaternaries(variableTop, rightPair); 645 } 646 647 if(leftPair == rightPair) { 648 if(leftPair == EOS) { break; } 649 leftPair = rightPair = 0; 650 continue; 651 } 652 int leftQuaternary = leftPair & 0xffff; 653 int rightQuaternary = rightPair & 0xffff; 654 if(leftQuaternary != rightQuaternary) { 655 return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER; 656 } 657 if(leftPair == EOS) { break; } 658 leftPair >>>= 16; 659 rightPair >>>= 16; 660 } 661 return Collation.EQUAL; 662 } 663 664 private static int lookup(char[] table, int c) { 665 assert(c > LATIN_MAX); 666 if(PUNCT_START <= c && c < PUNCT_LIMIT) { 667 return table[c - PUNCT_START + LATIN_LIMIT]; 668 } else if(c == 0xfffe) { 669 return MERGE_WEIGHT; 670 } else if(c == 0xffff) { 671 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; 672 } else { 673 return BAIL_OUT; 674 } 675 } 676 677 /** 678 * Java returns a negative result (use the '~' operator) if sIndex is to be incremented. 679 * C++ modifies sIndex. 680 */ 681 private static long nextPair(char[] table, int c, int ce, CharSequence s16, int sIndex) { 682 if(ce >= MIN_LONG || ce < CONTRACTION) { 683 return ce; // simple or special mini CE 684 } else if(ce >= EXPANSION) { 685 int index = NUM_FAST_CHARS + (ce & INDEX_MASK); 686 return ((long)table[index + 1] << 16) | table[index]; 687 } else /* ce >= CONTRACTION */ { 688 // Contraction list: Default mapping followed by 689 // 0 or more single-character contraction suffix mappings. 690 int index = NUM_FAST_CHARS + (ce & INDEX_MASK); 691 boolean inc = false; // true if the next char is consumed. 692 if(sIndex != s16.length()) { 693 // Read the next character. 694 int c2; 695 int nextIndex = sIndex; 696 c2 = s16.charAt(nextIndex++); 697 if(c2 > LATIN_MAX) { 698 if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { 699 c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF 700 } else if(c2 == 0xfffe || c2 == 0xffff) { 701 c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. 702 } else { 703 return BAIL_OUT; 704 } 705 } 706 // Look for the next character in the contraction suffix list, 707 // which is in ascending order of single suffix characters. 708 int i = index; 709 int head = table[i]; // first skip the default mapping 710 int x; 711 do { 712 i += head >> CONTR_LENGTH_SHIFT; 713 head = table[i]; 714 x = head & CONTR_CHAR_MASK; 715 } while(x < c2); 716 if(x == c2) { 717 index = i; 718 inc = true; 719 } 720 } 721 // Return the CE or CEs for the default or contraction mapping. 722 int length = table[index] >> CONTR_LENGTH_SHIFT; 723 if(length == 1) { 724 return BAIL_OUT; 725 } 726 ce = table[index + 1]; 727 long result; 728 if(length == 2) { 729 result = ce; 730 } else { 731 result = ((long)table[index + 2] << 16) | ce; 732 } 733 return inc ? ~result : result; 734 } 735 } 736 737 private static int getPrimaries(int variableTop, int pair) { 738 int ce = pair & 0xffff; 739 if(ce >= MIN_SHORT) { return pair & TWO_SHORT_PRIMARIES_MASK; } 740 if(ce > variableTop) { return pair & TWO_LONG_PRIMARIES_MASK; } 741 if(ce >= MIN_LONG) { return 0; } // variable 742 return pair; // special mini CE 743 } 744 745 private static int getSecondariesFromOneShortCE(int ce) { 746 ce &= SECONDARY_MASK; 747 if(ce < MIN_SEC_HIGH) { 748 return ce + SEC_OFFSET; 749 } else { 750 return ((ce + SEC_OFFSET) << 16) | COMMON_SEC_PLUS_OFFSET; 751 } 752 } 753 754 private static int getSecondaries(int variableTop, int pair) { 755 if(pair <= 0xffff) { 756 // one mini CE 757 if(pair >= MIN_SHORT) { 758 pair = getSecondariesFromOneShortCE(pair); 759 } else if(pair > variableTop) { 760 pair = COMMON_SEC_PLUS_OFFSET; 761 } else if(pair >= MIN_LONG) { 762 pair = 0; // variable 763 } 764 // else special mini CE 765 } else { 766 int ce = pair & 0xffff; 767 if(ce >= MIN_SHORT) { 768 pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; 769 } else if(ce > variableTop) { 770 pair = TWO_COMMON_SEC_PLUS_OFFSET; 771 } else { 772 assert(ce >= MIN_LONG); 773 pair = 0; // variable 774 } 775 } 776 return pair; 777 } 778 779 private static int getCases(int variableTop, boolean strengthIsPrimary, int pair) { 780 // Primary+caseLevel: Ignore case level weights of primary ignorables. 781 // Otherwise: Ignore case level weights of secondary ignorables. 782 // For details see the comments in the CollationCompare class. 783 // Tertiary CEs (secondary ignorables) are not supported in fast Latin. 784 if(pair <= 0xffff) { 785 // one mini CE 786 if(pair >= MIN_SHORT) { 787 // A high secondary weight means we really have two CEs, 788 // a primary CE and a secondary CE. 789 int ce = pair; 790 pair &= CASE_MASK; // explicit weight of primary CE 791 if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 792 pair |= LOWER_CASE << 16; // implied weight of secondary CE 793 } 794 } else if(pair > variableTop) { 795 pair = LOWER_CASE; 796 } else if(pair >= MIN_LONG) { 797 pair = 0; // variable 798 } 799 // else special mini CE 800 } else { 801 // two mini CEs, same primary groups, neither expands like above 802 int ce = pair & 0xffff; 803 if(ce >= MIN_SHORT) { 804 if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { 805 pair &= CASE_MASK; 806 } else { 807 pair &= TWO_CASES_MASK; 808 } 809 } else if(ce > variableTop) { 810 pair = TWO_LOWER_CASES; 811 } else { 812 assert(ce >= MIN_LONG); 813 pair = 0; // variable 814 } 815 } 816 return pair; 817 } 818 819 private static int getTertiaries(int variableTop, boolean withCaseBits, int pair) { 820 if(pair <= 0xffff) { 821 // one mini CE 822 if(pair >= MIN_SHORT) { 823 // A high secondary weight means we really have two CEs, 824 // a primary CE and a secondary CE. 825 int ce = pair; 826 if(withCaseBits) { 827 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; 828 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 829 pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; 830 } 831 } else { 832 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 833 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { 834 pair |= COMMON_TER_PLUS_OFFSET << 16; 835 } 836 } 837 } else if(pair > variableTop) { 838 pair = (pair & TERTIARY_MASK) + TER_OFFSET; 839 if(withCaseBits) { 840 pair |= LOWER_CASE; 841 } 842 } else if(pair >= MIN_LONG) { 843 pair = 0; // variable 844 } 845 // else special mini CE 846 } else { 847 // two mini CEs, same primary groups, neither expands like above 848 int ce = pair & 0xffff; 849 if(ce >= MIN_SHORT) { 850 if(withCaseBits) { 851 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; 852 } else { 853 pair &= TWO_TERTIARIES_MASK; 854 } 855 pair += TWO_TER_OFFSETS; 856 } else if(ce > variableTop) { 857 pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; 858 if(withCaseBits) { 859 pair |= TWO_LOWER_CASES; 860 } 861 } else { 862 assert(ce >= MIN_LONG); 863 pair = 0; // variable 864 } 865 } 866 return pair; 867 } 868 869 private static int getQuaternaries(int variableTop, int pair) { 870 // Return the primary weight of a variable CE, 871 // or the maximum primary weight for a non-variable, not-completely-ignorable CE. 872 if(pair <= 0xffff) { 873 // one mini CE 874 if(pair >= MIN_SHORT) { 875 // A high secondary weight means we really have two CEs, 876 // a primary CE and a secondary CE. 877 if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { 878 pair = TWO_SHORT_PRIMARIES_MASK; 879 } else { 880 pair = SHORT_PRIMARY_MASK; 881 } 882 } else if(pair > variableTop) { 883 pair = SHORT_PRIMARY_MASK; 884 } else if(pair >= MIN_LONG) { 885 pair &= LONG_PRIMARY_MASK; // variable 886 } 887 // else special mini CE 888 } else { 889 // two mini CEs, same primary groups, neither expands like above 890 int ce = pair & 0xffff; 891 if(ce > variableTop) { 892 pair = TWO_SHORT_PRIMARIES_MASK; 893 } else { 894 assert(ce >= MIN_LONG); 895 pair &= TWO_LONG_PRIMARIES_MASK; // variable 896 } 897 } 898 return pair; 899 } 900 901 private CollationFastLatin() {} // no constructor 902} 903