1/* 2******************************************************************************* 3* Copyright (C) 2013-2015, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* CollationFastLatinBuilder.java, ported from collationfastlatinbuilder.h/.cpp 7* 8* C++ version created on: 2013aug09 9* created by: Markus W. Scherer 10*/ 11 12package com.ibm.icu.impl.coll; 13 14import com.ibm.icu.lang.UScript; 15import com.ibm.icu.text.Collator; 16import com.ibm.icu.util.CharsTrie; 17 18final class CollationFastLatinBuilder { 19 // #define DEBUG_COLLATION_FAST_LATIN_BUILDER 0 // 0 or 1 or 2 20 21 /** 22 * Compare two signed long values as if they were unsigned. 23 */ 24 private static final int compareInt64AsUnsigned(long a, long b) { 25 a += 0x8000000000000000L; 26 b += 0x8000000000000000L; 27 if(a < b) { 28 return -1; 29 } else if(a > b) { 30 return 1; 31 } else { 32 return 0; 33 } 34 } 35 36 /** 37 * Like Java Collections.binarySearch(List, String, Comparator). 38 * 39 * @return the index>=0 where the item was found, 40 * or the index<0 for inserting the string at ~index in sorted order 41 */ 42 private static final int binarySearch(long[] list, int limit, long ce) { 43 if (limit == 0) { return ~0; } 44 int start = 0; 45 for (;;) { 46 int i = (start + limit) / 2; 47 int cmp = compareInt64AsUnsigned(ce, list[i]); 48 if (cmp == 0) { 49 return i; 50 } else if (cmp < 0) { 51 if (i == start) { 52 return ~start; // insert ce before i 53 } 54 limit = i; 55 } else { 56 if (i == start) { 57 return ~(start + 1); // insert ce after i 58 } 59 start = i; 60 } 61 } 62 } 63 64 CollationFastLatinBuilder() { 65 ce0 = 0; 66 ce1 = 0; 67 contractionCEs = new UVector64(); 68 uniqueCEs = new UVector64(); 69 miniCEs = null; 70 firstDigitPrimary = 0; 71 firstLatinPrimary = 0; 72 lastLatinPrimary = 0; 73 firstShortPrimary = 0; 74 shortPrimaryOverflow = false; 75 headerLength = 0; 76 } 77 78 boolean forData(CollationData data) { 79 if(result.length() != 0) { // This builder is not reusable. 80 throw new IllegalStateException("attempt to reuse a CollationFastLatinBuilder"); 81 } 82 if(!loadGroups(data)) { return false; } 83 84 // Fast handling of digits. 85 firstShortPrimary = firstDigitPrimary; 86 getCEs(data); 87 encodeUniqueCEs(); 88 if(shortPrimaryOverflow) { 89 // Give digits long mini primaries, 90 // so that there are more short primaries for letters. 91 firstShortPrimary = firstLatinPrimary; 92 resetCEs(); 93 getCEs(data); 94 encodeUniqueCEs(); 95 } 96 // Note: If we still have a short-primary overflow but not a long-primary overflow, 97 // then we could calculate how many more long primaries would fit, 98 // and set the firstShortPrimary to that many after the current firstShortPrimary, 99 // and try again. 100 // However, this might only benefit the en_US_POSIX tailoring, 101 // and it is simpler to suppress building fast Latin data for it in genrb, 102 // or by returning false here if shortPrimaryOverflow. 103 104 boolean ok = !shortPrimaryOverflow; 105 if(ok) { 106 encodeCharCEs(); 107 encodeContractions(); 108 } 109 contractionCEs.removeAllElements(); // might reduce heap memory usage 110 uniqueCEs.removeAllElements(); 111 return ok; 112 } 113 114 // C++ returns one combined array with the contents of the result buffer. 115 // Java returns two arrays (header & table) because we cannot use pointer arithmetic, 116 // and we do not want to index into the table with an offset. 117 char[] getHeader() { 118 char[] resultArray = new char[headerLength]; 119 result.getChars(0, headerLength, resultArray, 0); 120 return resultArray; 121 } 122 123 char[] getTable() { 124 char[] resultArray = new char[result.length() - headerLength]; 125 result.getChars(headerLength, result.length(), resultArray, 0); 126 return resultArray; 127 } 128 129 private boolean loadGroups(CollationData data) { 130 headerLength = 1 + NUM_SPECIAL_GROUPS; 131 int r0 = (CollationFastLatin.VERSION << 8) | headerLength; 132 result.append((char)r0); 133 // The first few reordering groups should be special groups 134 // (space, punct, ..., digit) followed by Latn, then Grek and other scripts. 135 for(int i = 0; i < NUM_SPECIAL_GROUPS; ++i) { 136 lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(Collator.ReorderCodes.FIRST + i); 137 if(lastSpecialPrimaries[i] == 0) { 138 // missing data 139 return false; 140 } 141 result.append(0); // reserve a slot for this group 142 } 143 144 firstDigitPrimary = data.getFirstPrimaryForGroup(Collator.ReorderCodes.DIGIT); 145 firstLatinPrimary = data.getFirstPrimaryForGroup(UScript.LATIN); 146 lastLatinPrimary = data.getLastPrimaryForGroup(UScript.LATIN); 147 if(firstDigitPrimary == 0 || firstLatinPrimary == 0) { 148 // missing data 149 return false; 150 } 151 return true; 152 } 153 154 private boolean inSameGroup(long p, long q) { 155 // Both or neither need to be encoded as short primaries, 156 // so that we can test only one and use the same bit mask. 157 if(p >= firstShortPrimary) { 158 return q >= firstShortPrimary; 159 } else if(q >= firstShortPrimary) { 160 return false; 161 } 162 // Both or neither must be potentially-variable, 163 // so that we can test only one and determine if both are variable. 164 long lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1]; 165 if(p > lastVariablePrimary) { 166 return q > lastVariablePrimary; 167 } else if(q > lastVariablePrimary) { 168 return false; 169 } 170 // Both will be encoded with long mini primaries. 171 // They must be in the same special reordering group, 172 // so that we can test only one and determine if both are variable. 173 assert(p != 0 && q != 0); 174 for(int i = 0;; ++i) { // will terminate 175 long lastPrimary = lastSpecialPrimaries[i]; 176 if(p <= lastPrimary) { 177 return q <= lastPrimary; 178 } else if(q <= lastPrimary) { 179 return false; 180 } 181 } 182 } 183 184 private void resetCEs() { 185 contractionCEs.removeAllElements(); 186 uniqueCEs.removeAllElements(); 187 shortPrimaryOverflow = false; 188 result.setLength(headerLength); 189 } 190 191 private void getCEs(CollationData data) { 192 int i = 0; 193 for(char c = 0;; ++i, ++c) { 194 if(c == CollationFastLatin.LATIN_LIMIT) { 195 c = CollationFastLatin.PUNCT_START; 196 } else if(c == CollationFastLatin.PUNCT_LIMIT) { 197 break; 198 } 199 CollationData d; 200 int ce32 = data.getCE32(c); 201 if(ce32 == Collation.FALLBACK_CE32) { 202 d = data.base; 203 ce32 = d.getCE32(c); 204 } else { 205 d = data; 206 } 207 if(getCEsFromCE32(d, c, ce32)) { 208 charCEs[i][0] = ce0; 209 charCEs[i][1] = ce1; 210 addUniqueCE(ce0); 211 addUniqueCE(ce1); 212 } else { 213 // bail out for c 214 charCEs[i][0] = ce0 = Collation.NO_CE; 215 charCEs[i][1] = ce1 = 0; 216 } 217 if(c == 0 && !isContractionCharCE(ce0)) { 218 // Always map U+0000 to a contraction. 219 // Write a contraction list with only a default value if there is no real contraction. 220 assert(contractionCEs.isEmpty()); 221 addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1); 222 charCEs[0][0] = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG; 223 charCEs[0][1] = 0; 224 } 225 } 226 // Terminate the last contraction list. 227 contractionCEs.addElement(CollationFastLatin.CONTR_CHAR_MASK); 228 } 229 230 private boolean getCEsFromCE32(CollationData data, int c, int ce32) { 231 ce32 = data.getFinalCE32(ce32); 232 ce1 = 0; 233 if(Collation.isSimpleOrLongCE32(ce32)) { 234 ce0 = Collation.ceFromCE32(ce32); 235 } else { 236 switch(Collation.tagFromCE32(ce32)) { 237 case Collation.LATIN_EXPANSION_TAG: 238 ce0 = Collation.latinCE0FromCE32(ce32); 239 ce1 = Collation.latinCE1FromCE32(ce32); 240 break; 241 case Collation.EXPANSION32_TAG: { 242 int index = Collation.indexFromCE32(ce32); 243 int length = Collation.lengthFromCE32(ce32); 244 if(length <= 2) { 245 ce0 = Collation.ceFromCE32(data.ce32s[index]); 246 if(length == 2) { 247 ce1 = Collation.ceFromCE32(data.ce32s[index + 1]); 248 } 249 break; 250 } else { 251 return false; 252 } 253 } 254 case Collation.EXPANSION_TAG: { 255 int index = Collation.indexFromCE32(ce32); 256 int length = Collation.lengthFromCE32(ce32); 257 if(length <= 2) { 258 ce0 = data.ces[index]; 259 if(length == 2) { 260 ce1 = data.ces[index + 1]; 261 } 262 break; 263 } else { 264 return false; 265 } 266 } 267 // Note: We could support PREFIX_TAG (assert c>=0) 268 // by recursing on its default CE32 and checking that none of the prefixes starts 269 // with a fast Latin character. 270 // However, currently (2013) there are only the L-before-middle-dot 271 // prefix mappings in the Latin range, and those would be rejected anyway. 272 case Collation.CONTRACTION_TAG: 273 assert(c >= 0); 274 return getCEsFromContractionCE32(data, ce32); 275 case Collation.OFFSET_TAG: 276 assert(c >= 0); 277 ce0 = data.getCEFromOffsetCE32(c, ce32); 278 break; 279 default: 280 return false; 281 } 282 } 283 // A mapping can be completely ignorable. 284 if(ce0 == 0) { return ce1 == 0; } 285 // We do not support an ignorable ce0 unless it is completely ignorable. 286 long p0 = ce0 >>> 32; 287 if(p0 == 0) { return false; } 288 // We only support primaries up to the Latin script. 289 if(p0 > lastLatinPrimary) { return false; } 290 // We support non-common secondary and case weights only together with short primaries. 291 int lower32_0 = (int)ce0; 292 if(p0 < firstShortPrimary) { 293 int sc0 = lower32_0 & Collation.SECONDARY_AND_CASE_MASK; 294 if(sc0 != Collation.COMMON_SECONDARY_CE) { return false; } 295 } 296 // No below-common tertiary weights. 297 if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; } 298 if(ce1 != 0) { 299 // Both primaries must be in the same group, 300 // or both must get short mini primaries, 301 // or a short-primary CE is followed by a secondary CE. 302 // This is so that we can test the first primary and use the same mask for both, 303 // and determine for both whether they are variable. 304 long p1 = ce1 >>> 32; 305 if(p1 == 0 ? p0 < firstShortPrimary : !inSameGroup(p0, p1)) { return false; } 306 int lower32_1 = (int)ce1; 307 // No tertiary CEs. 308 if((lower32_1 >>> 16) == 0) { return false; } 309 // We support non-common secondary and case weights 310 // only for secondary CEs or together with short primaries. 311 if(p1 != 0 && p1 < firstShortPrimary) { 312 int sc1 = lower32_1 & Collation.SECONDARY_AND_CASE_MASK; 313 if(sc1 != Collation.COMMON_SECONDARY_CE) { return false; } 314 } 315 // No below-common tertiary weights. 316 if((lower32_0 & Collation.ONLY_TERTIARY_MASK) < Collation.COMMON_WEIGHT16) { return false; } 317 } 318 // No quaternary weights. 319 if(((ce0 | ce1) & Collation.QUATERNARY_MASK) != 0) { return false; } 320 return true; 321 } 322 323 private boolean getCEsFromContractionCE32(CollationData data, int ce32) { 324 int trieIndex = Collation.indexFromCE32(ce32); 325 ce32 = data.getCE32FromContexts(trieIndex); // Default if no suffix match. 326 // Since the original ce32 is not a prefix mapping, 327 // the default ce32 must not be another contraction. 328 assert(!Collation.isContractionCE32(ce32)); 329 int contractionIndex = contractionCEs.size(); 330 if(getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) { 331 addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, ce0, ce1); 332 } else { 333 // Bail out for c-without-contraction. 334 addContractionEntry(CollationFastLatin.CONTR_CHAR_MASK, Collation.NO_CE, 0); 335 } 336 // Handle an encodable contraction unless the next contraction is too long 337 // and starts with the same character. 338 int prevX = -1; 339 boolean addContraction = false; 340 CharsTrie.Iterator suffixes = CharsTrie.iterator(data.contexts, trieIndex + 2, 0); 341 while(suffixes.hasNext()) { 342 CharsTrie.Entry entry = suffixes.next(); 343 CharSequence suffix = entry.chars; 344 int x = CollationFastLatin.getCharIndex(suffix.charAt(0)); 345 if(x < 0) { continue; } // ignore anything but fast Latin text 346 if(x == prevX) { 347 if(addContraction) { 348 // Bail out for all contractions starting with this character. 349 addContractionEntry(x, Collation.NO_CE, 0); 350 addContraction = false; 351 } 352 continue; 353 } 354 if(addContraction) { 355 addContractionEntry(prevX, ce0, ce1); 356 } 357 ce32 = entry.value; 358 if(suffix.length() == 1 && getCEsFromCE32(data, Collation.SENTINEL_CP, ce32)) { 359 addContraction = true; 360 } else { 361 addContractionEntry(x, Collation.NO_CE, 0); 362 addContraction = false; 363 } 364 prevX = x; 365 } 366 if(addContraction) { 367 addContractionEntry(prevX, ce0, ce1); 368 } 369 // Note: There might not be any fast Latin contractions, but 370 // we need to enter contraction handling anyway so that we can bail out 371 // when there is a non-fast-Latin character following. 372 // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the 373 // following umlaut and bail out, rather than return the difference of Y vs. u. 374 ce0 = (Collation.NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | contractionIndex; 375 ce1 = 0; 376 return true; 377 } 378 379 private void addContractionEntry(int x, long cce0, long cce1) { 380 contractionCEs.addElement(x); 381 contractionCEs.addElement(cce0); 382 contractionCEs.addElement(cce1); 383 addUniqueCE(cce0); 384 addUniqueCE(cce1); 385 } 386 387 private void addUniqueCE(long ce) { 388 if(ce == 0 || (ce >>> 32) == Collation.NO_CE_PRIMARY) { return; } 389 ce &= ~(long)Collation.CASE_MASK; // blank out case bits 390 int i = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce); 391 if(i < 0) { 392 uniqueCEs.insertElementAt(ce, ~i); 393 } 394 } 395 396 private int getMiniCE(long ce) { 397 ce &= ~(long)Collation.CASE_MASK; // blank out case bits 398 int index = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce); 399 assert(index >= 0); 400 return miniCEs[index]; 401 } 402 403 private void encodeUniqueCEs() { 404 miniCEs = new char[uniqueCEs.size()]; 405 int group = 0; 406 long lastGroupPrimary = lastSpecialPrimaries[group]; 407 // The lowest unique CE must be at least a secondary CE. 408 assert(((int)uniqueCEs.elementAti(0) >>> 16) != 0); 409 long prevPrimary = 0; 410 int prevSecondary = 0; 411 int pri = 0; 412 int sec = 0; 413 int ter = CollationFastLatin.COMMON_TER; 414 for(int i = 0; i < uniqueCEs.size(); ++i) { 415 long ce = uniqueCEs.elementAti(i); 416 // Note: At least one of the p/s/t weights changes from one unique CE to the next. 417 // (uniqueCEs does not store case bits.) 418 long p = ce >>> 32; 419 if(p != prevPrimary) { 420 while(p > lastGroupPrimary) { 421 assert(pri <= CollationFastLatin.MAX_LONG); 422 // Set the group's header entry to the 423 // last "long primary" in or before the group. 424 result.setCharAt(1 + group, (char)pri); 425 if(++group < NUM_SPECIAL_GROUPS) { 426 lastGroupPrimary = lastSpecialPrimaries[group]; 427 } else { 428 lastGroupPrimary = 0xffffffffL; 429 break; 430 } 431 } 432 if(p < firstShortPrimary) { 433 if(pri == 0) { 434 pri = CollationFastLatin.MIN_LONG; 435 } else if(pri < CollationFastLatin.MAX_LONG) { 436 pri += CollationFastLatin.LONG_INC; 437 } else { 438 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER 439 printf("long-primary overflow for %08x\n", p); 440 #endif */ 441 miniCEs[i] = CollationFastLatin.BAIL_OUT; 442 continue; 443 } 444 } else { 445 if(pri < CollationFastLatin.MIN_SHORT) { 446 pri = CollationFastLatin.MIN_SHORT; 447 } else if(pri < (CollationFastLatin.MAX_SHORT - CollationFastLatin.SHORT_INC)) { 448 // Reserve the highest primary weight for U+FFFF. 449 pri += CollationFastLatin.SHORT_INC; 450 } else { 451 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER 452 printf("short-primary overflow for %08x\n", p); 453 #endif */ 454 shortPrimaryOverflow = true; 455 miniCEs[i] = CollationFastLatin.BAIL_OUT; 456 continue; 457 } 458 } 459 prevPrimary = p; 460 prevSecondary = Collation.COMMON_WEIGHT16; 461 sec = CollationFastLatin.COMMON_SEC; 462 ter = CollationFastLatin.COMMON_TER; 463 } 464 int lower32 = (int)ce; 465 int s = lower32 >>> 16; 466 if(s != prevSecondary) { 467 if(pri == 0) { 468 if(sec == 0) { 469 sec = CollationFastLatin.MIN_SEC_HIGH; 470 } else if(sec < CollationFastLatin.MAX_SEC_HIGH) { 471 sec += CollationFastLatin.SEC_INC; 472 } else { 473 miniCEs[i] = CollationFastLatin.BAIL_OUT; 474 continue; 475 } 476 prevSecondary = s; 477 ter = CollationFastLatin.COMMON_TER; 478 } else if(s < Collation.COMMON_WEIGHT16) { 479 if(sec == CollationFastLatin.COMMON_SEC) { 480 sec = CollationFastLatin.MIN_SEC_BEFORE; 481 } else if(sec < CollationFastLatin.MAX_SEC_BEFORE) { 482 sec += CollationFastLatin.SEC_INC; 483 } else { 484 miniCEs[i] = CollationFastLatin.BAIL_OUT; 485 continue; 486 } 487 } else if(s == Collation.COMMON_WEIGHT16) { 488 sec = CollationFastLatin.COMMON_SEC; 489 } else { 490 if(sec < CollationFastLatin.MIN_SEC_AFTER) { 491 sec = CollationFastLatin.MIN_SEC_AFTER; 492 } else if(sec < CollationFastLatin.MAX_SEC_AFTER) { 493 sec += CollationFastLatin.SEC_INC; 494 } else { 495 miniCEs[i] = CollationFastLatin.BAIL_OUT; 496 continue; 497 } 498 } 499 prevSecondary = s; 500 ter = CollationFastLatin.COMMON_TER; 501 } 502 assert((lower32 & Collation.CASE_MASK) == 0); // blanked out in uniqueCEs 503 int t = lower32 & Collation.ONLY_TERTIARY_MASK; 504 if(t > Collation.COMMON_WEIGHT16) { 505 if(ter < CollationFastLatin.MAX_TER_AFTER) { 506 ++ter; 507 } else { 508 miniCEs[i] = CollationFastLatin.BAIL_OUT; 509 continue; 510 } 511 } 512 if(CollationFastLatin.MIN_LONG <= pri && pri <= CollationFastLatin.MAX_LONG) { 513 assert(sec == CollationFastLatin.COMMON_SEC); 514 miniCEs[i] = (char)(pri | ter); 515 } else { 516 miniCEs[i] = (char)(pri | sec | ter); 517 } 518 } 519 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER 520 printf("last mini primary: %04x\n", pri); 521 #endif */ 522 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2 523 for(int i = 0; i < uniqueCEs.size(); ++i) { 524 long ce = uniqueCEs.elementAti(i); 525 printf("unique CE 0x%016lx -> 0x%04x\n", ce, miniCEs[i]); 526 } 527 #endif */ 528 } 529 530 private void encodeCharCEs() { 531 int miniCEsStart = result.length(); 532 for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) { 533 result.append(0); // initialize to completely ignorable 534 } 535 int indexBase = result.length(); 536 for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) { 537 long ce = charCEs[i][0]; 538 if(isContractionCharCE(ce)) { continue; } // defer contraction 539 int miniCE = encodeTwoCEs(ce, charCEs[i][1]); 540 if((miniCE >>> 16) > 0) { // if ((unsigned)miniCE > 0xffff) 541 // Note: There is a chance that this new expansion is the same as a previous one, 542 // and if so, then we could reuse the other expansion. 543 // However, that seems unlikely. 544 int expansionIndex = result.length() - indexBase; 545 if(expansionIndex > CollationFastLatin.INDEX_MASK) { 546 miniCE = CollationFastLatin.BAIL_OUT; 547 } else { 548 result.append((char)(miniCE >> 16)).append((char)miniCE); 549 miniCE = CollationFastLatin.EXPANSION | expansionIndex; 550 } 551 } 552 result.setCharAt(miniCEsStart + i, (char)miniCE); 553 } 554 } 555 556 private void encodeContractions() { 557 // We encode all contraction lists so that the first word of a list 558 // terminates the previous list, and we only need one additional terminator at the end. 559 int indexBase = headerLength + CollationFastLatin.NUM_FAST_CHARS; 560 int firstContractionIndex = result.length(); 561 for(int i = 0; i < CollationFastLatin.NUM_FAST_CHARS; ++i) { 562 long ce = charCEs[i][0]; 563 if(!isContractionCharCE(ce)) { continue; } 564 int contractionIndex = result.length() - indexBase; 565 if(contractionIndex > CollationFastLatin.INDEX_MASK) { 566 result.setCharAt(headerLength + i, (char) CollationFastLatin.BAIL_OUT); 567 continue; 568 } 569 boolean firstTriple = true; 570 for(int index = (int)ce & 0x7fffffff;; index += 3) { 571 long x = contractionCEs.elementAti(index); 572 if(x == CollationFastLatin.CONTR_CHAR_MASK && !firstTriple) { break; } 573 long cce0 = contractionCEs.elementAti(index + 1); 574 long cce1 = contractionCEs.elementAti(index + 2); 575 int miniCE = encodeTwoCEs(cce0, cce1); 576 if(miniCE == CollationFastLatin.BAIL_OUT) { 577 result.append((char)(x | (1 << CollationFastLatin.CONTR_LENGTH_SHIFT))); 578 } else if((miniCE >>> 16) == 0) { // if ((unsigned)miniCE <= 0xffff) 579 result.append((char)(x | (2 << CollationFastLatin.CONTR_LENGTH_SHIFT))); 580 result.append((char)miniCE); 581 } else { 582 result.append((char)(x | (3 << CollationFastLatin.CONTR_LENGTH_SHIFT))); 583 result.append((char)(miniCE >> 16)).append((char)miniCE); 584 } 585 firstTriple = false; 586 } 587 // Note: There is a chance that this new contraction list is the same as a previous one, 588 // and if so, then we could truncate the result and reuse the other list. 589 // However, that seems unlikely. 590 result.setCharAt(headerLength + i, 591 (char)(CollationFastLatin.CONTRACTION | contractionIndex)); 592 } 593 if(result.length() > firstContractionIndex) { 594 // Terminate the last contraction list. 595 result.append((char)CollationFastLatin.CONTR_CHAR_MASK); 596 } 597 /* #if DEBUG_COLLATION_FAST_LATIN_BUILDER 598 printf("** fast Latin %d * 2 = %d bytes\n", result.length(), result.length() * 2); 599 puts(" header & below-digit groups map"); 600 int i = 0; 601 for(; i < headerLength; ++i) { 602 printf(" %04x", result[i]); 603 } 604 printf("\n char mini CEs"); 605 assert(CollationFastLatin.NUM_FAST_CHARS % 16 == 0); 606 for(; i < indexBase; i += 16) { 607 int c = i - headerLength; 608 if(c >= CollationFastLatin.LATIN_LIMIT) { 609 c = CollationFastLatin.PUNCT_START + c - CollationFastLatin.LATIN_LIMIT; 610 } 611 printf("\n %04x:", c); 612 for(int j = 0; j < 16; ++j) { 613 printf(" %04x", result[i + j]); 614 } 615 } 616 printf("\n expansions & contractions"); 617 for(; i < result.length(); ++i) { 618 if((i - indexBase) % 16 == 0) { puts(""); } 619 printf(" %04x", result[i]); 620 } 621 puts(""); 622 #endif */ 623 } 624 625 private int encodeTwoCEs(long first, long second) { 626 if(first == 0) { 627 return 0; // completely ignorable 628 } 629 if(first == Collation.NO_CE) { 630 return CollationFastLatin.BAIL_OUT; 631 } 632 assert((first >>> 32) != Collation.NO_CE_PRIMARY); 633 634 int miniCE = getMiniCE(first); 635 if(miniCE == CollationFastLatin.BAIL_OUT) { return miniCE; } 636 if(miniCE >= CollationFastLatin.MIN_SHORT) { 637 // Extract & copy the case bits. 638 // Shift them from normal CE bits 15..14 to mini CE bits 4..3. 639 int c = (((int)first & Collation.CASE_MASK) >> (14 - 3)); 640 // Only in mini CEs: Ignorable case bits = 0, lowercase = 1. 641 c += CollationFastLatin.LOWER_CASE; 642 miniCE |= c; 643 } 644 if(second == 0) { return miniCE; } 645 646 int miniCE1 = getMiniCE(second); 647 if(miniCE1 == CollationFastLatin.BAIL_OUT) { return miniCE1; } 648 649 int case1 = (int)second & Collation.CASE_MASK; 650 if(miniCE >= CollationFastLatin.MIN_SHORT && 651 (miniCE & CollationFastLatin.SECONDARY_MASK) == CollationFastLatin.COMMON_SEC) { 652 // Try to combine the two mini CEs into one. 653 int sec1 = miniCE1 & CollationFastLatin.SECONDARY_MASK; 654 int ter1 = miniCE1 & CollationFastLatin.TERTIARY_MASK; 655 if(sec1 >= CollationFastLatin.MIN_SEC_HIGH && case1 == 0 && 656 ter1 == CollationFastLatin.COMMON_TER) { 657 // sec1>=sec_high implies pri1==0. 658 return (miniCE & ~CollationFastLatin.SECONDARY_MASK) | sec1; 659 } 660 } 661 662 if(miniCE1 <= CollationFastLatin.SECONDARY_MASK || CollationFastLatin.MIN_SHORT <= miniCE1) { 663 // Secondary CE, or a CE with a short primary, copy the case bits. 664 case1 = (case1 >> (14 - 3)) + CollationFastLatin.LOWER_CASE; 665 miniCE1 |= case1; 666 } 667 return (miniCE << 16) | miniCE1; 668 } 669 670 private static boolean isContractionCharCE(long ce) { 671 return (ce >>> 32) == Collation.NO_CE_PRIMARY && ce != Collation.NO_CE; 672 } 673 674 // space, punct, symbol, currency (not digit) 675 private static final int NUM_SPECIAL_GROUPS = 676 Collator.ReorderCodes.CURRENCY - Collator.ReorderCodes.FIRST + 1; 677 678 private static final long CONTRACTION_FLAG = 0x80000000L; 679 680 // temporary "buffer" 681 private long ce0, ce1; 682 683 private long[][] charCEs = new long[CollationFastLatin.NUM_FAST_CHARS][2]; 684 685 private UVector64 contractionCEs; 686 private UVector64 uniqueCEs; 687 688 /** One 16-bit mini CE per unique CE. */ 689 private char[] miniCEs; 690 691 // These are constant for a given root collator. 692 long[] lastSpecialPrimaries = new long[NUM_SPECIAL_GROUPS]; 693 private long firstDigitPrimary; 694 private long firstLatinPrimary; 695 private long lastLatinPrimary; 696 // This determines the first normal primary weight which is mapped to 697 // a short mini primary. It must be >=firstDigitPrimary. 698 private long firstShortPrimary; 699 700 private boolean shortPrimaryOverflow; 701 702 private StringBuilder result = new StringBuilder(); 703 private int headerLength; 704} 705