1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* Copyright (C) 2013-2015, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* collationfastlatinbuilder.cpp 9* 10* created on: 2013aug09 11* created by: Markus W. Scherer 12*/ 13 14#define DEBUG_COLLATION_FAST_LATIN_BUILDER 0 // 0 or 1 or 2 15#if DEBUG_COLLATION_FAST_LATIN_BUILDER 16#include <stdio.h> 17#include <string> 18#endif 19 20#include "unicode/utypes.h" 21 22#if !UCONFIG_NO_COLLATION 23 24#include "unicode/ucol.h" 25#include "unicode/ucharstrie.h" 26#include "unicode/unistr.h" 27#include "unicode/uobject.h" 28#include "unicode/uscript.h" 29#include "cmemory.h" 30#include "collation.h" 31#include "collationdata.h" 32#include "collationfastlatin.h" 33#include "collationfastlatinbuilder.h" 34#include "uassert.h" 35#include "uvectr64.h" 36 37U_NAMESPACE_BEGIN 38 39struct CollationData; 40 41namespace { 42 43/** 44 * Compare two signed int64_t values as if they were unsigned. 45 */ 46int32_t 47compareInt64AsUnsigned(int64_t a, int64_t b) { 48 if((uint64_t)a < (uint64_t)b) { 49 return -1; 50 } else if((uint64_t)a > (uint64_t)b) { 51 return 1; 52 } else { 53 return 0; 54 } 55} 56 57// TODO: Merge this with the near-identical version in collationbasedatabuilder.cpp 58/** 59 * Like Java Collections.binarySearch(List, String, Comparator). 60 * 61 * @return the index>=0 where the item was found, 62 * or the index<0 for inserting the string at ~index in sorted order 63 */ 64int32_t 65binarySearch(const int64_t list[], int32_t limit, int64_t ce) { 66 if (limit == 0) { return ~0; } 67 int32_t start = 0; 68 for (;;) { 69 int32_t i = (start + limit) / 2; 70 int32_t cmp = compareInt64AsUnsigned(ce, list[i]); 71 if (cmp == 0) { 72 return i; 73 } else if (cmp < 0) { 74 if (i == start) { 75 return ~start; // insert ce before i 76 } 77 limit = i; 78 } else { 79 if (i == start) { 80 return ~(start + 1); // insert ce after i 81 } 82 start = i; 83 } 84 } 85} 86 87} // namespace 88 89CollationFastLatinBuilder::CollationFastLatinBuilder(UErrorCode &errorCode) 90 : ce0(0), ce1(0), 91 contractionCEs(errorCode), uniqueCEs(errorCode), 92 miniCEs(NULL), 93 firstDigitPrimary(0), firstLatinPrimary(0), lastLatinPrimary(0), 94 firstShortPrimary(0), shortPrimaryOverflow(FALSE), 95 headerLength(0) { 96} 97 98CollationFastLatinBuilder::~CollationFastLatinBuilder() { 99 uprv_free(miniCEs); 100} 101 102UBool 103CollationFastLatinBuilder::forData(const CollationData &data, UErrorCode &errorCode) { 104 if(U_FAILURE(errorCode)) { return FALSE; } 105 if(!result.isEmpty()) { // This builder is not reusable. 106 errorCode = U_INVALID_STATE_ERROR; 107 return FALSE; 108 } 109 if(!loadGroups(data, errorCode)) { return FALSE; } 110 111 // Fast handling of digits. 112 firstShortPrimary = firstDigitPrimary; 113 getCEs(data, errorCode); 114 if(!encodeUniqueCEs(errorCode)) { return FALSE; } 115 if(shortPrimaryOverflow) { 116 // Give digits long mini primaries, 117 // so that there are more short primaries for letters. 118 firstShortPrimary = firstLatinPrimary; 119 resetCEs(); 120 getCEs(data, errorCode); 121 if(!encodeUniqueCEs(errorCode)) { return FALSE; } 122 } 123 // Note: If we still have a short-primary overflow but not a long-primary overflow, 124 // then we could calculate how many more long primaries would fit, 125 // and set the firstShortPrimary to that many after the current firstShortPrimary, 126 // and try again. 127 // However, this might only benefit the en_US_POSIX tailoring, 128 // and it is simpler to suppress building fast Latin data for it in genrb, 129 // or by returning FALSE here if shortPrimaryOverflow. 130 131 UBool ok = !shortPrimaryOverflow && 132 encodeCharCEs(errorCode) && encodeContractions(errorCode); 133 contractionCEs.removeAllElements(); // might reduce heap memory usage 134 uniqueCEs.removeAllElements(); 135 return ok; 136} 137 138UBool 139CollationFastLatinBuilder::loadGroups(const CollationData &data, UErrorCode &errorCode) { 140 if(U_FAILURE(errorCode)) { return FALSE; } 141 headerLength = 1 + NUM_SPECIAL_GROUPS; 142 uint32_t r0 = (CollationFastLatin::VERSION << 8) | headerLength; 143 result.append((UChar)r0); 144 // The first few reordering groups should be special groups 145 // (space, punct, ..., digit) followed by Latn, then Grek and other scripts. 146 for(int32_t i = 0; i < NUM_SPECIAL_GROUPS; ++i) { 147 lastSpecialPrimaries[i] = data.getLastPrimaryForGroup(UCOL_REORDER_CODE_FIRST + i); 148 if(lastSpecialPrimaries[i] == 0) { 149 // missing data 150 return FALSE; 151 } 152 result.append((UChar)0); // reserve a slot for this group 153 } 154 155 firstDigitPrimary = data.getFirstPrimaryForGroup(UCOL_REORDER_CODE_DIGIT); 156 firstLatinPrimary = data.getFirstPrimaryForGroup(USCRIPT_LATIN); 157 lastLatinPrimary = data.getLastPrimaryForGroup(USCRIPT_LATIN); 158 if(firstDigitPrimary == 0 || firstLatinPrimary == 0) { 159 // missing data 160 return FALSE; 161 } 162 return TRUE; 163} 164 165UBool 166CollationFastLatinBuilder::inSameGroup(uint32_t p, uint32_t q) const { 167 // Both or neither need to be encoded as short primaries, 168 // so that we can test only one and use the same bit mask. 169 if(p >= firstShortPrimary) { 170 return q >= firstShortPrimary; 171 } else if(q >= firstShortPrimary) { 172 return FALSE; 173 } 174 // Both or neither must be potentially-variable, 175 // so that we can test only one and determine if both are variable. 176 uint32_t lastVariablePrimary = lastSpecialPrimaries[NUM_SPECIAL_GROUPS - 1]; 177 if(p > lastVariablePrimary) { 178 return q > lastVariablePrimary; 179 } else if(q > lastVariablePrimary) { 180 return FALSE; 181 } 182 // Both will be encoded with long mini primaries. 183 // They must be in the same special reordering group, 184 // so that we can test only one and determine if both are variable. 185 U_ASSERT(p != 0 && q != 0); 186 for(int32_t i = 0;; ++i) { // will terminate 187 uint32_t lastPrimary = lastSpecialPrimaries[i]; 188 if(p <= lastPrimary) { 189 return q <= lastPrimary; 190 } else if(q <= lastPrimary) { 191 return FALSE; 192 } 193 } 194} 195 196void 197CollationFastLatinBuilder::resetCEs() { 198 contractionCEs.removeAllElements(); 199 uniqueCEs.removeAllElements(); 200 shortPrimaryOverflow = FALSE; 201 result.truncate(headerLength); 202} 203 204void 205CollationFastLatinBuilder::getCEs(const CollationData &data, UErrorCode &errorCode) { 206 if(U_FAILURE(errorCode)) { return; } 207 int32_t i = 0; 208 for(UChar c = 0;; ++i, ++c) { 209 if(c == CollationFastLatin::LATIN_LIMIT) { 210 c = CollationFastLatin::PUNCT_START; 211 } else if(c == CollationFastLatin::PUNCT_LIMIT) { 212 break; 213 } 214 const CollationData *d; 215 uint32_t ce32 = data.getCE32(c); 216 if(ce32 == Collation::FALLBACK_CE32) { 217 d = data.base; 218 ce32 = d->getCE32(c); 219 } else { 220 d = &data; 221 } 222 if(getCEsFromCE32(*d, c, ce32, errorCode)) { 223 charCEs[i][0] = ce0; 224 charCEs[i][1] = ce1; 225 addUniqueCE(ce0, errorCode); 226 addUniqueCE(ce1, errorCode); 227 } else { 228 // bail out for c 229 charCEs[i][0] = ce0 = Collation::NO_CE; 230 charCEs[i][1] = ce1 = 0; 231 } 232 if(c == 0 && !isContractionCharCE(ce0)) { 233 // Always map U+0000 to a contraction. 234 // Write a contraction list with only a default value if there is no real contraction. 235 U_ASSERT(contractionCEs.isEmpty()); 236 addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, ce0, ce1, errorCode); 237 charCEs[0][0] = ((int64_t)Collation::NO_CE_PRIMARY << 32) | CONTRACTION_FLAG; 238 charCEs[0][1] = 0; 239 } 240 } 241 // Terminate the last contraction list. 242 contractionCEs.addElement(CollationFastLatin::CONTR_CHAR_MASK, errorCode); 243} 244 245UBool 246CollationFastLatinBuilder::getCEsFromCE32(const CollationData &data, UChar32 c, uint32_t ce32, 247 UErrorCode &errorCode) { 248 if(U_FAILURE(errorCode)) { return FALSE; } 249 ce32 = data.getFinalCE32(ce32); 250 ce1 = 0; 251 if(Collation::isSimpleOrLongCE32(ce32)) { 252 ce0 = Collation::ceFromCE32(ce32); 253 } else { 254 switch(Collation::tagFromCE32(ce32)) { 255 case Collation::LATIN_EXPANSION_TAG: 256 ce0 = Collation::latinCE0FromCE32(ce32); 257 ce1 = Collation::latinCE1FromCE32(ce32); 258 break; 259 case Collation::EXPANSION32_TAG: { 260 const uint32_t *ce32s = data.ce32s + Collation::indexFromCE32(ce32); 261 int32_t length = Collation::lengthFromCE32(ce32); 262 if(length <= 2) { 263 ce0 = Collation::ceFromCE32(ce32s[0]); 264 if(length == 2) { 265 ce1 = Collation::ceFromCE32(ce32s[1]); 266 } 267 break; 268 } else { 269 return FALSE; 270 } 271 } 272 case Collation::EXPANSION_TAG: { 273 const int64_t *ces = data.ces + Collation::indexFromCE32(ce32); 274 int32_t length = Collation::lengthFromCE32(ce32); 275 if(length <= 2) { 276 ce0 = ces[0]; 277 if(length == 2) { 278 ce1 = ces[1]; 279 } 280 break; 281 } else { 282 return FALSE; 283 } 284 } 285 // Note: We could support PREFIX_TAG (assert c>=0) 286 // by recursing on its default CE32 and checking that none of the prefixes starts 287 // with a fast Latin character. 288 // However, currently (2013) there are only the L-before-middle-dot 289 // prefix mappings in the Latin range, and those would be rejected anyway. 290 case Collation::CONTRACTION_TAG: 291 U_ASSERT(c >= 0); 292 return getCEsFromContractionCE32(data, ce32, errorCode); 293 case Collation::OFFSET_TAG: 294 U_ASSERT(c >= 0); 295 ce0 = data.getCEFromOffsetCE32(c, ce32); 296 break; 297 default: 298 return FALSE; 299 } 300 } 301 // A mapping can be completely ignorable. 302 if(ce0 == 0) { return ce1 == 0; } 303 // We do not support an ignorable ce0 unless it is completely ignorable. 304 uint32_t p0 = (uint32_t)(ce0 >> 32); 305 if(p0 == 0) { return FALSE; } 306 // We only support primaries up to the Latin script. 307 if(p0 > lastLatinPrimary) { return FALSE; } 308 // We support non-common secondary and case weights only together with short primaries. 309 uint32_t lower32_0 = (uint32_t)ce0; 310 if(p0 < firstShortPrimary) { 311 uint32_t sc0 = lower32_0 & Collation::SECONDARY_AND_CASE_MASK; 312 if(sc0 != Collation::COMMON_SECONDARY_CE) { return FALSE; } 313 } 314 // No below-common tertiary weights. 315 if((lower32_0 & Collation::ONLY_TERTIARY_MASK) < Collation::COMMON_WEIGHT16) { return FALSE; } 316 if(ce1 != 0) { 317 // Both primaries must be in the same group, 318 // or both must get short mini primaries, 319 // or a short-primary CE is followed by a secondary CE. 320 // This is so that we can test the first primary and use the same mask for both, 321 // and determine for both whether they are variable. 322 uint32_t p1 = (uint32_t)(ce1 >> 32); 323 if(p1 == 0 ? p0 < firstShortPrimary : !inSameGroup(p0, p1)) { return FALSE; } 324 uint32_t lower32_1 = (uint32_t)ce1; 325 // No tertiary CEs. 326 if((lower32_1 >> 16) == 0) { return FALSE; } 327 // We support non-common secondary and case weights 328 // only for secondary CEs or together with short primaries. 329 if(p1 != 0 && p1 < firstShortPrimary) { 330 uint32_t sc1 = lower32_1 & Collation::SECONDARY_AND_CASE_MASK; 331 if(sc1 != Collation::COMMON_SECONDARY_CE) { return FALSE; } 332 } 333 // No below-common tertiary weights. 334 if((lower32_1 & Collation::ONLY_TERTIARY_MASK) < Collation::COMMON_WEIGHT16) { return FALSE; } 335 } 336 // No quaternary weights. 337 if(((ce0 | ce1) & Collation::QUATERNARY_MASK) != 0) { return FALSE; } 338 return TRUE; 339} 340 341UBool 342CollationFastLatinBuilder::getCEsFromContractionCE32(const CollationData &data, uint32_t ce32, 343 UErrorCode &errorCode) { 344 if(U_FAILURE(errorCode)) { return FALSE; } 345 const UChar *p = data.contexts + Collation::indexFromCE32(ce32); 346 ce32 = CollationData::readCE32(p); // Default if no suffix match. 347 // Since the original ce32 is not a prefix mapping, 348 // the default ce32 must not be another contraction. 349 U_ASSERT(!Collation::isContractionCE32(ce32)); 350 int32_t contractionIndex = contractionCEs.size(); 351 if(getCEsFromCE32(data, U_SENTINEL, ce32, errorCode)) { 352 addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, ce0, ce1, errorCode); 353 } else { 354 // Bail out for c-without-contraction. 355 addContractionEntry(CollationFastLatin::CONTR_CHAR_MASK, Collation::NO_CE, 0, errorCode); 356 } 357 // Handle an encodable contraction unless the next contraction is too long 358 // and starts with the same character. 359 int32_t prevX = -1; 360 UBool addContraction = FALSE; 361 UCharsTrie::Iterator suffixes(p + 2, 0, errorCode); 362 while(suffixes.next(errorCode)) { 363 const UnicodeString &suffix = suffixes.getString(); 364 int32_t x = CollationFastLatin::getCharIndex(suffix.charAt(0)); 365 if(x < 0) { continue; } // ignore anything but fast Latin text 366 if(x == prevX) { 367 if(addContraction) { 368 // Bail out for all contractions starting with this character. 369 addContractionEntry(x, Collation::NO_CE, 0, errorCode); 370 addContraction = FALSE; 371 } 372 continue; 373 } 374 if(addContraction) { 375 addContractionEntry(prevX, ce0, ce1, errorCode); 376 } 377 ce32 = (uint32_t)suffixes.getValue(); 378 if(suffix.length() == 1 && getCEsFromCE32(data, U_SENTINEL, ce32, errorCode)) { 379 addContraction = TRUE; 380 } else { 381 addContractionEntry(x, Collation::NO_CE, 0, errorCode); 382 addContraction = FALSE; 383 } 384 prevX = x; 385 } 386 if(addContraction) { 387 addContractionEntry(prevX, ce0, ce1, errorCode); 388 } 389 if(U_FAILURE(errorCode)) { return FALSE; } 390 // Note: There might not be any fast Latin contractions, but 391 // we need to enter contraction handling anyway so that we can bail out 392 // when there is a non-fast-Latin character following. 393 // For example: Danish &Y<<u+umlaut, when we compare Y vs. u\u0308 we need to see the 394 // following umlaut and bail out, rather than return the difference of Y vs. u. 395 ce0 = ((int64_t)Collation::NO_CE_PRIMARY << 32) | CONTRACTION_FLAG | contractionIndex; 396 ce1 = 0; 397 return TRUE; 398} 399 400void 401CollationFastLatinBuilder::addContractionEntry(int32_t x, int64_t cce0, int64_t cce1, 402 UErrorCode &errorCode) { 403 contractionCEs.addElement(x, errorCode); 404 contractionCEs.addElement(cce0, errorCode); 405 contractionCEs.addElement(cce1, errorCode); 406 addUniqueCE(cce0, errorCode); 407 addUniqueCE(cce1, errorCode); 408} 409 410void 411CollationFastLatinBuilder::addUniqueCE(int64_t ce, UErrorCode &errorCode) { 412 if(U_FAILURE(errorCode)) { return; } 413 if(ce == 0 || (uint32_t)(ce >> 32) == Collation::NO_CE_PRIMARY) { return; } 414 ce &= ~(int64_t)Collation::CASE_MASK; // blank out case bits 415 int32_t i = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce); 416 if(i < 0) { 417 uniqueCEs.insertElementAt(ce, ~i, errorCode); 418 } 419} 420 421uint32_t 422CollationFastLatinBuilder::getMiniCE(int64_t ce) const { 423 ce &= ~(int64_t)Collation::CASE_MASK; // blank out case bits 424 int32_t index = binarySearch(uniqueCEs.getBuffer(), uniqueCEs.size(), ce); 425 U_ASSERT(index >= 0); 426 return miniCEs[index]; 427} 428 429UBool 430CollationFastLatinBuilder::encodeUniqueCEs(UErrorCode &errorCode) { 431 if(U_FAILURE(errorCode)) { return FALSE; } 432 uprv_free(miniCEs); 433 miniCEs = (uint16_t *)uprv_malloc(uniqueCEs.size() * 2); 434 if(miniCEs == NULL) { 435 errorCode = U_MEMORY_ALLOCATION_ERROR; 436 return FALSE; 437 } 438 int32_t group = 0; 439 uint32_t lastGroupPrimary = lastSpecialPrimaries[group]; 440 // The lowest unique CE must be at least a secondary CE. 441 U_ASSERT(((uint32_t)uniqueCEs.elementAti(0) >> 16) != 0); 442 uint32_t prevPrimary = 0; 443 uint32_t prevSecondary = 0; 444 uint32_t pri = 0; 445 uint32_t sec = 0; 446 uint32_t ter = CollationFastLatin::COMMON_TER; 447 for(int32_t i = 0; i < uniqueCEs.size(); ++i) { 448 int64_t ce = uniqueCEs.elementAti(i); 449 // Note: At least one of the p/s/t weights changes from one unique CE to the next. 450 // (uniqueCEs does not store case bits.) 451 uint32_t p = (uint32_t)(ce >> 32); 452 if(p != prevPrimary) { 453 while(p > lastGroupPrimary) { 454 U_ASSERT(pri <= CollationFastLatin::MAX_LONG); 455 // Set the group's header entry to the 456 // last "long primary" in or before the group. 457 result.setCharAt(1 + group, (UChar)pri); 458 if(++group < NUM_SPECIAL_GROUPS) { 459 lastGroupPrimary = lastSpecialPrimaries[group]; 460 } else { 461 lastGroupPrimary = 0xffffffff; 462 break; 463 } 464 } 465 if(p < firstShortPrimary) { 466 if(pri == 0) { 467 pri = CollationFastLatin::MIN_LONG; 468 } else if(pri < CollationFastLatin::MAX_LONG) { 469 pri += CollationFastLatin::LONG_INC; 470 } else { 471#if DEBUG_COLLATION_FAST_LATIN_BUILDER 472 printf("long-primary overflow for %08x\n", p); 473#endif 474 miniCEs[i] = CollationFastLatin::BAIL_OUT; 475 continue; 476 } 477 } else { 478 if(pri < CollationFastLatin::MIN_SHORT) { 479 pri = CollationFastLatin::MIN_SHORT; 480 } else if(pri < (CollationFastLatin::MAX_SHORT - CollationFastLatin::SHORT_INC)) { 481 // Reserve the highest primary weight for U+FFFF. 482 pri += CollationFastLatin::SHORT_INC; 483 } else { 484#if DEBUG_COLLATION_FAST_LATIN_BUILDER 485 printf("short-primary overflow for %08x\n", p); 486#endif 487 shortPrimaryOverflow = TRUE; 488 miniCEs[i] = CollationFastLatin::BAIL_OUT; 489 continue; 490 } 491 } 492 prevPrimary = p; 493 prevSecondary = Collation::COMMON_WEIGHT16; 494 sec = CollationFastLatin::COMMON_SEC; 495 ter = CollationFastLatin::COMMON_TER; 496 } 497 uint32_t lower32 = (uint32_t)ce; 498 uint32_t s = lower32 >> 16; 499 if(s != prevSecondary) { 500 if(pri == 0) { 501 if(sec == 0) { 502 sec = CollationFastLatin::MIN_SEC_HIGH; 503 } else if(sec < CollationFastLatin::MAX_SEC_HIGH) { 504 sec += CollationFastLatin::SEC_INC; 505 } else { 506 miniCEs[i] = CollationFastLatin::BAIL_OUT; 507 continue; 508 } 509 prevSecondary = s; 510 ter = CollationFastLatin::COMMON_TER; 511 } else if(s < Collation::COMMON_WEIGHT16) { 512 if(sec == CollationFastLatin::COMMON_SEC) { 513 sec = CollationFastLatin::MIN_SEC_BEFORE; 514 } else if(sec < CollationFastLatin::MAX_SEC_BEFORE) { 515 sec += CollationFastLatin::SEC_INC; 516 } else { 517 miniCEs[i] = CollationFastLatin::BAIL_OUT; 518 continue; 519 } 520 } else if(s == Collation::COMMON_WEIGHT16) { 521 sec = CollationFastLatin::COMMON_SEC; 522 } else { 523 if(sec < CollationFastLatin::MIN_SEC_AFTER) { 524 sec = CollationFastLatin::MIN_SEC_AFTER; 525 } else if(sec < CollationFastLatin::MAX_SEC_AFTER) { 526 sec += CollationFastLatin::SEC_INC; 527 } else { 528 miniCEs[i] = CollationFastLatin::BAIL_OUT; 529 continue; 530 } 531 } 532 prevSecondary = s; 533 ter = CollationFastLatin::COMMON_TER; 534 } 535 U_ASSERT((lower32 & Collation::CASE_MASK) == 0); // blanked out in uniqueCEs 536 uint32_t t = lower32 & Collation::ONLY_TERTIARY_MASK; 537 if(t > Collation::COMMON_WEIGHT16) { 538 if(ter < CollationFastLatin::MAX_TER_AFTER) { 539 ++ter; 540 } else { 541 miniCEs[i] = CollationFastLatin::BAIL_OUT; 542 continue; 543 } 544 } 545 if(CollationFastLatin::MIN_LONG <= pri && pri <= CollationFastLatin::MAX_LONG) { 546 U_ASSERT(sec == CollationFastLatin::COMMON_SEC); 547 miniCEs[i] = (uint16_t)(pri | ter); 548 } else { 549 miniCEs[i] = (uint16_t)(pri | sec | ter); 550 } 551 } 552#if DEBUG_COLLATION_FAST_LATIN_BUILDER 553 printf("last mini primary: %04x\n", pri); 554#endif 555#if DEBUG_COLLATION_FAST_LATIN_BUILDER >= 2 556 for(int32_t i = 0; i < uniqueCEs.size(); ++i) { 557 int64_t ce = uniqueCEs.elementAti(i); 558 printf("unique CE 0x%016lx -> 0x%04x\n", ce, miniCEs[i]); 559 } 560#endif 561 return U_SUCCESS(errorCode); 562} 563 564UBool 565CollationFastLatinBuilder::encodeCharCEs(UErrorCode &errorCode) { 566 if(U_FAILURE(errorCode)) { return FALSE; } 567 int32_t miniCEsStart = result.length(); 568 for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) { 569 result.append((UChar)0); // initialize to completely ignorable 570 } 571 int32_t indexBase = result.length(); 572 for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) { 573 int64_t ce = charCEs[i][0]; 574 if(isContractionCharCE(ce)) { continue; } // defer contraction 575 uint32_t miniCE = encodeTwoCEs(ce, charCEs[i][1]); 576 if(miniCE > 0xffff) { 577 // Note: There is a chance that this new expansion is the same as a previous one, 578 // and if so, then we could reuse the other expansion. 579 // However, that seems unlikely. 580 int32_t expansionIndex = result.length() - indexBase; 581 if(expansionIndex > (int32_t)CollationFastLatin::INDEX_MASK) { 582 miniCE = CollationFastLatin::BAIL_OUT; 583 } else { 584 result.append((UChar)(miniCE >> 16)).append((UChar)miniCE); 585 miniCE = CollationFastLatin::EXPANSION | expansionIndex; 586 } 587 } 588 result.setCharAt(miniCEsStart + i, (UChar)miniCE); 589 } 590 return U_SUCCESS(errorCode); 591} 592 593UBool 594CollationFastLatinBuilder::encodeContractions(UErrorCode &errorCode) { 595 // We encode all contraction lists so that the first word of a list 596 // terminates the previous list, and we only need one additional terminator at the end. 597 if(U_FAILURE(errorCode)) { return FALSE; } 598 int32_t indexBase = headerLength + CollationFastLatin::NUM_FAST_CHARS; 599 int32_t firstContractionIndex = result.length(); 600 for(int32_t i = 0; i < CollationFastLatin::NUM_FAST_CHARS; ++i) { 601 int64_t ce = charCEs[i][0]; 602 if(!isContractionCharCE(ce)) { continue; } 603 int32_t contractionIndex = result.length() - indexBase; 604 if(contractionIndex > (int32_t)CollationFastLatin::INDEX_MASK) { 605 result.setCharAt(headerLength + i, CollationFastLatin::BAIL_OUT); 606 continue; 607 } 608 UBool firstTriple = TRUE; 609 for(int32_t index = (int32_t)ce & 0x7fffffff;; index += 3) { 610 int32_t x = contractionCEs.elementAti(index); 611 if((uint32_t)x == CollationFastLatin::CONTR_CHAR_MASK && !firstTriple) { break; } 612 int64_t cce0 = contractionCEs.elementAti(index + 1); 613 int64_t cce1 = contractionCEs.elementAti(index + 2); 614 uint32_t miniCE = encodeTwoCEs(cce0, cce1); 615 if(miniCE == CollationFastLatin::BAIL_OUT) { 616 result.append((UChar)(x | (1 << CollationFastLatin::CONTR_LENGTH_SHIFT))); 617 } else if(miniCE <= 0xffff) { 618 result.append((UChar)(x | (2 << CollationFastLatin::CONTR_LENGTH_SHIFT))); 619 result.append((UChar)miniCE); 620 } else { 621 result.append((UChar)(x | (3 << CollationFastLatin::CONTR_LENGTH_SHIFT))); 622 result.append((UChar)(miniCE >> 16)).append((UChar)miniCE); 623 } 624 firstTriple = FALSE; 625 } 626 // Note: There is a chance that this new contraction list is the same as a previous one, 627 // and if so, then we could truncate the result and reuse the other list. 628 // However, that seems unlikely. 629 result.setCharAt(headerLength + i, 630 (UChar)(CollationFastLatin::CONTRACTION | contractionIndex)); 631 } 632 if(result.length() > firstContractionIndex) { 633 // Terminate the last contraction list. 634 result.append((UChar)CollationFastLatin::CONTR_CHAR_MASK); 635 } 636 if(result.isBogus()) { 637 errorCode = U_MEMORY_ALLOCATION_ERROR; 638 return FALSE; 639 } 640#if DEBUG_COLLATION_FAST_LATIN_BUILDER 641 printf("** fast Latin %d * 2 = %d bytes\n", result.length(), result.length() * 2); 642 puts(" header & below-digit groups map"); 643 int32_t i = 0; 644 for(; i < headerLength; ++i) { 645 printf(" %04x", result[i]); 646 } 647 printf("\n char mini CEs"); 648 U_ASSERT(CollationFastLatin::NUM_FAST_CHARS % 16 == 0); 649 for(; i < indexBase; i += 16) { 650 UChar32 c = i - headerLength; 651 if(c >= CollationFastLatin::LATIN_LIMIT) { 652 c = CollationFastLatin::PUNCT_START + c - CollationFastLatin::LATIN_LIMIT; 653 } 654 printf("\n %04x:", c); 655 for(int32_t j = 0; j < 16; ++j) { 656 printf(" %04x", result[i + j]); 657 } 658 } 659 printf("\n expansions & contractions"); 660 for(; i < result.length(); ++i) { 661 if((i - indexBase) % 16 == 0) { puts(""); } 662 printf(" %04x", result[i]); 663 } 664 puts(""); 665#endif 666 return TRUE; 667} 668 669uint32_t 670CollationFastLatinBuilder::encodeTwoCEs(int64_t first, int64_t second) const { 671 if(first == 0) { 672 return 0; // completely ignorable 673 } 674 if(first == Collation::NO_CE) { 675 return CollationFastLatin::BAIL_OUT; 676 } 677 U_ASSERT((uint32_t)(first >> 32) != Collation::NO_CE_PRIMARY); 678 679 uint32_t miniCE = getMiniCE(first); 680 if(miniCE == CollationFastLatin::BAIL_OUT) { return miniCE; } 681 if(miniCE >= CollationFastLatin::MIN_SHORT) { 682 // Extract & copy the case bits. 683 // Shift them from normal CE bits 15..14 to mini CE bits 4..3. 684 uint32_t c = (((uint32_t)first & Collation::CASE_MASK) >> (14 - 3)); 685 // Only in mini CEs: Ignorable case bits = 0, lowercase = 1. 686 c += CollationFastLatin::LOWER_CASE; 687 miniCE |= c; 688 } 689 if(second == 0) { return miniCE; } 690 691 uint32_t miniCE1 = getMiniCE(second); 692 if(miniCE1 == CollationFastLatin::BAIL_OUT) { return miniCE1; } 693 694 uint32_t case1 = (uint32_t)second & Collation::CASE_MASK; 695 if(miniCE >= CollationFastLatin::MIN_SHORT && 696 (miniCE & CollationFastLatin::SECONDARY_MASK) == CollationFastLatin::COMMON_SEC) { 697 // Try to combine the two mini CEs into one. 698 uint32_t sec1 = miniCE1 & CollationFastLatin::SECONDARY_MASK; 699 uint32_t ter1 = miniCE1 & CollationFastLatin::TERTIARY_MASK; 700 if(sec1 >= CollationFastLatin::MIN_SEC_HIGH && case1 == 0 && 701 ter1 == CollationFastLatin::COMMON_TER) { 702 // sec1>=sec_high implies pri1==0. 703 return (miniCE & ~CollationFastLatin::SECONDARY_MASK) | sec1; 704 } 705 } 706 707 if(miniCE1 <= CollationFastLatin::SECONDARY_MASK || CollationFastLatin::MIN_SHORT <= miniCE1) { 708 // Secondary CE, or a CE with a short primary, copy the case bits. 709 case1 = (case1 >> (14 - 3)) + CollationFastLatin::LOWER_CASE; 710 miniCE1 |= case1; 711 } 712 return (miniCE << 16) | miniCE1; 713} 714 715U_NAMESPACE_END 716 717#endif // !UCONFIG_NO_COLLATION 718