1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* Copyright (C) 2013-2014, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* collationsets.cpp 9* 10* created on: 2013feb09 11* created by: Markus W. Scherer 12*/ 13 14#include "unicode/utypes.h" 15 16#if !UCONFIG_NO_COLLATION 17 18#include "unicode/ucharstrie.h" 19#include "unicode/uniset.h" 20#include "unicode/unistr.h" 21#include "unicode/ustringtrie.h" 22#include "collation.h" 23#include "collationdata.h" 24#include "collationsets.h" 25#include "normalizer2impl.h" 26#include "uassert.h" 27#include "utf16collationiterator.h" 28#include "utrie2.h" 29 30U_NAMESPACE_BEGIN 31 32U_CDECL_BEGIN 33 34static UBool U_CALLCONV 35enumTailoredRange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) { 36 if(ce32 == Collation::FALLBACK_CE32) { 37 return TRUE; // fallback to base, not tailored 38 } 39 TailoredSet *ts = (TailoredSet *)context; 40 return ts->handleCE32(start, end, ce32); 41} 42 43U_CDECL_END 44 45void 46TailoredSet::forData(const CollationData *d, UErrorCode &ec) { 47 if(U_FAILURE(ec)) { return; } 48 errorCode = ec; // Preserve info & warning codes. 49 data = d; 50 baseData = d->base; 51 U_ASSERT(baseData != NULL); 52 utrie2_enum(data->trie, NULL, enumTailoredRange, this); 53 ec = errorCode; 54} 55 56UBool 57TailoredSet::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) { 58 U_ASSERT(ce32 != Collation::FALLBACK_CE32); 59 if(Collation::isSpecialCE32(ce32)) { 60 ce32 = data->getIndirectCE32(ce32); 61 if(ce32 == Collation::FALLBACK_CE32) { 62 return U_SUCCESS(errorCode); 63 } 64 } 65 do { 66 uint32_t baseCE32 = baseData->getFinalCE32(baseData->getCE32(start)); 67 // Do not just continue if ce32 == baseCE32 because 68 // contractions and expansions in different data objects 69 // normally differ even if they have the same data offsets. 70 if(Collation::isSelfContainedCE32(ce32) && Collation::isSelfContainedCE32(baseCE32)) { 71 // fastpath 72 if(ce32 != baseCE32) { 73 tailored->add(start); 74 } 75 } else { 76 compare(start, ce32, baseCE32); 77 } 78 } while(++start <= end); 79 return U_SUCCESS(errorCode); 80} 81 82void 83TailoredSet::compare(UChar32 c, uint32_t ce32, uint32_t baseCE32) { 84 if(Collation::isPrefixCE32(ce32)) { 85 const UChar *p = data->contexts + Collation::indexFromCE32(ce32); 86 ce32 = data->getFinalCE32(CollationData::readCE32(p)); 87 if(Collation::isPrefixCE32(baseCE32)) { 88 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); 89 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); 90 comparePrefixes(c, p + 2, q + 2); 91 } else { 92 addPrefixes(data, c, p + 2); 93 } 94 } else if(Collation::isPrefixCE32(baseCE32)) { 95 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); 96 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); 97 addPrefixes(baseData, c, q + 2); 98 } 99 100 if(Collation::isContractionCE32(ce32)) { 101 const UChar *p = data->contexts + Collation::indexFromCE32(ce32); 102 if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 103 ce32 = Collation::NO_CE32; 104 } else { 105 ce32 = data->getFinalCE32(CollationData::readCE32(p)); 106 } 107 if(Collation::isContractionCE32(baseCE32)) { 108 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); 109 if((baseCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 110 baseCE32 = Collation::NO_CE32; 111 } else { 112 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); 113 } 114 compareContractions(c, p + 2, q + 2); 115 } else { 116 addContractions(c, p + 2); 117 } 118 } else if(Collation::isContractionCE32(baseCE32)) { 119 const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); 120 baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); 121 addContractions(c, q + 2); 122 } 123 124 int32_t tag; 125 if(Collation::isSpecialCE32(ce32)) { 126 tag = Collation::tagFromCE32(ce32); 127 U_ASSERT(tag != Collation::PREFIX_TAG); 128 U_ASSERT(tag != Collation::CONTRACTION_TAG); 129 // Currently, the tailoring data builder does not write offset tags. 130 // They might be useful for saving space, 131 // but they would complicate the builder, 132 // and in tailorings we assume that performance of tailored characters is more important. 133 U_ASSERT(tag != Collation::OFFSET_TAG); 134 } else { 135 tag = -1; 136 } 137 int32_t baseTag; 138 if(Collation::isSpecialCE32(baseCE32)) { 139 baseTag = Collation::tagFromCE32(baseCE32); 140 U_ASSERT(baseTag != Collation::PREFIX_TAG); 141 U_ASSERT(baseTag != Collation::CONTRACTION_TAG); 142 } else { 143 baseTag = -1; 144 } 145 146 // Non-contextual mappings, expansions, etc. 147 if(baseTag == Collation::OFFSET_TAG) { 148 // We might be comparing a tailoring CE which is a copy of 149 // a base offset-tag CE, via the [optimize [set]] syntax 150 // or when a single-character mapping was copied for tailored contractions. 151 // Offset tags always result in long-primary CEs, 152 // with common secondary/tertiary weights. 153 if(!Collation::isLongPrimaryCE32(ce32)) { 154 add(c); 155 return; 156 } 157 int64_t dataCE = baseData->ces[Collation::indexFromCE32(baseCE32)]; 158 uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE); 159 if(Collation::primaryFromLongPrimaryCE32(ce32) != p) { 160 add(c); 161 return; 162 } 163 } 164 165 if(tag != baseTag) { 166 add(c); 167 return; 168 } 169 170 if(tag == Collation::EXPANSION32_TAG) { 171 const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32); 172 int32_t length = Collation::lengthFromCE32(ce32); 173 174 const uint32_t *baseCE32s = baseData->ce32s + Collation::indexFromCE32(baseCE32); 175 int32_t baseLength = Collation::lengthFromCE32(baseCE32); 176 177 if(length != baseLength) { 178 add(c); 179 return; 180 } 181 for(int32_t i = 0; i < length; ++i) { 182 if(ce32s[i] != baseCE32s[i]) { 183 add(c); 184 break; 185 } 186 } 187 } else if(tag == Collation::EXPANSION_TAG) { 188 const int64_t *ces = data->ces + Collation::indexFromCE32(ce32); 189 int32_t length = Collation::lengthFromCE32(ce32); 190 191 const int64_t *baseCEs = baseData->ces + Collation::indexFromCE32(baseCE32); 192 int32_t baseLength = Collation::lengthFromCE32(baseCE32); 193 194 if(length != baseLength) { 195 add(c); 196 return; 197 } 198 for(int32_t i = 0; i < length; ++i) { 199 if(ces[i] != baseCEs[i]) { 200 add(c); 201 break; 202 } 203 } 204 } else if(tag == Collation::HANGUL_TAG) { 205 UChar jamos[3]; 206 int32_t length = Hangul::decompose(c, jamos); 207 if(tailored->contains(jamos[0]) || tailored->contains(jamos[1]) || 208 (length == 3 && tailored->contains(jamos[2]))) { 209 add(c); 210 } 211 } else if(ce32 != baseCE32) { 212 add(c); 213 } 214} 215 216void 217TailoredSet::comparePrefixes(UChar32 c, const UChar *p, const UChar *q) { 218 // Parallel iteration over prefixes of both tables. 219 UCharsTrie::Iterator prefixes(p, 0, errorCode); 220 UCharsTrie::Iterator basePrefixes(q, 0, errorCode); 221 const UnicodeString *tp = NULL; // Tailoring prefix. 222 const UnicodeString *bp = NULL; // Base prefix. 223 // Use a string with a U+FFFF as the limit sentinel. 224 // U+FFFF is untailorable and will not occur in prefixes. 225 UnicodeString none((UChar)0xffff); 226 for(;;) { 227 if(tp == NULL) { 228 if(prefixes.next(errorCode)) { 229 tp = &prefixes.getString(); 230 } else { 231 tp = &none; 232 } 233 } 234 if(bp == NULL) { 235 if(basePrefixes.next(errorCode)) { 236 bp = &basePrefixes.getString(); 237 } else { 238 bp = &none; 239 } 240 } 241 if(tp == &none && bp == &none) { break; } 242 int32_t cmp = tp->compare(*bp); 243 if(cmp < 0) { 244 // tp occurs in the tailoring but not in the base. 245 addPrefix(data, *tp, c, (uint32_t)prefixes.getValue()); 246 tp = NULL; 247 } else if(cmp > 0) { 248 // bp occurs in the base but not in the tailoring. 249 addPrefix(baseData, *bp, c, (uint32_t)basePrefixes.getValue()); 250 bp = NULL; 251 } else { 252 setPrefix(*tp); 253 compare(c, (uint32_t)prefixes.getValue(), (uint32_t)basePrefixes.getValue()); 254 resetPrefix(); 255 tp = NULL; 256 bp = NULL; 257 } 258 } 259} 260 261void 262TailoredSet::compareContractions(UChar32 c, const UChar *p, const UChar *q) { 263 // Parallel iteration over suffixes of both tables. 264 UCharsTrie::Iterator suffixes(p, 0, errorCode); 265 UCharsTrie::Iterator baseSuffixes(q, 0, errorCode); 266 const UnicodeString *ts = NULL; // Tailoring suffix. 267 const UnicodeString *bs = NULL; // Base suffix. 268 // Use a string with two U+FFFF as the limit sentinel. 269 // U+FFFF is untailorable and will not occur in contractions except maybe 270 // as a single suffix character for a root-collator boundary contraction. 271 UnicodeString none((UChar)0xffff); 272 none.append((UChar)0xffff); 273 for(;;) { 274 if(ts == NULL) { 275 if(suffixes.next(errorCode)) { 276 ts = &suffixes.getString(); 277 } else { 278 ts = &none; 279 } 280 } 281 if(bs == NULL) { 282 if(baseSuffixes.next(errorCode)) { 283 bs = &baseSuffixes.getString(); 284 } else { 285 bs = &none; 286 } 287 } 288 if(ts == &none && bs == &none) { break; } 289 int32_t cmp = ts->compare(*bs); 290 if(cmp < 0) { 291 // ts occurs in the tailoring but not in the base. 292 addSuffix(c, *ts); 293 ts = NULL; 294 } else if(cmp > 0) { 295 // bs occurs in the base but not in the tailoring. 296 addSuffix(c, *bs); 297 bs = NULL; 298 } else { 299 suffix = ts; 300 compare(c, (uint32_t)suffixes.getValue(), (uint32_t)baseSuffixes.getValue()); 301 suffix = NULL; 302 ts = NULL; 303 bs = NULL; 304 } 305 } 306} 307 308void 309TailoredSet::addPrefixes(const CollationData *d, UChar32 c, const UChar *p) { 310 UCharsTrie::Iterator prefixes(p, 0, errorCode); 311 while(prefixes.next(errorCode)) { 312 addPrefix(d, prefixes.getString(), c, (uint32_t)prefixes.getValue()); 313 } 314} 315 316void 317TailoredSet::addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32) { 318 setPrefix(pfx); 319 ce32 = d->getFinalCE32(ce32); 320 if(Collation::isContractionCE32(ce32)) { 321 const UChar *p = d->contexts + Collation::indexFromCE32(ce32); 322 addContractions(c, p + 2); 323 } 324 tailored->add(UnicodeString(unreversedPrefix).append(c)); 325 resetPrefix(); 326} 327 328void 329TailoredSet::addContractions(UChar32 c, const UChar *p) { 330 UCharsTrie::Iterator suffixes(p, 0, errorCode); 331 while(suffixes.next(errorCode)) { 332 addSuffix(c, suffixes.getString()); 333 } 334} 335 336void 337TailoredSet::addSuffix(UChar32 c, const UnicodeString &sfx) { 338 tailored->add(UnicodeString(unreversedPrefix).append(c).append(sfx)); 339} 340 341void 342TailoredSet::add(UChar32 c) { 343 if(unreversedPrefix.isEmpty() && suffix == NULL) { 344 tailored->add(c); 345 } else { 346 UnicodeString s(unreversedPrefix); 347 s.append(c); 348 if(suffix != NULL) { 349 s.append(*suffix); 350 } 351 tailored->add(s); 352 } 353} 354 355ContractionsAndExpansions::CESink::~CESink() {} 356 357U_CDECL_BEGIN 358 359static UBool U_CALLCONV 360enumCnERange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) { 361 ContractionsAndExpansions *cne = (ContractionsAndExpansions *)context; 362 if(cne->checkTailored == 0) { 363 // There is no tailoring. 364 // No need to collect nor check the tailored set. 365 } else if(cne->checkTailored < 0) { 366 // Collect the set of code points with mappings in the tailoring data. 367 if(ce32 == Collation::FALLBACK_CE32) { 368 return TRUE; // fallback to base, not tailored 369 } else { 370 cne->tailored.add(start, end); 371 } 372 // checkTailored > 0: Exclude tailored ranges from the base data enumeration. 373 } else if(start == end) { 374 if(cne->tailored.contains(start)) { 375 return TRUE; 376 } 377 } else if(cne->tailored.containsSome(start, end)) { 378 cne->ranges.set(start, end).removeAll(cne->tailored); 379 int32_t count = cne->ranges.getRangeCount(); 380 for(int32_t i = 0; i < count; ++i) { 381 cne->handleCE32(cne->ranges.getRangeStart(i), cne->ranges.getRangeEnd(i), ce32); 382 } 383 return U_SUCCESS(cne->errorCode); 384 } 385 cne->handleCE32(start, end, ce32); 386 return U_SUCCESS(cne->errorCode); 387} 388 389U_CDECL_END 390 391void 392ContractionsAndExpansions::forData(const CollationData *d, UErrorCode &ec) { 393 if(U_FAILURE(ec)) { return; } 394 errorCode = ec; // Preserve info & warning codes. 395 // Add all from the data, can be tailoring or base. 396 if(d->base != NULL) { 397 checkTailored = -1; 398 } 399 data = d; 400 utrie2_enum(data->trie, NULL, enumCnERange, this); 401 if(d->base == NULL || U_FAILURE(errorCode)) { 402 ec = errorCode; 403 return; 404 } 405 // Add all from the base data but only for un-tailored code points. 406 tailored.freeze(); 407 checkTailored = 1; 408 data = d->base; 409 utrie2_enum(data->trie, NULL, enumCnERange, this); 410 ec = errorCode; 411} 412 413void 414ContractionsAndExpansions::forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec) { 415 if(U_FAILURE(ec)) { return; } 416 errorCode = ec; // Preserve info & warning codes. 417 uint32_t ce32 = d->getCE32(c); 418 if(ce32 == Collation::FALLBACK_CE32) { 419 d = d->base; 420 ce32 = d->getCE32(c); 421 } 422 data = d; 423 handleCE32(c, c, ce32); 424 ec = errorCode; 425} 426 427void 428ContractionsAndExpansions::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) { 429 for(;;) { 430 if((ce32 & 0xff) < Collation::SPECIAL_CE32_LOW_BYTE) { 431 // !isSpecialCE32() 432 if(sink != NULL) { 433 sink->handleCE(Collation::ceFromSimpleCE32(ce32)); 434 } 435 return; 436 } 437 switch(Collation::tagFromCE32(ce32)) { 438 case Collation::FALLBACK_TAG: 439 return; 440 case Collation::RESERVED_TAG_3: 441 case Collation::BUILDER_DATA_TAG: 442 case Collation::LEAD_SURROGATE_TAG: 443 if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; } 444 return; 445 case Collation::LONG_PRIMARY_TAG: 446 if(sink != NULL) { 447 sink->handleCE(Collation::ceFromLongPrimaryCE32(ce32)); 448 } 449 return; 450 case Collation::LONG_SECONDARY_TAG: 451 if(sink != NULL) { 452 sink->handleCE(Collation::ceFromLongSecondaryCE32(ce32)); 453 } 454 return; 455 case Collation::LATIN_EXPANSION_TAG: 456 if(sink != NULL) { 457 ces[0] = Collation::latinCE0FromCE32(ce32); 458 ces[1] = Collation::latinCE1FromCE32(ce32); 459 sink->handleExpansion(ces, 2); 460 } 461 // Optimization: If we have a prefix, 462 // then the relevant strings have been added already. 463 if(unreversedPrefix.isEmpty()) { 464 addExpansions(start, end); 465 } 466 return; 467 case Collation::EXPANSION32_TAG: 468 if(sink != NULL) { 469 const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32); 470 int32_t length = Collation::lengthFromCE32(ce32); 471 for(int32_t i = 0; i < length; ++i) { 472 ces[i] = Collation::ceFromCE32(*ce32s++); 473 } 474 sink->handleExpansion(ces, length); 475 } 476 // Optimization: If we have a prefix, 477 // then the relevant strings have been added already. 478 if(unreversedPrefix.isEmpty()) { 479 addExpansions(start, end); 480 } 481 return; 482 case Collation::EXPANSION_TAG: 483 if(sink != NULL) { 484 int32_t length = Collation::lengthFromCE32(ce32); 485 sink->handleExpansion(data->ces + Collation::indexFromCE32(ce32), length); 486 } 487 // Optimization: If we have a prefix, 488 // then the relevant strings have been added already. 489 if(unreversedPrefix.isEmpty()) { 490 addExpansions(start, end); 491 } 492 return; 493 case Collation::PREFIX_TAG: 494 handlePrefixes(start, end, ce32); 495 return; 496 case Collation::CONTRACTION_TAG: 497 handleContractions(start, end, ce32); 498 return; 499 case Collation::DIGIT_TAG: 500 // Fetch the non-numeric-collation CE32 and continue. 501 ce32 = data->ce32s[Collation::indexFromCE32(ce32)]; 502 break; 503 case Collation::U0000_TAG: 504 U_ASSERT(start == 0 && end == 0); 505 // Fetch the normal ce32 for U+0000 and continue. 506 ce32 = data->ce32s[0]; 507 break; 508 case Collation::HANGUL_TAG: 509 if(sink != NULL) { 510 // TODO: This should be optimized, 511 // especially if [start..end] is the complete Hangul range. (assert that) 512 UTF16CollationIterator iter(data, FALSE, NULL, NULL, NULL); 513 UChar hangul[1] = { 0 }; 514 for(UChar32 c = start; c <= end; ++c) { 515 hangul[0] = (UChar)c; 516 iter.setText(hangul, hangul + 1); 517 int32_t length = iter.fetchCEs(errorCode); 518 if(U_FAILURE(errorCode)) { return; } 519 // Ignore the terminating non-CE. 520 U_ASSERT(length >= 2 && iter.getCE(length - 1) == Collation::NO_CE); 521 sink->handleExpansion(iter.getCEs(), length - 1); 522 } 523 } 524 // Optimization: If we have a prefix, 525 // then the relevant strings have been added already. 526 if(unreversedPrefix.isEmpty()) { 527 addExpansions(start, end); 528 } 529 return; 530 case Collation::OFFSET_TAG: 531 // Currently no need to send offset CEs to the sink. 532 return; 533 case Collation::IMPLICIT_TAG: 534 // Currently no need to send implicit CEs to the sink. 535 return; 536 } 537 } 538} 539 540void 541ContractionsAndExpansions::handlePrefixes( 542 UChar32 start, UChar32 end, uint32_t ce32) { 543 const UChar *p = data->contexts + Collation::indexFromCE32(ce32); 544 ce32 = CollationData::readCE32(p); // Default if no prefix match. 545 handleCE32(start, end, ce32); 546 if(!addPrefixes) { return; } 547 UCharsTrie::Iterator prefixes(p + 2, 0, errorCode); 548 while(prefixes.next(errorCode)) { 549 setPrefix(prefixes.getString()); 550 // Prefix/pre-context mappings are special kinds of contractions 551 // that always yield expansions. 552 addStrings(start, end, contractions); 553 addStrings(start, end, expansions); 554 handleCE32(start, end, (uint32_t)prefixes.getValue()); 555 } 556 resetPrefix(); 557} 558 559void 560ContractionsAndExpansions::handleContractions( 561 UChar32 start, UChar32 end, uint32_t ce32) { 562 const UChar *p = data->contexts + Collation::indexFromCE32(ce32); 563 if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 564 // No match on the single code point. 565 // We are underneath a prefix, and the default mapping is just 566 // a fallback to the mappings for a shorter prefix. 567 U_ASSERT(!unreversedPrefix.isEmpty()); 568 } else { 569 ce32 = CollationData::readCE32(p); // Default if no suffix match. 570 U_ASSERT(!Collation::isContractionCE32(ce32)); 571 handleCE32(start, end, ce32); 572 } 573 UCharsTrie::Iterator suffixes(p + 2, 0, errorCode); 574 while(suffixes.next(errorCode)) { 575 suffix = &suffixes.getString(); 576 addStrings(start, end, contractions); 577 if(!unreversedPrefix.isEmpty()) { 578 addStrings(start, end, expansions); 579 } 580 handleCE32(start, end, (uint32_t)suffixes.getValue()); 581 } 582 suffix = NULL; 583} 584 585void 586ContractionsAndExpansions::addExpansions(UChar32 start, UChar32 end) { 587 if(unreversedPrefix.isEmpty() && suffix == NULL) { 588 if(expansions != NULL) { 589 expansions->add(start, end); 590 } 591 } else { 592 addStrings(start, end, expansions); 593 } 594} 595 596void 597ContractionsAndExpansions::addStrings(UChar32 start, UChar32 end, UnicodeSet *set) { 598 if(set == NULL) { return; } 599 UnicodeString s(unreversedPrefix); 600 do { 601 s.append(start); 602 if(suffix != NULL) { 603 s.append(*suffix); 604 } 605 set->add(s); 606 s.truncate(unreversedPrefix.length()); 607 } while(++start <= end); 608} 609 610U_NAMESPACE_END 611 612#endif // !UCONFIG_NO_COLLATION 613