1/* 2******************************************************************************* 3* Copyright (C) 1996-2014, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* rulebasedcollator.cpp 7* 8* (replaced the former tblcoll.cpp) 9* 10* created on: 2012feb14 with new and old collation code 11* created by: Markus W. Scherer 12*/ 13 14#include "unicode/utypes.h" 15 16#if !UCONFIG_NO_COLLATION 17 18#include "unicode/coll.h" 19#include "unicode/coleitr.h" 20#include "unicode/localpointer.h" 21#include "unicode/locid.h" 22#include "unicode/sortkey.h" 23#include "unicode/tblcoll.h" 24#include "unicode/ucol.h" 25#include "unicode/uiter.h" 26#include "unicode/uloc.h" 27#include "unicode/uniset.h" 28#include "unicode/unistr.h" 29#include "unicode/usetiter.h" 30#include "unicode/utf8.h" 31#include "unicode/uversion.h" 32#include "bocsu.h" 33#include "charstr.h" 34#include "cmemory.h" 35#include "collation.h" 36#include "collationcompare.h" 37#include "collationdata.h" 38#include "collationdatareader.h" 39#include "collationfastlatin.h" 40#include "collationiterator.h" 41#include "collationkeys.h" 42#include "collationroot.h" 43#include "collationsets.h" 44#include "collationsettings.h" 45#include "collationtailoring.h" 46#include "cstring.h" 47#include "uassert.h" 48#include "ucol_imp.h" 49#include "uhash.h" 50#include "uitercollationiterator.h" 51#include "ustr_imp.h" 52#include "utf16collationiterator.h" 53#include "utf8collationiterator.h" 54#include "uvectr64.h" 55 56#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 57 58U_NAMESPACE_BEGIN 59 60namespace { 61 62class FixedSortKeyByteSink : public SortKeyByteSink { 63public: 64 FixedSortKeyByteSink(char *dest, int32_t destCapacity) 65 : SortKeyByteSink(dest, destCapacity) {} 66 virtual ~FixedSortKeyByteSink(); 67 68private: 69 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); 70 virtual UBool Resize(int32_t appendCapacity, int32_t length); 71}; 72 73FixedSortKeyByteSink::~FixedSortKeyByteSink() {} 74 75void 76FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) { 77 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ 78 // Fill the buffer completely. 79 int32_t available = capacity_ - length; 80 if (available > 0) { 81 uprv_memcpy(buffer_ + length, bytes, available); 82 } 83} 84 85UBool 86FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) { 87 return FALSE; 88} 89 90} // namespace 91 92// Not in an anonymous namespace, so that it can be a friend of CollationKey. 93class CollationKeyByteSink : public SortKeyByteSink { 94public: 95 CollationKeyByteSink(CollationKey &key) 96 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()), 97 key_(key) {} 98 virtual ~CollationKeyByteSink(); 99 100private: 101 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); 102 virtual UBool Resize(int32_t appendCapacity, int32_t length); 103 104 CollationKey &key_; 105}; 106 107CollationKeyByteSink::~CollationKeyByteSink() {} 108 109void 110CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) { 111 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ 112 if (Resize(n, length)) { 113 uprv_memcpy(buffer_ + length, bytes, n); 114 } 115} 116 117UBool 118CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) { 119 if (buffer_ == NULL) { 120 return FALSE; // allocation failed before already 121 } 122 int32_t newCapacity = 2 * capacity_; 123 int32_t altCapacity = length + 2 * appendCapacity; 124 if (newCapacity < altCapacity) { 125 newCapacity = altCapacity; 126 } 127 if (newCapacity < 200) { 128 newCapacity = 200; 129 } 130 uint8_t *newBuffer = key_.reallocate(newCapacity, length); 131 if (newBuffer == NULL) { 132 SetNotOk(); 133 return FALSE; 134 } 135 buffer_ = reinterpret_cast<char *>(newBuffer); 136 capacity_ = newCapacity; 137 return TRUE; 138} 139 140RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other) 141 : Collator(other), 142 data(other.data), 143 settings(other.settings), 144 tailoring(other.tailoring), 145 validLocale(other.validLocale), 146 explicitlySetAttributes(other.explicitlySetAttributes), 147 actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) { 148 settings->addRef(); 149 tailoring->addRef(); 150} 151 152RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length, 153 const RuleBasedCollator *base, UErrorCode &errorCode) 154 : data(NULL), 155 settings(NULL), 156 tailoring(NULL), 157 validLocale(""), 158 explicitlySetAttributes(0), 159 actualLocaleIsSameAsValid(FALSE) { 160 if(U_FAILURE(errorCode)) { return; } 161 if(bin == NULL || length <= 0 || base == NULL) { 162 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 163 return; 164 } 165 const CollationTailoring *root = CollationRoot::getRoot(errorCode); 166 if(U_FAILURE(errorCode)) { return; } 167 if(base->tailoring != root) { 168 errorCode = U_UNSUPPORTED_ERROR; 169 return; 170 } 171 LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->settings)); 172 if(t.isNull() || t->isBogus()) { 173 errorCode = U_MEMORY_ALLOCATION_ERROR; 174 return; 175 } 176 CollationDataReader::read(base->tailoring, bin, length, *t, errorCode); 177 if(U_FAILURE(errorCode)) { return; } 178 t->actualLocale.setToBogus(); 179 adoptTailoring(t.orphan()); 180} 181 182RuleBasedCollator::RuleBasedCollator(const CollationTailoring *t, const Locale &vl) 183 : data(t->data), 184 settings(t->settings), 185 tailoring(t), 186 validLocale(vl), 187 explicitlySetAttributes(0), 188 actualLocaleIsSameAsValid(FALSE) { 189 settings->addRef(); 190 tailoring->addRef(); 191} 192 193RuleBasedCollator::~RuleBasedCollator() { 194 SharedObject::clearPtr(settings); 195 SharedObject::clearPtr(tailoring); 196} 197 198void 199RuleBasedCollator::adoptTailoring(CollationTailoring *t) { 200 U_ASSERT(settings == NULL && data == NULL && tailoring == NULL); 201 data = t->data; 202 settings = t->settings; 203 settings->addRef(); 204 t->addRef(); 205 tailoring = t; 206 validLocale = t->actualLocale; 207 actualLocaleIsSameAsValid = FALSE; 208} 209 210Collator * 211RuleBasedCollator::clone() const { 212 return new RuleBasedCollator(*this); 213} 214 215RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) { 216 if(this == &other) { return *this; } 217 SharedObject::copyPtr(other.settings, settings); 218 SharedObject::copyPtr(other.tailoring, tailoring); 219 data = tailoring->data; 220 validLocale = other.validLocale; 221 explicitlySetAttributes = other.explicitlySetAttributes; 222 actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid; 223 return *this; 224} 225 226UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator) 227 228UBool 229RuleBasedCollator::operator==(const Collator& other) const { 230 if(this == &other) { return TRUE; } 231 if(!Collator::operator==(other)) { return FALSE; } 232 const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other); 233 if(*settings != *o.settings) { return FALSE; } 234 if(data == o.data) { return TRUE; } 235 UBool thisIsRoot = data->base == NULL; 236 UBool otherIsRoot = o.data->base == NULL; 237 U_ASSERT(!thisIsRoot || !otherIsRoot); // otherwise their data pointers should be == 238 if(thisIsRoot != otherIsRoot) { return FALSE; } 239 if((thisIsRoot || !tailoring->rules.isEmpty()) && 240 (otherIsRoot || !o.tailoring->rules.isEmpty())) { 241 // Shortcut: If both collators have valid rule strings, then compare those. 242 if(tailoring->rules == o.tailoring->rules) { return TRUE; } 243 } 244 // Different rule strings can result in the same or equivalent tailoring. 245 // The rule strings are optional in ICU resource bundles, although included by default. 246 // cloneBinary() drops the rule string. 247 UErrorCode errorCode = U_ZERO_ERROR; 248 LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode)); 249 LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode)); 250 if(U_FAILURE(errorCode)) { return FALSE; } 251 if(*thisTailored != *otherTailored) { return FALSE; } 252 // For completeness, we should compare all of the mappings; 253 // or we should create a list of strings, sort it with one collator, 254 // and check if both collators compare adjacent strings the same 255 // (order & strength, down to quaternary); or similar. 256 // Testing equality of collators seems unusual. 257 return TRUE; 258} 259 260int32_t 261RuleBasedCollator::hashCode() const { 262 int32_t h = settings->hashCode(); 263 if(data->base == NULL) { return h; } // root collator 264 // Do not rely on the rule string, see comments in operator==(). 265 UErrorCode errorCode = U_ZERO_ERROR; 266 LocalPointer<UnicodeSet> set(getTailoredSet(errorCode)); 267 if(U_FAILURE(errorCode)) { return 0; } 268 UnicodeSetIterator iter(*set); 269 while(iter.next() && !iter.isString()) { 270 h ^= data->getCE32(iter.getCodepoint()); 271 } 272 return h; 273} 274 275void 276RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid, 277 const Locale &actual) { 278 if(actual == tailoring->actualLocale) { 279 actualLocaleIsSameAsValid = FALSE; 280 } else { 281 U_ASSERT(actual == valid); 282 actualLocaleIsSameAsValid = TRUE; 283 } 284 // Do not modify tailoring.actualLocale: 285 // We cannot be sure that that would be thread-safe. 286 validLocale = valid; 287 (void)requested; // Ignore, see also ticket #10477. 288} 289 290Locale 291RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) const { 292 if(U_FAILURE(errorCode)) { 293 return Locale::getRoot(); 294 } 295 switch(type) { 296 case ULOC_ACTUAL_LOCALE: 297 return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale; 298 case ULOC_VALID_LOCALE: 299 case ULOC_REQUESTED_LOCALE: // TODO: Drop this, see ticket #10477. 300 return validLocale; 301 default: 302 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 303 return Locale::getRoot(); 304 } 305} 306 307const char * 308RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const { 309 if(U_FAILURE(errorCode)) { 310 return NULL; 311 } 312 const Locale *result; 313 switch(type) { 314 case ULOC_ACTUAL_LOCALE: 315 result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLocale; 316 break; 317 case ULOC_VALID_LOCALE: 318 case ULOC_REQUESTED_LOCALE: // TODO: Drop this, see ticket #10477. 319 result = &validLocale; 320 break; 321 default: 322 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 323 return NULL; 324 } 325 if(result->isBogus()) { return NULL; } 326 const char *id = result->getName(); 327 return id[0] == 0 ? "root" : id; 328} 329 330const UnicodeString& 331RuleBasedCollator::getRules() const { 332 return tailoring->rules; 333} 334 335void 336RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const { 337 if(delta == UCOL_TAILORING_ONLY) { 338 buffer = tailoring->rules; 339 return; 340 } 341 // UCOL_FULL_RULES 342 buffer.remove(); 343 CollationLoader::appendRootRules(buffer); 344 buffer.append(tailoring->rules).getTerminatedBuffer(); 345} 346 347void 348RuleBasedCollator::getVersion(UVersionInfo version) const { 349 uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH); 350 version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4); 351} 352 353UnicodeSet * 354RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const { 355 if(U_FAILURE(errorCode)) { return NULL; } 356 UnicodeSet *tailored = new UnicodeSet(); 357 if(tailored == NULL) { 358 errorCode = U_MEMORY_ALLOCATION_ERROR; 359 return NULL; 360 } 361 if(data->base != NULL) { 362 TailoredSet(tailored).forData(data, errorCode); 363 if(U_FAILURE(errorCode)) { 364 delete tailored; 365 return NULL; 366 } 367 } 368 return tailored; 369} 370 371void 372RuleBasedCollator::internalGetContractionsAndExpansions( 373 UnicodeSet *contractions, UnicodeSet *expansions, 374 UBool addPrefixes, UErrorCode &errorCode) const { 375 if(U_FAILURE(errorCode)) { return; } 376 if(contractions != NULL) { 377 contractions->clear(); 378 } 379 if(expansions != NULL) { 380 expansions->clear(); 381 } 382 ContractionsAndExpansions(contractions, expansions, NULL, addPrefixes).forData(data, errorCode); 383} 384 385void 386RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const { 387 if(U_FAILURE(errorCode)) { return; } 388 ContractionsAndExpansions(&set, NULL, NULL, FALSE).forCodePoint(data, c, errorCode); 389} 390 391const CollationSettings & 392RuleBasedCollator::getDefaultSettings() const { 393 return *tailoring->settings; 394} 395 396UColAttributeValue 397RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const { 398 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; } 399 int32_t option; 400 switch(attr) { 401 case UCOL_FRENCH_COLLATION: 402 option = CollationSettings::BACKWARD_SECONDARY; 403 break; 404 case UCOL_ALTERNATE_HANDLING: 405 return settings->getAlternateHandling(); 406 case UCOL_CASE_FIRST: 407 return settings->getCaseFirst(); 408 case UCOL_CASE_LEVEL: 409 option = CollationSettings::CASE_LEVEL; 410 break; 411 case UCOL_NORMALIZATION_MODE: 412 option = CollationSettings::CHECK_FCD; 413 break; 414 case UCOL_STRENGTH: 415 return (UColAttributeValue)settings->getStrength(); 416 case UCOL_HIRAGANA_QUATERNARY_MODE: 417 // Deprecated attribute, unsettable. 418 return UCOL_OFF; 419 case UCOL_NUMERIC_COLLATION: 420 option = CollationSettings::NUMERIC; 421 break; 422 default: 423 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 424 return UCOL_DEFAULT; 425 } 426 return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON; 427} 428 429void 430RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value, 431 UErrorCode &errorCode) { 432 UColAttributeValue oldValue = getAttribute(attr, errorCode); 433 if(U_FAILURE(errorCode)) { return; } 434 if(value == oldValue) { 435 setAttributeExplicitly(attr); 436 return; 437 } 438 const CollationSettings &defaultSettings = getDefaultSettings(); 439 if(settings == &defaultSettings) { 440 if(value == UCOL_DEFAULT) { 441 setAttributeDefault(attr); 442 return; 443 } 444 } 445 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 446 if(ownedSettings == NULL) { 447 errorCode = U_MEMORY_ALLOCATION_ERROR; 448 return; 449 } 450 451 switch(attr) { 452 case UCOL_FRENCH_COLLATION: 453 ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value, 454 defaultSettings.options, errorCode); 455 break; 456 case UCOL_ALTERNATE_HANDLING: 457 ownedSettings->setAlternateHandling(value, defaultSettings.options, errorCode); 458 break; 459 case UCOL_CASE_FIRST: 460 ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode); 461 break; 462 case UCOL_CASE_LEVEL: 463 ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value, 464 defaultSettings.options, errorCode); 465 break; 466 case UCOL_NORMALIZATION_MODE: 467 ownedSettings->setFlag(CollationSettings::CHECK_FCD, value, 468 defaultSettings.options, errorCode); 469 break; 470 case UCOL_STRENGTH: 471 ownedSettings->setStrength(value, defaultSettings.options, errorCode); 472 break; 473 case UCOL_HIRAGANA_QUATERNARY_MODE: 474 // Deprecated attribute. Check for valid values but do not change anything. 475 if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) { 476 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 477 } 478 break; 479 case UCOL_NUMERIC_COLLATION: 480 ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSettings.options, errorCode); 481 break; 482 default: 483 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 484 break; 485 } 486 if(U_FAILURE(errorCode)) { return; } 487 setFastLatinOptions(*ownedSettings); 488 if(value == UCOL_DEFAULT) { 489 setAttributeDefault(attr); 490 } else { 491 setAttributeExplicitly(attr); 492 } 493} 494 495Collator & 496RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) { 497 if(U_FAILURE(errorCode)) { return *this; } 498 // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1. 499 int32_t value; 500 if(group == UCOL_REORDER_CODE_DEFAULT) { 501 value = UCOL_DEFAULT; 502 } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CURRENCY) { 503 value = group - UCOL_REORDER_CODE_FIRST; 504 } else { 505 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 506 return *this; 507 } 508 CollationSettings::MaxVariable oldValue = settings->getMaxVariable(); 509 if(value == oldValue) { 510 setAttributeExplicitly(ATTR_VARIABLE_TOP); 511 return *this; 512 } 513 const CollationSettings &defaultSettings = getDefaultSettings(); 514 if(settings == &defaultSettings) { 515 if(value == UCOL_DEFAULT) { 516 setAttributeDefault(ATTR_VARIABLE_TOP); 517 return *this; 518 } 519 } 520 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 521 if(ownedSettings == NULL) { 522 errorCode = U_MEMORY_ALLOCATION_ERROR; 523 return *this; 524 } 525 526 if(group == UCOL_REORDER_CODE_DEFAULT) { 527 group = (UColReorderCode)(UCOL_REORDER_CODE_FIRST + defaultSettings.getMaxVariable()); 528 } 529 uint32_t varTop = data->getLastPrimaryForGroup(group); 530 U_ASSERT(varTop != 0); 531 ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode); 532 if(U_FAILURE(errorCode)) { return *this; } 533 ownedSettings->variableTop = varTop; 534 setFastLatinOptions(*ownedSettings); 535 if(value == UCOL_DEFAULT) { 536 setAttributeDefault(ATTR_VARIABLE_TOP); 537 } else { 538 setAttributeExplicitly(ATTR_VARIABLE_TOP); 539 } 540 return *this; 541} 542 543UColReorderCode 544RuleBasedCollator::getMaxVariable() const { 545 return (UColReorderCode)(UCOL_REORDER_CODE_FIRST + settings->getMaxVariable()); 546} 547 548uint32_t 549RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const { 550 return settings->variableTop; 551} 552 553uint32_t 554RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &errorCode) { 555 if(U_FAILURE(errorCode)) { return 0; } 556 if(varTop == NULL && len !=0) { 557 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 558 return 0; 559 } 560 if(len < 0) { len = u_strlen(varTop); } 561 if(len == 0) { 562 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 563 return 0; 564 } 565 UBool numeric = settings->isNumeric(); 566 int64_t ce1, ce2; 567 if(settings->dontCheckFCD()) { 568 UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len); 569 ce1 = ci.nextCE(errorCode); 570 ce2 = ci.nextCE(errorCode); 571 } else { 572 FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len); 573 ce1 = ci.nextCE(errorCode); 574 ce2 = ci.nextCE(errorCode); 575 } 576 if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) { 577 errorCode = U_CE_NOT_FOUND_ERROR; 578 return 0; 579 } 580 setVariableTop((uint32_t)(ce1 >> 32), errorCode); 581 return settings->variableTop; 582} 583 584uint32_t 585RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &errorCode) { 586 return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode); 587} 588 589void 590RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) { 591 if(U_FAILURE(errorCode)) { return; } 592 if(varTop != settings->variableTop) { 593 // Pin the variable top to the end of the reordering group which contains it. 594 // Only a few special groups are supported. 595 int32_t group = data->getGroupForPrimary(varTop); 596 if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group) { 597 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 598 return; 599 } 600 uint32_t v = data->getLastPrimaryForGroup(group); 601 U_ASSERT(v != 0 && v >= varTop); 602 varTop = v; 603 if(varTop != settings->variableTop) { 604 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 605 if(ownedSettings == NULL) { 606 errorCode = U_MEMORY_ALLOCATION_ERROR; 607 return; 608 } 609 ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST, 610 getDefaultSettings().options, errorCode); 611 if(U_FAILURE(errorCode)) { return; } 612 ownedSettings->variableTop = varTop; 613 setFastLatinOptions(*ownedSettings); 614 } 615 } 616 if(varTop == getDefaultSettings().variableTop) { 617 setAttributeDefault(ATTR_VARIABLE_TOP); 618 } else { 619 setAttributeExplicitly(ATTR_VARIABLE_TOP); 620 } 621} 622 623int32_t 624RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity, 625 UErrorCode &errorCode) const { 626 if(U_FAILURE(errorCode)) { return 0; } 627 if(capacity < 0 || (dest == NULL && capacity > 0)) { 628 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 629 return 0; 630 } 631 int32_t length = settings->reorderCodesLength; 632 if(length == 0) { return 0; } 633 if(length > capacity) { 634 errorCode = U_BUFFER_OVERFLOW_ERROR; 635 return length; 636 } 637 uprv_memcpy(dest, settings->reorderCodes, length * 4); 638 return length; 639} 640 641void 642RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length, 643 UErrorCode &errorCode) { 644 if(U_FAILURE(errorCode)) { return; } 645 if(length < 0 || (reorderCodes == NULL && length > 0)) { 646 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 647 return; 648 } 649 if(length == settings->reorderCodesLength && 650 uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) { 651 return; 652 } 653 const CollationSettings &defaultSettings = getDefaultSettings(); 654 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) { 655 if(settings != &defaultSettings) { 656 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 657 if(ownedSettings == NULL) { 658 errorCode = U_MEMORY_ALLOCATION_ERROR; 659 return; 660 } 661 ownedSettings->aliasReordering(defaultSettings.reorderCodes, 662 defaultSettings.reorderCodesLength, 663 defaultSettings.reorderTable); 664 setFastLatinOptions(*ownedSettings); 665 } 666 return; 667 } 668 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 669 if(ownedSettings == NULL) { 670 errorCode = U_MEMORY_ALLOCATION_ERROR; 671 return; 672 } 673 if(length == 0) { 674 ownedSettings->resetReordering(); 675 } else { 676 uint8_t reorderTable[256]; 677 data->makeReorderTable(reorderCodes, length, reorderTable, errorCode); 678 if(U_FAILURE(errorCode)) { return; } 679 if(!ownedSettings->setReordering(reorderCodes, length, reorderTable)) { 680 errorCode = U_MEMORY_ALLOCATION_ERROR; 681 return; 682 } 683 } 684 setFastLatinOptions(*ownedSettings); 685} 686 687void 688RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const { 689 ownedSettings.fastLatinOptions = CollationFastLatin::getOptions( 690 data, ownedSettings, 691 ownedSettings.fastLatinPrimaries, LENGTHOF(ownedSettings.fastLatinPrimaries)); 692} 693 694UCollationResult 695RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right, 696 UErrorCode &errorCode) const { 697 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 698 return doCompare(left.getBuffer(), left.length(), 699 right.getBuffer(), right.length(), errorCode); 700} 701 702UCollationResult 703RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right, 704 int32_t length, UErrorCode &errorCode) const { 705 if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; } 706 if(length < 0) { 707 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 708 return UCOL_EQUAL; 709 } 710 int32_t leftLength = left.length(); 711 int32_t rightLength = right.length(); 712 if(leftLength > length) { leftLength = length; } 713 if(rightLength > length) { rightLength = length; } 714 return doCompare(left.getBuffer(), leftLength, 715 right.getBuffer(), rightLength, errorCode); 716} 717 718UCollationResult 719RuleBasedCollator::compare(const UChar *left, int32_t leftLength, 720 const UChar *right, int32_t rightLength, 721 UErrorCode &errorCode) const { 722 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 723 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) { 724 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 725 return UCOL_EQUAL; 726 } 727 // Make sure both or neither strings have a known length. 728 // We do not optimize for mixed length/termination. 729 if(leftLength >= 0) { 730 if(rightLength < 0) { rightLength = u_strlen(right); } 731 } else { 732 if(rightLength >= 0) { leftLength = u_strlen(left); } 733 } 734 return doCompare(left, leftLength, right, rightLength, errorCode); 735} 736 737UCollationResult 738RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right, 739 UErrorCode &errorCode) const { 740 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 741 const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data()); 742 const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data()); 743 if((leftBytes == NULL && !left.empty()) || (rightBytes == NULL && !right.empty())) { 744 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 745 return UCOL_EQUAL; 746 } 747 return doCompare(leftBytes, left.length(), rightBytes, right.length(), errorCode); 748} 749 750UCollationResult 751RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength, 752 const char *right, int32_t rightLength, 753 UErrorCode &errorCode) const { 754 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 755 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) { 756 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 757 return UCOL_EQUAL; 758 } 759 // Make sure both or neither strings have a known length. 760 // We do not optimize for mixed length/termination. 761 if(leftLength >= 0) { 762 if(rightLength < 0) { rightLength = uprv_strlen(right); } 763 } else { 764 if(rightLength >= 0) { leftLength = uprv_strlen(left); } 765 } 766 return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength, 767 reinterpret_cast<const uint8_t *>(right), rightLength, errorCode); 768} 769 770namespace { 771 772/** 773 * Abstract iterator for identical-level string comparisons. 774 * Returns FCD code points and handles temporary switching to NFD. 775 */ 776class NFDIterator { 777public: 778 NFDIterator() : index(-1), length(0) {} 779 virtual ~NFDIterator() {} 780 /** 781 * Returns the next code point from the internal normalization buffer, 782 * or else the next text code point. 783 * Returns -1 at the end of the text. 784 */ 785 UChar32 nextCodePoint() { 786 if(index >= 0) { 787 if(index == length) { 788 index = -1; 789 } else { 790 UChar32 c; 791 U16_NEXT_UNSAFE(decomp, index, c); 792 return c; 793 } 794 } 795 return nextRawCodePoint(); 796 } 797 /** 798 * @param nfcImpl 799 * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint() 800 * @return the first code point in c's decomposition, 801 * or c itself if it was decomposed already or if it does not decompose 802 */ 803 UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) { 804 if(index >= 0) { return c; } 805 decomp = nfcImpl.getDecomposition(c, buffer, length); 806 if(decomp == NULL) { return c; } 807 index = 0; 808 U16_NEXT_UNSAFE(decomp, index, c); 809 return c; 810 } 811protected: 812 /** 813 * Returns the next text code point in FCD order. 814 * Returns -1 at the end of the text. 815 */ 816 virtual UChar32 nextRawCodePoint() = 0; 817private: 818 const UChar *decomp; 819 UChar buffer[4]; 820 int32_t index; 821 int32_t length; 822}; 823 824class UTF16NFDIterator : public NFDIterator { 825public: 826 UTF16NFDIterator(const UChar *text, const UChar *textLimit) : s(text), limit(textLimit) {} 827protected: 828 virtual UChar32 nextRawCodePoint() { 829 if(s == limit) { return U_SENTINEL; } 830 UChar32 c = *s++; 831 if(limit == NULL && c == 0) { 832 s = NULL; 833 return U_SENTINEL; 834 } 835 UChar trail; 836 if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) { 837 ++s; 838 c = U16_GET_SUPPLEMENTARY(c, trail); 839 } 840 return c; 841 } 842 843 const UChar *s; 844 const UChar *limit; 845}; 846 847class FCDUTF16NFDIterator : public UTF16NFDIterator { 848public: 849 FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const UChar *text, const UChar *textLimit) 850 : UTF16NFDIterator(NULL, NULL) { 851 UErrorCode errorCode = U_ZERO_ERROR; 852 const UChar *spanLimit = nfcImpl.makeFCD(text, textLimit, NULL, errorCode); 853 if(U_FAILURE(errorCode)) { return; } 854 if(spanLimit == textLimit || (textLimit == NULL && *spanLimit == 0)) { 855 s = text; 856 limit = spanLimit; 857 } else { 858 str.setTo(text, (int32_t)(spanLimit - text)); 859 { 860 ReorderingBuffer buffer(nfcImpl, str); 861 if(buffer.init(str.length(), errorCode)) { 862 nfcImpl.makeFCD(spanLimit, textLimit, &buffer, errorCode); 863 } 864 } 865 if(U_SUCCESS(errorCode)) { 866 s = str.getBuffer(); 867 limit = s + str.length(); 868 } 869 } 870 } 871private: 872 UnicodeString str; 873}; 874 875class UTF8NFDIterator : public NFDIterator { 876public: 877 UTF8NFDIterator(const uint8_t *text, int32_t textLength) 878 : s(text), pos(0), length(textLength) {} 879protected: 880 virtual UChar32 nextRawCodePoint() { 881 if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; } 882 UChar32 c; 883 U8_NEXT_OR_FFFD(s, pos, length, c); 884 return c; 885 } 886 887 const uint8_t *s; 888 int32_t pos; 889 int32_t length; 890}; 891 892class FCDUTF8NFDIterator : public NFDIterator { 893public: 894 FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t textLength) 895 : u8ci(data, FALSE, text, 0, textLength) {} 896protected: 897 virtual UChar32 nextRawCodePoint() { 898 UErrorCode errorCode = U_ZERO_ERROR; 899 return u8ci.nextCodePoint(errorCode); 900 } 901private: 902 FCDUTF8CollationIterator u8ci; 903}; 904 905class UIterNFDIterator : public NFDIterator { 906public: 907 UIterNFDIterator(UCharIterator &it) : iter(it) {} 908protected: 909 virtual UChar32 nextRawCodePoint() { 910 return uiter_next32(&iter); 911 } 912private: 913 UCharIterator &iter; 914}; 915 916class FCDUIterNFDIterator : public NFDIterator { 917public: 918 FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t startIndex) 919 : uici(data, FALSE, it, startIndex) {} 920protected: 921 virtual UChar32 nextRawCodePoint() { 922 UErrorCode errorCode = U_ZERO_ERROR; 923 return uici.nextCodePoint(errorCode); 924 } 925private: 926 FCDUIterCollationIterator uici; 927}; 928 929UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl, 930 NFDIterator &left, NFDIterator &right) { 931 for(;;) { 932 // Fetch the next FCD code point from each string. 933 UChar32 leftCp = left.nextCodePoint(); 934 UChar32 rightCp = right.nextCodePoint(); 935 if(leftCp == rightCp) { 936 if(leftCp < 0) { break; } 937 continue; 938 } 939 // If they are different, then decompose each and compare again. 940 if(leftCp < 0) { 941 leftCp = -2; // end of string 942 } else if(leftCp == 0xfffe) { 943 leftCp = -1; // U+FFFE: merge separator 944 } else { 945 leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp); 946 } 947 if(rightCp < 0) { 948 rightCp = -2; // end of string 949 } else if(rightCp == 0xfffe) { 950 rightCp = -1; // U+FFFE: merge separator 951 } else { 952 rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp); 953 } 954 if(leftCp < rightCp) { return UCOL_LESS; } 955 if(leftCp > rightCp) { return UCOL_GREATER; } 956 } 957 return UCOL_EQUAL; 958} 959 960} // namespace 961 962UCollationResult 963RuleBasedCollator::doCompare(const UChar *left, int32_t leftLength, 964 const UChar *right, int32_t rightLength, 965 UErrorCode &errorCode) const { 966 // U_FAILURE(errorCode) checked by caller. 967 if(left == right && leftLength == rightLength) { 968 return UCOL_EQUAL; 969 } 970 971 // Identical-prefix test. 972 const UChar *leftLimit; 973 const UChar *rightLimit; 974 int32_t equalPrefixLength = 0; 975 if(leftLength < 0) { 976 leftLimit = NULL; 977 rightLimit = NULL; 978 UChar c; 979 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) { 980 if(c == 0) { return UCOL_EQUAL; } 981 ++equalPrefixLength; 982 } 983 } else { 984 leftLimit = left + leftLength; 985 rightLimit = right + rightLength; 986 for(;;) { 987 if(equalPrefixLength == leftLength) { 988 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; } 989 break; 990 } else if(equalPrefixLength == rightLength || 991 left[equalPrefixLength] != right[equalPrefixLength]) { 992 break; 993 } 994 ++equalPrefixLength; 995 } 996 } 997 998 UBool numeric = settings->isNumeric(); 999 if(equalPrefixLength > 0) { 1000 if((equalPrefixLength != leftLength && 1001 data->isUnsafeBackward(left[equalPrefixLength], numeric)) || 1002 (equalPrefixLength != rightLength && 1003 data->isUnsafeBackward(right[equalPrefixLength], numeric))) { 1004 // Identical prefix: Back up to the start of a contraction or reordering sequence. 1005 while(--equalPrefixLength > 0 && 1006 data->isUnsafeBackward(left[equalPrefixLength], numeric)) {} 1007 } 1008 // Notes: 1009 // - A longer string can compare equal to a prefix of it if only ignorables follow. 1010 // - With a backward level, a longer string can compare less-than a prefix of it. 1011 1012 // Pass the actual start of each string into the CollationIterators, 1013 // plus the equalPrefixLength position, 1014 // so that prefix matches back into the equal prefix work. 1015 } 1016 1017 int32_t result; 1018 int32_t fastLatinOptions = settings->fastLatinOptions; 1019 if(fastLatinOptions >= 0 && 1020 (equalPrefixLength == leftLength || 1021 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) && 1022 (equalPrefixLength == rightLength || 1023 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) { 1024 if(leftLength >= 0) { 1025 result = CollationFastLatin::compareUTF16(data->fastLatinTable, 1026 settings->fastLatinPrimaries, 1027 fastLatinOptions, 1028 left + equalPrefixLength, 1029 leftLength - equalPrefixLength, 1030 right + equalPrefixLength, 1031 rightLength - equalPrefixLength); 1032 } else { 1033 result = CollationFastLatin::compareUTF16(data->fastLatinTable, 1034 settings->fastLatinPrimaries, 1035 fastLatinOptions, 1036 left + equalPrefixLength, -1, 1037 right + equalPrefixLength, -1); 1038 } 1039 } else { 1040 result = CollationFastLatin::BAIL_OUT_RESULT; 1041 } 1042 1043 if(result == CollationFastLatin::BAIL_OUT_RESULT) { 1044 if(settings->dontCheckFCD()) { 1045 UTF16CollationIterator leftIter(data, numeric, 1046 left, left + equalPrefixLength, leftLimit); 1047 UTF16CollationIterator rightIter(data, numeric, 1048 right, right + equalPrefixLength, rightLimit); 1049 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1050 } else { 1051 FCDUTF16CollationIterator leftIter(data, numeric, 1052 left, left + equalPrefixLength, leftLimit); 1053 FCDUTF16CollationIterator rightIter(data, numeric, 1054 right, right + equalPrefixLength, rightLimit); 1055 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1056 } 1057 } 1058 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) { 1059 return (UCollationResult)result; 1060 } 1061 1062 // Note: If NUL-terminated, we could get the actual limits from the iterators now. 1063 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience, 1064 // and the benefit seems unlikely to be measurable. 1065 1066 // Compare identical level. 1067 const Normalizer2Impl &nfcImpl = data->nfcImpl; 1068 left += equalPrefixLength; 1069 right += equalPrefixLength; 1070 if(settings->dontCheckFCD()) { 1071 UTF16NFDIterator leftIter(left, leftLimit); 1072 UTF16NFDIterator rightIter(right, rightLimit); 1073 return compareNFDIter(nfcImpl, leftIter, rightIter); 1074 } else { 1075 FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit); 1076 FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit); 1077 return compareNFDIter(nfcImpl, leftIter, rightIter); 1078 } 1079} 1080 1081UCollationResult 1082RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength, 1083 const uint8_t *right, int32_t rightLength, 1084 UErrorCode &errorCode) const { 1085 // U_FAILURE(errorCode) checked by caller. 1086 if(left == right && leftLength == rightLength) { 1087 return UCOL_EQUAL; 1088 } 1089 1090 // Identical-prefix test. 1091 int32_t equalPrefixLength = 0; 1092 if(leftLength < 0) { 1093 uint8_t c; 1094 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) { 1095 if(c == 0) { return UCOL_EQUAL; } 1096 ++equalPrefixLength; 1097 } 1098 } else { 1099 for(;;) { 1100 if(equalPrefixLength == leftLength) { 1101 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; } 1102 break; 1103 } else if(equalPrefixLength == rightLength || 1104 left[equalPrefixLength] != right[equalPrefixLength]) { 1105 break; 1106 } 1107 ++equalPrefixLength; 1108 } 1109 } 1110 // Back up to the start of a partially-equal code point. 1111 if(equalPrefixLength > 0 && 1112 ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLength])) || 1113 (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLength])))) { 1114 while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) {} 1115 } 1116 1117 UBool numeric = settings->isNumeric(); 1118 if(equalPrefixLength > 0) { 1119 UBool unsafe = FALSE; 1120 if(equalPrefixLength != leftLength) { 1121 int32_t i = equalPrefixLength; 1122 UChar32 c; 1123 U8_NEXT_OR_FFFD(left, i, leftLength, c); 1124 unsafe = data->isUnsafeBackward(c, numeric); 1125 } 1126 if(!unsafe && equalPrefixLength != rightLength) { 1127 int32_t i = equalPrefixLength; 1128 UChar32 c; 1129 U8_NEXT_OR_FFFD(right, i, rightLength, c); 1130 unsafe = data->isUnsafeBackward(c, numeric); 1131 } 1132 if(unsafe) { 1133 // Identical prefix: Back up to the start of a contraction or reordering sequence. 1134 UChar32 c; 1135 do { 1136 U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c); 1137 } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric)); 1138 } 1139 // See the notes in the UTF-16 version. 1140 1141 // Pass the actual start of each string into the CollationIterators, 1142 // plus the equalPrefixLength position, 1143 // so that prefix matches back into the equal prefix work. 1144 } 1145 1146 int32_t result; 1147 int32_t fastLatinOptions = settings->fastLatinOptions; 1148 if(fastLatinOptions >= 0 && 1149 (equalPrefixLength == leftLength || 1150 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD) && 1151 (equalPrefixLength == rightLength || 1152 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD)) { 1153 if(leftLength >= 0) { 1154 result = CollationFastLatin::compareUTF8(data->fastLatinTable, 1155 settings->fastLatinPrimaries, 1156 fastLatinOptions, 1157 left + equalPrefixLength, 1158 leftLength - equalPrefixLength, 1159 right + equalPrefixLength, 1160 rightLength - equalPrefixLength); 1161 } else { 1162 result = CollationFastLatin::compareUTF8(data->fastLatinTable, 1163 settings->fastLatinPrimaries, 1164 fastLatinOptions, 1165 left + equalPrefixLength, -1, 1166 right + equalPrefixLength, -1); 1167 } 1168 } else { 1169 result = CollationFastLatin::BAIL_OUT_RESULT; 1170 } 1171 1172 if(result == CollationFastLatin::BAIL_OUT_RESULT) { 1173 if(settings->dontCheckFCD()) { 1174 UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength); 1175 UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength); 1176 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1177 } else { 1178 FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength); 1179 FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength); 1180 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1181 } 1182 } 1183 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) { 1184 return (UCollationResult)result; 1185 } 1186 1187 // Note: If NUL-terminated, we could get the actual limits from the iterators now. 1188 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience, 1189 // and the benefit seems unlikely to be measurable. 1190 1191 // Compare identical level. 1192 const Normalizer2Impl &nfcImpl = data->nfcImpl; 1193 left += equalPrefixLength; 1194 right += equalPrefixLength; 1195 if(leftLength > 0) { 1196 leftLength -= equalPrefixLength; 1197 rightLength -= equalPrefixLength; 1198 } 1199 if(settings->dontCheckFCD()) { 1200 UTF8NFDIterator leftIter(left, leftLength); 1201 UTF8NFDIterator rightIter(right, rightLength); 1202 return compareNFDIter(nfcImpl, leftIter, rightIter); 1203 } else { 1204 FCDUTF8NFDIterator leftIter(data, left, leftLength); 1205 FCDUTF8NFDIterator rightIter(data, right, rightLength); 1206 return compareNFDIter(nfcImpl, leftIter, rightIter); 1207 } 1208} 1209 1210UCollationResult 1211RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right, 1212 UErrorCode &errorCode) const { 1213 if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; } 1214 UBool numeric = settings->isNumeric(); 1215 1216 // Identical-prefix test. 1217 int32_t equalPrefixLength = 0; 1218 { 1219 UChar32 leftUnit; 1220 UChar32 rightUnit; 1221 while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) { 1222 if(leftUnit < 0) { return UCOL_EQUAL; } 1223 ++equalPrefixLength; 1224 } 1225 1226 // Back out the code units that differed, for the real collation comparison. 1227 if(leftUnit >= 0) { left.previous(&left); } 1228 if(rightUnit >= 0) { right.previous(&right); } 1229 1230 if(equalPrefixLength > 0) { 1231 if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) || 1232 (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric))) { 1233 // Identical prefix: Back up to the start of a contraction or reordering sequence. 1234 do { 1235 --equalPrefixLength; 1236 leftUnit = left.previous(&left); 1237 right.previous(&right); 1238 } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit, numeric)); 1239 } 1240 // See the notes in the UTF-16 version. 1241 } 1242 } 1243 1244 UCollationResult result; 1245 if(settings->dontCheckFCD()) { 1246 UIterCollationIterator leftIter(data, numeric, left); 1247 UIterCollationIterator rightIter(data, numeric, right); 1248 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1249 } else { 1250 FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLength); 1251 FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLength); 1252 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1253 } 1254 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) { 1255 return result; 1256 } 1257 1258 // Compare identical level. 1259 left.move(&left, equalPrefixLength, UITER_ZERO); 1260 right.move(&right, equalPrefixLength, UITER_ZERO); 1261 const Normalizer2Impl &nfcImpl = data->nfcImpl; 1262 if(settings->dontCheckFCD()) { 1263 UIterNFDIterator leftIter(left); 1264 UIterNFDIterator rightIter(right); 1265 return compareNFDIter(nfcImpl, leftIter, rightIter); 1266 } else { 1267 FCDUIterNFDIterator leftIter(data, left, equalPrefixLength); 1268 FCDUIterNFDIterator rightIter(data, right, equalPrefixLength); 1269 return compareNFDIter(nfcImpl, leftIter, rightIter); 1270 } 1271} 1272 1273CollationKey & 1274RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key, 1275 UErrorCode &errorCode) const { 1276 return getCollationKey(s.getBuffer(), s.length(), key, errorCode); 1277} 1278 1279CollationKey & 1280RuleBasedCollator::getCollationKey(const UChar *s, int32_t length, CollationKey& key, 1281 UErrorCode &errorCode) const { 1282 if(U_FAILURE(errorCode)) { 1283 return key.setToBogus(); 1284 } 1285 if(s == NULL && length != 0) { 1286 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 1287 return key.setToBogus(); 1288 } 1289 key.reset(); // resets the "bogus" state 1290 CollationKeyByteSink sink(key); 1291 writeSortKey(s, length, sink, errorCode); 1292 if(U_FAILURE(errorCode)) { 1293 key.setToBogus(); 1294 } else if(key.isBogus()) { 1295 errorCode = U_MEMORY_ALLOCATION_ERROR; 1296 } else { 1297 key.setLength(sink.NumberOfBytesAppended()); 1298 } 1299 return key; 1300} 1301 1302int32_t 1303RuleBasedCollator::getSortKey(const UnicodeString &s, 1304 uint8_t *dest, int32_t capacity) const { 1305 return getSortKey(s.getBuffer(), s.length(), dest, capacity); 1306} 1307 1308int32_t 1309RuleBasedCollator::getSortKey(const UChar *s, int32_t length, 1310 uint8_t *dest, int32_t capacity) const { 1311 if((s == NULL && length != 0) || capacity < 0 || (dest == NULL && capacity > 0)) { 1312 return 0; 1313 } 1314 uint8_t noDest[1] = { 0 }; 1315 if(dest == NULL) { 1316 // Distinguish pure preflighting from an allocation error. 1317 dest = noDest; 1318 capacity = 0; 1319 } 1320 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity); 1321 UErrorCode errorCode = U_ZERO_ERROR; 1322 writeSortKey(s, length, sink, errorCode); 1323 return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0; 1324} 1325 1326void 1327RuleBasedCollator::writeSortKey(const UChar *s, int32_t length, 1328 SortKeyByteSink &sink, UErrorCode &errorCode) const { 1329 if(U_FAILURE(errorCode)) { return; } 1330 const UChar *limit = (length >= 0) ? s + length : NULL; 1331 UBool numeric = settings->isNumeric(); 1332 CollationKeys::LevelCallback callback; 1333 if(settings->dontCheckFCD()) { 1334 UTF16CollationIterator iter(data, numeric, s, s, limit); 1335 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings, 1336 sink, Collation::PRIMARY_LEVEL, 1337 callback, TRUE, errorCode); 1338 } else { 1339 FCDUTF16CollationIterator iter(data, numeric, s, s, limit); 1340 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings, 1341 sink, Collation::PRIMARY_LEVEL, 1342 callback, TRUE, errorCode); 1343 } 1344 if(settings->getStrength() == UCOL_IDENTICAL) { 1345 writeIdenticalLevel(s, limit, sink, errorCode); 1346 } 1347 static const char terminator = 0; // TERMINATOR_BYTE 1348 sink.Append(&terminator, 1); 1349} 1350 1351void 1352RuleBasedCollator::writeIdenticalLevel(const UChar *s, const UChar *limit, 1353 SortKeyByteSink &sink, UErrorCode &errorCode) const { 1354 // NFD quick check 1355 const UChar *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, NULL, errorCode); 1356 if(U_FAILURE(errorCode)) { return; } 1357 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); 1358 UChar32 prev = 0; 1359 if(nfdQCYesLimit != s) { 1360 prev = u_writeIdenticalLevelRun(prev, s, (int32_t)(nfdQCYesLimit - s), sink); 1361 } 1362 // Is there non-NFD text? 1363 int32_t destLengthEstimate; 1364 if(limit != NULL) { 1365 if(nfdQCYesLimit == limit) { return; } 1366 destLengthEstimate = (int32_t)(limit - nfdQCYesLimit); 1367 } else { 1368 // s is NUL-terminated 1369 if(*nfdQCYesLimit == 0) { return; } 1370 destLengthEstimate = -1; 1371 } 1372 UnicodeString nfd; 1373 data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, errorCode); 1374 u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink); 1375} 1376 1377namespace { 1378 1379/** 1380 * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary() 1381 * with an instance of this callback class. 1382 * When another level is about to be written, the callback 1383 * records the level and the number of bytes that will be written until 1384 * the sink (which is actually a FixedSortKeyByteSink) fills up. 1385 * 1386 * When internalNextSortKeyPart() is called again, it restarts with the last level 1387 * and ignores as many bytes as were written previously for that level. 1388 */ 1389class PartLevelCallback : public CollationKeys::LevelCallback { 1390public: 1391 PartLevelCallback(const SortKeyByteSink &s) 1392 : sink(s), level(Collation::PRIMARY_LEVEL) { 1393 levelCapacity = sink.GetRemainingCapacity(); 1394 } 1395 virtual ~PartLevelCallback() {} 1396 virtual UBool needToWrite(Collation::Level l) { 1397 if(!sink.Overflowed()) { 1398 // Remember a level that will be at least partially written. 1399 level = l; 1400 levelCapacity = sink.GetRemainingCapacity(); 1401 return TRUE; 1402 } else { 1403 return FALSE; 1404 } 1405 } 1406 Collation::Level getLevel() const { return level; } 1407 int32_t getLevelCapacity() const { return levelCapacity; } 1408 1409private: 1410 const SortKeyByteSink &sink; 1411 Collation::Level level; 1412 int32_t levelCapacity; 1413}; 1414 1415} // namespace 1416 1417int32_t 1418RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2], 1419 uint8_t *dest, int32_t count, UErrorCode &errorCode) const { 1420 if(U_FAILURE(errorCode)) { return 0; } 1421 if(iter == NULL || state == NULL || count < 0 || (count > 0 && dest == NULL)) { 1422 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 1423 return 0; 1424 } 1425 if(count == 0) { return 0; } 1426 1427 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count); 1428 sink.IgnoreBytes((int32_t)state[1]); 1429 iter->move(iter, 0, UITER_START); 1430 1431 Collation::Level level = (Collation::Level)state[0]; 1432 if(level <= Collation::QUATERNARY_LEVEL) { 1433 UBool numeric = settings->isNumeric(); 1434 PartLevelCallback callback(sink); 1435 if(settings->dontCheckFCD()) { 1436 UIterCollationIterator ci(data, numeric, *iter); 1437 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings, 1438 sink, level, callback, FALSE, errorCode); 1439 } else { 1440 FCDUIterCollationIterator ci(data, numeric, *iter, 0); 1441 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings, 1442 sink, level, callback, FALSE, errorCode); 1443 } 1444 if(U_FAILURE(errorCode)) { return 0; } 1445 if(sink.NumberOfBytesAppended() > count) { 1446 state[0] = (uint32_t)callback.getLevel(); 1447 state[1] = (uint32_t)callback.getLevelCapacity(); 1448 return count; 1449 } 1450 // All of the normal levels are done. 1451 if(settings->getStrength() == UCOL_IDENTICAL) { 1452 level = Collation::IDENTICAL_LEVEL; 1453 iter->move(iter, 0, UITER_START); 1454 } 1455 // else fall through to setting ZERO_LEVEL 1456 } 1457 1458 if(level == Collation::IDENTICAL_LEVEL) { 1459 int32_t levelCapacity = sink.GetRemainingCapacity(); 1460 UnicodeString s; 1461 for(;;) { 1462 UChar32 c = iter->next(iter); 1463 if(c < 0) { break; } 1464 s.append((UChar)c); 1465 } 1466 const UChar *sArray = s.getBuffer(); 1467 writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode); 1468 if(U_FAILURE(errorCode)) { return 0; } 1469 if(sink.NumberOfBytesAppended() > count) { 1470 state[0] = (uint32_t)level; 1471 state[1] = (uint32_t)levelCapacity; 1472 return count; 1473 } 1474 } 1475 1476 // ZERO_LEVEL: Fill the remainder of dest with 00 bytes. 1477 state[0] = (uint32_t)Collation::ZERO_LEVEL; 1478 state[1] = 0; 1479 int32_t length = sink.NumberOfBytesAppended(); 1480 int32_t i = length; 1481 while(i < count) { dest[i++] = 0; } 1482 return length; 1483} 1484 1485void 1486RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces, 1487 UErrorCode &errorCode) const { 1488 if(U_FAILURE(errorCode)) { return; } 1489 const UChar *s = str.getBuffer(); 1490 const UChar *limit = s + str.length(); 1491 UBool numeric = settings->isNumeric(); 1492 if(settings->dontCheckFCD()) { 1493 UTF16CollationIterator iter(data, numeric, s, s, limit); 1494 int64_t ce; 1495 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) { 1496 ces.addElement(ce, errorCode); 1497 } 1498 } else { 1499 FCDUTF16CollationIterator iter(data, numeric, s, s, limit); 1500 int64_t ce; 1501 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) { 1502 ces.addElement(ce, errorCode); 1503 } 1504 } 1505} 1506 1507namespace { 1508 1509void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length, 1510 UErrorCode &errorCode) { 1511 if(U_FAILURE(errorCode) || length == 0) { return; } 1512 if(!s.isEmpty()) { 1513 s.append('_', errorCode); 1514 } 1515 s.append(letter, errorCode); 1516 for(int32_t i = 0; i < length; ++i) { 1517 s.append(uprv_toupper(subtag[i]), errorCode); 1518 } 1519} 1520 1521void appendAttribute(CharString &s, char letter, UColAttributeValue value, 1522 UErrorCode &errorCode) { 1523 if(U_FAILURE(errorCode)) { return; } 1524 if(!s.isEmpty()) { 1525 s.append('_', errorCode); 1526 } 1527 static const char *valueChars = "1234...........IXO..SN..LU......"; 1528 s.append(letter, errorCode); 1529 s.append(valueChars[value], errorCode); 1530} 1531 1532} // namespace 1533 1534int32_t 1535RuleBasedCollator::internalGetShortDefinitionString(const char *locale, 1536 char *buffer, int32_t capacity, 1537 UErrorCode &errorCode) const { 1538 if(U_FAILURE(errorCode)) { return 0; } 1539 if(buffer == NULL ? capacity != 0 : capacity < 0) { 1540 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 1541 return 0; 1542 } 1543 if(locale == NULL) { 1544 locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode); 1545 } 1546 1547 char resultLocale[ULOC_FULLNAME_CAPACITY + 1]; 1548 int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CAPACITY, 1549 "collation", locale, 1550 NULL, &errorCode); 1551 if(U_FAILURE(errorCode)) { return 0; } 1552 if(length == 0) { 1553 uprv_strcpy(resultLocale, "root"); 1554 } else { 1555 resultLocale[length] = 0; 1556 } 1557 1558 // Append items in alphabetic order of their short definition letters. 1559 CharString result; 1560 char subtag[ULOC_KEYWORD_AND_VALUES_CAPACITY]; 1561 1562 if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) { 1563 appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, errorCode), errorCode); 1564 } 1565 // ATTR_VARIABLE_TOP not supported because 'B' was broken. 1566 // See ICU tickets #10372 and #10386. 1567 if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) { 1568 appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), errorCode); 1569 } 1570 if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) { 1571 appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorCode), errorCode); 1572 } 1573 if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) { 1574 appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), errorCode); 1575 } 1576 if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) { 1577 appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCode), errorCode); 1578 } 1579 // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default. 1580 length = uloc_getKeywordValue(resultLocale, "collation", subtag, LENGTHOF(subtag), &errorCode); 1581 appendSubtag(result, 'K', subtag, length, errorCode); 1582 length = uloc_getLanguage(resultLocale, subtag, LENGTHOF(subtag), &errorCode); 1583 appendSubtag(result, 'L', subtag, length, errorCode); 1584 if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) { 1585 appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode); 1586 } 1587 length = uloc_getCountry(resultLocale, subtag, LENGTHOF(subtag), &errorCode); 1588 appendSubtag(result, 'R', subtag, length, errorCode); 1589 if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) { 1590 appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), errorCode); 1591 } 1592 length = uloc_getVariant(resultLocale, subtag, LENGTHOF(subtag), &errorCode); 1593 appendSubtag(result, 'V', subtag, length, errorCode); 1594 length = uloc_getScript(resultLocale, subtag, LENGTHOF(subtag), &errorCode); 1595 appendSubtag(result, 'Z', subtag, length, errorCode); 1596 1597 if(U_FAILURE(errorCode)) { return 0; } 1598 if(result.length() <= capacity) { 1599 uprv_memcpy(buffer, result.data(), result.length()); 1600 } 1601 return u_terminateChars(buffer, capacity, result.length(), &errorCode); 1602} 1603 1604UBool 1605RuleBasedCollator::isUnsafe(UChar32 c) const { 1606 return data->isUnsafeBackward(c, settings->isNumeric()); 1607} 1608 1609void 1610RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) { 1611 t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, errorCode); 1612} 1613 1614UBool 1615RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const { 1616 umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailoring, errorCode); 1617 return U_SUCCESS(errorCode); 1618} 1619 1620CollationElementIterator * 1621RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const { 1622 UErrorCode errorCode = U_ZERO_ERROR; 1623 if(!initMaxExpansions(errorCode)) { return NULL; } 1624 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode); 1625 if(U_FAILURE(errorCode)) { 1626 delete cei; 1627 return NULL; 1628 } 1629 return cei; 1630} 1631 1632CollationElementIterator * 1633RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const { 1634 UErrorCode errorCode = U_ZERO_ERROR; 1635 if(!initMaxExpansions(errorCode)) { return NULL; } 1636 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode); 1637 if(U_FAILURE(errorCode)) { 1638 delete cei; 1639 return NULL; 1640 } 1641 return cei; 1642} 1643 1644int32_t 1645RuleBasedCollator::getMaxExpansion(int32_t order) const { 1646 UErrorCode errorCode = U_ZERO_ERROR; 1647 (void)initMaxExpansions(errorCode); 1648 return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, order); 1649} 1650 1651U_NAMESPACE_END 1652 1653#endif // !UCONFIG_NO_COLLATION 1654