1/* 2 ****************************************************************************** 3 * Copyright (C) 1996-2010, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ****************************************************************************** 6 */ 7 8/** 9 * File tblcoll.cpp 10 * 11 * Created by: Helena Shih 12 * 13 * Modification History: 14 * 15 * Date Name Description 16 * 2/5/97 aliu Added streamIn and streamOut methods. Added 17 * constructor which reads RuleBasedCollator object from 18 * a binary file. Added writeToFile method which streams 19 * RuleBasedCollator out to a binary file. The streamIn 20 * and streamOut methods use istream and ostream objects 21 * in binary mode. 22 * 2/11/97 aliu Moved declarations out of for loop initializer. 23 * Added Mac compatibility #ifdef for ios::nocreate. 24 * 2/12/97 aliu Modified to use TableCollationData sub-object to 25 * hold invariant data. 26 * 2/13/97 aliu Moved several methods into this class from Collation. 27 * Added a private RuleBasedCollator(Locale&) constructor, 28 * to be used by Collator::getInstance(). General 29 * clean up. Made use of UErrorCode variables consistent. 30 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy 31 * constructor and getDynamicClassID. 32 * 3/5/97 aliu Changed compaction cycle to improve performance. We 33 * use the maximum allowable value which is kBlockCount. 34 * Modified getRules() to load rules dynamically. Changed 35 * constructFromFile() call to accomodate this (added 36 * parameter to specify whether binary loading is to 37 * take place). 38 * 05/06/97 helena Added memory allocation error check. 39 * 6/20/97 helena Java class name change. 40 * 6/23/97 helena Adding comments to make code more readable. 41 * 09/03/97 helena Added createCollationKeyValues(). 42 * 06/26/98 erm Changes for CollationKeys using byte arrays. 43 * 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java 44 * 04/23/99 stephen Removed EDecompositionMode, merged with 45 * Normalizer::EMode 46 * 06/14/99 stephen Removed kResourceBundleSuffix 47 * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx 48 * files are no longer used. 49 * 11/02/99 helena Collator performance enhancements. Special case 50 * for NO_OP situations. 51 * 11/17/99 srl More performance enhancements. Inlined some internal functions. 52 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator 53 * to implementation file. 54 * 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h) 55 */ 56 57#include "unicode/utypeinfo.h" // for 'typeid' to work 58 59#include "unicode/utypes.h" 60 61#if !UCONFIG_NO_COLLATION 62 63#include "unicode/tblcoll.h" 64#include "unicode/coleitr.h" 65#include "unicode/ures.h" 66#include "unicode/uset.h" 67#include "ucol_imp.h" 68#include "uresimp.h" 69#include "uhash.h" 70#include "cmemory.h" 71#include "cstring.h" 72#include "putilimp.h" 73 74/* public RuleBasedCollator constructor ---------------------------------- */ 75 76U_NAMESPACE_BEGIN 77 78/** 79* Copy constructor, aliasing, not write-through 80*/ 81RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that) 82: Collator(that) 83, dataIsOwned(FALSE) 84, isWriteThroughAlias(FALSE) 85, ucollator(NULL) 86{ 87 RuleBasedCollator::operator=(that); 88} 89 90RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 91 UErrorCode& status) : 92dataIsOwned(FALSE) 93{ 94 construct(rules, 95 UCOL_DEFAULT_STRENGTH, 96 UCOL_DEFAULT, 97 status); 98} 99 100RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 101 ECollationStrength collationStrength, 102 UErrorCode& status) : dataIsOwned(FALSE) 103{ 104 construct(rules, 105 getUCollationStrength(collationStrength), 106 UCOL_DEFAULT, 107 status); 108} 109 110RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 111 UColAttributeValue decompositionMode, 112 UErrorCode& status) : 113dataIsOwned(FALSE) 114{ 115 construct(rules, 116 UCOL_DEFAULT_STRENGTH, 117 decompositionMode, 118 status); 119} 120 121RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 122 ECollationStrength collationStrength, 123 UColAttributeValue decompositionMode, 124 UErrorCode& status) : dataIsOwned(FALSE) 125{ 126 construct(rules, 127 getUCollationStrength(collationStrength), 128 decompositionMode, 129 status); 130} 131RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length, 132 const RuleBasedCollator *base, 133 UErrorCode &status) : 134dataIsOwned(TRUE), 135isWriteThroughAlias(FALSE) 136{ 137 ucollator = ucol_openBinary(bin, length, base->ucollator, &status); 138} 139 140void 141RuleBasedCollator::setRuleStringFromCollator() 142{ 143 int32_t length; 144 const UChar *r = ucol_getRules(ucollator, &length); 145 146 if (r && length > 0) { 147 // alias the rules string 148 urulestring.setTo(TRUE, r, length); 149 } 150 else { 151 urulestring.truncate(0); // Clear string. 152 } 153} 154 155// not aliasing, not write-through 156void 157RuleBasedCollator::construct(const UnicodeString& rules, 158 UColAttributeValue collationStrength, 159 UColAttributeValue decompositionMode, 160 UErrorCode& status) 161{ 162 ucollator = ucol_openRules(rules.getBuffer(), rules.length(), 163 decompositionMode, collationStrength, 164 NULL, &status); 165 166 dataIsOwned = TRUE; // since we own a collator now, we need to get rid of it 167 isWriteThroughAlias = FALSE; 168 169 if(ucollator == NULL) { 170 if(U_SUCCESS(status)) { 171 status = U_MEMORY_ALLOCATION_ERROR; 172 } 173 return; // Failure 174 } 175 176 setRuleStringFromCollator(); 177} 178 179/* RuleBasedCollator public destructor ----------------------------------- */ 180 181RuleBasedCollator::~RuleBasedCollator() 182{ 183 if (dataIsOwned) 184 { 185 ucol_close(ucollator); 186 } 187 ucollator = 0; 188} 189 190/* RuleBaseCollator public methods --------------------------------------- */ 191 192UBool RuleBasedCollator::operator==(const Collator& that) const 193{ 194 /* only checks for address equals here */ 195 if (Collator::operator==(that)) 196 return TRUE; 197 198 if (typeid(*this) != typeid(that)) 199 return FALSE; /* not the same class */ 200 201 RuleBasedCollator& thatAlias = (RuleBasedCollator&)that; 202 203 // weiv: use C function, commented code below is wrong 204 return ucol_equals(this->ucollator, thatAlias.ucollator); 205 /* 206 synwee : orginal code does not check for data compatibility 207 */ 208 /* 209 if (ucollator != thatAlias.ucollator) 210 return FALSE; 211 212 return TRUE; 213 */ 214} 215 216UBool RuleBasedCollator::operator!=(const Collator& other) const 217{ 218 return !(*this == other); 219} 220 221// aliasing, not write-through 222RuleBasedCollator& RuleBasedCollator::operator=(const RuleBasedCollator& that) 223{ 224 if (this != &that) 225 { 226 if (dataIsOwned) 227 { 228 ucol_close(ucollator); 229 } 230 231 urulestring.truncate(0); // empty the rule string 232 dataIsOwned = TRUE; 233 isWriteThroughAlias = FALSE; 234 235 UErrorCode intStatus = U_ZERO_ERROR; 236 int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE; 237 ucollator = ucol_safeClone(that.ucollator, NULL, &buffersize, 238 &intStatus); 239 if (U_SUCCESS(intStatus)) { 240 setRuleStringFromCollator(); 241 } 242 } 243 return *this; 244} 245 246// aliasing, not write-through 247Collator* RuleBasedCollator::clone() const 248{ 249 return new RuleBasedCollator(*this); 250} 251 252CollationElementIterator* RuleBasedCollator::createCollationElementIterator 253 (const UnicodeString& source) const 254{ 255 UErrorCode status = U_ZERO_ERROR; 256 CollationElementIterator *result = new CollationElementIterator(source, this, 257 status); 258 if (U_FAILURE(status)) { 259 delete result; 260 return NULL; 261 } 262 263 return result; 264} 265 266/** 267* Create a CollationElementIterator object that will iterate over the 268* elements in a string, using the collation rules defined in this 269* RuleBasedCollator 270*/ 271CollationElementIterator* RuleBasedCollator::createCollationElementIterator 272 (const CharacterIterator& source) const 273{ 274 UErrorCode status = U_ZERO_ERROR; 275 CollationElementIterator *result = new CollationElementIterator(source, this, 276 status); 277 278 if (U_FAILURE(status)) { 279 delete result; 280 return NULL; 281 } 282 283 return result; 284} 285 286/** 287* Return a string representation of this collator's rules. The string can 288* later be passed to the constructor that takes a UnicodeString argument, 289* which will construct a collator that's functionally identical to this one. 290* You can also allow users to edit the string in order to change the collation 291* data, or you can print it out for inspection, or whatever. 292*/ 293const UnicodeString& RuleBasedCollator::getRules() const 294{ 295 return urulestring; 296} 297 298void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) 299{ 300 int32_t rulesize = ucol_getRulesEx(ucollator, delta, NULL, -1); 301 302 if (rulesize > 0) { 303 UChar *rules = (UChar*) uprv_malloc( sizeof(UChar) * (rulesize) ); 304 if(rules != NULL) { 305 ucol_getRulesEx(ucollator, delta, rules, rulesize); 306 buffer.setTo(rules, rulesize); 307 uprv_free(rules); 308 } else { // couldn't allocate 309 buffer.remove(); 310 } 311 } 312 else { 313 buffer.remove(); 314 } 315} 316 317UnicodeSet * 318RuleBasedCollator::getTailoredSet(UErrorCode &status) const 319{ 320 if(U_FAILURE(status)) { 321 return NULL; 322 } 323 return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status); 324} 325 326 327void RuleBasedCollator::getVersion(UVersionInfo versionInfo) const 328{ 329 if (versionInfo!=NULL){ 330 ucol_getVersion(ucollator, versionInfo); 331 } 332} 333 334Collator::EComparisonResult RuleBasedCollator::compare( 335 const UnicodeString& source, 336 const UnicodeString& target, 337 int32_t length) const 338{ 339 UErrorCode status = U_ZERO_ERROR; 340 return getEComparisonResult(compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status)); 341} 342 343UCollationResult RuleBasedCollator::compare( 344 const UnicodeString& source, 345 const UnicodeString& target, 346 int32_t length, 347 UErrorCode &status) const 348{ 349 return compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status); 350} 351 352Collator::EComparisonResult RuleBasedCollator::compare(const UChar* source, 353 int32_t sourceLength, 354 const UChar* target, 355 int32_t targetLength) 356 const 357{ 358 return getEComparisonResult(ucol_strcoll(ucollator, source, sourceLength, 359 target, targetLength)); 360} 361 362UCollationResult RuleBasedCollator::compare(const UChar* source, 363 int32_t sourceLength, 364 const UChar* target, 365 int32_t targetLength, 366 UErrorCode &status) const 367{ 368 if(U_SUCCESS(status)) { 369 return ucol_strcoll(ucollator, source, sourceLength, target, targetLength); 370 } else { 371 return UCOL_EQUAL; 372 } 373} 374 375/** 376* Compare two strings using this collator 377*/ 378Collator::EComparisonResult RuleBasedCollator::compare( 379 const UnicodeString& source, 380 const UnicodeString& target) const 381{ 382 return getEComparisonResult(ucol_strcoll(ucollator, source.getBuffer(), source.length(), 383 target.getBuffer(), target.length())); 384} 385 386UCollationResult RuleBasedCollator::compare( 387 const UnicodeString& source, 388 const UnicodeString& target, 389 UErrorCode &status) const 390{ 391 if(U_SUCCESS(status)) { 392 return ucol_strcoll(ucollator, source.getBuffer(), source.length(), 393 target.getBuffer(), target.length()); 394 } else { 395 return UCOL_EQUAL; 396 } 397} 398 399UCollationResult RuleBasedCollator::compare(UCharIterator &sIter, 400 UCharIterator &tIter, 401 UErrorCode &status) const { 402 if(U_SUCCESS(status)) { 403 return ucol_strcollIter(ucollator, &sIter, &tIter, &status); 404 } else { 405 return UCOL_EQUAL; 406 } 407} 408 409/** 410* Retrieve a collation key for the specified string. The key can be compared 411* with other collation keys using a bitwise comparison (e.g. memcmp) to find 412* the ordering of their respective source strings. This is handy when doing a 413* sort, where each sort key must be compared many times. 414* 415* The basic algorithm here is to find all of the collation elements for each 416* character in the source string, convert them to an ASCII representation, and 417* put them into the collation key. But it's trickier than that. Each 418* collation element in a string has three components: primary ('A' vs 'B'), 419* secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference 420* at the end of a string takes precedence over a secondary or tertiary 421* difference earlier in the string. 422* 423* To account for this, we put all of the primary orders at the beginning of 424* the string, followed by the secondary and tertiary orders. Each set of 425* orders is terminated by nulls so that a key for a string which is a initial 426* substring of another key will compare less without any special case. 427* 428* Here's a hypothetical example, with the collation element represented as a 429* three-digit number, one digit for primary, one for secondary, etc. 430* 431* String: A a B \u00C9 432* Collation Elements: 101 100 201 511 433* Collation Key: 1125<null>0001<null>1011<null> 434* 435* To make things even trickier, secondary differences (accent marks) are 436* compared starting at the *end* of the string in languages with French 437* secondary ordering. But when comparing the accent marks on a single base 438* character, they are compared from the beginning. To handle this, we reverse 439* all of the accents that belong to each base character, then we reverse the 440* entire string of secondary orderings at the end. 441*/ 442CollationKey& RuleBasedCollator::getCollationKey( 443 const UnicodeString& source, 444 CollationKey& sortkey, 445 UErrorCode& status) const 446{ 447 return getCollationKey(source.getBuffer(), source.length(), sortkey, status); 448} 449 450CollationKey& RuleBasedCollator::getCollationKey(const UChar* source, 451 int32_t sourceLen, 452 CollationKey& sortkey, 453 UErrorCode& status) const 454{ 455 if (U_FAILURE(status)) 456 { 457 return sortkey.setToBogus(); 458 } 459 460 if ((!source) || (sourceLen == 0)) { 461 return sortkey.reset(); 462 } 463 464 uint8_t *result; 465 int32_t resultLen = ucol_getSortKeyWithAllocation(ucollator, 466 source, sourceLen, 467 &result, 468 &status); 469 sortkey.adopt(result, resultLen); 470 return sortkey; 471} 472 473/** 474 * Return the maximum length of any expansion sequences that end with the 475 * specified comparison order. 476 * @param order a collation order returned by previous or next. 477 * @return the maximum length of any expansion seuences ending with the 478 * specified order or 1 if collation order does not occur at the end of any 479 * expansion sequence. 480 * @see CollationElementIterator#getMaxExpansion 481 */ 482int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const 483{ 484 uint8_t result; 485 UCOL_GETMAXEXPANSION(ucollator, (uint32_t)order, result); 486 return result; 487} 488 489uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length, 490 UErrorCode &status) 491{ 492 return ucol_cloneRuleData(ucollator, &length, &status); 493} 494 495 496int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) 497{ 498 return ucol_cloneBinary(ucollator, buffer, capacity, &status); 499} 500 501void RuleBasedCollator::setAttribute(UColAttribute attr, 502 UColAttributeValue value, 503 UErrorCode &status) 504{ 505 if (U_FAILURE(status)) 506 return; 507 checkOwned(); 508 ucol_setAttribute(ucollator, attr, value, &status); 509} 510 511UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr, 512 UErrorCode &status) 513{ 514 if (U_FAILURE(status)) 515 return UCOL_DEFAULT; 516 return ucol_getAttribute(ucollator, attr, &status); 517} 518 519uint32_t RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status) { 520 checkOwned(); 521 return ucol_setVariableTop(ucollator, varTop, len, &status); 522} 523 524uint32_t RuleBasedCollator::setVariableTop(const UnicodeString varTop, UErrorCode &status) { 525 checkOwned(); 526 return ucol_setVariableTop(ucollator, varTop.getBuffer(), varTop.length(), &status); 527} 528 529void RuleBasedCollator::setVariableTop(const uint32_t varTop, UErrorCode &status) { 530 checkOwned(); 531 ucol_restoreVariableTop(ucollator, varTop, &status); 532} 533 534uint32_t RuleBasedCollator::getVariableTop(UErrorCode &status) const { 535 return ucol_getVariableTop(ucollator, &status); 536} 537 538Collator* RuleBasedCollator::safeClone(void) 539{ 540 UErrorCode intStatus = U_ZERO_ERROR; 541 int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE; 542 UCollator *ucol = ucol_safeClone(ucollator, NULL, &buffersize, 543 &intStatus); 544 if (U_FAILURE(intStatus)) { 545 return NULL; 546 } 547 548 RuleBasedCollator *result = new RuleBasedCollator(); 549 // Null pointer check 550 if (result != NULL) { 551 result->ucollator = ucol; 552 result->dataIsOwned = TRUE; 553 result->isWriteThroughAlias = FALSE; 554 setRuleStringFromCollator(); 555 } 556 557 return result; 558} 559 560 561int32_t RuleBasedCollator::getSortKey(const UnicodeString& source, 562 uint8_t *result, int32_t resultLength) 563 const 564{ 565 return ucol_getSortKey(ucollator, source.getBuffer(), source.length(), result, resultLength); 566} 567 568int32_t RuleBasedCollator::getSortKey(const UChar *source, 569 int32_t sourceLength, uint8_t *result, 570 int32_t resultLength) const 571{ 572 return ucol_getSortKey(ucollator, source, sourceLength, result, resultLength); 573} 574 575Collator::ECollationStrength RuleBasedCollator::getStrength(void) const 576{ 577 UErrorCode intStatus = U_ZERO_ERROR; 578 return getECollationStrength(ucol_getAttribute(ucollator, UCOL_STRENGTH, 579 &intStatus)); 580} 581 582void RuleBasedCollator::setStrength(ECollationStrength newStrength) 583{ 584 checkOwned(); 585 UErrorCode intStatus = U_ZERO_ERROR; 586 UCollationStrength strength = getUCollationStrength(newStrength); 587 ucol_setAttribute(ucollator, UCOL_STRENGTH, strength, &intStatus); 588} 589 590int32_t RuleBasedCollator::getReorderCodes(int32_t *dest, 591 int32_t destCapacity, 592 UErrorCode& status) const 593{ 594 return ucol_getReorderCodes(ucollator, dest, destCapacity, &status); 595} 596 597void RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, 598 int32_t reorderCodesLength, 599 UErrorCode& status) 600{ 601 ucol_setReorderCodes(ucollator, reorderCodes, reorderCodesLength, &status); 602} 603 604 605/** 606* Create a hash code for this collation. Just hash the main rule table -- that 607* should be good enough for almost any use. 608*/ 609int32_t RuleBasedCollator::hashCode() const 610{ 611 int32_t length; 612 const UChar *rules = ucol_getRules(ucollator, &length); 613 return uhash_hashUCharsN(rules, length); 614} 615 616/** 617* return the locale of this collator 618*/ 619const Locale RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode &status) const { 620 const char *result = ucol_getLocaleByType(ucollator, type, &status); 621 if(result == NULL) { 622 Locale res(""); 623 res.setToBogus(); 624 return res; 625 } else { 626 return Locale(result); 627 } 628} 629 630void 631RuleBasedCollator::setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) { 632 checkOwned(); 633 char* rloc = uprv_strdup(requestedLocale.getName()); 634 if (rloc) { 635 char* vloc = uprv_strdup(validLocale.getName()); 636 if (vloc) { 637 char* aloc = uprv_strdup(actualLocale.getName()); 638 if (aloc) { 639 ucol_setReqValidLocales(ucollator, rloc, vloc, aloc); 640 return; 641 } 642 uprv_free(vloc); 643 } 644 uprv_free(rloc); 645 } 646} 647 648// RuleBaseCollatorNew private constructor ---------------------------------- 649 650RuleBasedCollator::RuleBasedCollator() 651 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL) 652{ 653} 654 655RuleBasedCollator::RuleBasedCollator(const Locale& desiredLocale, 656 UErrorCode& status) 657 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL) 658{ 659 if (U_FAILURE(status)) 660 return; 661 662 /* 663 Try to load, in order: 664 1. The desired locale's collation. 665 2. A fallback of the desired locale. 666 3. The default locale's collation. 667 4. A fallback of the default locale. 668 5. The default collation rules, which contains en_US collation rules. 669 670 To reiterate, we try: 671 Specific: 672 language+country+variant 673 language+country 674 language 675 Default: 676 language+country+variant 677 language+country 678 language 679 Root: (aka DEFAULTRULES) 680 steps 1-5 are handled by resource bundle fallback mechanism. 681 however, in a very unprobable situation that no resource bundle 682 data exists, step 5 is repeated with hardcoded default rules. 683 */ 684 685 setUCollator(desiredLocale, status); 686 687 if (U_FAILURE(status)) 688 { 689 status = U_ZERO_ERROR; 690 691 setUCollator(kRootLocaleName, status); 692 if (status == U_ZERO_ERROR) { 693 status = U_USING_DEFAULT_WARNING; 694 } 695 } 696 697 if (U_SUCCESS(status)) 698 { 699 setRuleStringFromCollator(); 700 } 701} 702 703void 704RuleBasedCollator::setUCollator(const char *locale, 705 UErrorCode &status) 706{ 707 if (U_FAILURE(status)) 708 return; 709 if (ucollator && dataIsOwned) 710 ucol_close(ucollator); 711 ucollator = ucol_open_internal(locale, &status); 712 dataIsOwned = TRUE; 713 isWriteThroughAlias = FALSE; 714} 715 716 717void 718RuleBasedCollator::checkOwned() { 719 if (!(dataIsOwned || isWriteThroughAlias)) { 720 UErrorCode status = U_ZERO_ERROR; 721 ucollator = ucol_safeClone(ucollator, NULL, NULL, &status); 722 setRuleStringFromCollator(); 723 dataIsOwned = TRUE; 724 isWriteThroughAlias = FALSE; 725 } 726} 727 728UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator) 729 730U_NAMESPACE_END 731 732#endif /* #if !UCONFIG_NO_COLLATION */ 733