1/* 2 ****************************************************************************** 3 * Copyright (C) 1996-2012, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ****************************************************************************** 6 */ 7 8/** 9 * File tblcoll.cpp 10 * 11 * Created by: Helena Shih 12 * 13 * Modification History: 14 * 15 * Date Name Description 16 * 2/5/97 aliu Added streamIn and streamOut methods. Added 17 * constructor which reads RuleBasedCollator object from 18 * a binary file. Added writeToFile method which streams 19 * RuleBasedCollator out to a binary file. The streamIn 20 * and streamOut methods use istream and ostream objects 21 * in binary mode. 22 * 2/11/97 aliu Moved declarations out of for loop initializer. 23 * Added Mac compatibility #ifdef for ios::nocreate. 24 * 2/12/97 aliu Modified to use TableCollationData sub-object to 25 * hold invariant data. 26 * 2/13/97 aliu Moved several methods into this class from Collation. 27 * Added a private RuleBasedCollator(Locale&) constructor, 28 * to be used by Collator::getInstance(). General 29 * clean up. Made use of UErrorCode variables consistent. 30 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy 31 * constructor and getDynamicClassID. 32 * 3/5/97 aliu Changed compaction cycle to improve performance. We 33 * use the maximum allowable value which is kBlockCount. 34 * Modified getRules() to load rules dynamically. Changed 35 * constructFromFile() call to accomodate this (added 36 * parameter to specify whether binary loading is to 37 * take place). 38 * 05/06/97 helena Added memory allocation error check. 39 * 6/20/97 helena Java class name change. 40 * 6/23/97 helena Adding comments to make code more readable. 41 * 09/03/97 helena Added createCollationKeyValues(). 42 * 06/26/98 erm Changes for CollationKeys using byte arrays. 43 * 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java 44 * 04/23/99 stephen Removed EDecompositionMode, merged with 45 * Normalizer::EMode 46 * 06/14/99 stephen Removed kResourceBundleSuffix 47 * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx 48 * files are no longer used. 49 * 11/02/99 helena Collator performance enhancements. Special case 50 * for NO_OP situations. 51 * 11/17/99 srl More performance enhancements. Inlined some internal functions. 52 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator 53 * to implementation file. 54 * 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h) 55 */ 56 57#include "unicode/utypes.h" 58 59#if !UCONFIG_NO_COLLATION 60 61#include "unicode/tblcoll.h" 62#include "unicode/coleitr.h" 63#include "unicode/ures.h" 64#include "unicode/uset.h" 65#include "ucol_imp.h" 66#include "uresimp.h" 67#include "uhash.h" 68#include "cmemory.h" 69#include "cstring.h" 70#include "putilimp.h" 71#include "ustr_imp.h" 72 73/* public RuleBasedCollator constructor ---------------------------------- */ 74 75U_NAMESPACE_BEGIN 76 77/** 78* Copy constructor, aliasing, not write-through 79*/ 80RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that) 81: Collator(that) 82, dataIsOwned(FALSE) 83, isWriteThroughAlias(FALSE) 84, ucollator(NULL) 85{ 86 RuleBasedCollator::operator=(that); 87} 88 89RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 90 UErrorCode& status) : 91dataIsOwned(FALSE) 92{ 93 construct(rules, 94 UCOL_DEFAULT_STRENGTH, 95 UCOL_DEFAULT, 96 status); 97} 98 99RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 100 ECollationStrength collationStrength, 101 UErrorCode& status) : dataIsOwned(FALSE) 102{ 103 construct(rules, 104 (UColAttributeValue)collationStrength, 105 UCOL_DEFAULT, 106 status); 107} 108 109RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 110 UColAttributeValue decompositionMode, 111 UErrorCode& status) : 112dataIsOwned(FALSE) 113{ 114 construct(rules, 115 UCOL_DEFAULT_STRENGTH, 116 decompositionMode, 117 status); 118} 119 120RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 121 ECollationStrength collationStrength, 122 UColAttributeValue decompositionMode, 123 UErrorCode& status) : dataIsOwned(FALSE) 124{ 125 construct(rules, 126 (UColAttributeValue)collationStrength, 127 decompositionMode, 128 status); 129} 130RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length, 131 const RuleBasedCollator *base, 132 UErrorCode &status) : 133dataIsOwned(TRUE), 134isWriteThroughAlias(FALSE) 135{ 136 ucollator = ucol_openBinary(bin, length, base->ucollator, &status); 137} 138 139void 140RuleBasedCollator::setRuleStringFromCollator() 141{ 142 int32_t length; 143 const UChar *r = ucol_getRules(ucollator, &length); 144 145 if (r && length > 0) { 146 // alias the rules string 147 urulestring.setTo(TRUE, r, length); 148 } 149 else { 150 urulestring.truncate(0); // Clear string. 151 } 152} 153 154// not aliasing, not write-through 155void 156RuleBasedCollator::construct(const UnicodeString& rules, 157 UColAttributeValue collationStrength, 158 UColAttributeValue decompositionMode, 159 UErrorCode& status) 160{ 161 ucollator = ucol_openRules(rules.getBuffer(), rules.length(), 162 decompositionMode, collationStrength, 163 NULL, &status); 164 165 dataIsOwned = TRUE; // since we own a collator now, we need to get rid of it 166 isWriteThroughAlias = FALSE; 167 168 if(ucollator == NULL) { 169 if(U_SUCCESS(status)) { 170 status = U_MEMORY_ALLOCATION_ERROR; 171 } 172 return; // Failure 173 } 174 175 setRuleStringFromCollator(); 176} 177 178/* RuleBasedCollator public destructor ----------------------------------- */ 179 180RuleBasedCollator::~RuleBasedCollator() 181{ 182 if (dataIsOwned) 183 { 184 ucol_close(ucollator); 185 } 186 ucollator = 0; 187} 188 189/* RuleBaseCollator public methods --------------------------------------- */ 190 191UBool RuleBasedCollator::operator==(const Collator& that) const 192{ 193 /* only checks for address equals here */ 194 if (this == &that) { 195 return TRUE; 196 } 197 if (!Collator::operator==(that)) { 198 return FALSE; /* not the same class */ 199 } 200 201 RuleBasedCollator& thatAlias = (RuleBasedCollator&)that; 202 203 return ucol_equals(this->ucollator, thatAlias.ucollator); 204} 205 206// aliasing, not write-through 207RuleBasedCollator& RuleBasedCollator::operator=(const RuleBasedCollator& that) 208{ 209 if (this == &that) { return *this; } 210 211 UErrorCode intStatus = U_ZERO_ERROR; 212 int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE; 213 UCollator *ucol = ucol_safeClone(that.ucollator, NULL, &buffersize, &intStatus); 214 if (U_FAILURE(intStatus)) { return *this; } 215 216 if (dataIsOwned) { 217 ucol_close(ucollator); 218 } 219 ucollator = ucol; 220 dataIsOwned = TRUE; 221 isWriteThroughAlias = FALSE; 222 setRuleStringFromCollator(); 223 return *this; 224} 225 226// aliasing, not write-through 227Collator* RuleBasedCollator::clone() const 228{ 229 RuleBasedCollator* coll = new RuleBasedCollator(*this); 230 // There is a small chance that the internal ucol_safeClone() call fails. 231 if (coll != NULL && coll->ucollator == NULL) { 232 delete coll; 233 return NULL; 234 } 235 return coll; 236} 237 238 239CollationElementIterator* RuleBasedCollator::createCollationElementIterator 240 (const UnicodeString& source) const 241{ 242 UErrorCode status = U_ZERO_ERROR; 243 CollationElementIterator *result = new CollationElementIterator(source, this, 244 status); 245 if (U_FAILURE(status)) { 246 delete result; 247 return NULL; 248 } 249 250 return result; 251} 252 253/** 254* Create a CollationElementIterator object that will iterate over the 255* elements in a string, using the collation rules defined in this 256* RuleBasedCollator 257*/ 258CollationElementIterator* RuleBasedCollator::createCollationElementIterator 259 (const CharacterIterator& source) const 260{ 261 UErrorCode status = U_ZERO_ERROR; 262 CollationElementIterator *result = new CollationElementIterator(source, this, 263 status); 264 265 if (U_FAILURE(status)) { 266 delete result; 267 return NULL; 268 } 269 270 return result; 271} 272 273/** 274* Return a string representation of this collator's rules. The string can 275* later be passed to the constructor that takes a UnicodeString argument, 276* which will construct a collator that's functionally identical to this one. 277* You can also allow users to edit the string in order to change the collation 278* data, or you can print it out for inspection, or whatever. 279*/ 280const UnicodeString& RuleBasedCollator::getRules() const 281{ 282 return urulestring; 283} 284 285void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) 286{ 287 int32_t rulesize = ucol_getRulesEx(ucollator, delta, NULL, -1); 288 289 if (rulesize > 0) { 290 UChar *rules = (UChar*) uprv_malloc( sizeof(UChar) * (rulesize) ); 291 if(rules != NULL) { 292 ucol_getRulesEx(ucollator, delta, rules, rulesize); 293 buffer.setTo(rules, rulesize); 294 uprv_free(rules); 295 } else { // couldn't allocate 296 buffer.remove(); 297 } 298 } 299 else { 300 buffer.remove(); 301 } 302} 303 304UnicodeSet * 305RuleBasedCollator::getTailoredSet(UErrorCode &status) const 306{ 307 if(U_FAILURE(status)) { 308 return NULL; 309 } 310 return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status); 311} 312 313 314void RuleBasedCollator::getVersion(UVersionInfo versionInfo) const 315{ 316 if (versionInfo!=NULL){ 317 ucol_getVersion(ucollator, versionInfo); 318 } 319} 320 321/** 322* Compare two strings using this collator 323*/ 324UCollationResult RuleBasedCollator::compare( 325 const UnicodeString& source, 326 const UnicodeString& target, 327 int32_t length, 328 UErrorCode &status) const 329{ 330 return compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status); 331} 332 333UCollationResult RuleBasedCollator::compare(const UChar* source, 334 int32_t sourceLength, 335 const UChar* target, 336 int32_t targetLength, 337 UErrorCode &status) const 338{ 339 if(U_SUCCESS(status)) { 340 return ucol_strcoll(ucollator, source, sourceLength, target, targetLength); 341 } else { 342 return UCOL_EQUAL; 343 } 344} 345 346UCollationResult RuleBasedCollator::compare( 347 const UnicodeString& source, 348 const UnicodeString& target, 349 UErrorCode &status) const 350{ 351 if(U_SUCCESS(status)) { 352 return ucol_strcoll(ucollator, source.getBuffer(), source.length(), 353 target.getBuffer(), target.length()); 354 } else { 355 return UCOL_EQUAL; 356 } 357} 358 359UCollationResult RuleBasedCollator::compare(UCharIterator &sIter, 360 UCharIterator &tIter, 361 UErrorCode &status) const { 362 if(U_SUCCESS(status)) { 363 return ucol_strcollIter(ucollator, &sIter, &tIter, &status); 364 } else { 365 return UCOL_EQUAL; 366 } 367} 368 369/** 370* Retrieve a collation key for the specified string. The key can be compared 371* with other collation keys using a bitwise comparison (e.g. memcmp) to find 372* the ordering of their respective source strings. This is handy when doing a 373* sort, where each sort key must be compared many times. 374* 375* The basic algorithm here is to find all of the collation elements for each 376* character in the source string, convert them to an ASCII representation, and 377* put them into the collation key. But it's trickier than that. Each 378* collation element in a string has three components: primary ('A' vs 'B'), 379* secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference 380* at the end of a string takes precedence over a secondary or tertiary 381* difference earlier in the string. 382* 383* To account for this, we put all of the primary orders at the beginning of 384* the string, followed by the secondary and tertiary orders. Each set of 385* orders is terminated by nulls so that a key for a string which is a initial 386* substring of another key will compare less without any special case. 387* 388* Here's a hypothetical example, with the collation element represented as a 389* three-digit number, one digit for primary, one for secondary, etc. 390* 391* String: A a B \u00C9 392* Collation Elements: 101 100 201 511 393* Collation Key: 1125<null>0001<null>1011<null> 394* 395* To make things even trickier, secondary differences (accent marks) are 396* compared starting at the *end* of the string in languages with French 397* secondary ordering. But when comparing the accent marks on a single base 398* character, they are compared from the beginning. To handle this, we reverse 399* all of the accents that belong to each base character, then we reverse the 400* entire string of secondary orderings at the end. 401*/ 402CollationKey& RuleBasedCollator::getCollationKey( 403 const UnicodeString& source, 404 CollationKey& sortkey, 405 UErrorCode& status) const 406{ 407 return getCollationKey(source.getBuffer(), source.length(), sortkey, status); 408} 409 410CollationKey& RuleBasedCollator::getCollationKey(const UChar* source, 411 int32_t sourceLen, 412 CollationKey& sortkey, 413 UErrorCode& status) const 414{ 415 if (U_FAILURE(status)) { 416 return sortkey.setToBogus(); 417 } 418 if (sourceLen < -1 || (source == NULL && sourceLen != 0)) { 419 status = U_ILLEGAL_ARGUMENT_ERROR; 420 return sortkey.setToBogus(); 421 } 422 423 if (sourceLen < 0) { 424 sourceLen = u_strlen(source); 425 } 426 if (sourceLen == 0) { 427 return sortkey.reset(); 428 } 429 430 int32_t resultLen = ucol_getCollationKey(ucollator, source, sourceLen, sortkey, status); 431 432 if (U_SUCCESS(status)) { 433 sortkey.setLength(resultLen); 434 } else { 435 sortkey.setToBogus(); 436 } 437 return sortkey; 438} 439 440/** 441 * Return the maximum length of any expansion sequences that end with the 442 * specified comparison order. 443 * @param order a collation order returned by previous or next. 444 * @return the maximum length of any expansion seuences ending with the 445 * specified order or 1 if collation order does not occur at the end of any 446 * expansion sequence. 447 * @see CollationElementIterator#getMaxExpansion 448 */ 449int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const 450{ 451 uint8_t result; 452 UCOL_GETMAXEXPANSION(ucollator, (uint32_t)order, result); 453 return result; 454} 455 456uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length, 457 UErrorCode &status) 458{ 459 return ucol_cloneRuleData(ucollator, &length, &status); 460} 461 462 463int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) 464{ 465 return ucol_cloneBinary(ucollator, buffer, capacity, &status); 466} 467 468void RuleBasedCollator::setAttribute(UColAttribute attr, 469 UColAttributeValue value, 470 UErrorCode &status) 471{ 472 if (U_FAILURE(status)) 473 return; 474 checkOwned(); 475 ucol_setAttribute(ucollator, attr, value, &status); 476} 477 478UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr, 479 UErrorCode &status) const 480{ 481 if (U_FAILURE(status)) 482 return UCOL_DEFAULT; 483 return ucol_getAttribute(ucollator, attr, &status); 484} 485 486uint32_t RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status) { 487 checkOwned(); 488 return ucol_setVariableTop(ucollator, varTop, len, &status); 489} 490 491uint32_t RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &status) { 492 checkOwned(); 493 return ucol_setVariableTop(ucollator, varTop.getBuffer(), varTop.length(), &status); 494} 495 496void RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &status) { 497 checkOwned(); 498 ucol_restoreVariableTop(ucollator, varTop, &status); 499} 500 501uint32_t RuleBasedCollator::getVariableTop(UErrorCode &status) const { 502 return ucol_getVariableTop(ucollator, &status); 503} 504 505int32_t RuleBasedCollator::getSortKey(const UnicodeString& source, 506 uint8_t *result, int32_t resultLength) 507 const 508{ 509 return ucol_getSortKey(ucollator, source.getBuffer(), source.length(), result, resultLength); 510} 511 512int32_t RuleBasedCollator::getSortKey(const UChar *source, 513 int32_t sourceLength, uint8_t *result, 514 int32_t resultLength) const 515{ 516 return ucol_getSortKey(ucollator, source, sourceLength, result, resultLength); 517} 518 519int32_t RuleBasedCollator::getReorderCodes(int32_t *dest, 520 int32_t destCapacity, 521 UErrorCode& status) const 522{ 523 return ucol_getReorderCodes(ucollator, dest, destCapacity, &status); 524} 525 526void RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, 527 int32_t reorderCodesLength, 528 UErrorCode& status) 529{ 530 checkOwned(); 531 ucol_setReorderCodes(ucollator, reorderCodes, reorderCodesLength, &status); 532} 533 534int32_t RuleBasedCollator::getEquivalentReorderCodes(int32_t reorderCode, 535 int32_t* dest, 536 int32_t destCapacity, 537 UErrorCode& status) 538{ 539 return ucol_getEquivalentReorderCodes(reorderCode, dest, destCapacity, &status); 540} 541 542/** 543* Create a hash code for this collation. Just hash the main rule table -- that 544* should be good enough for almost any use. 545*/ 546int32_t RuleBasedCollator::hashCode() const 547{ 548 int32_t length; 549 const UChar *rules = ucol_getRules(ucollator, &length); 550 return ustr_hashUCharsN(rules, length); 551} 552 553/** 554* return the locale of this collator 555*/ 556Locale RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode &status) const { 557 const char *result = ucol_getLocaleByType(ucollator, type, &status); 558 if(result == NULL) { 559 Locale res(""); 560 res.setToBogus(); 561 return res; 562 } else { 563 return Locale(result); 564 } 565} 566 567void 568RuleBasedCollator::setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) { 569 checkOwned(); 570 char* rloc = uprv_strdup(requestedLocale.getName()); 571 if (rloc) { 572 char* vloc = uprv_strdup(validLocale.getName()); 573 if (vloc) { 574 char* aloc = uprv_strdup(actualLocale.getName()); 575 if (aloc) { 576 ucol_setReqValidLocales(ucollator, rloc, vloc, aloc); 577 return; 578 } 579 uprv_free(vloc); 580 } 581 uprv_free(rloc); 582 } 583} 584 585// RuleBaseCollatorNew private constructor ---------------------------------- 586 587RuleBasedCollator::RuleBasedCollator() 588 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL) 589{ 590} 591 592RuleBasedCollator::RuleBasedCollator(const Locale& desiredLocale, 593 UErrorCode& status) 594 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL) 595{ 596 if (U_FAILURE(status)) 597 return; 598 599 /* 600 Try to load, in order: 601 1. The desired locale's collation. 602 2. A fallback of the desired locale. 603 3. The default locale's collation. 604 4. A fallback of the default locale. 605 5. The default collation rules, which contains en_US collation rules. 606 607 To reiterate, we try: 608 Specific: 609 language+country+variant 610 language+country 611 language 612 Default: 613 language+country+variant 614 language+country 615 language 616 Root: (aka DEFAULTRULES) 617 steps 1-5 are handled by resource bundle fallback mechanism. 618 however, in a very unprobable situation that no resource bundle 619 data exists, step 5 is repeated with hardcoded default rules. 620 */ 621 622 setUCollator(desiredLocale, status); 623 624 if (U_FAILURE(status)) 625 { 626 status = U_ZERO_ERROR; 627 628 setUCollator(kRootLocaleName, status); 629 if (status == U_ZERO_ERROR) { 630 status = U_USING_DEFAULT_WARNING; 631 } 632 } 633 634 if (U_SUCCESS(status)) 635 { 636 setRuleStringFromCollator(); 637 } 638} 639 640void 641RuleBasedCollator::setUCollator(const char *locale, 642 UErrorCode &status) 643{ 644 if (U_FAILURE(status)) { 645 return; 646 } 647 if (ucollator && dataIsOwned) 648 ucol_close(ucollator); 649 ucollator = ucol_open_internal(locale, &status); 650 dataIsOwned = TRUE; 651 isWriteThroughAlias = FALSE; 652} 653 654 655void 656RuleBasedCollator::checkOwned() { 657 if (!(dataIsOwned || isWriteThroughAlias)) { 658 UErrorCode status = U_ZERO_ERROR; 659 ucollator = ucol_safeClone(ucollator, NULL, NULL, &status); 660 setRuleStringFromCollator(); 661 dataIsOwned = TRUE; 662 isWriteThroughAlias = FALSE; 663 } 664} 665 666 667int32_t RuleBasedCollator::internalGetShortDefinitionString(const char *locale, 668 char *buffer, 669 int32_t capacity, 670 UErrorCode &status) const { 671 /* simply delegate */ 672 return ucol_getShortDefinitionString(ucollator, locale, buffer, capacity, &status); 673} 674 675 676UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator) 677 678U_NAMESPACE_END 679 680#endif /* #if !UCONFIG_NO_COLLATION */ 681