ucol.cpp revision 64339d36f8bd4db5025fe2988eda22b491a9219c
1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4******************************************************************************* 5* Copyright (C) 1996-2015, International Business Machines 6* Corporation and others. All Rights Reserved. 7******************************************************************************* 8* file name: ucol.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* Modification history 14* Date Name Comments 15* 1996-1999 various members of ICU team maintained C API for collation framework 16* 02/16/2001 synwee Added internal method getPrevSpecialCE 17* 03/01/2001 synwee Added maxexpansion functionality. 18* 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant 19* 2012-2014 markus Rewritten in C++ again. 20*/ 21 22#include "unicode/utypes.h" 23 24#if !UCONFIG_NO_COLLATION 25 26#include "unicode/coll.h" 27#include "unicode/tblcoll.h" 28#include "unicode/bytestream.h" 29#include "unicode/coleitr.h" 30#include "unicode/ucoleitr.h" 31#include "unicode/ustring.h" 32#include "cmemory.h" 33#include "collation.h" 34#include "cstring.h" 35#include "putilimp.h" 36#include "uassert.h" 37#include "utracimp.h" 38 39U_NAMESPACE_USE 40 41U_CAPI UCollator* U_EXPORT2 42ucol_openBinary(const uint8_t *bin, int32_t length, 43 const UCollator *base, 44 UErrorCode *status) 45{ 46 if(U_FAILURE(*status)) { return NULL; } 47 RuleBasedCollator *coll = new RuleBasedCollator( 48 bin, length, 49 RuleBasedCollator::rbcFromUCollator(base), 50 *status); 51 if(coll == NULL) { 52 *status = U_MEMORY_ALLOCATION_ERROR; 53 return NULL; 54 } 55 if(U_FAILURE(*status)) { 56 delete coll; 57 return NULL; 58 } 59 return coll->toUCollator(); 60} 61 62U_CAPI int32_t U_EXPORT2 63ucol_cloneBinary(const UCollator *coll, 64 uint8_t *buffer, int32_t capacity, 65 UErrorCode *status) 66{ 67 if(U_FAILURE(*status)) { 68 return 0; 69 } 70 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 71 if(rbc == NULL && coll != NULL) { 72 *status = U_UNSUPPORTED_ERROR; 73 return 0; 74 } 75 return rbc->cloneBinary(buffer, capacity, *status); 76} 77 78U_CAPI UCollator* U_EXPORT2 79ucol_safeClone(const UCollator *coll, void * /*stackBuffer*/, int32_t * pBufferSize, UErrorCode *status) 80{ 81 if (status == NULL || U_FAILURE(*status)){ 82 return NULL; 83 } 84 if (coll == NULL) { 85 *status = U_ILLEGAL_ARGUMENT_ERROR; 86 return NULL; 87 } 88 if (pBufferSize != NULL) { 89 int32_t inputSize = *pBufferSize; 90 *pBufferSize = 1; 91 if (inputSize == 0) { 92 return NULL; // preflighting for deprecated functionality 93 } 94 } 95 Collator *newColl = Collator::fromUCollator(coll)->clone(); 96 if (newColl == NULL) { 97 *status = U_MEMORY_ALLOCATION_ERROR; 98 } else { 99 *status = U_SAFECLONE_ALLOCATED_WARNING; 100 } 101 return newColl->toUCollator(); 102} 103 104U_CAPI void U_EXPORT2 105ucol_close(UCollator *coll) 106{ 107 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); 108 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); 109 if(coll != NULL) { 110 delete Collator::fromUCollator(coll); 111 } 112 UTRACE_EXIT(); 113} 114 115U_CAPI int32_t U_EXPORT2 116ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 117 const uint8_t *src2, int32_t src2Length, 118 uint8_t *dest, int32_t destCapacity) { 119 /* check arguments */ 120 if( src1==NULL || src1Length<-1 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) || 121 src2==NULL || src2Length<-1 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) || 122 destCapacity<0 || (destCapacity>0 && dest==NULL) 123 ) { 124 /* error, attempt to write a zero byte and return 0 */ 125 if(dest!=NULL && destCapacity>0) { 126 *dest=0; 127 } 128 return 0; 129 } 130 131 /* check lengths and capacity */ 132 if(src1Length<0) { 133 src1Length=(int32_t)uprv_strlen((const char *)src1)+1; 134 } 135 if(src2Length<0) { 136 src2Length=(int32_t)uprv_strlen((const char *)src2)+1; 137 } 138 139 int32_t destLength=src1Length+src2Length; 140 if(destLength>destCapacity) { 141 /* the merged sort key does not fit into the destination */ 142 return destLength; 143 } 144 145 /* merge the sort keys with the same number of levels */ 146 uint8_t *p=dest; 147 for(;;) { 148 /* copy level from src1 not including 00 or 01 */ 149 uint8_t b; 150 while((b=*src1)>=2) { 151 ++src1; 152 *p++=b; 153 } 154 155 /* add a 02 merge separator */ 156 *p++=2; 157 158 /* copy level from src2 not including 00 or 01 */ 159 while((b=*src2)>=2) { 160 ++src2; 161 *p++=b; 162 } 163 164 /* if both sort keys have another level, then add a 01 level separator and continue */ 165 if(*src1==1 && *src2==1) { 166 ++src1; 167 ++src2; 168 *p++=1; 169 } else { 170 break; 171 } 172 } 173 174 /* 175 * here, at least one sort key is finished now, but the other one 176 * might have some contents left from containing more levels; 177 * that contents is just appended to the result 178 */ 179 if(*src1!=0) { 180 /* src1 is not finished, therefore *src2==0, and src1 is appended */ 181 src2=src1; 182 } 183 /* append src2, "the other, unfinished sort key" */ 184 while((*p++=*src2++)!=0) {} 185 186 /* the actual length might be less than destLength if either sort key contained illegally embedded zero bytes */ 187 return (int32_t)(p-dest); 188} 189 190U_CAPI int32_t U_EXPORT2 191ucol_getSortKey(const UCollator *coll, 192 const UChar *source, 193 int32_t sourceLength, 194 uint8_t *result, 195 int32_t resultLength) 196{ 197 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); 198 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 199 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, 200 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength)); 201 } 202 203 int32_t keySize = Collator::fromUCollator(coll)-> 204 getSortKey(source, sourceLength, result, resultLength); 205 206 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); 207 UTRACE_EXIT_VALUE(keySize); 208 return keySize; 209} 210 211U_CAPI int32_t U_EXPORT2 212ucol_nextSortKeyPart(const UCollator *coll, 213 UCharIterator *iter, 214 uint32_t state[2], 215 uint8_t *dest, int32_t count, 216 UErrorCode *status) 217{ 218 /* error checking */ 219 if(status==NULL || U_FAILURE(*status)) { 220 return 0; 221 } 222 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); 223 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", 224 coll, iter, state[0], state[1], dest, count); 225 226 int32_t i = Collator::fromUCollator(coll)-> 227 internalNextSortKeyPart(iter, state, dest, count, *status); 228 229 // Return number of meaningful sortkey bytes. 230 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", 231 dest,i, state[0], state[1]); 232 UTRACE_EXIT_VALUE_STATUS(i, *status); 233 return i; 234} 235 236/** 237 * Produce a bound for a given sortkey and a number of levels. 238 */ 239U_CAPI int32_t U_EXPORT2 240ucol_getBound(const uint8_t *source, 241 int32_t sourceLength, 242 UColBoundMode boundType, 243 uint32_t noOfLevels, 244 uint8_t *result, 245 int32_t resultLength, 246 UErrorCode *status) 247{ 248 // consistency checks 249 if(status == NULL || U_FAILURE(*status)) { 250 return 0; 251 } 252 if(source == NULL) { 253 *status = U_ILLEGAL_ARGUMENT_ERROR; 254 return 0; 255 } 256 257 int32_t sourceIndex = 0; 258 // Scan the string until we skip enough of the key OR reach the end of the key 259 do { 260 sourceIndex++; 261 if(source[sourceIndex] == Collation::LEVEL_SEPARATOR_BYTE) { 262 noOfLevels--; 263 } 264 } while (noOfLevels > 0 265 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); 266 267 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) 268 && noOfLevels > 0) { 269 *status = U_SORT_KEY_TOO_SHORT_WARNING; 270 } 271 272 273 // READ ME: this code assumes that the values for boundType 274 // enum will not changes. They are set so that the enum value 275 // corresponds to the number of extra bytes each bound type 276 // needs. 277 if(result != NULL && resultLength >= sourceIndex+boundType) { 278 uprv_memcpy(result, source, sourceIndex); 279 switch(boundType) { 280 // Lower bound just gets terminated. No extra bytes 281 case UCOL_BOUND_LOWER: // = 0 282 break; 283 // Upper bound needs one extra byte 284 case UCOL_BOUND_UPPER: // = 1 285 result[sourceIndex++] = 2; 286 break; 287 // Upper long bound needs two extra bytes 288 case UCOL_BOUND_UPPER_LONG: // = 2 289 result[sourceIndex++] = 0xFF; 290 result[sourceIndex++] = 0xFF; 291 break; 292 default: 293 *status = U_ILLEGAL_ARGUMENT_ERROR; 294 return 0; 295 } 296 result[sourceIndex++] = 0; 297 298 return sourceIndex; 299 } else { 300 return sourceIndex+boundType+1; 301 } 302} 303 304U_CAPI void U_EXPORT2 305ucol_setMaxVariable(UCollator *coll, UColReorderCode group, UErrorCode *pErrorCode) { 306 if(U_FAILURE(*pErrorCode)) { return; } 307 Collator::fromUCollator(coll)->setMaxVariable(group, *pErrorCode); 308} 309 310U_CAPI UColReorderCode U_EXPORT2 311ucol_getMaxVariable(const UCollator *coll) { 312 return Collator::fromUCollator(coll)->getMaxVariable(); 313} 314 315U_CAPI uint32_t U_EXPORT2 316ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { 317 if(U_FAILURE(*status) || coll == NULL) { 318 return 0; 319 } 320 return Collator::fromUCollator(coll)->setVariableTop(varTop, len, *status); 321} 322 323U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { 324 if(U_FAILURE(*status) || coll == NULL) { 325 return 0; 326 } 327 return Collator::fromUCollator(coll)->getVariableTop(*status); 328} 329 330U_CAPI void U_EXPORT2 331ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { 332 if(U_FAILURE(*status) || coll == NULL) { 333 return; 334 } 335 Collator::fromUCollator(coll)->setVariableTop(varTop, *status); 336} 337 338U_CAPI void U_EXPORT2 339ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { 340 if(U_FAILURE(*status) || coll == NULL) { 341 return; 342 } 343 344 Collator::fromUCollator(coll)->setAttribute(attr, value, *status); 345} 346 347U_CAPI UColAttributeValue U_EXPORT2 348ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { 349 if(U_FAILURE(*status) || coll == NULL) { 350 return UCOL_DEFAULT; 351 } 352 353 return Collator::fromUCollator(coll)->getAttribute(attr, *status); 354} 355 356U_CAPI void U_EXPORT2 357ucol_setStrength( UCollator *coll, 358 UCollationStrength strength) 359{ 360 UErrorCode status = U_ZERO_ERROR; 361 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); 362} 363 364U_CAPI UCollationStrength U_EXPORT2 365ucol_getStrength(const UCollator *coll) 366{ 367 UErrorCode status = U_ZERO_ERROR; 368 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); 369} 370 371U_CAPI int32_t U_EXPORT2 372ucol_getReorderCodes(const UCollator *coll, 373 int32_t *dest, 374 int32_t destCapacity, 375 UErrorCode *status) { 376 if (U_FAILURE(*status)) { 377 return 0; 378 } 379 380 return Collator::fromUCollator(coll)->getReorderCodes(dest, destCapacity, *status); 381} 382 383U_CAPI void U_EXPORT2 384ucol_setReorderCodes(UCollator* coll, 385 const int32_t* reorderCodes, 386 int32_t reorderCodesLength, 387 UErrorCode *status) { 388 if (U_FAILURE(*status)) { 389 return; 390 } 391 392 Collator::fromUCollator(coll)->setReorderCodes(reorderCodes, reorderCodesLength, *status); 393} 394 395U_CAPI int32_t U_EXPORT2 396ucol_getEquivalentReorderCodes(int32_t reorderCode, 397 int32_t* dest, 398 int32_t destCapacity, 399 UErrorCode *pErrorCode) { 400 return Collator::getEquivalentReorderCodes(reorderCode, dest, destCapacity, *pErrorCode); 401} 402 403U_CAPI void U_EXPORT2 404ucol_getVersion(const UCollator* coll, 405 UVersionInfo versionInfo) 406{ 407 Collator::fromUCollator(coll)->getVersion(versionInfo); 408} 409 410U_CAPI UCollationResult U_EXPORT2 411ucol_strcollIter( const UCollator *coll, 412 UCharIterator *sIter, 413 UCharIterator *tIter, 414 UErrorCode *status) 415{ 416 if(!status || U_FAILURE(*status)) { 417 return UCOL_EQUAL; 418 } 419 420 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); 421 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); 422 423 if(sIter == NULL || tIter == NULL || coll == NULL) { 424 *status = U_ILLEGAL_ARGUMENT_ERROR; 425 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 426 return UCOL_EQUAL; 427 } 428 429 UCollationResult result = Collator::fromUCollator(coll)->compare(*sIter, *tIter, *status); 430 431 UTRACE_EXIT_VALUE_STATUS(result, *status); 432 return result; 433} 434 435 436/* */ 437/* ucol_strcoll Main public API string comparison function */ 438/* */ 439U_CAPI UCollationResult U_EXPORT2 440ucol_strcoll( const UCollator *coll, 441 const UChar *source, 442 int32_t sourceLength, 443 const UChar *target, 444 int32_t targetLength) 445{ 446 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); 447 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 448 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 449 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); 450 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); 451 } 452 453 UErrorCode status = U_ZERO_ERROR; 454 UCollationResult returnVal = Collator::fromUCollator(coll)-> 455 compare(source, sourceLength, target, targetLength, status); 456 UTRACE_EXIT_VALUE_STATUS(returnVal, status); 457 return returnVal; 458} 459 460U_CAPI UCollationResult U_EXPORT2 461ucol_strcollUTF8( 462 const UCollator *coll, 463 const char *source, 464 int32_t sourceLength, 465 const char *target, 466 int32_t targetLength, 467 UErrorCode *status) 468{ 469 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8); 470 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 471 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 472 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength); 473 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength); 474 } 475 476 if (U_FAILURE(*status)) { 477 /* do nothing */ 478 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status); 479 return UCOL_EQUAL; 480 } 481 482 UCollationResult returnVal = Collator::fromUCollator(coll)->internalCompareUTF8( 483 source, sourceLength, target, targetLength, *status); 484 UTRACE_EXIT_VALUE_STATUS(returnVal, *status); 485 return returnVal; 486} 487 488 489/* convenience function for comparing strings */ 490U_CAPI UBool U_EXPORT2 491ucol_greater( const UCollator *coll, 492 const UChar *source, 493 int32_t sourceLength, 494 const UChar *target, 495 int32_t targetLength) 496{ 497 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 498 == UCOL_GREATER); 499} 500 501/* convenience function for comparing strings */ 502U_CAPI UBool U_EXPORT2 503ucol_greaterOrEqual( const UCollator *coll, 504 const UChar *source, 505 int32_t sourceLength, 506 const UChar *target, 507 int32_t targetLength) 508{ 509 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 510 != UCOL_LESS); 511} 512 513/* convenience function for comparing strings */ 514U_CAPI UBool U_EXPORT2 515ucol_equal( const UCollator *coll, 516 const UChar *source, 517 int32_t sourceLength, 518 const UChar *target, 519 int32_t targetLength) 520{ 521 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 522 == UCOL_EQUAL); 523} 524 525U_CAPI void U_EXPORT2 526ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { 527 const Collator *c = Collator::fromUCollator(coll); 528 if(c != NULL) { 529 UVersionInfo v; 530 c->getVersion(v); 531 // Note: This is tied to how the current implementation encodes the UCA version 532 // in the overall getVersion(). 533 // Alternatively, we could load the root collator and get at lower-level data from there. 534 // Either way, it will reflect the input collator's UCA version only 535 // if it is a known implementation. 536 // It would be cleaner to make this a virtual Collator method. 537 info[0] = v[1] >> 3; 538 info[1] = v[1] & 7; 539 info[2] = v[2] >> 6; 540 info[3] = 0; 541 } 542} 543 544U_CAPI const UChar * U_EXPORT2 545ucol_getRules(const UCollator *coll, int32_t *length) { 546 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 547 // OK to crash if coll==NULL: We do not want to check "this" pointers. 548 if(rbc != NULL || coll == NULL) { 549 const UnicodeString &rules = rbc->getRules(); 550 U_ASSERT(rules.getBuffer()[rules.length()] == 0); 551 *length = rules.length(); 552 return rules.getBuffer(); 553 } 554 static const UChar _NUL = 0; 555 *length = 0; 556 return &_NUL; 557} 558 559U_CAPI int32_t U_EXPORT2 560ucol_getRulesEx(const UCollator *coll, UColRuleOption delta, UChar *buffer, int32_t bufferLen) { 561 UnicodeString rules; 562 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 563 if(rbc != NULL || coll == NULL) { 564 rbc->getRules(delta, rules); 565 } 566 if(buffer != NULL && bufferLen > 0) { 567 UErrorCode errorCode = U_ZERO_ERROR; 568 return rules.extract(buffer, bufferLen, errorCode); 569 } else { 570 return rules.length(); 571 } 572} 573 574U_CAPI const char * U_EXPORT2 575ucol_getLocale(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) { 576 return ucol_getLocaleByType(coll, type, status); 577} 578 579U_CAPI const char * U_EXPORT2 580ucol_getLocaleByType(const UCollator *coll, ULocDataLocaleType type, UErrorCode *status) { 581 if(U_FAILURE(*status)) { 582 return NULL; 583 } 584 UTRACE_ENTRY(UTRACE_UCOL_GETLOCALE); 585 UTRACE_DATA1(UTRACE_INFO, "coll=%p", coll); 586 587 const char *result; 588 const RuleBasedCollator *rbc = RuleBasedCollator::rbcFromUCollator(coll); 589 if(rbc == NULL && coll != NULL) { 590 *status = U_UNSUPPORTED_ERROR; 591 result = NULL; 592 } else { 593 result = rbc->internalGetLocaleID(type, *status); 594 } 595 596 UTRACE_DATA1(UTRACE_INFO, "result = %s", result); 597 UTRACE_EXIT_STATUS(*status); 598 return result; 599} 600 601U_CAPI USet * U_EXPORT2 602ucol_getTailoredSet(const UCollator *coll, UErrorCode *status) { 603 if(U_FAILURE(*status)) { 604 return NULL; 605 } 606 UnicodeSet *set = Collator::fromUCollator(coll)->getTailoredSet(*status); 607 if(U_FAILURE(*status)) { 608 delete set; 609 return NULL; 610 } 611 return set->toUSet(); 612} 613 614U_CAPI UBool U_EXPORT2 615ucol_equals(const UCollator *source, const UCollator *target) { 616 return source == target || 617 (*Collator::fromUCollator(source)) == (*Collator::fromUCollator(target)); 618} 619 620#endif /* #if !UCONFIG_NO_COLLATION */ 621