1/* 2****************************************************************************** 3* Copyright (C) 2001-2009, International Business Machines 4* Corporation and others. All Rights Reserved. 5****************************************************************************** 6* 7* File ucoleitr.cpp 8* 9* Modification History: 10* 11* Date Name Description 12* 02/15/2001 synwee Modified all methods to process its own function 13* instead of calling the equivalent c++ api (coleitr.h) 14******************************************************************************/ 15 16#include "unicode/utypes.h" 17 18#if !UCONFIG_NO_COLLATION 19 20#include "unicode/ucoleitr.h" 21#include "unicode/ustring.h" 22#include "unicode/sortkey.h" 23#include "unicode/uobject.h" 24#include "ucol_imp.h" 25#include "cmemory.h" 26 27U_NAMESPACE_USE 28 29#define BUFFER_LENGTH 100 30 31#define DEFAULT_BUFFER_SIZE 16 32#define BUFFER_GROW 8 33 34#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 35 36#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) 37 38#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 39 40#define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0]) 41 42#define DELETE_ARRAY(array) uprv_free((void *) (array)) 43 44typedef struct collIterate collIterator; 45 46struct RCEI 47{ 48 uint32_t ce; 49 int32_t low; 50 int32_t high; 51}; 52 53U_NAMESPACE_BEGIN 54 55struct RCEBuffer 56{ 57 RCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; 58 RCEI *buffer; 59 int32_t bufferIndex; 60 int32_t bufferSize; 61 62 RCEBuffer(); 63 ~RCEBuffer(); 64 65 UBool empty() const; 66 void put(uint32_t ce, int32_t ixLow, int32_t ixHigh); 67 const RCEI *get(); 68}; 69 70RCEBuffer::RCEBuffer() 71{ 72 buffer = defaultBuffer; 73 bufferIndex = 0; 74 bufferSize = DEFAULT_BUFFER_SIZE; 75} 76 77RCEBuffer::~RCEBuffer() 78{ 79 if (buffer != defaultBuffer) { 80 DELETE_ARRAY(buffer); 81 } 82} 83 84UBool RCEBuffer::empty() const 85{ 86 return bufferIndex <= 0; 87} 88 89void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh) 90{ 91 if (bufferIndex >= bufferSize) { 92 RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW); 93 94 ARRAY_COPY(newBuffer, buffer, bufferSize); 95 96 if (buffer != defaultBuffer) { 97 DELETE_ARRAY(buffer); 98 } 99 100 buffer = newBuffer; 101 bufferSize += BUFFER_GROW; 102 } 103 104 buffer[bufferIndex].ce = ce; 105 buffer[bufferIndex].low = ixLow; 106 buffer[bufferIndex].high = ixHigh; 107 108 bufferIndex += 1; 109} 110 111const RCEI *RCEBuffer::get() 112{ 113 if (bufferIndex > 0) { 114 return &buffer[--bufferIndex]; 115 } 116 117 return NULL; 118} 119 120struct PCEI 121{ 122 uint64_t ce; 123 int32_t low; 124 int32_t high; 125}; 126 127struct PCEBuffer 128{ 129 PCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; 130 PCEI *buffer; 131 int32_t bufferIndex; 132 int32_t bufferSize; 133 134 PCEBuffer(); 135 ~PCEBuffer(); 136 137 void reset(); 138 UBool empty() const; 139 void put(uint64_t ce, int32_t ixLow, int32_t ixHigh); 140 const PCEI *get(); 141}; 142 143PCEBuffer::PCEBuffer() 144{ 145 buffer = defaultBuffer; 146 bufferIndex = 0; 147 bufferSize = DEFAULT_BUFFER_SIZE; 148} 149 150PCEBuffer::~PCEBuffer() 151{ 152 if (buffer != defaultBuffer) { 153 DELETE_ARRAY(buffer); 154 } 155} 156 157void PCEBuffer::reset() 158{ 159 bufferIndex = 0; 160} 161 162UBool PCEBuffer::empty() const 163{ 164 return bufferIndex <= 0; 165} 166 167void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh) 168{ 169 if (bufferIndex >= bufferSize) { 170 PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW); 171 172 ARRAY_COPY(newBuffer, buffer, bufferSize); 173 174 if (buffer != defaultBuffer) { 175 DELETE_ARRAY(buffer); 176 } 177 178 buffer = newBuffer; 179 bufferSize += BUFFER_GROW; 180 } 181 182 buffer[bufferIndex].ce = ce; 183 buffer[bufferIndex].low = ixLow; 184 buffer[bufferIndex].high = ixHigh; 185 186 bufferIndex += 1; 187} 188 189const PCEI *PCEBuffer::get() 190{ 191 if (bufferIndex > 0) { 192 return &buffer[--bufferIndex]; 193 } 194 195 return NULL; 196} 197 198/* 199 * This inherits from UObject so that 200 * it can be allocated by new and the 201 * constructor for PCEBuffer is called. 202 */ 203struct UCollationPCE : public UObject 204{ 205 PCEBuffer pceBuffer; 206 UCollationStrength strength; 207 UBool toShift; 208 UBool isShifted; 209 uint32_t variableTop; 210 211 UCollationPCE(UCollationElements *elems); 212 ~UCollationPCE(); 213 214 void init(const UCollator *coll); 215 216 virtual UClassID getDynamicClassID() const; 217 static UClassID getStaticClassID(); 218}; 219 220UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE) 221 222UCollationPCE::UCollationPCE(UCollationElements *elems) 223{ 224 init(elems->iteratordata_.coll); 225} 226 227void UCollationPCE::init(const UCollator *coll) 228{ 229 UErrorCode status = U_ZERO_ERROR; 230 231 strength = ucol_getStrength(coll); 232 toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; 233 isShifted = FALSE; 234 variableTop = coll->variableTopValue << 16; 235} 236 237UCollationPCE::~UCollationPCE() 238{ 239 // nothing to do 240} 241 242 243U_NAMESPACE_END 244 245 246inline uint64_t processCE(UCollationElements *elems, uint32_t ce) 247{ 248 uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; 249 250 // This is clean, but somewhat slow... 251 // We could apply the mask to ce and then 252 // just get all three orders... 253 switch(elems->pce->strength) { 254 default: 255 tertiary = ucol_tertiaryOrder(ce); 256 /* note fall-through */ 257 258 case UCOL_SECONDARY: 259 secondary = ucol_secondaryOrder(ce); 260 /* note fall-through */ 261 262 case UCOL_PRIMARY: 263 primary = ucol_primaryOrder(ce); 264 } 265 266 // **** This should probably handle continuations too. **** 267 // **** That means that we need 24 bits for the primary **** 268 // **** instead of the 16 that we're currently using. **** 269 // **** So we can lay out the 64 bits as: 24.12.12.16. **** 270 // **** Another complication with continuations is that **** 271 // **** the *second* CE is marked as a continuation, so **** 272 // **** we always have to peek ahead to know how long **** 273 // **** the primary is... **** 274 if (elems->pce->toShift && (elems->pce->variableTop > ce && primary != 0) 275 || (elems->pce->isShifted && primary == 0)) { 276 277 if (primary == 0) { 278 return UCOL_IGNORABLE; 279 } 280 281 if (elems->pce->strength >= UCOL_QUATERNARY) { 282 quaternary = primary; 283 } 284 285 primary = secondary = tertiary = 0; 286 elems->pce->isShifted = TRUE; 287 } else { 288 if (elems->pce->strength >= UCOL_QUATERNARY) { 289 quaternary = 0xFFFF; 290 } 291 292 elems->pce->isShifted = FALSE; 293 } 294 295 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; 296} 297 298U_CAPI void U_EXPORT2 299uprv_init_pce(const UCollationElements *elems) 300{ 301 if (elems->pce != NULL) { 302 elems->pce->init(elems->iteratordata_.coll); 303 } 304} 305 306 307 308/* public methods ---------------------------------------------------- */ 309 310U_CAPI UCollationElements* U_EXPORT2 311ucol_openElements(const UCollator *coll, 312 const UChar *text, 313 int32_t textLength, 314 UErrorCode *status) 315{ 316 UCollationElements *result; 317 318 if (U_FAILURE(*status)) { 319 return NULL; 320 } 321 322 result = (UCollationElements *)uprv_malloc(sizeof(UCollationElements)); 323 /* test for NULL */ 324 if (result == NULL) { 325 *status = U_MEMORY_ALLOCATION_ERROR; 326 return NULL; 327 } 328 329 result->reset_ = TRUE; 330 result->isWritable = FALSE; 331 result->pce = NULL; 332 333 if (text == NULL) { 334 textLength = 0; 335 } 336 uprv_init_collIterate(coll, text, textLength, &result->iteratordata_); 337 338 return result; 339} 340 341 342U_CAPI void U_EXPORT2 343ucol_closeElements(UCollationElements *elems) 344{ 345 if (elems != NULL) { 346 collIterate *ci = &elems->iteratordata_; 347 348 if (ci != NULL) { 349 if (ci->writableBuffer != ci->stackWritableBuffer) { 350 uprv_free(ci->writableBuffer); 351 } 352 353 if (ci->extendCEs) { 354 uprv_free(ci->extendCEs); 355 } 356 357 if (ci->offsetBuffer) { 358 uprv_free(ci->offsetBuffer); 359 } 360 } 361 362 if (elems->isWritable && elems->iteratordata_.string != NULL) 363 { 364 uprv_free(elems->iteratordata_.string); 365 } 366 367 if (elems->pce != NULL) { 368 delete elems->pce; 369 } 370 371 uprv_free(elems); 372 } 373} 374 375U_CAPI void U_EXPORT2 376ucol_reset(UCollationElements *elems) 377{ 378 collIterate *ci = &(elems->iteratordata_); 379 elems->reset_ = TRUE; 380 ci->pos = ci->string; 381 if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) { 382 ci->endp = ci->string + u_strlen(ci->string); 383 } 384 ci->CEpos = ci->toReturn = ci->CEs; 385 ci->flags = (ci->flags & UCOL_FORCE_HAN_IMPLICIT) | UCOL_ITER_HASLEN; 386 if (ci->coll->normalizationMode == UCOL_ON) { 387 ci->flags |= UCOL_ITER_NORM; 388 } 389 390 if (ci->stackWritableBuffer != ci->writableBuffer) { 391 uprv_free(ci->writableBuffer); 392 ci->writableBuffer = ci->stackWritableBuffer; 393 ci->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE; 394 } 395 ci->fcdPosition = NULL; 396 397 //ci->offsetReturn = ci->offsetStore = NULL; 398 ci->offsetRepeatCount = ci->offsetRepeatValue = 0; 399} 400 401U_CAPI void U_EXPORT2 402ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status) 403{ 404 if (U_FAILURE(*status)) { 405 return; 406 } 407 408 if (elems == NULL) { 409 *status = U_ILLEGAL_ARGUMENT_ERROR; 410 return; 411 } 412 413 elems->iteratordata_.flags |= UCOL_FORCE_HAN_IMPLICIT; 414} 415 416U_CAPI int32_t U_EXPORT2 417ucol_next(UCollationElements *elems, 418 UErrorCode *status) 419{ 420 int32_t result; 421 if (U_FAILURE(*status)) { 422 return UCOL_NULLORDER; 423 } 424 425 elems->reset_ = FALSE; 426 427 result = (int32_t)ucol_getNextCE(elems->iteratordata_.coll, 428 &elems->iteratordata_, 429 status); 430 431 if (result == UCOL_NO_MORE_CES) { 432 result = UCOL_NULLORDER; 433 } 434 return result; 435} 436 437U_CAPI int64_t U_EXPORT2 438ucol_nextProcessed(UCollationElements *elems, 439 int32_t *ixLow, 440 int32_t *ixHigh, 441 UErrorCode *status) 442{ 443 const UCollator *coll = elems->iteratordata_.coll; 444 int64_t result = UCOL_IGNORABLE; 445 uint32_t low = 0, high = 0; 446 447 if (U_FAILURE(*status)) { 448 return UCOL_PROCESSED_NULLORDER; 449 } 450 451 if (elems->pce == NULL) { 452 elems->pce = new UCollationPCE(elems); 453 } else { 454 elems->pce->pceBuffer.reset(); 455 } 456 457 elems->reset_ = FALSE; 458 459 do { 460 low = ucol_getOffset(elems); 461 uint32_t ce = (uint32_t) ucol_getNextCE(coll, &elems->iteratordata_, status); 462 high = ucol_getOffset(elems); 463 464 if (ce == UCOL_NO_MORE_CES) { 465 result = UCOL_PROCESSED_NULLORDER; 466 break; 467 } 468 469 result = processCE(elems, ce); 470 } while (result == UCOL_IGNORABLE); 471 472 if (ixLow != NULL) { 473 *ixLow = low; 474 } 475 476 if (ixHigh != NULL) { 477 *ixHigh = high; 478 } 479 480 return result; 481} 482 483U_CAPI int32_t U_EXPORT2 484ucol_previous(UCollationElements *elems, 485 UErrorCode *status) 486{ 487 if(U_FAILURE(*status)) { 488 return UCOL_NULLORDER; 489 } 490 else 491 { 492 int32_t result; 493 494 if (elems->reset_ && (elems->iteratordata_.pos == elems->iteratordata_.string)) { 495 if (elems->iteratordata_.endp == NULL) { 496 elems->iteratordata_.endp = elems->iteratordata_.string + 497 u_strlen(elems->iteratordata_.string); 498 elems->iteratordata_.flags |= UCOL_ITER_HASLEN; 499 } 500 elems->iteratordata_.pos = elems->iteratordata_.endp; 501 elems->iteratordata_.fcdPosition = elems->iteratordata_.endp; 502 } 503 504 elems->reset_ = FALSE; 505 506 result = (int32_t)ucol_getPrevCE(elems->iteratordata_.coll, 507 &(elems->iteratordata_), 508 status); 509 510 if (result == UCOL_NO_MORE_CES) { 511 result = UCOL_NULLORDER; 512 } 513 514 return result; 515 } 516} 517 518U_CAPI int64_t U_EXPORT2 519ucol_previousProcessed(UCollationElements *elems, 520 int32_t *ixLow, 521 int32_t *ixHigh, 522 UErrorCode *status) 523{ 524 const UCollator *coll = elems->iteratordata_.coll; 525 int64_t result = UCOL_IGNORABLE; 526 // int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; 527 // UCollationStrength strength = ucol_getStrength(coll); 528 // UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED; 529 // uint32_t variableTop = coll->variableTopValue; 530 int32_t low = 0, high = 0; 531 532 if (U_FAILURE(*status)) { 533 return UCOL_PROCESSED_NULLORDER; 534 } 535 536 if (elems->reset_ && 537 (elems->iteratordata_.pos == elems->iteratordata_.string)) { 538 if (elems->iteratordata_.endp == NULL) { 539 elems->iteratordata_.endp = elems->iteratordata_.string + 540 u_strlen(elems->iteratordata_.string); 541 elems->iteratordata_.flags |= UCOL_ITER_HASLEN; 542 } 543 544 elems->iteratordata_.pos = elems->iteratordata_.endp; 545 elems->iteratordata_.fcdPosition = elems->iteratordata_.endp; 546 } 547 548 if (elems->pce == NULL) { 549 elems->pce = new UCollationPCE(elems); 550 } else { 551 //elems->pce->pceBuffer.reset(); 552 } 553 554 elems->reset_ = FALSE; 555 556 while (elems->pce->pceBuffer.empty()) { 557 // buffer raw CEs up to non-ignorable primary 558 RCEBuffer rceb; 559 uint32_t ce; 560 561 // **** do we need to reset rceb, or will it always be empty at this point **** 562 do { 563 high = ucol_getOffset(elems); 564 ce = ucol_getPrevCE(coll, &elems->iteratordata_, status); 565 low = ucol_getOffset(elems); 566 567 if (ce == UCOL_NO_MORE_CES) { 568 if (! rceb.empty()) { 569 break; 570 } 571 572 goto finish; 573 } 574 575 rceb.put(ce, low, high); 576 } while ((ce & UCOL_PRIMARYMASK) == 0); 577 578 // process the raw CEs 579 while (! rceb.empty()) { 580 const RCEI *rcei = rceb.get(); 581 582 result = processCE(elems, rcei->ce); 583 584 if (result != UCOL_IGNORABLE) { 585 elems->pce->pceBuffer.put(result, rcei->low, rcei->high); 586 } 587 } 588 } 589 590finish: 591 if (elems->pce->pceBuffer.empty()) { 592 // **** Is -1 the right value for ixLow, ixHigh? **** 593 if (ixLow != NULL) { 594 *ixLow = -1; 595 } 596 597 if (ixHigh != NULL) { 598 *ixHigh = -1 599 ; 600 } 601 return UCOL_PROCESSED_NULLORDER; 602 } 603 604 const PCEI *pcei = elems->pce->pceBuffer.get(); 605 606 if (ixLow != NULL) { 607 *ixLow = pcei->low; 608 } 609 610 if (ixHigh != NULL) { 611 *ixHigh = pcei->high; 612 } 613 614 return pcei->ce; 615} 616 617U_CAPI int32_t U_EXPORT2 618ucol_getMaxExpansion(const UCollationElements *elems, 619 int32_t order) 620{ 621 uint8_t result; 622 623#if 0 624 UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result); 625#else 626 const UCollator *coll = elems->iteratordata_.coll; 627 const uint32_t *start; 628 const uint32_t *limit; 629 const uint32_t *mid; 630 uint32_t strengthMask = 0; 631 uint32_t mOrder = (uint32_t) order; 632 633 switch (coll->strength) 634 { 635 default: 636 strengthMask |= UCOL_TERTIARYORDERMASK; 637 /* fall through */ 638 639 case UCOL_SECONDARY: 640 strengthMask |= UCOL_SECONDARYORDERMASK; 641 /* fall through */ 642 643 case UCOL_PRIMARY: 644 strengthMask |= UCOL_PRIMARYORDERMASK; 645 } 646 647 mOrder &= strengthMask; 648 start = (coll)->endExpansionCE; 649 limit = (coll)->lastEndExpansionCE; 650 651 while (start < limit - 1) { 652 mid = start + ((limit - start) >> 1); 653 if (mOrder <= (*mid & strengthMask)) { 654 limit = mid; 655 } else { 656 start = mid; 657 } 658 } 659 660 // FIXME: with a masked search, there might be more than one hit, 661 // so we need to look forward and backward from the match to find all 662 // of the hits... 663 if ((*start & strengthMask) == mOrder) { 664 result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE)); 665 } else if ((*limit & strengthMask) == mOrder) { 666 result = *(coll->expansionCESize + (limit - coll->endExpansionCE)); 667 } else if ((mOrder & 0xFFFF) == 0x00C0) { 668 result = 2; 669 } else { 670 result = 1; 671 } 672#endif 673 674 return result; 675} 676 677U_CAPI void U_EXPORT2 678ucol_setText( UCollationElements *elems, 679 const UChar *text, 680 int32_t textLength, 681 UErrorCode *status) 682{ 683 if (U_FAILURE(*status)) { 684 return; 685 } 686 687 if (elems->isWritable && elems->iteratordata_.string != NULL) 688 { 689 uprv_free(elems->iteratordata_.string); 690 } 691 692 if (text == NULL) { 693 textLength = 0; 694 } 695 696 elems->isWritable = FALSE; 697 698 /* free offset buffer to avoid memory leak before initializing. */ 699 ucol_freeOffsetBuffer(&(elems->iteratordata_)); 700 uprv_init_collIterate(elems->iteratordata_.coll, text, textLength, 701 &elems->iteratordata_); 702 703 elems->reset_ = TRUE; 704} 705 706U_CAPI int32_t U_EXPORT2 707ucol_getOffset(const UCollationElements *elems) 708{ 709 const collIterate *ci = &(elems->iteratordata_); 710 711 if (ci->offsetRepeatCount > 0 && ci->offsetRepeatValue != 0) { 712 return ci->offsetRepeatValue; 713 } 714 715 if (ci->offsetReturn != NULL) { 716 return *ci->offsetReturn; 717 } 718 719 // while processing characters in normalization buffer getOffset will 720 // return the next non-normalized character. 721 // should be inline with the old implementation since the old codes uses 722 // nextDecomp in normalizer which also decomposes the string till the 723 // first base character is found. 724 if (ci->flags & UCOL_ITER_INNORMBUF) { 725 if (ci->fcdPosition == NULL) { 726 return 0; 727 } 728 return (int32_t)(ci->fcdPosition - ci->string); 729 } 730 else { 731 return (int32_t)(ci->pos - ci->string); 732 } 733} 734 735U_CAPI void U_EXPORT2 736ucol_setOffset(UCollationElements *elems, 737 int32_t offset, 738 UErrorCode *status) 739{ 740 if (U_FAILURE(*status)) { 741 return; 742 } 743 744 // this methods will clean up any use of the writable buffer and points to 745 // the original string 746 collIterate *ci = &(elems->iteratordata_); 747 ci->pos = ci->string + offset; 748 ci->CEpos = ci->toReturn = ci->CEs; 749 if (ci->flags & UCOL_ITER_INNORMBUF) { 750 ci->flags = ci->origFlags; 751 } 752 if ((ci->flags & UCOL_ITER_HASLEN) == 0) { 753 ci->endp = ci->string + u_strlen(ci->string); 754 ci->flags |= UCOL_ITER_HASLEN; 755 } 756 ci->fcdPosition = NULL; 757 elems->reset_ = FALSE; 758 759 ci->offsetReturn = NULL; 760 ci->offsetStore = ci->offsetBuffer; 761 ci->offsetRepeatCount = ci->offsetRepeatValue = 0; 762} 763 764U_CAPI int32_t U_EXPORT2 765ucol_primaryOrder (int32_t order) 766{ 767 order &= UCOL_PRIMARYMASK; 768 return (order >> UCOL_PRIMARYORDERSHIFT); 769} 770 771U_CAPI int32_t U_EXPORT2 772ucol_secondaryOrder (int32_t order) 773{ 774 order &= UCOL_SECONDARYMASK; 775 return (order >> UCOL_SECONDARYORDERSHIFT); 776} 777 778U_CAPI int32_t U_EXPORT2 779ucol_tertiaryOrder (int32_t order) 780{ 781 return (order & UCOL_TERTIARYMASK); 782} 783 784 785void ucol_freeOffsetBuffer(collIterate *s) { 786 if (s != NULL && s->offsetBuffer != NULL) { 787 uprv_free(s->offsetBuffer); 788 s->offsetBuffer = NULL; 789 s->offsetBufferSize = 0; 790 } 791} 792 793#endif /* #if !UCONFIG_NO_COLLATION */ 794