1/* 2****************************************************************************** 3* Copyright (C) 2001-2011, International Business Machines 4* Corporation and others. All Rights Reserved. 5****************************************************************************** 6* 7* File ucoleitr.cpp 8* 9* Modification History: 10* 11* Date Name Description 12* 02/15/2001 synwee Modified all methods to process its own function 13* instead of calling the equivalent c++ api (coleitr.h) 14******************************************************************************/ 15 16#include "unicode/utypes.h" 17 18#if !UCONFIG_NO_COLLATION 19 20#include "unicode/ucoleitr.h" 21#include "unicode/ustring.h" 22#include "unicode/sortkey.h" 23#include "unicode/uobject.h" 24#include "ucol_imp.h" 25#include "cmemory.h" 26 27U_NAMESPACE_USE 28 29#define BUFFER_LENGTH 100 30 31#define DEFAULT_BUFFER_SIZE 16 32#define BUFFER_GROW 8 33 34#define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 35 36#define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) 37 38#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 39 40#define GROW_ARRAY(array, newSize) uprv_realloc((void *) (array), (newSize) * sizeof (array)[0]) 41 42#define DELETE_ARRAY(array) uprv_free((void *) (array)) 43 44typedef struct icu::collIterate collIterator; 45 46struct RCEI 47{ 48 uint32_t ce; 49 int32_t low; 50 int32_t high; 51}; 52 53U_NAMESPACE_BEGIN 54 55struct RCEBuffer 56{ 57 RCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; 58 RCEI *buffer; 59 int32_t bufferIndex; 60 int32_t bufferSize; 61 62 RCEBuffer(); 63 ~RCEBuffer(); 64 65 UBool empty() const; 66 void put(uint32_t ce, int32_t ixLow, int32_t ixHigh); 67 const RCEI *get(); 68}; 69 70RCEBuffer::RCEBuffer() 71{ 72 buffer = defaultBuffer; 73 bufferIndex = 0; 74 bufferSize = DEFAULT_BUFFER_SIZE; 75} 76 77RCEBuffer::~RCEBuffer() 78{ 79 if (buffer != defaultBuffer) { 80 DELETE_ARRAY(buffer); 81 } 82} 83 84UBool RCEBuffer::empty() const 85{ 86 return bufferIndex <= 0; 87} 88 89void RCEBuffer::put(uint32_t ce, int32_t ixLow, int32_t ixHigh) 90{ 91 if (bufferIndex >= bufferSize) { 92 RCEI *newBuffer = NEW_ARRAY(RCEI, bufferSize + BUFFER_GROW); 93 94 ARRAY_COPY(newBuffer, buffer, bufferSize); 95 96 if (buffer != defaultBuffer) { 97 DELETE_ARRAY(buffer); 98 } 99 100 buffer = newBuffer; 101 bufferSize += BUFFER_GROW; 102 } 103 104 buffer[bufferIndex].ce = ce; 105 buffer[bufferIndex].low = ixLow; 106 buffer[bufferIndex].high = ixHigh; 107 108 bufferIndex += 1; 109} 110 111const RCEI *RCEBuffer::get() 112{ 113 if (bufferIndex > 0) { 114 return &buffer[--bufferIndex]; 115 } 116 117 return NULL; 118} 119 120struct PCEI 121{ 122 uint64_t ce; 123 int32_t low; 124 int32_t high; 125}; 126 127struct PCEBuffer 128{ 129 PCEI defaultBuffer[DEFAULT_BUFFER_SIZE]; 130 PCEI *buffer; 131 int32_t bufferIndex; 132 int32_t bufferSize; 133 134 PCEBuffer(); 135 ~PCEBuffer(); 136 137 void reset(); 138 UBool empty() const; 139 void put(uint64_t ce, int32_t ixLow, int32_t ixHigh); 140 const PCEI *get(); 141}; 142 143PCEBuffer::PCEBuffer() 144{ 145 buffer = defaultBuffer; 146 bufferIndex = 0; 147 bufferSize = DEFAULT_BUFFER_SIZE; 148} 149 150PCEBuffer::~PCEBuffer() 151{ 152 if (buffer != defaultBuffer) { 153 DELETE_ARRAY(buffer); 154 } 155} 156 157void PCEBuffer::reset() 158{ 159 bufferIndex = 0; 160} 161 162UBool PCEBuffer::empty() const 163{ 164 return bufferIndex <= 0; 165} 166 167void PCEBuffer::put(uint64_t ce, int32_t ixLow, int32_t ixHigh) 168{ 169 if (bufferIndex >= bufferSize) { 170 PCEI *newBuffer = NEW_ARRAY(PCEI, bufferSize + BUFFER_GROW); 171 172 ARRAY_COPY(newBuffer, buffer, bufferSize); 173 174 if (buffer != defaultBuffer) { 175 DELETE_ARRAY(buffer); 176 } 177 178 buffer = newBuffer; 179 bufferSize += BUFFER_GROW; 180 } 181 182 buffer[bufferIndex].ce = ce; 183 buffer[bufferIndex].low = ixLow; 184 buffer[bufferIndex].high = ixHigh; 185 186 bufferIndex += 1; 187} 188 189const PCEI *PCEBuffer::get() 190{ 191 if (bufferIndex > 0) { 192 return &buffer[--bufferIndex]; 193 } 194 195 return NULL; 196} 197 198/* 199 * This inherits from UObject so that 200 * it can be allocated by new and the 201 * constructor for PCEBuffer is called. 202 */ 203struct UCollationPCE : public UObject 204{ 205 PCEBuffer pceBuffer; 206 UCollationStrength strength; 207 UBool toShift; 208 UBool isShifted; 209 uint32_t variableTop; 210 211 UCollationPCE(UCollationElements *elems); 212 ~UCollationPCE(); 213 214 void init(const UCollator *coll); 215 216 virtual UClassID getDynamicClassID() const; 217 static UClassID getStaticClassID(); 218}; 219 220UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UCollationPCE) 221 222UCollationPCE::UCollationPCE(UCollationElements *elems) 223{ 224 init(elems->iteratordata_.coll); 225} 226 227void UCollationPCE::init(const UCollator *coll) 228{ 229 UErrorCode status = U_ZERO_ERROR; 230 231 strength = ucol_getStrength(coll); 232 toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; 233 isShifted = FALSE; 234 variableTop = coll->variableTopValue << 16; 235} 236 237UCollationPCE::~UCollationPCE() 238{ 239 // nothing to do 240} 241 242 243U_NAMESPACE_END 244 245 246inline uint64_t processCE(UCollationElements *elems, uint32_t ce) 247{ 248 uint64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; 249 250 // This is clean, but somewhat slow... 251 // We could apply the mask to ce and then 252 // just get all three orders... 253 switch(elems->pce->strength) { 254 default: 255 tertiary = ucol_tertiaryOrder(ce); 256 /* note fall-through */ 257 258 case UCOL_SECONDARY: 259 secondary = ucol_secondaryOrder(ce); 260 /* note fall-through */ 261 262 case UCOL_PRIMARY: 263 primary = ucol_primaryOrder(ce); 264 } 265 266 // **** This should probably handle continuations too. **** 267 // **** That means that we need 24 bits for the primary **** 268 // **** instead of the 16 that we're currently using. **** 269 // **** So we can lay out the 64 bits as: 24.12.12.16. **** 270 // **** Another complication with continuations is that **** 271 // **** the *second* CE is marked as a continuation, so **** 272 // **** we always have to peek ahead to know how long **** 273 // **** the primary is... **** 274 if ((elems->pce->toShift && elems->pce->variableTop > ce && primary != 0) 275 || (elems->pce->isShifted && primary == 0)) { 276 277 if (primary == 0) { 278 return UCOL_IGNORABLE; 279 } 280 281 if (elems->pce->strength >= UCOL_QUATERNARY) { 282 quaternary = primary; 283 } 284 285 primary = secondary = tertiary = 0; 286 elems->pce->isShifted = TRUE; 287 } else { 288 if (elems->pce->strength >= UCOL_QUATERNARY) { 289 quaternary = 0xFFFF; 290 } 291 292 elems->pce->isShifted = FALSE; 293 } 294 295 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; 296} 297 298U_CAPI void U_EXPORT2 299uprv_init_pce(const UCollationElements *elems) 300{ 301 if (elems->pce != NULL) { 302 elems->pce->init(elems->iteratordata_.coll); 303 } 304} 305 306 307 308/* public methods ---------------------------------------------------- */ 309 310U_CAPI UCollationElements* U_EXPORT2 311ucol_openElements(const UCollator *coll, 312 const UChar *text, 313 int32_t textLength, 314 UErrorCode *status) 315{ 316 if (U_FAILURE(*status)) { 317 return NULL; 318 } 319 320 UCollationElements *result = new UCollationElements; 321 if (result == NULL) { 322 *status = U_MEMORY_ALLOCATION_ERROR; 323 return NULL; 324 } 325 326 result->reset_ = TRUE; 327 result->isWritable = FALSE; 328 result->pce = NULL; 329 330 if (text == NULL) { 331 textLength = 0; 332 } 333 uprv_init_collIterate(coll, text, textLength, &result->iteratordata_, status); 334 335 return result; 336} 337 338 339U_CAPI void U_EXPORT2 340ucol_closeElements(UCollationElements *elems) 341{ 342 if (elems != NULL) { 343 collIterate *ci = &elems->iteratordata_; 344 345 if (ci->extendCEs) { 346 uprv_free(ci->extendCEs); 347 } 348 349 if (ci->offsetBuffer) { 350 uprv_free(ci->offsetBuffer); 351 } 352 353 if (elems->isWritable && elems->iteratordata_.string != NULL) 354 { 355 uprv_free((UChar *)elems->iteratordata_.string); 356 } 357 358 if (elems->pce != NULL) { 359 delete elems->pce; 360 } 361 362 delete elems; 363 } 364} 365 366U_CAPI void U_EXPORT2 367ucol_reset(UCollationElements *elems) 368{ 369 collIterate *ci = &(elems->iteratordata_); 370 elems->reset_ = TRUE; 371 ci->pos = ci->string; 372 if ((ci->flags & UCOL_ITER_HASLEN) == 0 || ci->endp == NULL) { 373 ci->endp = ci->string + u_strlen(ci->string); 374 } 375 ci->CEpos = ci->toReturn = ci->CEs; 376 ci->flags = (ci->flags & UCOL_FORCE_HAN_IMPLICIT) | UCOL_ITER_HASLEN; 377 if (ci->coll->normalizationMode == UCOL_ON) { 378 ci->flags |= UCOL_ITER_NORM; 379 } 380 381 ci->writableBuffer.remove(); 382 ci->fcdPosition = NULL; 383 384 //ci->offsetReturn = ci->offsetStore = NULL; 385 ci->offsetRepeatCount = ci->offsetRepeatValue = 0; 386} 387 388U_CAPI void U_EXPORT2 389ucol_forceHanImplicit(UCollationElements *elems, UErrorCode *status) 390{ 391 if (U_FAILURE(*status)) { 392 return; 393 } 394 395 if (elems == NULL) { 396 *status = U_ILLEGAL_ARGUMENT_ERROR; 397 return; 398 } 399 400 elems->iteratordata_.flags |= UCOL_FORCE_HAN_IMPLICIT; 401} 402 403U_CAPI int32_t U_EXPORT2 404ucol_next(UCollationElements *elems, 405 UErrorCode *status) 406{ 407 int32_t result; 408 if (U_FAILURE(*status)) { 409 return UCOL_NULLORDER; 410 } 411 412 elems->reset_ = FALSE; 413 414 result = (int32_t)ucol_getNextCE(elems->iteratordata_.coll, 415 &elems->iteratordata_, 416 status); 417 418 if (result == UCOL_NO_MORE_CES) { 419 result = UCOL_NULLORDER; 420 } 421 return result; 422} 423 424U_CAPI int64_t U_EXPORT2 425ucol_nextProcessed(UCollationElements *elems, 426 int32_t *ixLow, 427 int32_t *ixHigh, 428 UErrorCode *status) 429{ 430 const UCollator *coll = elems->iteratordata_.coll; 431 int64_t result = UCOL_IGNORABLE; 432 uint32_t low = 0, high = 0; 433 434 if (U_FAILURE(*status)) { 435 return UCOL_PROCESSED_NULLORDER; 436 } 437 438 if (elems->pce == NULL) { 439 elems->pce = new UCollationPCE(elems); 440 } else { 441 elems->pce->pceBuffer.reset(); 442 } 443 444 elems->reset_ = FALSE; 445 446 do { 447 low = ucol_getOffset(elems); 448 uint32_t ce = (uint32_t) ucol_getNextCE(coll, &elems->iteratordata_, status); 449 high = ucol_getOffset(elems); 450 451 if (ce == UCOL_NO_MORE_CES) { 452 result = UCOL_PROCESSED_NULLORDER; 453 break; 454 } 455 456 result = processCE(elems, ce); 457 } while (result == UCOL_IGNORABLE); 458 459 if (ixLow != NULL) { 460 *ixLow = low; 461 } 462 463 if (ixHigh != NULL) { 464 *ixHigh = high; 465 } 466 467 return result; 468} 469 470U_CAPI int32_t U_EXPORT2 471ucol_previous(UCollationElements *elems, 472 UErrorCode *status) 473{ 474 if(U_FAILURE(*status)) { 475 return UCOL_NULLORDER; 476 } 477 else 478 { 479 int32_t result; 480 481 if (elems->reset_ && (elems->iteratordata_.pos == elems->iteratordata_.string)) { 482 if (elems->iteratordata_.endp == NULL) { 483 elems->iteratordata_.endp = elems->iteratordata_.string + 484 u_strlen(elems->iteratordata_.string); 485 elems->iteratordata_.flags |= UCOL_ITER_HASLEN; 486 } 487 elems->iteratordata_.pos = elems->iteratordata_.endp; 488 elems->iteratordata_.fcdPosition = elems->iteratordata_.endp; 489 } 490 491 elems->reset_ = FALSE; 492 493 result = (int32_t)ucol_getPrevCE(elems->iteratordata_.coll, 494 &(elems->iteratordata_), 495 status); 496 497 if (result == UCOL_NO_MORE_CES) { 498 result = UCOL_NULLORDER; 499 } 500 501 return result; 502 } 503} 504 505U_CAPI int64_t U_EXPORT2 506ucol_previousProcessed(UCollationElements *elems, 507 int32_t *ixLow, 508 int32_t *ixHigh, 509 UErrorCode *status) 510{ 511 const UCollator *coll = elems->iteratordata_.coll; 512 int64_t result = UCOL_IGNORABLE; 513 // int64_t primary = 0, secondary = 0, tertiary = 0, quaternary = 0; 514 // UCollationStrength strength = ucol_getStrength(coll); 515 // UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, status) == UCOL_SHIFTED; 516 // uint32_t variableTop = coll->variableTopValue; 517 int32_t low = 0, high = 0; 518 519 if (U_FAILURE(*status)) { 520 return UCOL_PROCESSED_NULLORDER; 521 } 522 523 if (elems->reset_ && 524 (elems->iteratordata_.pos == elems->iteratordata_.string)) { 525 if (elems->iteratordata_.endp == NULL) { 526 elems->iteratordata_.endp = elems->iteratordata_.string + 527 u_strlen(elems->iteratordata_.string); 528 elems->iteratordata_.flags |= UCOL_ITER_HASLEN; 529 } 530 531 elems->iteratordata_.pos = elems->iteratordata_.endp; 532 elems->iteratordata_.fcdPosition = elems->iteratordata_.endp; 533 } 534 535 if (elems->pce == NULL) { 536 elems->pce = new UCollationPCE(elems); 537 } else { 538 //elems->pce->pceBuffer.reset(); 539 } 540 541 elems->reset_ = FALSE; 542 543 while (elems->pce->pceBuffer.empty()) { 544 // buffer raw CEs up to non-ignorable primary 545 RCEBuffer rceb; 546 uint32_t ce; 547 548 // **** do we need to reset rceb, or will it always be empty at this point **** 549 do { 550 high = ucol_getOffset(elems); 551 ce = ucol_getPrevCE(coll, &elems->iteratordata_, status); 552 low = ucol_getOffset(elems); 553 554 if (ce == UCOL_NO_MORE_CES) { 555 if (! rceb.empty()) { 556 break; 557 } 558 559 goto finish; 560 } 561 562 rceb.put(ce, low, high); 563 } while ((ce & UCOL_PRIMARYMASK) == 0); 564 565 // process the raw CEs 566 while (! rceb.empty()) { 567 const RCEI *rcei = rceb.get(); 568 569 result = processCE(elems, rcei->ce); 570 571 if (result != UCOL_IGNORABLE) { 572 elems->pce->pceBuffer.put(result, rcei->low, rcei->high); 573 } 574 } 575 } 576 577finish: 578 if (elems->pce->pceBuffer.empty()) { 579 // **** Is -1 the right value for ixLow, ixHigh? **** 580 if (ixLow != NULL) { 581 *ixLow = -1; 582 } 583 584 if (ixHigh != NULL) { 585 *ixHigh = -1 586 ; 587 } 588 return UCOL_PROCESSED_NULLORDER; 589 } 590 591 const PCEI *pcei = elems->pce->pceBuffer.get(); 592 593 if (ixLow != NULL) { 594 *ixLow = pcei->low; 595 } 596 597 if (ixHigh != NULL) { 598 *ixHigh = pcei->high; 599 } 600 601 return pcei->ce; 602} 603 604U_CAPI int32_t U_EXPORT2 605ucol_getMaxExpansion(const UCollationElements *elems, 606 int32_t order) 607{ 608 uint8_t result; 609 610#if 0 611 UCOL_GETMAXEXPANSION(elems->iteratordata_.coll, (uint32_t)order, result); 612#else 613 const UCollator *coll = elems->iteratordata_.coll; 614 const uint32_t *start; 615 const uint32_t *limit; 616 const uint32_t *mid; 617 uint32_t strengthMask = 0; 618 uint32_t mOrder = (uint32_t) order; 619 620 switch (coll->strength) 621 { 622 default: 623 strengthMask |= UCOL_TERTIARYORDERMASK; 624 /* fall through */ 625 626 case UCOL_SECONDARY: 627 strengthMask |= UCOL_SECONDARYORDERMASK; 628 /* fall through */ 629 630 case UCOL_PRIMARY: 631 strengthMask |= UCOL_PRIMARYORDERMASK; 632 } 633 634 mOrder &= strengthMask; 635 start = (coll)->endExpansionCE; 636 limit = (coll)->lastEndExpansionCE; 637 638 while (start < limit - 1) { 639 mid = start + ((limit - start) >> 1); 640 if (mOrder <= (*mid & strengthMask)) { 641 limit = mid; 642 } else { 643 start = mid; 644 } 645 } 646 647 // FIXME: with a masked search, there might be more than one hit, 648 // so we need to look forward and backward from the match to find all 649 // of the hits... 650 if ((*start & strengthMask) == mOrder) { 651 result = *((coll)->expansionCESize + (start - (coll)->endExpansionCE)); 652 } else if ((*limit & strengthMask) == mOrder) { 653 result = *(coll->expansionCESize + (limit - coll->endExpansionCE)); 654 } else if ((mOrder & 0xFFFF) == 0x00C0) { 655 result = 2; 656 } else { 657 result = 1; 658 } 659#endif 660 661 return result; 662} 663 664U_CAPI void U_EXPORT2 665ucol_setText( UCollationElements *elems, 666 const UChar *text, 667 int32_t textLength, 668 UErrorCode *status) 669{ 670 if (U_FAILURE(*status)) { 671 return; 672 } 673 674 if (elems->isWritable && elems->iteratordata_.string != NULL) 675 { 676 uprv_free((UChar *)elems->iteratordata_.string); 677 } 678 679 if (text == NULL) { 680 textLength = 0; 681 } 682 683 elems->isWritable = FALSE; 684 685 /* free offset buffer to avoid memory leak before initializing. */ 686 ucol_freeOffsetBuffer(&(elems->iteratordata_)); 687 /* Ensure that previously allocated extendCEs is freed before setting to NULL. */ 688 if (elems->iteratordata_.extendCEs != NULL) { 689 uprv_free(elems->iteratordata_.extendCEs); 690 } 691 uprv_init_collIterate(elems->iteratordata_.coll, text, textLength, 692 &elems->iteratordata_, status); 693 694 elems->reset_ = TRUE; 695} 696 697U_CAPI int32_t U_EXPORT2 698ucol_getOffset(const UCollationElements *elems) 699{ 700 const collIterate *ci = &(elems->iteratordata_); 701 702 if (ci->offsetRepeatCount > 0 && ci->offsetRepeatValue != 0) { 703 return ci->offsetRepeatValue; 704 } 705 706 if (ci->offsetReturn != NULL) { 707 return *ci->offsetReturn; 708 } 709 710 // while processing characters in normalization buffer getOffset will 711 // return the next non-normalized character. 712 // should be inline with the old implementation since the old codes uses 713 // nextDecomp in normalizer which also decomposes the string till the 714 // first base character is found. 715 if (ci->flags & UCOL_ITER_INNORMBUF) { 716 if (ci->fcdPosition == NULL) { 717 return 0; 718 } 719 return (int32_t)(ci->fcdPosition - ci->string); 720 } 721 else { 722 return (int32_t)(ci->pos - ci->string); 723 } 724} 725 726U_CAPI void U_EXPORT2 727ucol_setOffset(UCollationElements *elems, 728 int32_t offset, 729 UErrorCode *status) 730{ 731 if (U_FAILURE(*status)) { 732 return; 733 } 734 735 // this methods will clean up any use of the writable buffer and points to 736 // the original string 737 collIterate *ci = &(elems->iteratordata_); 738 ci->pos = ci->string + offset; 739 ci->CEpos = ci->toReturn = ci->CEs; 740 if (ci->flags & UCOL_ITER_INNORMBUF) { 741 ci->flags = ci->origFlags; 742 } 743 if ((ci->flags & UCOL_ITER_HASLEN) == 0) { 744 ci->endp = ci->string + u_strlen(ci->string); 745 ci->flags |= UCOL_ITER_HASLEN; 746 } 747 ci->fcdPosition = NULL; 748 elems->reset_ = FALSE; 749 750 ci->offsetReturn = NULL; 751 ci->offsetStore = ci->offsetBuffer; 752 ci->offsetRepeatCount = ci->offsetRepeatValue = 0; 753} 754 755U_CAPI int32_t U_EXPORT2 756ucol_primaryOrder (int32_t order) 757{ 758 order &= UCOL_PRIMARYMASK; 759 return (order >> UCOL_PRIMARYORDERSHIFT); 760} 761 762U_CAPI int32_t U_EXPORT2 763ucol_secondaryOrder (int32_t order) 764{ 765 order &= UCOL_SECONDARYMASK; 766 return (order >> UCOL_SECONDARYORDERSHIFT); 767} 768 769U_CAPI int32_t U_EXPORT2 770ucol_tertiaryOrder (int32_t order) 771{ 772 return (order & UCOL_TERTIARYMASK); 773} 774 775 776void ucol_freeOffsetBuffer(collIterate *s) { 777 if (s != NULL && s->offsetBuffer != NULL) { 778 uprv_free(s->offsetBuffer); 779 s->offsetBuffer = NULL; 780 s->offsetBufferSize = 0; 781 } 782} 783 784#endif /* #if !UCONFIG_NO_COLLATION */ 785