1/* 2******************************************************************************* 3* Copyright (C) 1996-2009, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* file name: ucol.cpp 7* encoding: US-ASCII 8* tab size: 8 (not used) 9* indentation:4 10* 11* Modification history 12* Date Name Comments 13* 1996-1999 various members of ICU team maintained C API for collation framework 14* 02/16/2001 synwee Added internal method getPrevSpecialCE 15* 03/01/2001 synwee Added maxexpansion functionality. 16* 03/16/2001 weiv Collation framework is rewritten in C and made UCA compliant 17*/ 18 19#include "unicode/utypes.h" 20 21#if !UCONFIG_NO_COLLATION 22 23#include "unicode/coleitr.h" 24#include "unicode/unorm.h" 25#include "unicode/udata.h" 26#include "unicode/ustring.h" 27 28#include "ucol_imp.h" 29#include "bocsu.h" 30 31#include "unormimp.h" 32#include "unorm_it.h" 33#include "umutex.h" 34#include "cmemory.h" 35#include "ucln_in.h" 36#include "cstring.h" 37#include "utracimp.h" 38#include "putilimp.h" 39#include "uassert.h" 40 41#ifdef UCOL_DEBUG 42#include <stdio.h> 43#endif 44 45U_NAMESPACE_USE 46 47#define LAST_BYTE_MASK_ 0xFF 48#define SECOND_LAST_BYTE_SHIFT_ 8 49 50#define ZERO_CC_LIMIT_ 0xC0 51 52// this is static pointer to the normalizer fcdTrieIndex 53// it is always the same between calls to u_cleanup 54// and therefore writing to it is not synchronized. 55// It is cleaned in ucol_cleanup 56static const uint16_t *fcdTrieIndex=NULL; 57// Code points at fcdHighStart and above have a zero FCD value. 58static UChar32 fcdHighStart = 0; 59 60// These are values from UCA required for 61// implicit generation and supressing sort key compression 62// they should regularly be in the UCA, but if one 63// is running without UCA, it could be a problem 64static const int32_t maxRegularPrimary = 0xA0; 65static const int32_t minImplicitPrimary = 0xE0; 66static const int32_t maxImplicitPrimary = 0xE4; 67 68U_CDECL_BEGIN 69static UBool U_CALLCONV 70ucol_cleanup(void) 71{ 72 fcdTrieIndex = NULL; 73 return TRUE; 74} 75 76static int32_t U_CALLCONV 77_getFoldingOffset(uint32_t data) { 78 return (int32_t)(data&0xFFFFFF); 79} 80 81U_CDECL_END 82 83static 84inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString, 85 int32_t sourceLen, collIterate *s) 86{ 87 (s)->string = (s)->pos = (UChar *)(sourceString); 88 (s)->origFlags = 0; 89 (s)->flags = 0; 90 if (sourceLen >= 0) { 91 s->flags |= UCOL_ITER_HASLEN; 92 (s)->endp = (UChar *)sourceString+sourceLen; 93 } 94 else { 95 /* change to enable easier checking for end of string for fcdpositon */ 96 (s)->endp = NULL; 97 } 98 (s)->extendCEs = NULL; 99 (s)->extendCEsSize = 0; 100 (s)->CEpos = (s)->toReturn = (s)->CEs; 101 (s)->offsetBuffer = NULL; 102 (s)->offsetBufferSize = 0; 103 (s)->offsetReturn = (s)->offsetStore = NULL; 104 (s)->offsetRepeatCount = (s)->offsetRepeatValue = 0; 105 (s)->writableBuffer = (s)->stackWritableBuffer; 106 (s)->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE; 107 (s)->coll = (collator); 108 (s)->fcdPosition = 0; 109 if(collator->normalizationMode == UCOL_ON) { 110 (s)->flags |= UCOL_ITER_NORM; 111 } 112 if(collator->hiraganaQ == UCOL_ON && collator->strength >= UCOL_QUATERNARY) { 113 (s)->flags |= UCOL_HIRAGANA_Q; 114 } 115 (s)->iterator = NULL; 116 //(s)->iteratorIndex = 0; 117} 118 119U_CAPI void U_EXPORT2 120uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, 121 int32_t sourceLen, collIterate *s){ 122 /* Out-of-line version for use from other files. */ 123 IInit_collIterate(collator, sourceString, sourceLen, s); 124} 125 126/** 127* Backup the state of the collIterate struct data 128* @param data collIterate to backup 129* @param backup storage 130*/ 131static 132inline void backupState(const collIterate *data, collIterateState *backup) 133{ 134 backup->fcdPosition = data->fcdPosition; 135 backup->flags = data->flags; 136 backup->origFlags = data->origFlags; 137 backup->pos = data->pos; 138 backup->bufferaddress = data->writableBuffer; 139 backup->buffersize = data->writableBufSize; 140 backup->iteratorMove = 0; 141 backup->iteratorIndex = 0; 142 if(data->iterator != NULL) { 143 //backup->iteratorIndex = data->iterator->getIndex(data->iterator, UITER_CURRENT); 144 backup->iteratorIndex = data->iterator->getState(data->iterator); 145 // no we try to fixup if we're using a normalizing iterator and we get UITER_NO_STATE 146 if(backup->iteratorIndex == UITER_NO_STATE) { 147 while((backup->iteratorIndex = data->iterator->getState(data->iterator)) == UITER_NO_STATE) { 148 backup->iteratorMove++; 149 data->iterator->move(data->iterator, -1, UITER_CURRENT); 150 } 151 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 152 } 153 } 154} 155 156/** 157* Loads the state into the collIterate struct data 158* @param data collIterate to backup 159* @param backup storage 160* @param forwards boolean to indicate if forwards iteration is used, 161* false indicates backwards iteration 162*/ 163static 164inline void loadState(collIterate *data, const collIterateState *backup, 165 UBool forwards) 166{ 167 UErrorCode status = U_ZERO_ERROR; 168 data->flags = backup->flags; 169 data->origFlags = backup->origFlags; 170 if(data->iterator != NULL) { 171 //data->iterator->move(data->iterator, backup->iteratorIndex, UITER_ZERO); 172 data->iterator->setState(data->iterator, backup->iteratorIndex, &status); 173 if(backup->iteratorMove != 0) { 174 data->iterator->move(data->iterator, backup->iteratorMove, UITER_CURRENT); 175 } 176 } 177 data->pos = backup->pos; 178 179 if ((data->flags & UCOL_ITER_INNORMBUF) && 180 data->writableBuffer != backup->bufferaddress) { 181 /* 182 this is when a new buffer has been reallocated and we'll have to 183 calculate the new position. 184 note the new buffer has to contain the contents of the old buffer. 185 */ 186 if (forwards) { 187 data->pos = data->writableBuffer + 188 (data->pos - backup->bufferaddress); 189 } 190 else { 191 /* backwards direction */ 192 uint32_t temp = backup->buffersize - 193 (data->pos - backup->bufferaddress); 194 data->pos = data->writableBuffer + (data->writableBufSize - temp); 195 } 196 } 197 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 198 /* 199 this is alittle tricky. 200 if we are initially not in the normalization buffer, even if we 201 normalize in the later stage, the data in the buffer will be 202 ignored, since we skip back up to the data string. 203 however if we are already in the normalization buffer, any 204 further normalization will pull data into the normalization 205 buffer and modify the fcdPosition. 206 since we are keeping the data in the buffer for use, the 207 fcdPosition can not be reverted back. 208 arrgghh.... 209 */ 210 data->fcdPosition = backup->fcdPosition; 211 } 212} 213 214 215/* 216* collIter_eos() 217* Checks for a collIterate being positioned at the end of 218* its source string. 219* 220*/ 221static 222inline UBool collIter_eos(collIterate *s) { 223 if(s->flags & UCOL_USE_ITERATOR) { 224 return !(s->iterator->hasNext(s->iterator)); 225 } 226 if ((s->flags & UCOL_ITER_HASLEN) == 0 && *s->pos != 0) { 227 // Null terminated string, but not at null, so not at end. 228 // Whether in main or normalization buffer doesn't matter. 229 return FALSE; 230 } 231 232 // String with length. Can't be in normalization buffer, which is always 233 // null termintated. 234 if (s->flags & UCOL_ITER_HASLEN) { 235 return (s->pos == s->endp); 236 } 237 238 // We are at a null termination, could be either normalization buffer or main string. 239 if ((s->flags & UCOL_ITER_INNORMBUF) == 0) { 240 // At null at end of main string. 241 return TRUE; 242 } 243 244 // At null at end of normalization buffer. Need to check whether there there are 245 // any characters left in the main buffer. 246 if(s->origFlags & UCOL_USE_ITERATOR) { 247 return !(s->iterator->hasNext(s->iterator)); 248 } else if ((s->origFlags & UCOL_ITER_HASLEN) == 0) { 249 // Null terminated main string. fcdPosition is the 'return' position into main buf. 250 return (*s->fcdPosition == 0); 251 } 252 else { 253 // Main string with an end pointer. 254 return s->fcdPosition == s->endp; 255 } 256} 257 258/* 259* collIter_bos() 260* Checks for a collIterate being positioned at the start of 261* its source string. 262* 263*/ 264static 265inline UBool collIter_bos(collIterate *source) { 266 // if we're going backwards, we need to know whether there is more in the 267 // iterator, even if we are in the side buffer 268 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 269 return !source->iterator->hasPrevious(source->iterator); 270 } 271 if (source->pos <= source->string || 272 ((source->flags & UCOL_ITER_INNORMBUF) && 273 *(source->pos - 1) == 0 && source->fcdPosition == NULL)) { 274 return TRUE; 275 } 276 return FALSE; 277} 278 279/*static 280inline UBool collIter_SimpleBos(collIterate *source) { 281 // if we're going backwards, we need to know whether there is more in the 282 // iterator, even if we are in the side buffer 283 if(source->flags & UCOL_USE_ITERATOR || source->origFlags & UCOL_USE_ITERATOR) { 284 return !source->iterator->hasPrevious(source->iterator); 285 } 286 if (source->pos == source->string) { 287 return TRUE; 288 } 289 return FALSE; 290}*/ 291 //return (data->pos == data->string) || 292 293 294/** 295* Checks and free writable buffer if it is not the original stack buffer 296* in collIterate. This function does not reassign the writable buffer. 297* @param data collIterate struct to determine and free the writable buffer 298*/ 299static 300inline void freeHeapWritableBuffer(collIterate *data) 301{ 302 if (data->writableBuffer != data->stackWritableBuffer) { 303 uprv_free(data->writableBuffer); 304 } 305} 306 307 308/****************************************************************************/ 309/* Following are the open/close functions */ 310/* */ 311/****************************************************************************/ 312 313static UCollator* 314ucol_initFromBinary(const uint8_t *bin, int32_t length, 315 const UCollator *base, 316 UCollator *fillIn, 317 UErrorCode *status) 318{ 319 UCollator *result = fillIn; 320 if(U_FAILURE(*status)) { 321 return NULL; 322 } 323 /* 324 if(base == NULL) { 325 // we don't support null base yet 326 *status = U_ILLEGAL_ARGUMENT_ERROR; 327 return NULL; 328 } 329 */ 330 // We need these and we could be running without UCA 331 uprv_uca_initImplicitConstants(status); 332 UCATableHeader *colData = (UCATableHeader *)bin; 333 // do we want version check here? We're trying to figure out whether collators are compatible 334 if((base && (uprv_memcmp(colData->UCAVersion, base->image->UCAVersion, sizeof(UVersionInfo)) != 0 || 335 uprv_memcmp(colData->UCDVersion, base->image->UCDVersion, sizeof(UVersionInfo)) != 0)) || 336 colData->version[0] != UCOL_BUILDER_VERSION) 337 { 338 *status = U_COLLATOR_VERSION_MISMATCH; 339 return NULL; 340 } 341 else { 342 if((uint32_t)length > (paddedsize(sizeof(UCATableHeader)) + paddedsize(sizeof(UColOptionSet)))) { 343 result = ucol_initCollator((const UCATableHeader *)bin, result, base, status); 344 if(U_FAILURE(*status)){ 345 return NULL; 346 } 347 result->hasRealData = TRUE; 348 } 349 else { 350 if(base) { 351 result = ucol_initCollator(base->image, result, base, status); 352 ucol_setOptionsFromHeader(result, (UColOptionSet *)(bin+((const UCATableHeader *)bin)->options), status); 353 if(U_FAILURE(*status)){ 354 return NULL; 355 } 356 result->hasRealData = FALSE; 357 } 358 else { 359 *status = U_USELESS_COLLATOR_ERROR; 360 return NULL; 361 } 362 } 363 result->freeImageOnClose = FALSE; 364 } 365 result->actualLocale = NULL; 366 result->validLocale = NULL; 367 result->requestedLocale = NULL; 368 result->rules = NULL; 369 result->rulesLength = 0; 370 result->freeRulesOnClose = FALSE; 371 result->ucaRules = NULL; 372 return result; 373} 374 375U_CAPI UCollator* U_EXPORT2 376ucol_openBinary(const uint8_t *bin, int32_t length, 377 const UCollator *base, 378 UErrorCode *status) 379{ 380 return ucol_initFromBinary(bin, length, base, NULL, status); 381} 382 383U_CAPI int32_t U_EXPORT2 384ucol_cloneBinary(const UCollator *coll, 385 uint8_t *buffer, int32_t capacity, 386 UErrorCode *status) 387{ 388 int32_t length = 0; 389 if(U_FAILURE(*status)) { 390 return length; 391 } 392 if(capacity < 0) { 393 *status = U_ILLEGAL_ARGUMENT_ERROR; 394 return length; 395 } 396 if(coll->hasRealData == TRUE) { 397 length = coll->image->size; 398 if(length <= capacity) { 399 uprv_memcpy(buffer, coll->image, length); 400 } else { 401 *status = U_BUFFER_OVERFLOW_ERROR; 402 } 403 } else { 404 length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 405 if(length <= capacity) { 406 /* build the UCATableHeader with minimal entries */ 407 /* do not copy the header from the UCA file because its values are wrong! */ 408 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 409 410 /* reset everything */ 411 uprv_memset(buffer, 0, length); 412 413 /* set the tailoring-specific values */ 414 UCATableHeader *myData = (UCATableHeader *)buffer; 415 myData->size = length; 416 417 /* offset for the options, the only part of the data that is present after the header */ 418 myData->options = sizeof(UCATableHeader); 419 420 /* need to always set the expansion value for an upper bound of the options */ 421 myData->expansion = myData->options + sizeof(UColOptionSet); 422 423 myData->magic = UCOL_HEADER_MAGIC; 424 myData->isBigEndian = U_IS_BIG_ENDIAN; 425 myData->charSetFamily = U_CHARSET_FAMILY; 426 427 /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 428 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 429 430 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 431 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 432 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 433 myData->jamoSpecial = coll->image->jamoSpecial; 434 435 /* copy the collator options */ 436 uprv_memcpy(buffer+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 437 } else { 438 *status = U_BUFFER_OVERFLOW_ERROR; 439 } 440 } 441 return length; 442} 443 444U_CAPI UCollator* U_EXPORT2 445ucol_safeClone(const UCollator *coll, void *stackBuffer, int32_t * pBufferSize, UErrorCode *status) 446{ 447 UCollator * localCollator; 448 int32_t bufferSizeNeeded = (int32_t)sizeof(UCollator); 449 char *stackBufferChars = (char *)stackBuffer; 450 int32_t imageSize = 0; 451 int32_t rulesSize = 0; 452 int32_t rulesPadding = 0; 453 uint8_t *image; 454 UChar *rules; 455 UBool colAllocated = FALSE; 456 UBool imageAllocated = FALSE; 457 458 if (status == NULL || U_FAILURE(*status)){ 459 return 0; 460 } 461 if ((stackBuffer && !pBufferSize) || !coll){ 462 *status = U_ILLEGAL_ARGUMENT_ERROR; 463 return 0; 464 } 465 if (coll->rules && coll->freeRulesOnClose) { 466 rulesSize = (int32_t)(coll->rulesLength + 1)*sizeof(UChar); 467 rulesPadding = (int32_t)(bufferSizeNeeded % sizeof(UChar)); 468 bufferSizeNeeded += rulesSize + rulesPadding; 469 } 470 471 if (stackBuffer && *pBufferSize <= 0){ /* 'preflighting' request - set needed size into *pBufferSize */ 472 *pBufferSize = bufferSizeNeeded; 473 return 0; 474 } 475 476 /* Pointers on 64-bit platforms need to be aligned 477 * on a 64-bit boundry in memory. 478 */ 479 if (U_ALIGNMENT_OFFSET(stackBuffer) != 0) { 480 int32_t offsetUp = (int32_t)U_ALIGNMENT_OFFSET_UP(stackBufferChars); 481 if (*pBufferSize > offsetUp) { 482 *pBufferSize -= offsetUp; 483 stackBufferChars += offsetUp; 484 } 485 else { 486 /* prevent using the stack buffer but keep the size > 0 so that we do not just preflight */ 487 *pBufferSize = 1; 488 } 489 } 490 stackBuffer = (void *)stackBufferChars; 491 492 if (stackBuffer == NULL || *pBufferSize < bufferSizeNeeded) { 493 /* allocate one here...*/ 494 stackBufferChars = (char *)uprv_malloc(bufferSizeNeeded); 495 // Null pointer check. 496 if (stackBufferChars == NULL) { 497 *status = U_MEMORY_ALLOCATION_ERROR; 498 return NULL; 499 } 500 colAllocated = TRUE; 501 if (U_SUCCESS(*status)) { 502 *status = U_SAFECLONE_ALLOCATED_WARNING; 503 } 504 } 505 localCollator = (UCollator *)stackBufferChars; 506 rules = (UChar *)(stackBufferChars + sizeof(UCollator) + rulesPadding); 507 { 508 UErrorCode tempStatus = U_ZERO_ERROR; 509 imageSize = ucol_cloneBinary(coll, NULL, 0, &tempStatus); 510 } 511 if (coll->freeImageOnClose) { 512 image = (uint8_t *)uprv_malloc(imageSize); 513 // Null pointer check 514 if (image == NULL) { 515 *status = U_MEMORY_ALLOCATION_ERROR; 516 return NULL; 517 } 518 ucol_cloneBinary(coll, image, imageSize, status); 519 imageAllocated = TRUE; 520 } 521 else { 522 image = (uint8_t *)coll->image; 523 } 524 localCollator = ucol_initFromBinary(image, imageSize, coll->UCA, localCollator, status); 525 if (U_FAILURE(*status)) { 526 return NULL; 527 } 528 529 if (coll->rules) { 530 if (coll->freeRulesOnClose) { 531 localCollator->rules = u_strcpy(rules, coll->rules); 532 //bufferEnd += rulesSize; 533 } 534 else { 535 localCollator->rules = coll->rules; 536 } 537 localCollator->freeRulesOnClose = FALSE; 538 localCollator->rulesLength = coll->rulesLength; 539 } 540 541 int32_t i; 542 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 543 ucol_setAttribute(localCollator, (UColAttribute)i, ucol_getAttribute(coll, (UColAttribute)i, status), status); 544 } 545 // zero copies of pointers 546 localCollator->actualLocale = NULL; 547 localCollator->validLocale = NULL; 548 localCollator->requestedLocale = NULL; 549 localCollator->ucaRules = coll->ucaRules; // There should only be one copy here. 550 localCollator->freeOnClose = colAllocated; 551 localCollator->freeImageOnClose = imageAllocated; 552 return localCollator; 553} 554 555U_CAPI void U_EXPORT2 556ucol_close(UCollator *coll) 557{ 558 UTRACE_ENTRY_OC(UTRACE_UCOL_CLOSE); 559 UTRACE_DATA1(UTRACE_INFO, "coll = %p", coll); 560 if(coll != NULL) { 561 // these are always owned by each UCollator struct, 562 // so we always free them 563 if(coll->validLocale != NULL) { 564 uprv_free(coll->validLocale); 565 } 566 if(coll->actualLocale != NULL) { 567 uprv_free(coll->actualLocale); 568 } 569 if(coll->requestedLocale != NULL) { 570 uprv_free(coll->requestedLocale); 571 } 572 if(coll->latinOneCEs != NULL) { 573 uprv_free(coll->latinOneCEs); 574 } 575 if(coll->options != NULL && coll->freeOptionsOnClose) { 576 uprv_free(coll->options); 577 } 578 if(coll->rules != NULL && coll->freeRulesOnClose) { 579 uprv_free((UChar *)coll->rules); 580 } 581 if(coll->image != NULL && coll->freeImageOnClose) { 582 uprv_free((UCATableHeader *)coll->image); 583 } 584 585 /* Here, it would be advisable to close: */ 586 /* - UData for UCA (unless we stuff it in the root resb */ 587 /* Again, do we need additional housekeeping... HMMM! */ 588 UTRACE_DATA1(UTRACE_INFO, "coll->freeOnClose: %d", coll->freeOnClose); 589 if(coll->freeOnClose){ 590 /* for safeClone, if freeOnClose is FALSE, 591 don't free the other instance data */ 592 uprv_free(coll); 593 } 594 } 595 UTRACE_EXIT(); 596} 597 598/* This one is currently used by genrb & tests. After constructing from rules (tailoring),*/ 599/* you should be able to get the binary chunk to write out... Doesn't look very full now */ 600U_CFUNC uint8_t* U_EXPORT2 601ucol_cloneRuleData(const UCollator *coll, int32_t *length, UErrorCode *status) 602{ 603 uint8_t *result = NULL; 604 if(U_FAILURE(*status)) { 605 return NULL; 606 } 607 if(coll->hasRealData == TRUE) { 608 *length = coll->image->size; 609 result = (uint8_t *)uprv_malloc(*length); 610 /* test for NULL */ 611 if (result == NULL) { 612 *status = U_MEMORY_ALLOCATION_ERROR; 613 return NULL; 614 } 615 uprv_memcpy(result, coll->image, *length); 616 } else { 617 *length = (int32_t)(paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet))); 618 result = (uint8_t *)uprv_malloc(*length); 619 /* test for NULL */ 620 if (result == NULL) { 621 *status = U_MEMORY_ALLOCATION_ERROR; 622 return NULL; 623 } 624 625 /* build the UCATableHeader with minimal entries */ 626 /* do not copy the header from the UCA file because its values are wrong! */ 627 /* uprv_memcpy(result, UCA->image, sizeof(UCATableHeader)); */ 628 629 /* reset everything */ 630 uprv_memset(result, 0, *length); 631 632 /* set the tailoring-specific values */ 633 UCATableHeader *myData = (UCATableHeader *)result; 634 myData->size = *length; 635 636 /* offset for the options, the only part of the data that is present after the header */ 637 myData->options = sizeof(UCATableHeader); 638 639 /* need to always set the expansion value for an upper bound of the options */ 640 myData->expansion = myData->options + sizeof(UColOptionSet); 641 642 myData->magic = UCOL_HEADER_MAGIC; 643 myData->isBigEndian = U_IS_BIG_ENDIAN; 644 myData->charSetFamily = U_CHARSET_FAMILY; 645 646 /* copy UCA's version; genrb will override all but the builder version with tailoring data */ 647 uprv_memcpy(myData->version, coll->image->version, sizeof(UVersionInfo)); 648 649 uprv_memcpy(myData->UCAVersion, coll->image->UCAVersion, sizeof(UVersionInfo)); 650 uprv_memcpy(myData->UCDVersion, coll->image->UCDVersion, sizeof(UVersionInfo)); 651 uprv_memcpy(myData->formatVersion, coll->image->formatVersion, sizeof(UVersionInfo)); 652 myData->jamoSpecial = coll->image->jamoSpecial; 653 654 /* copy the collator options */ 655 uprv_memcpy(result+paddedsize(sizeof(UCATableHeader)), coll->options, sizeof(UColOptionSet)); 656 } 657 return result; 658} 659 660void ucol_setOptionsFromHeader(UCollator* result, UColOptionSet * opts, UErrorCode *status) { 661 if(U_FAILURE(*status)) { 662 return; 663 } 664 result->caseFirst = (UColAttributeValue)opts->caseFirst; 665 result->caseLevel = (UColAttributeValue)opts->caseLevel; 666 result->frenchCollation = (UColAttributeValue)opts->frenchCollation; 667 result->normalizationMode = (UColAttributeValue)opts->normalizationMode; 668 result->strength = (UColAttributeValue)opts->strength; 669 result->variableTopValue = opts->variableTopValue; 670 result->alternateHandling = (UColAttributeValue)opts->alternateHandling; 671 result->hiraganaQ = (UColAttributeValue)opts->hiraganaQ; 672 result->numericCollation = (UColAttributeValue)opts->numericCollation; 673 674 result->caseFirstisDefault = TRUE; 675 result->caseLevelisDefault = TRUE; 676 result->frenchCollationisDefault = TRUE; 677 result->normalizationModeisDefault = TRUE; 678 result->strengthisDefault = TRUE; 679 result->variableTopValueisDefault = TRUE; 680 result->hiraganaQisDefault = TRUE; 681 result->numericCollationisDefault = TRUE; 682 683 ucol_updateInternalState(result, status); 684 685 result->options = opts; 686} 687 688 689/** 690* Approximate determination if a character is at a contraction end. 691* Guaranteed to be TRUE if a character is at the end of a contraction, 692* otherwise it is not deterministic. 693* @param c character to be determined 694* @param coll collator 695*/ 696static 697inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) { 698 if (c < coll->minContrEndCP) { 699 return FALSE; 700 } 701 702 int32_t hash = c; 703 uint8_t htbyte; 704 if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) { 705 if (U16_IS_TRAIL(c)) { 706 return TRUE; 707 } 708 hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256; 709 } 710 htbyte = coll->contrEndCP[hash>>3]; 711 return (((htbyte >> (hash & 7)) & 1) == 1); 712} 713 714 715 716/* 717* i_getCombiningClass() 718* A fast, at least partly inline version of u_getCombiningClass() 719* This is a candidate for further optimization. Used heavily 720* in contraction processing. 721*/ 722static 723inline uint8_t i_getCombiningClass(UChar32 c, const UCollator *coll) { 724 uint8_t sCC = 0; 725 if ((c >= 0x300 && ucol_unsafeCP(c, coll)) || c > 0xFFFF) { 726 sCC = u_getCombiningClass(c); 727 } 728 return sCC; 729} 730 731UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, const UCollator *UCA, UErrorCode *status) { 732 UChar c; 733 UCollator *result = fillIn; 734 if(U_FAILURE(*status) || image == NULL) { 735 return NULL; 736 } 737 738 if(result == NULL) { 739 result = (UCollator *)uprv_malloc(sizeof(UCollator)); 740 if(result == NULL) { 741 *status = U_MEMORY_ALLOCATION_ERROR; 742 return result; 743 } 744 result->freeOnClose = TRUE; 745 } else { 746 result->freeOnClose = FALSE; 747 } 748 749 // init FCD data 750 if (fcdTrieIndex == NULL) { 751 // The result is constant, until the library is reloaded. 752 fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 753 ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup); 754 } 755 756 result->image = image; 757 result->mapping.getFoldingOffset = _getFoldingOffset; 758 const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; 759 utrie_unserialize(&result->mapping, mapping, result->image->endExpansionCE - result->image->mappingPosition, status); 760 if(U_FAILURE(*status)) { 761 if(result->freeOnClose == TRUE) { 762 uprv_free(result); 763 result = NULL; 764 } 765 return result; 766 } 767 768 /*result->latinOneMapping = (uint32_t*)((uint8_t*)result->image+result->image->latinOneMapping);*/ 769 result->latinOneMapping = UTRIE_GET32_LATIN1(&result->mapping); 770 result->contractionCEs = (uint32_t*)((uint8_t*)result->image+result->image->contractionCEs); 771 result->contractionIndex = (UChar*)((uint8_t*)result->image+result->image->contractionIndex); 772 result->expansion = (uint32_t*)((uint8_t*)result->image+result->image->expansion); 773 774 result->options = (UColOptionSet*)((uint8_t*)result->image+result->image->options); 775 result->freeOptionsOnClose = FALSE; 776 777 /* set attributes */ 778 result->caseFirst = (UColAttributeValue)result->options->caseFirst; 779 result->caseLevel = (UColAttributeValue)result->options->caseLevel; 780 result->frenchCollation = (UColAttributeValue)result->options->frenchCollation; 781 result->normalizationMode = (UColAttributeValue)result->options->normalizationMode; 782 result->strength = (UColAttributeValue)result->options->strength; 783 result->variableTopValue = result->options->variableTopValue; 784 result->alternateHandling = (UColAttributeValue)result->options->alternateHandling; 785 result->hiraganaQ = (UColAttributeValue)result->options->hiraganaQ; 786 result->numericCollation = (UColAttributeValue)result->options->numericCollation; 787 788 result->caseFirstisDefault = TRUE; 789 result->caseLevelisDefault = TRUE; 790 result->frenchCollationisDefault = TRUE; 791 result->normalizationModeisDefault = TRUE; 792 result->strengthisDefault = TRUE; 793 result->variableTopValueisDefault = TRUE; 794 result->alternateHandlingisDefault = TRUE; 795 result->hiraganaQisDefault = TRUE; 796 result->numericCollationisDefault = TRUE; 797 798 /*result->scriptOrder = NULL;*/ 799 800 result->rules = NULL; 801 result->rulesLength = 0; 802 result->freeRulesOnClose = FALSE; 803 804 /* get the version info from UCATableHeader and populate the Collator struct*/ 805 result->dataVersion[0] = result->image->version[0]; /* UCA Builder version*/ 806 result->dataVersion[1] = result->image->version[1]; /* UCA Tailoring rules version*/ 807 result->dataVersion[2] = 0; 808 result->dataVersion[3] = 0; 809 810 result->unsafeCP = (uint8_t *)result->image + result->image->unsafeCP; 811 result->minUnsafeCP = 0; 812 for (c=0; c<0x300; c++) { // Find the smallest unsafe char. 813 if (ucol_unsafeCP(c, result)) break; 814 } 815 result->minUnsafeCP = c; 816 817 result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP; 818 result->minContrEndCP = 0; 819 for (c=0; c<0x300; c++) { // Find the Contraction-ending char. 820 if (ucol_contractionEndCP(c, result)) break; 821 } 822 result->minContrEndCP = c; 823 824 /* max expansion tables */ 825 result->endExpansionCE = (uint32_t*)((uint8_t*)result->image + 826 result->image->endExpansionCE); 827 result->lastEndExpansionCE = result->endExpansionCE + 828 result->image->endExpansionCECount - 1; 829 result->expansionCESize = (uint8_t*)result->image + 830 result->image->expansionCESize; 831 832 833 //result->errorCode = *status; 834 835 result->latinOneCEs = NULL; 836 837 result->latinOneRegenTable = FALSE; 838 result->latinOneFailed = FALSE; 839 result->UCA = UCA; 840 841 ucol_updateInternalState(result, status); 842 843 /* Normally these will be set correctly later. This is the default if you use UCA or the default. */ 844 result->ucaRules = NULL; 845 result->actualLocale = NULL; 846 result->validLocale = NULL; 847 result->requestedLocale = NULL; 848 result->hasRealData = FALSE; // real data lives in .dat file... 849 result->freeImageOnClose = FALSE; 850 851 return result; 852} 853 854/* new Mark's code */ 855 856/** 857 * For generation of Implicit CEs 858 * @author Davis 859 * 860 * Cleaned up so that changes can be made more easily. 861 * Old values: 862# First Implicit: E26A792D 863# Last Implicit: E3DC70C0 864# First CJK: E0030300 865# Last CJK: E0A9DD00 866# First CJK_A: E0A9DF00 867# Last CJK_A: E0DE3100 868 */ 869/* Following is a port of Mark's code for new treatment of implicits. 870 * It is positioned here, since ucol_initUCA need to initialize the 871 * variables below according to the data in the fractional UCA. 872 */ 873 874/** 875 * Function used to: 876 * a) collapse the 2 different Han ranges from UCA into one (in the right order), and 877 * b) bump any non-CJK characters by 10FFFF. 878 * The relevant blocks are: 879 * A: 4E00..9FFF; CJK Unified Ideographs 880 * F900..FAFF; CJK Compatibility Ideographs 881 * B: 3400..4DBF; CJK Unified Ideographs Extension A 882 * 20000..XX; CJK Unified Ideographs Extension B (and others later on) 883 * As long as 884 * no new B characters are allocated between 4E00 and FAFF, and 885 * no new A characters are outside of this range, 886 * (very high probability) this simple code will work. 887 * The reordered blocks are: 888 * Block1 is CJK 889 * Block2 is CJK_COMPAT_USED 890 * Block3 is CJK_A 891 * (all contiguous) 892 * Any other CJK gets its normal code point 893 * Any non-CJK gets +10FFFF 894 * When we reorder Block1, we make sure that it is at the very start, 895 * so that it will use a 3-byte form. 896 * Warning: the we only pick up the compatibility characters that are 897 * NOT decomposed, so that block is smaller! 898 */ 899 900// CONSTANTS 901static const UChar32 902 NON_CJK_OFFSET = 0x110000, 903 UCOL_MAX_INPUT = 0x220001; // 2 * Unicode range + 2 904 905/** 906 * Precomputed by initImplicitConstants() 907 */ 908static int32_t 909 final3Multiplier = 0, 910 final4Multiplier = 0, 911 final3Count = 0, 912 final4Count = 0, 913 medialCount = 0, 914 min3Primary = 0, 915 min4Primary = 0, 916 max4Primary = 0, 917 minTrail = 0, 918 maxTrail = 0, 919 max3Trail = 0, 920 max4Trail = 0, 921 min4Boundary = 0; 922 923static const UChar32 924 CJK_BASE = 0x4E00, 925 CJK_LIMIT = 0x9FFF+1, 926 CJK_COMPAT_USED_BASE = 0xFA0E, 927 CJK_COMPAT_USED_LIMIT = 0xFA2F+1, 928 CJK_A_BASE = 0x3400, 929 CJK_A_LIMIT = 0x4DBF+1, 930 CJK_B_BASE = 0x20000, 931 CJK_B_LIMIT = 0x2A6DF+1; 932 933static UChar32 swapCJK(UChar32 i) { 934 935 if (i >= CJK_BASE) { 936 if (i < CJK_LIMIT) return i - CJK_BASE; 937 938 if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET; 939 940 if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE 941 + (CJK_LIMIT - CJK_BASE); 942 if (i < CJK_B_BASE) return i + NON_CJK_OFFSET; 943 944 if (i < CJK_B_LIMIT) return i; // non-BMP-CJK 945 946 return i + NON_CJK_OFFSET; // non-CJK 947 } 948 if (i < CJK_A_BASE) return i + NON_CJK_OFFSET; 949 950 if (i < CJK_A_LIMIT) return i - CJK_A_BASE 951 + (CJK_LIMIT - CJK_BASE) 952 + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 953 return i + NON_CJK_OFFSET; // non-CJK 954} 955 956U_CAPI UChar32 U_EXPORT2 957uprv_uca_getRawFromCodePoint(UChar32 i) { 958 return swapCJK(i)+1; 959} 960 961U_CAPI UChar32 U_EXPORT2 962uprv_uca_getCodePointFromRaw(UChar32 i) { 963 i--; 964 UChar32 result = 0; 965 if(i >= NON_CJK_OFFSET) { 966 result = i - NON_CJK_OFFSET; 967 } else if(i >= CJK_B_BASE) { 968 result = i; 969 } else if(i < CJK_A_LIMIT + (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { // rest of CJKs, compacted 970 if(i < CJK_LIMIT - CJK_BASE) { 971 result = i + CJK_BASE; 972 } else if(i < (CJK_LIMIT - CJK_BASE) + (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE)) { 973 result = i + CJK_COMPAT_USED_BASE - (CJK_LIMIT - CJK_BASE); 974 } else { 975 result = i + CJK_A_BASE - (CJK_LIMIT - CJK_BASE) - (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE); 976 } 977 } else { 978 result = -1; 979 } 980 return result; 981} 982 983// GET IMPLICIT PRIMARY WEIGHTS 984// Return value is left justified primary key 985U_CAPI uint32_t U_EXPORT2 986uprv_uca_getImplicitFromRaw(UChar32 cp) { 987 /* 988 if (cp < 0 || cp > UCOL_MAX_INPUT) { 989 throw new IllegalArgumentException("Code point out of range " + Utility.hex(cp)); 990 } 991 */ 992 int32_t last0 = cp - min4Boundary; 993 if (last0 < 0) { 994 int32_t last1 = cp / final3Count; 995 last0 = cp % final3Count; 996 997 int32_t last2 = last1 / medialCount; 998 last1 %= medialCount; 999 1000 last0 = minTrail + last0*final3Multiplier; // spread out, leaving gap at start 1001 last1 = minTrail + last1; // offset 1002 last2 = min3Primary + last2; // offset 1003 /* 1004 if (last2 >= min4Primary) { 1005 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last2)); 1006 } 1007 */ 1008 return (last2 << 24) + (last1 << 16) + (last0 << 8); 1009 } else { 1010 int32_t last1 = last0 / final4Count; 1011 last0 %= final4Count; 1012 1013 int32_t last2 = last1 / medialCount; 1014 last1 %= medialCount; 1015 1016 int32_t last3 = last2 / medialCount; 1017 last2 %= medialCount; 1018 1019 last0 = minTrail + last0*final4Multiplier; // spread out, leaving gap at start 1020 last1 = minTrail + last1; // offset 1021 last2 = minTrail + last2; // offset 1022 last3 = min4Primary + last3; // offset 1023 /* 1024 if (last3 > max4Primary) { 1025 throw new IllegalArgumentException("4-byte out of range: " + Utility.hex(cp) + ", " + Utility.hex(last3)); 1026 } 1027 */ 1028 return (last3 << 24) + (last2 << 16) + (last1 << 8) + last0; 1029 } 1030} 1031 1032static uint32_t U_EXPORT2 1033uprv_uca_getImplicitPrimary(UChar32 cp) { 1034 //if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp)); 1035 1036 cp = swapCJK(cp); 1037 cp++; 1038 // we now have a range of numbers from 0 to 21FFFF. 1039 1040 //if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp)); 1041 1042 return uprv_uca_getImplicitFromRaw(cp); 1043} 1044 1045/** 1046 * Converts implicit CE into raw integer ("code point") 1047 * @param implicit 1048 * @return -1 if illegal format 1049 */ 1050U_CAPI UChar32 U_EXPORT2 1051uprv_uca_getRawFromImplicit(uint32_t implicit) { 1052 UChar32 result; 1053 UChar32 b3 = implicit & 0xFF; 1054 UChar32 b2 = (implicit >> 8) & 0xFF; 1055 UChar32 b1 = (implicit >> 16) & 0xFF; 1056 UChar32 b0 = (implicit >> 24) & 0xFF; 1057 1058 // simple parameter checks 1059 if (b0 < min3Primary || b0 > max4Primary 1060 || b1 < minTrail || b1 > maxTrail) 1061 return -1; 1062 // normal offsets 1063 b1 -= minTrail; 1064 1065 // take care of the final values, and compose 1066 if (b0 < min4Primary) { 1067 if (b2 < minTrail || b2 > max3Trail || b3 != 0) 1068 return -1; 1069 b2 -= minTrail; 1070 UChar32 remainder = b2 % final3Multiplier; 1071 if (remainder != 0) 1072 return -1; 1073 b0 -= min3Primary; 1074 b2 /= final3Multiplier; 1075 result = ((b0 * medialCount) + b1) * final3Count + b2; 1076 } else { 1077 if (b2 < minTrail || b2 > maxTrail 1078 || b3 < minTrail || b3 > max4Trail) 1079 return -1; 1080 b2 -= minTrail; 1081 b3 -= minTrail; 1082 UChar32 remainder = b3 % final4Multiplier; 1083 if (remainder != 0) 1084 return -1; 1085 b3 /= final4Multiplier; 1086 b0 -= min4Primary; 1087 result = (((b0 * medialCount) + b1) * medialCount + b2) * final4Count + b3 + min4Boundary; 1088 } 1089 // final check 1090 if (result < 0 || result > UCOL_MAX_INPUT) 1091 return -1; 1092 return result; 1093} 1094 1095 1096static inline int32_t divideAndRoundUp(int a, int b) { 1097 return 1 + (a-1)/b; 1098} 1099 1100/* this function is either called from initUCA or from genUCA before 1101 * doing canonical closure for the UCA. 1102 */ 1103 1104/** 1105 * Set up to generate implicits. 1106 * Maintenance Note: this function may end up being called more than once, due 1107 * to threading races during initialization. Make sure that 1108 * none of the Constants is ever transiently assigned an 1109 * incorrect value. 1110 * @param minPrimary 1111 * @param maxPrimary 1112 * @param minTrail final byte 1113 * @param maxTrail final byte 1114 * @param gap3 the gap we leave for tailoring for 3-byte forms 1115 * @param gap4 the gap we leave for tailoring for 4-byte forms 1116 */ 1117static void initImplicitConstants(int minPrimary, int maxPrimary, 1118 int minTrailIn, int maxTrailIn, 1119 int gap3, int primaries3count, 1120 UErrorCode *status) { 1121 // some simple parameter checks 1122 if ((minPrimary < 0 || minPrimary >= maxPrimary || maxPrimary > 0xFF) 1123 || (minTrailIn < 0 || minTrailIn >= maxTrailIn || maxTrailIn > 0xFF) 1124 || (primaries3count < 1)) 1125 { 1126 *status = U_ILLEGAL_ARGUMENT_ERROR; 1127 return; 1128 }; 1129 1130 minTrail = minTrailIn; 1131 maxTrail = maxTrailIn; 1132 1133 min3Primary = minPrimary; 1134 max4Primary = maxPrimary; 1135 // compute constants for use later. 1136 // number of values we can use in trailing bytes 1137 // leave room for empty values between AND above, e.g. if gap = 2 1138 // range 3..7 => +3 -4 -5 -6 -7: so 1 value 1139 // range 3..8 => +3 -4 -5 +6 -7 -8: so 2 values 1140 // range 3..9 => +3 -4 -5 +6 -7 -8 -9: so 2 values 1141 final3Multiplier = gap3 + 1; 1142 final3Count = (maxTrail - minTrail + 1) / final3Multiplier; 1143 max3Trail = minTrail + (final3Count - 1) * final3Multiplier; 1144 1145 // medials can use full range 1146 medialCount = (maxTrail - minTrail + 1); 1147 // find out how many values fit in each form 1148 int32_t threeByteCount = medialCount * final3Count; 1149 // now determine where the 3/4 boundary is. 1150 // we use 3 bytes below the boundary, and 4 above 1151 int32_t primariesAvailable = maxPrimary - minPrimary + 1; 1152 int32_t primaries4count = primariesAvailable - primaries3count; 1153 1154 1155 int32_t min3ByteCoverage = primaries3count * threeByteCount; 1156 min4Primary = minPrimary + primaries3count; 1157 min4Boundary = min3ByteCoverage; 1158 // Now expand out the multiplier for the 4 bytes, and redo. 1159 1160 int32_t totalNeeded = UCOL_MAX_INPUT - min4Boundary; 1161 int32_t neededPerPrimaryByte = divideAndRoundUp(totalNeeded, primaries4count); 1162 int32_t neededPerFinalByte = divideAndRoundUp(neededPerPrimaryByte, medialCount * medialCount); 1163 int32_t gap4 = (maxTrail - minTrail - 1) / neededPerFinalByte; 1164 if (gap4 < 1) { 1165 *status = U_ILLEGAL_ARGUMENT_ERROR; 1166 return; 1167 } 1168 final4Multiplier = gap4 + 1; 1169 final4Count = neededPerFinalByte; 1170 max4Trail = minTrail + (final4Count - 1) * final4Multiplier; 1171} 1172 1173 /** 1174 * Supply parameters for generating implicit CEs 1175 */ 1176U_CAPI void U_EXPORT2 1177uprv_uca_initImplicitConstants(UErrorCode *status) { 1178 // 13 is the largest 4-byte gap we can use without getting 2 four-byte forms. 1179 //initImplicitConstants(minPrimary, maxPrimary, 0x04, 0xFE, 1, 1, status); 1180 initImplicitConstants(minImplicitPrimary, maxImplicitPrimary, 0x04, 0xFE, 1, 1, status); 1181} 1182 1183 1184/* collIterNormalize Incremental Normalization happens here. */ 1185/* pick up the range of chars identifed by FCD, */ 1186/* normalize it into the collIterate's writable buffer, */ 1187/* switch the collIterate's state to use the writable buffer. */ 1188/* */ 1189static 1190void collIterNormalize(collIterate *collationSource) 1191{ 1192 UErrorCode status = U_ZERO_ERROR; 1193 1194 int32_t normLen; 1195 UChar *srcP = collationSource->pos - 1; /* Start of chars to normalize */ 1196 UChar *endP = collationSource->fcdPosition; /* End of region to normalize+1 */ 1197 1198 normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize, 1199 srcP, (int32_t)(endP - srcP), 1200 FALSE, 0, 1201 &status); 1202 if(status == U_BUFFER_OVERFLOW_ERROR || status == U_STRING_NOT_TERMINATED_WARNING) { 1203 // reallocate and terminate 1204 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, 1205 &collationSource->writableBuffer, 1206 (int32_t *)&collationSource->writableBufSize, normLen + 1, 1207 0) 1208 ) { 1209#ifdef UCOL_DEBUG 1210 fprintf(stderr, "collIterNormalize(), out of memory\n"); 1211#endif 1212 return; 1213 } 1214 status = U_ZERO_ERROR; 1215 normLen = unorm_decompose(collationSource->writableBuffer, (int32_t)collationSource->writableBufSize, 1216 srcP, (int32_t)(endP - srcP), 1217 FALSE, 0, 1218 &status); 1219 } 1220 if (U_FAILURE(status)) { 1221#ifdef UCOL_DEBUG 1222 fprintf(stderr, "collIterNormalize(), unorm_decompose() failed, status = %s\n", u_errorName(status)); 1223#endif 1224 return; 1225 } 1226 1227 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { 1228 collationSource->flags |= UCOL_ITER_ALLOCATED; 1229 } 1230 collationSource->pos = collationSource->writableBuffer; 1231 collationSource->origFlags = collationSource->flags; 1232 collationSource->flags |= UCOL_ITER_INNORMBUF; 1233 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1234} 1235 1236 1237// This function takes the iterator and extracts normalized stuff up to the next boundary 1238// It is similar in the end results to the collIterNormalize, but for the cases when we 1239// use an iterator 1240/*static 1241inline void normalizeIterator(collIterate *collationSource) { 1242 UErrorCode status = U_ZERO_ERROR; 1243 UBool wasNormalized = FALSE; 1244 //int32_t iterIndex = collationSource->iterator->getIndex(collationSource->iterator, UITER_CURRENT); 1245 uint32_t iterIndex = collationSource->iterator->getState(collationSource->iterator); 1246 int32_t normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1247 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1248 if(status == U_BUFFER_OVERFLOW_ERROR || normLen == (int32_t)collationSource->writableBufSize) { 1249 // reallocate and terminate 1250 if(!u_growBufferFromStatic(collationSource->stackWritableBuffer, 1251 &collationSource->writableBuffer, 1252 (int32_t *)&collationSource->writableBufSize, normLen + 1, 1253 0) 1254 ) { 1255 #ifdef UCOL_DEBUG 1256 fprintf(stderr, "normalizeIterator(), out of memory\n"); 1257 #endif 1258 return; 1259 } 1260 status = U_ZERO_ERROR; 1261 //collationSource->iterator->move(collationSource->iterator, iterIndex, UITER_ZERO); 1262 collationSource->iterator->setState(collationSource->iterator, iterIndex, &status); 1263 normLen = unorm_next(collationSource->iterator, collationSource->writableBuffer, 1264 (int32_t)collationSource->writableBufSize, UNORM_FCD, 0, TRUE, &wasNormalized, &status); 1265 } 1266 // Terminate the buffer - we already checked that it is big enough 1267 collationSource->writableBuffer[normLen] = 0; 1268 if(collationSource->writableBuffer != collationSource->stackWritableBuffer) { 1269 collationSource->flags |= UCOL_ITER_ALLOCATED; 1270 } 1271 collationSource->pos = collationSource->writableBuffer; 1272 collationSource->origFlags = collationSource->flags; 1273 collationSource->flags |= UCOL_ITER_INNORMBUF; 1274 collationSource->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 1275}*/ 1276 1277 1278/* Incremental FCD check and normalize */ 1279/* Called from getNextCE when normalization state is suspect. */ 1280/* When entering, the state is known to be this: */ 1281/* o We are working in the main buffer of the collIterate, not the side */ 1282/* writable buffer. When in the side buffer, normalization mode is always off, */ 1283/* so we won't get here. */ 1284/* o The leading combining class from the current character is 0 or */ 1285/* the trailing combining class of the previous char was zero. */ 1286/* True because the previous call to this function will have always exited */ 1287/* that way, and we get called for every char where cc might be non-zero. */ 1288static 1289inline UBool collIterFCD(collIterate *collationSource) { 1290 const UChar *srcP, *endP; 1291 uint8_t leadingCC; 1292 uint8_t prevTrailingCC = 0; 1293 uint16_t fcd; 1294 UBool needNormalize = FALSE; 1295 1296 srcP = collationSource->pos-1; 1297 1298 if (collationSource->flags & UCOL_ITER_HASLEN) { 1299 endP = collationSource->endp; 1300 } else { 1301 endP = NULL; 1302 } 1303 1304 // Get the trailing combining class of the current character. If it's zero, 1305 // we are OK. 1306 /* trie access */ 1307 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); 1308 if (fcd != 0) { 1309 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1310 1311 if (prevTrailingCC != 0) { 1312 // The current char has a non-zero trailing CC. Scan forward until we find 1313 // a char with a leading cc of zero. 1314 while (endP == NULL || srcP != endP) 1315 { 1316 const UChar *savedSrcP = srcP; 1317 1318 /* trie access */ 1319 fcd = unorm_nextFCD16(fcdTrieIndex, fcdHighStart, srcP, endP); 1320 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1321 if (leadingCC == 0) { 1322 srcP = savedSrcP; // Hit char that is not part of combining sequence. 1323 // back up over it. (Could be surrogate pair!) 1324 break; 1325 } 1326 1327 if (leadingCC < prevTrailingCC) { 1328 needNormalize = TRUE; 1329 } 1330 1331 prevTrailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1332 } 1333 } 1334 } 1335 1336 collationSource->fcdPosition = (UChar *)srcP; 1337 1338 return needNormalize; 1339} 1340 1341/****************************************************************************/ 1342/* Following are the CE retrieval functions */ 1343/* */ 1344/****************************************************************************/ 1345 1346static uint32_t getImplicit(UChar32 cp, collIterate *collationSource); 1347static uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource); 1348 1349/* there should be a macro version of this function in the header file */ 1350/* This is the first function that tries to fetch a collation element */ 1351/* If it's not succesfull or it encounters a more difficult situation */ 1352/* some more sofisticated and slower functions are invoked */ 1353static 1354inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1355 uint32_t order = 0; 1356 if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ 1357 order = *(collationSource->toReturn++); /* if so, return them */ 1358 if(collationSource->CEpos == collationSource->toReturn) { 1359 collationSource->CEpos = collationSource->toReturn = collationSource->extendCEs ? collationSource->extendCEs : collationSource->CEs; 1360 } 1361 return order; 1362 } 1363 1364 UChar ch = 0; 1365 collationSource->offsetReturn = NULL; 1366 1367 for (;;) /* Loop handles case when incremental normalize switches */ 1368 { /* to or from the side buffer / original string, and we */ 1369 /* need to start again to get the next character. */ 1370 1371 if ((collationSource->flags & (UCOL_ITER_HASLEN | UCOL_ITER_INNORMBUF | UCOL_ITER_NORM | UCOL_HIRAGANA_Q | UCOL_USE_ITERATOR)) == 0) 1372 { 1373 // The source string is null terminated and we're not working from the side buffer, 1374 // and we're not normalizing. This is the fast path. 1375 // (We can be in the side buffer for Thai pre-vowel reordering even when not normalizing.) 1376 ch = *collationSource->pos++; 1377 if (ch != 0) { 1378 break; 1379 } 1380 else { 1381 return UCOL_NO_MORE_CES; 1382 } 1383 } 1384 1385 if (collationSource->flags & UCOL_ITER_HASLEN) { 1386 // Normal path for strings when length is specified. 1387 // (We can't be in side buffer because it is always null terminated.) 1388 if (collationSource->pos >= collationSource->endp) { 1389 // Ran off of the end of the main source string. We're done. 1390 return UCOL_NO_MORE_CES; 1391 } 1392 ch = *collationSource->pos++; 1393 } 1394 else if(collationSource->flags & UCOL_USE_ITERATOR) { 1395 UChar32 iterCh = collationSource->iterator->next(collationSource->iterator); 1396 if(iterCh == U_SENTINEL) { 1397 return UCOL_NO_MORE_CES; 1398 } 1399 ch = (UChar)iterCh; 1400 } 1401 else 1402 { 1403 // Null terminated string. 1404 ch = *collationSource->pos++; 1405 if (ch == 0) { 1406 // Ran off end of buffer. 1407 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1408 // Ran off end of main string. backing up one character. 1409 collationSource->pos--; 1410 return UCOL_NO_MORE_CES; 1411 } 1412 else 1413 { 1414 // Hit null in the normalize side buffer. 1415 // Usually this means the end of the normalized data, 1416 // except for one odd case: a null followed by combining chars, 1417 // which is the case if we are at the start of the buffer. 1418 if (collationSource->pos == collationSource->writableBuffer+1) { 1419 break; 1420 } 1421 1422 // Null marked end of side buffer. 1423 // Revert to the main string and 1424 // loop back to top to try again to get a character. 1425 collationSource->pos = collationSource->fcdPosition; 1426 collationSource->flags = collationSource->origFlags; 1427 continue; 1428 } 1429 } 1430 } 1431 1432 if(collationSource->flags&UCOL_HIRAGANA_Q) { 1433 /* Codepoints \u3099-\u309C are both Hiragana and Katakana. Set the flag 1434 * based on whether the previous codepoint was Hiragana or Katakana. 1435 */ 1436 if(((ch>=0x3040 && ch<=0x3096) || (ch >= 0x309d && ch <= 0x309f)) || 1437 ((collationSource->flags & UCOL_WAS_HIRAGANA) && (ch >= 0x3099 && ch <= 0x309C))) { 1438 collationSource->flags |= UCOL_WAS_HIRAGANA; 1439 } else { 1440 collationSource->flags &= ~UCOL_WAS_HIRAGANA; 1441 } 1442 } 1443 1444 // We've got a character. See if there's any fcd and/or normalization stuff to do. 1445 // Note that UCOL_ITER_NORM flag is always zero when we are in the side buffer. 1446 if ((collationSource->flags & UCOL_ITER_NORM) == 0) { 1447 break; 1448 } 1449 1450 if (collationSource->fcdPosition >= collationSource->pos) { 1451 // An earlier FCD check has already covered the current character. 1452 // We can go ahead and process this char. 1453 break; 1454 } 1455 1456 if (ch < ZERO_CC_LIMIT_ ) { 1457 // Fast fcd safe path. Trailing combining class == 0. This char is OK. 1458 break; 1459 } 1460 1461 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 1462 // We need to peek at the next character in order to tell if we are FCD 1463 if ((collationSource->flags & UCOL_ITER_HASLEN) && collationSource->pos >= collationSource->endp) { 1464 // We are at the last char of source string. 1465 // It is always OK for FCD check. 1466 break; 1467 } 1468 1469 // Not at last char of source string (or we'll check against terminating null). Do the FCD fast test 1470 if (*collationSource->pos < NFC_ZERO_CC_BLOCK_LIMIT_) { 1471 break; 1472 } 1473 } 1474 1475 1476 // Need a more complete FCD check and possible normalization. 1477 if (collIterFCD(collationSource)) { 1478 collIterNormalize(collationSource); 1479 } 1480 if ((collationSource->flags & UCOL_ITER_INNORMBUF) == 0) { 1481 // No normalization was needed. Go ahead and process the char we already had. 1482 break; 1483 } 1484 1485 // Some normalization happened. Next loop iteration will pick up a char 1486 // from the normalization buffer. 1487 1488 } // end for (;;) 1489 1490 1491 if (ch <= 0xFF) { 1492 /* For latin-1 characters we never need to fall back to the UCA table */ 1493 /* because all of the UCA data is replicated in the latinOneMapping array */ 1494 order = coll->latinOneMapping[ch]; 1495 if (order > UCOL_NOT_FOUND) { 1496 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); 1497 } 1498 } 1499 else 1500 { 1501 // Always use UCA for Han, Hangul 1502 // (Han extension A is before main Han block) 1503 // **** Han compatibility chars ?? **** 1504 if ((collationSource->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 1505 (ch >= UCOL_FIRST_HAN_A && ch <= UCOL_LAST_HANGUL)) { 1506 if (ch > UCOL_LAST_HAN && ch < UCOL_FIRST_HANGUL) { 1507 // between the two target ranges; do normal lookup 1508 // **** this range is YI, Modifier tone letters, **** 1509 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 1510 // **** Latin-D might be tailored, so we need to **** 1511 // **** do the normal lookup for these guys. **** 1512 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1513 } else { 1514 // in one of the target ranges; use UCA 1515 order = UCOL_NOT_FOUND; 1516 } 1517 } else { 1518 order = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1519 } 1520 1521 if(order > UCOL_NOT_FOUND) { /* if a CE is special */ 1522 order = ucol_prv_getSpecialCE(coll, ch, order, collationSource, status); /* and try to get the special CE */ 1523 } 1524 1525 if(order == UCOL_NOT_FOUND && coll->UCA) { /* We couldn't find a good CE in the tailoring */ 1526 /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ 1527 order = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 1528 1529 if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ 1530 order = ucol_prv_getSpecialCE(coll->UCA, ch, order, collationSource, status); 1531 } 1532 } 1533 } 1534 if(order == UCOL_NOT_FOUND) { 1535 order = getImplicit(ch, collationSource); 1536 } 1537 return order; /* return the CE */ 1538} 1539 1540/* ucol_getNextCE, out-of-line version for use from other files. */ 1541U_CAPI uint32_t U_EXPORT2 1542ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status) { 1543 return ucol_IGetNextCE(coll, collationSource, status); 1544} 1545 1546 1547/** 1548* Incremental previous normalization happens here. Pick up the range of chars 1549* identifed by FCD, normalize it into the collIterate's writable buffer, 1550* switch the collIterate's state to use the writable buffer. 1551* @param data collation iterator data 1552*/ 1553static 1554void collPrevIterNormalize(collIterate *data) 1555{ 1556 UErrorCode status = U_ZERO_ERROR; 1557 UChar *pEnd = data->pos; /* End normalize + 1 */ 1558 UChar *pStart; 1559 uint32_t normLen; 1560 UChar *pStartNorm; 1561 1562 /* Start normalize */ 1563 if (data->fcdPosition == NULL) { 1564 pStart = data->string; 1565 } 1566 else { 1567 pStart = data->fcdPosition + 1; 1568 } 1569 1570 normLen = unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, 1571 data->writableBuffer, 0, &status); 1572 1573 if (data->writableBufSize <= normLen) { 1574 freeHeapWritableBuffer(data); 1575 data->writableBuffer = (UChar *)uprv_malloc((normLen + 1) * 1576 sizeof(UChar)); 1577 if(data->writableBuffer == NULL) { // something is wrong here, return 1578 data->writableBufSize = 0; // Reset writableBufSize 1579 return; 1580 } 1581 data->flags |= UCOL_ITER_ALLOCATED; 1582 /* to handle the zero termination */ 1583 data->writableBufSize = normLen + 1; 1584 } 1585 status = U_ZERO_ERROR; 1586 /* 1587 this puts the null termination infront of the normalized string instead 1588 of the end 1589 */ 1590 pStartNorm = data->writableBuffer + (data->writableBufSize - normLen); 1591 *(pStartNorm - 1) = 0; 1592 unorm_normalize(pStart, (pEnd - pStart) + 1, UNORM_NFD, 0, pStartNorm, 1593 normLen, &status); 1594 1595 if (data->offsetBuffer == NULL) { 1596 int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE; 1597 1598 data->offsetBufferSize = len; 1599 data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len); 1600 data->offsetStore = data->offsetBuffer; 1601 } else if(data->offsetBufferSize < (int32_t) normLen) { 1602 int32_t storeIX = data->offsetStore - data->offsetBuffer; 1603 int32_t *tob = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1)); 1604 1605 if (tob != NULL) { 1606 data->offsetBuffer = tob; 1607 data->offsetStore = &data->offsetBuffer[storeIX]; 1608 data->offsetBufferSize = normLen + 1; 1609 } 1610 } 1611 1612 /* 1613 * The usual case at this point is that we've got a base 1614 * character followed by marks that were normalized. If 1615 * fcdPosition is NULL, that means that we backed up to 1616 * the beginning of the string and there's no base character. 1617 * 1618 * Forward processing will usually normalize when it sees 1619 * the first mark, so that mark will get it's natural offset 1620 * and the rest will get the offset of the character following 1621 * the marks. The base character will also get its natural offset. 1622 * 1623 * We write the offset of the base character, if there is one, 1624 * followed by the offset of the first mark and then the offsets 1625 * of the rest of the marks. 1626 */ 1627 int32_t firstMarkOffset = 0; 1628 int32_t trailOffset = data->pos - data->string + 1; 1629 int32_t trailCount = normLen - 1; 1630 1631 if (data->fcdPosition != NULL) { 1632 int32_t baseOffset = data->fcdPosition - data->string; 1633 UChar baseChar = *data->fcdPosition; 1634 1635 firstMarkOffset = baseOffset + 1; 1636 1637 /* 1638 * If the base character is the start of a contraction, forward processing 1639 * will normalize the marks while checking for the contraction, which means 1640 * that the offset of the first mark will the same as the other marks. 1641 * 1642 * **** THIS IS PROBABLY NOT A COMPLETE TEST **** 1643 */ 1644 if (baseChar >= 0x100) { 1645 uint32_t baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->mapping, baseChar); 1646 1647 if (baseOrder == UCOL_NOT_FOUND && data->coll->UCA) { 1648 baseOrder = UTRIE_GET32_FROM_LEAD(&data->coll->UCA->mapping, baseChar); 1649 } 1650 1651 if (baseOrder > UCOL_NOT_FOUND && getCETag(baseOrder) == CONTRACTION_TAG) { 1652 firstMarkOffset = trailOffset; 1653 } 1654 } 1655 1656 *(data->offsetStore++) = baseOffset; 1657 } 1658 1659 *(data->offsetStore++) = firstMarkOffset; 1660 1661 for (int32_t i = 0; i < trailCount; i += 1) { 1662 *(data->offsetStore++) = trailOffset; 1663 } 1664 1665 data->offsetRepeatValue = trailOffset; 1666 1667 data->offsetReturn = data->offsetStore - 1; 1668 if (data->offsetReturn == data->offsetBuffer) { 1669 data->offsetStore = data->offsetBuffer; 1670 } 1671 1672 data->pos = data->writableBuffer + data->writableBufSize; 1673 data->origFlags = data->flags; 1674 data->flags |= UCOL_ITER_INNORMBUF; 1675 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 1676} 1677 1678 1679/** 1680* Incremental FCD check for previous iteration and normalize. Called from 1681* getPrevCE when normalization state is suspect. 1682* When entering, the state is known to be this: 1683* o We are working in the main buffer of the collIterate, not the side 1684* writable buffer. When in the side buffer, normalization mode is always 1685* off, so we won't get here. 1686* o The leading combining class from the current character is 0 or the 1687* trailing combining class of the previous char was zero. 1688* True because the previous call to this function will have always exited 1689* that way, and we get called for every char where cc might be non-zero. 1690* @param data collation iterate struct 1691* @return normalization status, TRUE for normalization to be done, FALSE 1692* otherwise 1693*/ 1694static 1695inline UBool collPrevIterFCD(collIterate *data) 1696{ 1697 const UChar *src, *start; 1698 uint8_t leadingCC; 1699 uint8_t trailingCC = 0; 1700 uint16_t fcd; 1701 UBool result = FALSE; 1702 1703 start = data->string; 1704 src = data->pos + 1; 1705 1706 /* Get the trailing combining class of the current character. */ 1707 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); 1708 1709 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1710 1711 if (leadingCC != 0) { 1712 /* 1713 The current char has a non-zero leading combining class. 1714 Scan backward until we find a char with a trailing cc of zero. 1715 */ 1716 for (;;) 1717 { 1718 if (start == src) { 1719 data->fcdPosition = NULL; 1720 return result; 1721 } 1722 1723 fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, start, src); 1724 1725 trailingCC = (uint8_t)(fcd & LAST_BYTE_MASK_); 1726 1727 if (trailingCC == 0) { 1728 break; 1729 } 1730 1731 if (leadingCC < trailingCC) { 1732 result = TRUE; 1733 } 1734 1735 leadingCC = (uint8_t)(fcd >> SECOND_LAST_BYTE_SHIFT_); 1736 } 1737 } 1738 1739 data->fcdPosition = (UChar *)src; 1740 1741 return result; 1742} 1743 1744/** gets a character from the string at a given offset 1745 * Handles both normal and iterative cases. 1746 * No error checking - caller beware! 1747 */ 1748inline static 1749UChar peekCharacter(collIterate *source, int32_t offset) { 1750 if(source->pos != NULL) { 1751 return *(source->pos + offset); 1752 } else if(source->iterator != NULL) { 1753 if(offset != 0) { 1754 source->iterator->move(source->iterator, offset, UITER_CURRENT); 1755 UChar toReturn = (UChar)source->iterator->next(source->iterator); 1756 source->iterator->move(source->iterator, -offset-1, UITER_CURRENT); 1757 return toReturn; 1758 } else { 1759 return (UChar)source->iterator->current(source->iterator); 1760 } 1761 } else { 1762 return (UChar)U_SENTINEL; 1763 } 1764} 1765 1766/** 1767* Determines if we are at the start of the data string in the backwards 1768* collation iterator 1769* @param data collation iterator 1770* @return TRUE if we are at the start 1771*/ 1772static 1773inline UBool isAtStartPrevIterate(collIterate *data) { 1774 if(data->pos == NULL && data->iterator != NULL) { 1775 return !data->iterator->hasPrevious(data->iterator); 1776 } 1777 //return (collIter_bos(data)) || 1778 return (data->pos == data->string) || 1779 ((data->flags & UCOL_ITER_INNORMBUF) && 1780 *(data->pos - 1) == 0 && data->fcdPosition == NULL); 1781} 1782 1783static 1784inline void goBackOne(collIterate *data) { 1785# if 0 1786 // somehow, it looks like we need to keep iterator synced up 1787 // at all times, as above. 1788 if(data->pos) { 1789 data->pos--; 1790 } 1791 if(data->iterator) { 1792 data->iterator->previous(data->iterator); 1793 } 1794#endif 1795 if(data->iterator && (data->flags & UCOL_USE_ITERATOR)) { 1796 data->iterator->previous(data->iterator); 1797 } 1798 if(data->pos) { 1799 data->pos --; 1800 } 1801} 1802 1803/** 1804* Inline function that gets a simple CE. 1805* So what it does is that it will first check the expansion buffer. If the 1806* expansion buffer is not empty, ie the end pointer to the expansion buffer 1807* is different from the string pointer, we return the collation element at the 1808* return pointer and decrement it. 1809* For more complicated CEs it resorts to getComplicatedCE. 1810* @param coll collator data 1811* @param data collation iterator struct 1812* @param status error status 1813*/ 1814static 1815inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, 1816 UErrorCode *status) 1817{ 1818 uint32_t result = (uint32_t)UCOL_NULLORDER; 1819 1820 if (data->offsetReturn != NULL) { 1821 if (data->offsetRepeatCount > 0) { 1822 data->offsetRepeatCount -= 1; 1823 } else { 1824 if (data->offsetReturn == data->offsetBuffer) { 1825 data->offsetReturn = NULL; 1826 data->offsetStore = data->offsetBuffer; 1827 } else { 1828 data->offsetReturn -= 1; 1829 } 1830 } 1831 } 1832 1833 if ((data->extendCEs && data->toReturn > data->extendCEs) || 1834 (!data->extendCEs && data->toReturn > data->CEs)) 1835 { 1836 data->toReturn -= 1; 1837 result = *(data->toReturn); 1838 if (data->CEs == data->toReturn || data->extendCEs == data->toReturn) { 1839 data->CEpos = data->toReturn; 1840 } 1841 } 1842 else { 1843 UChar ch = 0; 1844 1845 /* 1846 Loop handles case when incremental normalize switches to or from the 1847 side buffer / original string, and we need to start again to get the 1848 next character. 1849 */ 1850 for (;;) { 1851 if (data->flags & UCOL_ITER_HASLEN) { 1852 /* 1853 Normal path for strings when length is specified. 1854 Not in side buffer because it is always null terminated. 1855 */ 1856 if (data->pos <= data->string) { 1857 /* End of the main source string */ 1858 return UCOL_NO_MORE_CES; 1859 } 1860 data->pos --; 1861 ch = *data->pos; 1862 } 1863 // we are using an iterator to go back. Pray for us! 1864 else if (data->flags & UCOL_USE_ITERATOR) { 1865 UChar32 iterCh = data->iterator->previous(data->iterator); 1866 if(iterCh == U_SENTINEL) { 1867 return UCOL_NO_MORE_CES; 1868 } else { 1869 ch = (UChar)iterCh; 1870 } 1871 } 1872 else { 1873 data->pos --; 1874 ch = *data->pos; 1875 /* we are in the side buffer. */ 1876 if (ch == 0) { 1877 /* 1878 At the start of the normalize side buffer. 1879 Go back to string. 1880 Because pointer points to the last accessed character, 1881 hence we have to increment it by one here. 1882 */ 1883 data->flags = data->origFlags; 1884 data->offsetRepeatValue = 0; 1885 1886 if (data->fcdPosition == NULL) { 1887 data->pos = data->string; 1888 return UCOL_NO_MORE_CES; 1889 } 1890 else { 1891 data->pos = data->fcdPosition + 1; 1892 } 1893 1894 continue; 1895 } 1896 } 1897 1898 if(data->flags&UCOL_HIRAGANA_Q) { 1899 if(ch>=0x3040 && ch<=0x309f) { 1900 data->flags |= UCOL_WAS_HIRAGANA; 1901 } else { 1902 data->flags &= ~UCOL_WAS_HIRAGANA; 1903 } 1904 } 1905 1906 /* 1907 * got a character to determine if there's fcd and/or normalization 1908 * stuff to do. 1909 * if the current character is not fcd. 1910 * if current character is at the start of the string 1911 * Trailing combining class == 0. 1912 * Note if pos is in the writablebuffer, norm is always 0 1913 */ 1914 if (ch < ZERO_CC_LIMIT_ || 1915 // this should propel us out of the loop in the iterator case 1916 (data->flags & UCOL_ITER_NORM) == 0 || 1917 (data->fcdPosition != NULL && data->fcdPosition <= data->pos) 1918 || data->string == data->pos) { 1919 break; 1920 } 1921 1922 if (ch < NFC_ZERO_CC_BLOCK_LIMIT_) { 1923 /* if next character is FCD */ 1924 if (data->pos == data->string) { 1925 /* First char of string is always OK for FCD check */ 1926 break; 1927 } 1928 1929 /* Not first char of string, do the FCD fast test */ 1930 if (*(data->pos - 1) < NFC_ZERO_CC_BLOCK_LIMIT_) { 1931 break; 1932 } 1933 } 1934 1935 /* Need a more complete FCD check and possible normalization. */ 1936 if (collPrevIterFCD(data)) { 1937 collPrevIterNormalize(data); 1938 } 1939 1940 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 1941 /* No normalization. Go ahead and process the char. */ 1942 break; 1943 } 1944 1945 /* 1946 Some normalization happened. 1947 Next loop picks up a char from the normalization buffer. 1948 */ 1949 } 1950 1951 /* attempt to handle contractions, after removal of the backwards 1952 contraction 1953 */ 1954 if (ucol_contractionEndCP(ch, coll) && !isAtStartPrevIterate(data)) { 1955 result = ucol_prv_getSpecialPrevCE(coll, ch, UCOL_CONTRACTION, data, status); 1956 } else { 1957 if (ch <= 0xFF) { 1958 result = coll->latinOneMapping[ch]; 1959 } 1960 else { 1961 // Always use UCA for [3400..9FFF], [AC00..D7AF] 1962 // **** [FA0E..FA2F] ?? **** 1963 if ((data->flags & UCOL_FORCE_HAN_IMPLICIT) != 0 && 1964 (ch >= 0x3400 && ch <= 0xD7AF)) { 1965 if (ch > 0x9FFF && ch < 0xAC00) { 1966 // between the two target ranges; do normal lookup 1967 // **** this range is YI, Modifier tone letters, **** 1968 // **** Latin-D, Syloti Nagari, Phagas-pa. **** 1969 // **** Latin-D might be tailored, so we need to **** 1970 // **** do the normal lookup for these guys. **** 1971 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1972 } else { 1973 result = UCOL_NOT_FOUND; 1974 } 1975 } else { 1976 result = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 1977 } 1978 } 1979 if (result > UCOL_NOT_FOUND) { 1980 result = ucol_prv_getSpecialPrevCE(coll, ch, result, data, status); 1981 } 1982 if (result == UCOL_NOT_FOUND) { // Not found in master list 1983 if (!isAtStartPrevIterate(data) && 1984 ucol_contractionEndCP(ch, data->coll)) 1985 { 1986 result = UCOL_CONTRACTION; 1987 } else { 1988 if(coll->UCA) { 1989 result = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 1990 } 1991 } 1992 1993 if (result > UCOL_NOT_FOUND) { 1994 if(coll->UCA) { 1995 result = ucol_prv_getSpecialPrevCE(coll->UCA, ch, result, data, status); 1996 } 1997 } 1998 } 1999 } 2000 2001 if(result == UCOL_NOT_FOUND) { 2002 result = getPrevImplicit(ch, data); 2003 } 2004 } 2005 2006 return result; 2007} 2008 2009 2010/* ucol_getPrevCE, out-of-line version for use from other files. */ 2011U_CFUNC uint32_t U_EXPORT2 2012ucol_getPrevCE(const UCollator *coll, collIterate *data, 2013 UErrorCode *status) { 2014 return ucol_IGetPrevCE(coll, data, status); 2015} 2016 2017 2018/* this should be connected to special Jamo handling */ 2019U_CFUNC uint32_t U_EXPORT2 2020ucol_getFirstCE(const UCollator *coll, UChar u, UErrorCode *status) { 2021 collIterate colIt; 2022 uint32_t order; 2023 IInit_collIterate(coll, &u, 1, &colIt); 2024 order = ucol_IGetNextCE(coll, &colIt, status); 2025 /*UCOL_GETNEXTCE(order, coll, colIt, status);*/ 2026 return order; 2027} 2028 2029/** 2030* Inserts the argument character into the end of the buffer pushing back the 2031* null terminator. 2032* @param data collIterate struct data 2033* @param pNull pointer to the null termination 2034* @param ch character to be appended 2035* @return the position of the new addition 2036*/ 2037static 2038inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar ch) 2039{ 2040 uint32_t size = data->writableBufSize; 2041 UChar *newbuffer; 2042 static const uint32_t INCSIZE = 5; 2043 2044 if ((data->writableBuffer + size) > (pNull + 1)) { 2045 *pNull = ch; 2046 *(pNull + 1) = 0; 2047 return pNull; 2048 } 2049 2050 /* 2051 buffer will always be null terminated at the end. 2052 giving extra space since it is likely that more characters will be added. 2053 */ 2054 size += INCSIZE; 2055 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size); 2056 if(newbuffer != NULL) { // something wrong, but no status 2057 uprv_memcpy(newbuffer, data->writableBuffer, 2058 data->writableBufSize * sizeof(UChar)); 2059 2060 freeHeapWritableBuffer(data); 2061 data->writableBufSize = size; 2062 data->writableBuffer = newbuffer; 2063 2064 newbuffer = newbuffer + data->writableBufSize; 2065 *newbuffer = ch; 2066 *(newbuffer + 1) = 0; 2067 } 2068 return newbuffer; 2069} 2070 2071/** 2072* Inserts the argument string into the end of the buffer pushing back the 2073* null terminator. 2074* @param data collIterate struct data 2075* @param pNull pointer to the null termination 2076* @param string to be appended 2077* @param length of the string to be appended 2078* @return the position of the new addition 2079*/ 2080static 2081inline UChar * insertBufferEnd(collIterate *data, UChar *pNull, UChar *str, 2082 int32_t length) 2083{ 2084 uint32_t size = pNull - data->writableBuffer; 2085 UChar *newbuffer; 2086 2087 if (data->writableBuffer + data->writableBufSize > pNull + length + 1) { 2088 uprv_memcpy(pNull, str, length * sizeof(UChar)); 2089 *(pNull + length) = 0; 2090 return pNull; 2091 } 2092 2093 /* 2094 buffer will always be null terminated at the end. 2095 giving extra space since it is likely that more characters will be added. 2096 */ 2097 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * (size + length + 1)); 2098 if(newbuffer != NULL) { 2099 uprv_memcpy(newbuffer, data->writableBuffer, size * sizeof(UChar)); 2100 uprv_memcpy(newbuffer + size, str, length * sizeof(UChar)); 2101 2102 freeHeapWritableBuffer(data); 2103 data->writableBufSize = size + length + 1; 2104 data->writableBuffer = newbuffer; 2105 } 2106 2107 return newbuffer; 2108} 2109 2110/** 2111* Special normalization function for contraction in the forwards iterator. 2112* This normalization sequence will place the current character at source->pos 2113* and its following normalized sequence into the buffer. 2114* The fcd position, pos will be changed. 2115* pos will now point to positions in the buffer. 2116* Flags will be changed accordingly. 2117* @param data collation iterator data 2118*/ 2119static 2120inline void normalizeNextContraction(collIterate *data) 2121{ 2122 UChar *buffer = data->writableBuffer; 2123 uint32_t buffersize = data->writableBufSize; 2124 uint32_t strsize; 2125 UErrorCode status = U_ZERO_ERROR; 2126 /* because the pointer points to the next character */ 2127 UChar *pStart = data->pos - 1; 2128 UChar *pEnd; 2129 uint32_t normLen; 2130 UChar *pStartNorm; 2131 2132 if ((data->flags & UCOL_ITER_INNORMBUF) == 0) { 2133 *data->writableBuffer = *(pStart - 1); 2134 strsize = 1; 2135 } 2136 else { 2137 strsize = u_strlen(data->writableBuffer); 2138 } 2139 2140 pEnd = data->fcdPosition; 2141 2142 normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, buffer, 0, 2143 &status); 2144 2145 if (buffersize <= normLen + strsize) { 2146 uint32_t size = strsize + normLen + 1; 2147 UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar)); 2148 if(temp != NULL) { 2149 uprv_memcpy(temp, buffer, sizeof(UChar) * strsize); 2150 freeHeapWritableBuffer(data); 2151 data->writableBuffer = temp; 2152 data->writableBufSize = size; 2153 data->flags |= UCOL_ITER_ALLOCATED; 2154 } else { 2155 return; // Avoid writing past bound of buffer->writableBuffer. 2156 } 2157 } 2158 2159 status = U_ZERO_ERROR; 2160 pStartNorm = buffer + strsize; 2161 /* null-termination will be added here */ 2162 unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, 2163 normLen + 1, &status); 2164 2165 data->pos = data->writableBuffer + strsize; 2166 data->origFlags = data->flags; 2167 data->flags |= UCOL_ITER_INNORMBUF; 2168 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2169} 2170 2171/** 2172* Contraction character management function that returns the next character 2173* for the forwards iterator. 2174* Does nothing if the next character is in buffer and not the first character 2175* in it. 2176* Else it checks next character in data string to see if it is normalizable. 2177* If it is not, the character is simply copied into the buffer, else 2178* the whole normalized substring is copied into the buffer, including the 2179* current character. 2180* @param data collation element iterator data 2181* @return next character 2182*/ 2183static 2184inline UChar getNextNormalizedChar(collIterate *data) 2185{ 2186 UChar nextch; 2187 UChar ch; 2188 // Here we need to add the iterator code. One problem is the way 2189 // end of string is handled. If we just return next char, it could 2190 // be the sentinel. Most of the cases already check for this, but we 2191 // need to be sure. 2192 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 ) { 2193 /* if no normalization and not in buffer. */ 2194 if(data->flags & UCOL_USE_ITERATOR) { 2195 return (UChar)data->iterator->next(data->iterator); 2196 } else { 2197 return *(data->pos ++); 2198 } 2199 } 2200 2201 //if (data->flags & UCOL_ITER_NORM && data->flags & UCOL_USE_ITERATOR) { 2202 //normalizeIterator(data); 2203 //} 2204 2205 UChar *pEndWritableBuffer = NULL; 2206 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2207 if ((innormbuf && *data->pos != 0) || 2208 (data->fcdPosition != NULL && !innormbuf && 2209 data->pos < data->fcdPosition)) { 2210 /* 2211 if next character is in normalized buffer, no further normalization 2212 is required 2213 */ 2214 return *(data->pos ++); 2215 } 2216 2217 if (data->flags & UCOL_ITER_HASLEN) { 2218 /* in data string */ 2219 if (data->pos + 1 == data->endp) { 2220 return *(data->pos ++); 2221 } 2222 } 2223 else { 2224 if (innormbuf) { 2225 // inside the normalization buffer, but at the end 2226 // (since we encountered zero). This means, in the 2227 // case we're using char iterator, that we need to 2228 // do another round of normalization. 2229 //if(data->origFlags & UCOL_USE_ITERATOR) { 2230 // we need to restore original flags, 2231 // otherwise, we'll lose them 2232 //data->flags = data->origFlags; 2233 //normalizeIterator(data); 2234 //return *(data->pos++); 2235 //} else { 2236 /* 2237 in writable buffer, at this point fcdPosition can not be 2238 pointing to the end of the data string. see contracting tag. 2239 */ 2240 if(data->fcdPosition) { 2241 if (*(data->fcdPosition + 1) == 0 || 2242 data->fcdPosition + 1 == data->endp) { 2243 /* at the end of the string, dump it into the normalizer */ 2244 data->pos = insertBufferEnd(data, data->pos, 2245 *(data->fcdPosition)) + 1; 2246 // Check if data->pos received a null pointer 2247 if (data->pos == NULL) { 2248 return (UChar)-1; // Return to indicate error. 2249 } 2250 return *(data->fcdPosition ++); 2251 } 2252 pEndWritableBuffer = data->pos; 2253 data->pos = data->fcdPosition; 2254 } else if(data->origFlags & UCOL_USE_ITERATOR) { 2255 // if we are here, we're using a normalizing iterator. 2256 // we should just continue further. 2257 data->flags = data->origFlags; 2258 data->pos = NULL; 2259 return (UChar)data->iterator->next(data->iterator); 2260 } 2261 //} 2262 } 2263 else { 2264 if (*(data->pos + 1) == 0) { 2265 return *(data->pos ++); 2266 } 2267 } 2268 } 2269 2270 ch = *data->pos ++; 2271 nextch = *data->pos; 2272 2273 /* 2274 * if the current character is not fcd. 2275 * Trailing combining class == 0. 2276 */ 2277 if ((data->fcdPosition == NULL || data->fcdPosition < data->pos) && 2278 (nextch >= NFC_ZERO_CC_BLOCK_LIMIT_ || 2279 ch >= NFC_ZERO_CC_BLOCK_LIMIT_)) { 2280 /* 2281 Need a more complete FCD check and possible normalization. 2282 normalize substring will be appended to buffer 2283 */ 2284 if (collIterFCD(data)) { 2285 normalizeNextContraction(data); 2286 return *(data->pos ++); 2287 } 2288 else if (innormbuf) { 2289 /* fcdposition shifted even when there's no normalization, if we 2290 don't input the rest into this, we'll get the wrong position when 2291 we reach the end of the writableBuffer */ 2292 int32_t length = data->fcdPosition - data->pos + 1; 2293 data->pos = insertBufferEnd(data, pEndWritableBuffer, 2294 data->pos - 1, length); 2295 // Check if data->pos received a null pointer 2296 if (data->pos == NULL) { 2297 return (UChar)-1; // Return to indicate error. 2298 } 2299 return *(data->pos ++); 2300 } 2301 } 2302 2303 if (innormbuf) { 2304 /* 2305 no normalization is to be done hence only one character will be 2306 appended to the buffer. 2307 */ 2308 data->pos = insertBufferEnd(data, pEndWritableBuffer, ch) + 1; 2309 // Check if data->pos received a null pointer 2310 if (data->pos == NULL) { 2311 return (UChar)-1; // Return to indicate error. 2312 } 2313 } 2314 2315 /* points back to the pos in string */ 2316 return ch; 2317} 2318 2319 2320 2321/** 2322* Function to copy the buffer into writableBuffer and sets the fcd position to 2323* the correct position 2324* @param source data string source 2325* @param buffer character buffer 2326* @param tempdb current position in buffer that has been used up 2327*/ 2328static 2329inline void setDiscontiguosAttribute(collIterate *source, UChar *buffer, 2330 UChar *tempdb) 2331{ 2332 /* okay confusing part here. to ensure that the skipped characters are 2333 considered later, we need to place it in the appropriate position in the 2334 normalization buffer and reassign the pos pointer. simple case if pos 2335 reside in string, simply copy to normalization buffer and 2336 fcdposition = pos, pos = start of normalization buffer. if pos in 2337 normalization buffer, we'll insert the copy infront of pos and point pos 2338 to the start of the normalization buffer. why am i doing these copies? 2339 well, so that the whole chunk of codes in the getNextCE, ucol_prv_getSpecialCE does 2340 not require any changes, which be really painful. */ 2341 uint32_t length = u_strlen(buffer);; 2342 if (source->flags & UCOL_ITER_INNORMBUF) { 2343 u_strcpy(tempdb, source->pos); 2344 } 2345 else { 2346 source->fcdPosition = source->pos; 2347 source->origFlags = source->flags; 2348 source->flags |= UCOL_ITER_INNORMBUF; 2349 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN | UCOL_USE_ITERATOR); 2350 } 2351 2352 if (length >= source->writableBufSize) { 2353 freeHeapWritableBuffer(source); 2354 source->writableBuffer = 2355 (UChar *)uprv_malloc((length + 1) * sizeof(UChar)); 2356 if(source->writableBuffer == NULL) { 2357 source->writableBufSize = 0; // Reset size 2358 return; 2359 } 2360 source->writableBufSize = length; 2361 } 2362 2363 u_strcpy(source->writableBuffer, buffer); 2364 source->pos = source->writableBuffer; 2365} 2366 2367/** 2368* Function to get the discontiguos collation element within the source. 2369* Note this function will set the position to the appropriate places. 2370* @param coll current collator used 2371* @param source data string source 2372* @param constart index to the start character in the contraction table 2373* @return discontiguos collation element offset 2374*/ 2375static 2376uint32_t getDiscontiguous(const UCollator *coll, collIterate *source, 2377 const UChar *constart) 2378{ 2379 /* source->pos currently points to the second combining character after 2380 the start character */ 2381 UChar *temppos = source->pos; 2382 UChar buffer[4*UCOL_MAX_BUFFER]; 2383 UChar *tempdb = buffer; 2384 const UChar *tempconstart = constart; 2385 uint8_t tempflags = source->flags; 2386 UBool multicontraction = FALSE; 2387 UChar *tempbufferpos = 0; 2388 collIterateState discState; 2389 2390 backupState(source, &discState); 2391 2392 //*tempdb = *(source->pos - 1); 2393 *tempdb = peekCharacter(source, -1); 2394 tempdb++; 2395 for (;;) { 2396 UChar *UCharOffset; 2397 UChar schar, 2398 tchar; 2399 uint32_t result; 2400 2401 if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp) 2402 || (peekCharacter(source, 0) == 0 && 2403 //|| (*source->pos == 0 && 2404 ((source->flags & UCOL_ITER_INNORMBUF) == 0 || 2405 source->fcdPosition == NULL || 2406 source->fcdPosition == source->endp || 2407 *(source->fcdPosition) == 0 || 2408 u_getCombiningClass(*(source->fcdPosition)) == 0)) || 2409 /* end of string in null terminated string or stopped by a 2410 null character, note fcd does not always point to a base 2411 character after the discontiguos change */ 2412 u_getCombiningClass(peekCharacter(source, 0)) == 0) { 2413 //u_getCombiningClass(*(source->pos)) == 0) { 2414 //constart = (UChar *)coll->image + getContractOffset(CE); 2415 if (multicontraction) { 2416 *tempbufferpos = 0; 2417 source->pos = temppos - 1; 2418 setDiscontiguosAttribute(source, buffer, tempdb); 2419 return *(coll->contractionCEs + 2420 (tempconstart - coll->contractionIndex)); 2421 } 2422 constart = tempconstart; 2423 break; 2424 } 2425 2426 UCharOffset = (UChar *)(tempconstart + 1); /* skip the backward offset*/ 2427 schar = getNextNormalizedChar(source); 2428 2429 while (schar > (tchar = *UCharOffset)) { 2430 UCharOffset++; 2431 } 2432 2433 if (schar != tchar) { 2434 /* not the correct codepoint. we stuff the current codepoint into 2435 the discontiguos buffer and try the next character */ 2436 *tempdb = schar; 2437 tempdb ++; 2438 continue; 2439 } 2440 else { 2441 if (u_getCombiningClass(schar) == 2442 u_getCombiningClass(peekCharacter(source, -2))) { 2443 //u_getCombiningClass(*(source->pos - 2))) { 2444 *tempdb = schar; 2445 tempdb ++; 2446 continue; 2447 } 2448 result = *(coll->contractionCEs + 2449 (UCharOffset - coll->contractionIndex)); 2450 } 2451 *tempdb = 0; 2452 2453 if (result == UCOL_NOT_FOUND) { 2454 break; 2455 } else if (isContraction(result)) { 2456 /* this is a multi-contraction*/ 2457 tempconstart = (UChar *)coll->image + getContractOffset(result); 2458 if (*(coll->contractionCEs + (constart - coll->contractionIndex)) 2459 != UCOL_NOT_FOUND) { 2460 multicontraction = TRUE; 2461 temppos = source->pos + 1; 2462 tempbufferpos = buffer + u_strlen(buffer); 2463 } 2464 } else { 2465 setDiscontiguosAttribute(source, buffer, tempdb); 2466 return result; 2467 } 2468 } 2469 2470 /* no problems simply reverting just like that, 2471 if we are in string before getting into this function, points back to 2472 string hence no problem. 2473 if we are in normalization buffer before getting into this function, 2474 since we'll never use another normalization within this function, we 2475 know that fcdposition points to a base character. the normalization buffer 2476 never change, hence this revert works. */ 2477 loadState(source, &discState, TRUE); 2478 goBackOne(source); 2479 2480 //source->pos = temppos - 1; 2481 source->flags = tempflags; 2482 return *(coll->contractionCEs + (constart - coll->contractionIndex)); 2483} 2484 2485static 2486inline UBool isNonChar(UChar32 cp) { 2487 return (UBool)((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF)); 2488} 2489 2490/* now uses Mark's getImplicitPrimary code */ 2491static 2492inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) { 2493 if(isNonChar(cp)) { 2494 return 0; 2495 } 2496 uint32_t r = uprv_uca_getImplicitPrimary(cp); 2497 *(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0; 2498 collationSource->offsetRepeatCount += 1; 2499 return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order' 2500} 2501 2502/** 2503* Inserts the argument character into the front of the buffer replacing the 2504* front null terminator. 2505* @param data collation element iterator data 2506* @param pNull pointer to the null terminator 2507* @param ch character to be appended 2508* @return positon of added character 2509*/ 2510static 2511inline UChar * insertBufferFront(collIterate *data, UChar *pNull, UChar ch) 2512{ 2513 uint32_t size = data->writableBufSize; 2514 UChar *end; 2515 UChar *newbuffer; 2516 static const uint32_t INCSIZE = 5; 2517 2518 if (pNull > data->writableBuffer + 1) { 2519 *pNull = ch; 2520 *(pNull - 1) = 0; 2521 return pNull; 2522 } 2523 2524 /* 2525 buffer will always be null terminated infront. 2526 giving extra space since it is likely that more characters will be added. 2527 */ 2528 size += INCSIZE; 2529 newbuffer = (UChar *)uprv_malloc(sizeof(UChar) * size); 2530 if(newbuffer == NULL) { 2531 return NULL; 2532 } 2533 end = newbuffer + INCSIZE; 2534 uprv_memcpy(end, data->writableBuffer, 2535 data->writableBufSize * sizeof(UChar)); 2536 *end = ch; 2537 *(end - 1) = 0; 2538 2539 freeHeapWritableBuffer(data); 2540 2541 data->writableBufSize = size; 2542 data->writableBuffer = newbuffer; 2543 return end; 2544} 2545 2546/** 2547* Special normalization function for contraction in the previous iterator. 2548* This normalization sequence will place the current character at source->pos 2549* and its following normalized sequence into the buffer. 2550* The fcd position, pos will be changed. 2551* pos will now point to positions in the buffer. 2552* Flags will be changed accordingly. 2553* @param data collation iterator data 2554*/ 2555static 2556inline void normalizePrevContraction(collIterate *data, UErrorCode *status) 2557{ 2558 uint32_t nulltermsize; 2559 UErrorCode localstatus = U_ZERO_ERROR; 2560 UChar *pEnd = data->pos + 1; /* End normalize + 1 */ 2561 UChar *pStart; 2562 uint32_t normLen; 2563 UChar *pStartNorm; 2564 2565 if (data->flags & UCOL_ITER_HASLEN) { 2566 /* 2567 normalization buffer not used yet, we'll pull down the next 2568 character into the end of the buffer 2569 */ 2570 *(data->writableBuffer + (data->writableBufSize - 1)) = *(data->pos + 1); 2571 nulltermsize = data->writableBufSize - 1; 2572 } 2573 else { 2574 nulltermsize = data->writableBufSize; 2575 UChar *temp = data->writableBuffer + (nulltermsize - 1); 2576 while (*(temp --) != 0) { 2577 nulltermsize --; 2578 } 2579 } 2580 2581 /* Start normalize */ 2582 if (data->fcdPosition == NULL) { 2583 pStart = data->string; 2584 } 2585 else { 2586 pStart = data->fcdPosition + 1; 2587 } 2588 2589 normLen = unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, data->writableBuffer, 0, 2590 &localstatus); 2591 2592 if (nulltermsize <= normLen) { 2593 uint32_t size = data->writableBufSize - nulltermsize + normLen + 1; 2594 UChar *temp = (UChar *)uprv_malloc(size * sizeof(UChar)); 2595 if (temp == NULL) { 2596 *status = U_MEMORY_ALLOCATION_ERROR; 2597 return; 2598 } 2599 nulltermsize = normLen + 1; 2600 uprv_memcpy(temp + normLen, data->writableBuffer, 2601 sizeof(UChar) * (data->writableBufSize - nulltermsize)); 2602 freeHeapWritableBuffer(data); 2603 data->writableBuffer = temp; 2604 data->writableBufSize = size; 2605 } 2606 2607 /* 2608 this puts the null termination infront of the normalized string instead 2609 of the end 2610 */ 2611 pStartNorm = data->writableBuffer + (nulltermsize - normLen); 2612 *(pStartNorm - 1) = 0; 2613 unorm_normalize(pStart, pEnd - pStart, UNORM_NFD, 0, pStartNorm, normLen, 2614 status); 2615 2616 data->pos = data->writableBuffer + nulltermsize; 2617 data->origFlags = data->flags; 2618 data->flags |= UCOL_ITER_INNORMBUF; 2619 data->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 2620} 2621 2622/** 2623* Contraction character management function that returns the previous character 2624* for the backwards iterator. 2625* Does nothing if the previous character is in buffer and not the first 2626* character in it. 2627* Else it checks previous character in data string to see if it is 2628* normalizable. 2629* If it is not, the character is simply copied into the buffer, else 2630* the whole normalized substring is copied into the buffer, including the 2631* current character. 2632* @param data collation element iterator data 2633* @return previous character 2634*/ 2635static 2636inline UChar getPrevNormalizedChar(collIterate *data, UErrorCode *status) 2637{ 2638 UChar prevch; 2639 UChar ch; 2640 UChar *start; 2641 UBool innormbuf = (UBool)(data->flags & UCOL_ITER_INNORMBUF); 2642 UChar *pNull = NULL; 2643 if ((data->flags & (UCOL_ITER_NORM | UCOL_ITER_INNORMBUF)) == 0 || 2644 (innormbuf && *(data->pos - 1) != 0)) { 2645 /* 2646 if no normalization. 2647 if previous character is in normalized buffer, no further normalization 2648 is required 2649 */ 2650 if(data->flags & UCOL_USE_ITERATOR) { 2651 data->iterator->move(data->iterator, -1, UITER_CURRENT); 2652 return (UChar)data->iterator->next(data->iterator); 2653 } else { 2654 return *(data->pos - 1); 2655 } 2656 } 2657 2658 start = data->pos; 2659 if ((data->fcdPosition==NULL)||(data->flags & UCOL_ITER_HASLEN)) { 2660 /* in data string */ 2661 if ((start - 1) == data->string) { 2662 return *(start - 1); 2663 } 2664 start --; 2665 ch = *start; 2666 prevch = *(start - 1); 2667 } 2668 else { 2669 /* 2670 in writable buffer, at this point fcdPosition can not be NULL. 2671 see contracting tag. 2672 */ 2673 if (data->fcdPosition == data->string) { 2674 /* at the start of the string, just dump it into the normalizer */ 2675 insertBufferFront(data, data->pos - 1, *(data->fcdPosition)); 2676 data->fcdPosition = NULL; 2677 return *(data->pos - 1); 2678 } 2679 pNull = data->pos - 1; 2680 start = data->fcdPosition; 2681 ch = *start; 2682 prevch = *(start - 1); 2683 } 2684 /* 2685 * if the current character is not fcd. 2686 * Trailing combining class == 0. 2687 */ 2688 if (data->fcdPosition > start && 2689 (ch >= NFC_ZERO_CC_BLOCK_LIMIT_ || prevch >= NFC_ZERO_CC_BLOCK_LIMIT_)) 2690 { 2691 /* 2692 Need a more complete FCD check and possible normalization. 2693 normalize substring will be appended to buffer 2694 */ 2695 UChar *backuppos = data->pos; 2696 data->pos = start; 2697 if (collPrevIterFCD(data)) { 2698 normalizePrevContraction(data, status); 2699 return *(data->pos - 1); 2700 } 2701 data->pos = backuppos; 2702 data->fcdPosition ++; 2703 } 2704 2705 if (innormbuf) { 2706 /* 2707 no normalization is to be done hence only one character will be 2708 appended to the buffer. 2709 */ 2710 insertBufferFront(data, pNull, ch); 2711 data->fcdPosition --; 2712 } 2713 2714 return ch; 2715} 2716 2717/* This function handles the special CEs like contractions, expansions, surrogates, Thai */ 2718/* It is called by getNextCE */ 2719 2720/* The following should be even */ 2721#define UCOL_MAX_DIGITS_FOR_NUMBER 254 2722 2723uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, collIterate *source, UErrorCode *status) { 2724 collIterateState entryState; 2725 backupState(source, &entryState); 2726 UChar32 cp = ch; 2727 2728 for (;;) { 2729 // This loop will repeat only in the case of contractions, and only when a contraction 2730 // is found and the first CE resulting from that contraction is itself a special 2731 // (an expansion, for example.) All other special CE types are fully handled the 2732 // first time through, and the loop exits. 2733 2734 const uint32_t *CEOffset = NULL; 2735 switch(getCETag(CE)) { 2736 case NOT_FOUND_TAG: 2737 /* This one is not found, and we'll let somebody else bother about it... no more games */ 2738 return CE; 2739 case SPEC_PROC_TAG: 2740 { 2741 // Special processing is getting a CE that is preceded by a certain prefix 2742 // Currently this is only needed for optimizing Japanese length and iteration marks. 2743 // When we encouter a special processing tag, we go backwards and try to see if 2744 // we have a match. 2745 // Contraction tables are used - so the whole process is not unlike contraction. 2746 // prefix data is stored backwards in the table. 2747 const UChar *UCharOffset; 2748 UChar schar, tchar; 2749 collIterateState prefixState; 2750 backupState(source, &prefixState); 2751 loadState(source, &entryState, TRUE); 2752 goBackOne(source); // We want to look at the point where we entered - actually one 2753 // before that... 2754 2755 for(;;) { 2756 // This loop will run once per source string character, for as long as we 2757 // are matching a potential contraction sequence 2758 2759 // First we position ourselves at the begining of contraction sequence 2760 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2761 if (collIter_bos(source)) { 2762 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2763 break; 2764 } 2765 schar = getPrevNormalizedChar(source, status); 2766 goBackOne(source); 2767 2768 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2769 UCharOffset++; 2770 } 2771 2772 if (schar == tchar) { 2773 // Found the source string char in the table. 2774 // Pick up the corresponding CE from the table. 2775 CE = *(coll->contractionCEs + 2776 (UCharOffset - coll->contractionIndex)); 2777 } 2778 else 2779 { 2780 // Source string char was not in the table. 2781 // We have not found the prefix. 2782 CE = *(coll->contractionCEs + 2783 (ContractionStart - coll->contractionIndex)); 2784 } 2785 2786 if(!isPrefix(CE)) { 2787 // The source string char was in the contraction table, and the corresponding 2788 // CE is not a prefix CE. We found the prefix, break 2789 // out of loop, this CE will end up being returned. This is the normal 2790 // way out of prefix handling when the source actually contained 2791 // the prefix. 2792 break; 2793 } 2794 } 2795 if(CE != UCOL_NOT_FOUND) { // we found something and we can merilly continue 2796 loadState(source, &prefixState, TRUE); 2797 if(source->origFlags & UCOL_USE_ITERATOR) { 2798 source->flags = source->origFlags; 2799 } 2800 } else { // prefix search was a failure, we have to backup all the way to the start 2801 loadState(source, &entryState, TRUE); 2802 } 2803 break; 2804 } 2805 case CONTRACTION_TAG: 2806 { 2807 /* This should handle contractions */ 2808 collIterateState state; 2809 backupState(source, &state); 2810 uint32_t firstCE = *(coll->contractionCEs + ((UChar *)coll->image+getContractOffset(CE) - coll->contractionIndex)); //UCOL_NOT_FOUND; 2811 const UChar *UCharOffset; 2812 UChar schar, tchar; 2813 2814 for (;;) { 2815 /* This loop will run once per source string character, for as long as we */ 2816 /* are matching a potential contraction sequence */ 2817 2818 /* First we position ourselves at the begining of contraction sequence */ 2819 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 2820 2821 if (collIter_eos(source)) { 2822 // Ran off the end of the source string. 2823 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 2824 // So we'll pick whatever we have at the point... 2825 if (CE == UCOL_NOT_FOUND) { 2826 // back up the source over all the chars we scanned going into this contraction. 2827 CE = firstCE; 2828 loadState(source, &state, TRUE); 2829 if(source->origFlags & UCOL_USE_ITERATOR) { 2830 source->flags = source->origFlags; 2831 } 2832 } 2833 break; 2834 } 2835 2836 uint8_t maxCC = (uint8_t)(*(UCharOffset)&0xFF); /*get the discontiguos stuff */ /* skip the backward offset, see above */ 2837 uint8_t allSame = (uint8_t)(*(UCharOffset++)>>8); 2838 2839 schar = getNextNormalizedChar(source); 2840 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 2841 UCharOffset++; 2842 } 2843 2844 if (schar == tchar) { 2845 // Found the source string char in the contraction table. 2846 // Pick up the corresponding CE from the table. 2847 CE = *(coll->contractionCEs + 2848 (UCharOffset - coll->contractionIndex)); 2849 } 2850 else 2851 { 2852 // Source string char was not in contraction table. 2853 // Unless we have a discontiguous contraction, we have finished 2854 // with this contraction. 2855 // in order to do the proper detection, we 2856 // need to see if we're dealing with a supplementary 2857 /* We test whether the next two char are surrogate pairs. 2858 * This test is done if the iterator is not NULL. 2859 * If there is no surrogate pair, the iterator 2860 * goes back one if needed. */ 2861 UChar32 miss = schar; 2862 if (source->iterator) { 2863 UChar32 surrNextChar; /* the next char in the iteration to test */ 2864 int32_t prevPos; /* holds the previous position before move forward of the source iterator */ 2865 if(U16_IS_LEAD(schar) && source->iterator->hasNext(source->iterator)) { 2866 prevPos = source->iterator->index; 2867 surrNextChar = getNextNormalizedChar(source); 2868 if (U16_IS_TRAIL(surrNextChar)) { 2869 miss = U16_GET_SUPPLEMENTARY(schar, surrNextChar); 2870 } else if (prevPos < source->iterator->index){ 2871 goBackOne(source); 2872 } 2873 } 2874 } else if (U16_IS_LEAD(schar)) { 2875 miss = U16_GET_SUPPLEMENTARY(schar, getNextNormalizedChar(source)); 2876 } 2877 2878 uint8_t sCC; 2879 if (miss < 0x300 || 2880 maxCC == 0 || 2881 (sCC = i_getCombiningClass(miss, coll)) == 0 || 2882 sCC>maxCC || 2883 (allSame != 0 && sCC == maxCC) || 2884 collIter_eos(source)) 2885 { 2886 // Contraction can not be discontiguous. 2887 goBackOne(source); // back up the source string by one, 2888 // because the character we just looked at was 2889 // not part of the contraction. */ 2890 if(U_IS_SUPPLEMENTARY(miss)) { 2891 goBackOne(source); 2892 } 2893 CE = *(coll->contractionCEs + 2894 (ContractionStart - coll->contractionIndex)); 2895 } else { 2896 // 2897 // Contraction is possibly discontiguous. 2898 // Scan more of source string looking for a match 2899 // 2900 UChar tempchar; 2901 /* find the next character if schar is not a base character 2902 and we are not yet at the end of the string */ 2903 tempchar = getNextNormalizedChar(source); 2904 // probably need another supplementary thingie here 2905 goBackOne(source); 2906 if (i_getCombiningClass(tempchar, coll) == 0) { 2907 goBackOne(source); 2908 if(U_IS_SUPPLEMENTARY(miss)) { 2909 goBackOne(source); 2910 } 2911 /* Spit out the last char of the string, wasn't tasty enough */ 2912 CE = *(coll->contractionCEs + 2913 (ContractionStart - coll->contractionIndex)); 2914 } else { 2915 CE = getDiscontiguous(coll, source, ContractionStart); 2916 } 2917 } 2918 } // else after if(schar == tchar) 2919 2920 if(CE == UCOL_NOT_FOUND) { 2921 /* The Source string did not match the contraction that we were checking. */ 2922 /* Back up the source position to undo the effects of having partially */ 2923 /* scanned through what ultimately proved to not be a contraction. */ 2924 loadState(source, &state, TRUE); 2925 CE = firstCE; 2926 break; 2927 } 2928 2929 if(!isContraction(CE)) { 2930 // The source string char was in the contraction table, and the corresponding 2931 // CE is not a contraction CE. We completed the contraction, break 2932 // out of loop, this CE will end up being returned. This is the normal 2933 // way out of contraction handling when the source actually contained 2934 // the contraction. 2935 break; 2936 } 2937 2938 2939 // The source string char was in the contraction table, and the corresponding 2940 // CE is IS a contraction CE. We will continue looping to check the source 2941 // string for the remaining chars in the contraction. 2942 uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex)); 2943 if(tempCE != UCOL_NOT_FOUND) { 2944 // We have scanned a a section of source string for which there is a 2945 // CE from the contraction table. Remember the CE and scan position, so 2946 // that we can return to this point if further scanning fails to 2947 // match a longer contraction sequence. 2948 firstCE = tempCE; 2949 2950 goBackOne(source); 2951 backupState(source, &state); 2952 getNextNormalizedChar(source); 2953 2954 // Another way to do this is: 2955 //collIterateState tempState; 2956 //backupState(source, &tempState); 2957 //goBackOne(source); 2958 //backupState(source, &state); 2959 //loadState(source, &tempState, TRUE); 2960 2961 // The problem is that for incomplete contractions we have to remember the previous 2962 // position. Before, the only thing I needed to do was state.pos--; 2963 // After iterator introduction and especially after introduction of normalizing 2964 // iterators, it became much more difficult to decrease the saved state. 2965 // I'm not yet sure which of the two methods above is faster. 2966 } 2967 } // for(;;) 2968 break; 2969 } // case CONTRACTION_TAG: 2970 case LONG_PRIMARY_TAG: 2971 { 2972 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 2973 CE = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 2974 source->offsetRepeatCount += 1; 2975 return CE; 2976 } 2977 case EXPANSION_TAG: 2978 { 2979 /* This should handle expansion. */ 2980 /* NOTE: we can encounter both continuations and expansions in an expansion! */ 2981 /* I have to decide where continuations are going to be dealt with */ 2982 uint32_t size; 2983 uint32_t i; /* general counter */ 2984 2985 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 2986 size = getExpansionCount(CE); 2987 CE = *CEOffset++; 2988 //source->offsetRepeatCount = -1; 2989 2990 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 2991 for(i = 1; i<size; i++) { 2992 *(source->CEpos++) = *CEOffset++; 2993 source->offsetRepeatCount += 1; 2994 } 2995 } else { /* else, we do */ 2996 while(*CEOffset != 0) { 2997 *(source->CEpos++) = *CEOffset++; 2998 source->offsetRepeatCount += 1; 2999 } 3000 } 3001 3002 return CE; 3003 } 3004 case DIGIT_TAG: 3005 { 3006 /* 3007 We do a check to see if we want to collate digits as numbers; if so we generate 3008 a custom collation key. Otherwise we pull out the value stored in the expansion table. 3009 */ 3010 //uint32_t size; 3011 uint32_t i; /* general counter */ 3012 3013 if (source->coll->numericCollation == UCOL_ON){ 3014 collIterateState digitState = {0,0,0,0,0,0,0,0,0}; 3015 UChar32 char32 = 0; 3016 int32_t digVal = 0; 3017 3018 uint32_t digIndx = 0; 3019 uint32_t endIndex = 0; 3020 uint32_t trailingZeroIndex = 0; 3021 3022 uint8_t collateVal = 0; 3023 3024 UBool nonZeroValReached = FALSE; 3025 3026 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3]; // I just need a temporary place to store my generated CEs. 3027 /* 3028 We parse the source string until we hit a char that's NOT a digit. 3029 Use this u_charDigitValue. This might be slow because we have to 3030 handle surrogates... 3031 */ 3032 /* 3033 if (U16_IS_LEAD(ch)){ 3034 if (!collIter_eos(source)) { 3035 backupState(source, &digitState); 3036 UChar trail = getNextNormalizedChar(source); 3037 if(U16_IS_TRAIL(trail)) { 3038 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 3039 } else { 3040 loadState(source, &digitState, TRUE); 3041 char32 = ch; 3042 } 3043 } else { 3044 char32 = ch; 3045 } 3046 } else { 3047 char32 = ch; 3048 } 3049 digVal = u_charDigitValue(char32); 3050 */ 3051 digVal = u_charDigitValue(cp); // if we have arrived here, we have 3052 // already processed possible supplementaries that trigered the digit tag - 3053 // all supplementaries are marked in the UCA. 3054 /* 3055 We pad a zero in front of the first element anyways. This takes 3056 care of the (probably) most common case where people are sorting things followed 3057 by a single digit 3058 */ 3059 digIndx++; 3060 for(;;){ 3061 // Make sure we have enough space. No longer needed; 3062 // at this point digIndx now has a max value of UCOL_MAX_DIGITS_FOR_NUMBER 3063 // (it has been pre-incremented) so we just ensure that numTempBuf is big enough 3064 // (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 3). 3065 3066 // Skipping over leading zeroes. 3067 if (digVal != 0) { 3068 nonZeroValReached = TRUE; 3069 } 3070 if (nonZeroValReached) { 3071 /* 3072 We parse the digit string into base 100 numbers (this fits into a byte). 3073 We only add to the buffer in twos, thus if we are parsing an odd character, 3074 that serves as the 'tens' digit while the if we are parsing an even one, that 3075 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 3076 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 3077 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 3078 than all the other bytes. 3079 */ 3080 3081 if (digIndx % 2 == 1){ 3082 collateVal += (uint8_t)digVal; 3083 3084 // We don't enter the low-order-digit case unless we've already seen 3085 // the high order, or for the first digit, which is always non-zero. 3086 if (collateVal != 0) 3087 trailingZeroIndex = 0; 3088 3089 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3090 collateVal = 0; 3091 } 3092 else{ 3093 // We drop the collation value into the buffer so if we need to do 3094 // a "front patch" we don't have to check to see if we're hitting the 3095 // last element. 3096 collateVal = (uint8_t)(digVal * 10); 3097 3098 // Check for trailing zeroes. 3099 if (collateVal == 0) 3100 { 3101 if (!trailingZeroIndex) 3102 trailingZeroIndex = (digIndx/2) + 2; 3103 } 3104 else 3105 trailingZeroIndex = 0; 3106 3107 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3108 } 3109 digIndx++; 3110 } 3111 3112 // Get next character. 3113 if (!collIter_eos(source)){ 3114 ch = getNextNormalizedChar(source); 3115 if (U16_IS_LEAD(ch)){ 3116 if (!collIter_eos(source)) { 3117 backupState(source, &digitState); 3118 UChar trail = getNextNormalizedChar(source); 3119 if(U16_IS_TRAIL(trail)) { 3120 char32 = U16_GET_SUPPLEMENTARY(ch, trail); 3121 } else { 3122 loadState(source, &digitState, TRUE); 3123 char32 = ch; 3124 } 3125 } 3126 } else { 3127 char32 = ch; 3128 } 3129 3130 if ((digVal = u_charDigitValue(char32)) == -1 || digIndx > UCOL_MAX_DIGITS_FOR_NUMBER){ 3131 // Resetting position to point to the next unprocessed char. We 3132 // overshot it when doing our test/set for numbers. 3133 if (char32 > 0xFFFF) { // For surrogates. 3134 loadState(source, &digitState, TRUE); 3135 //goBackOne(source); 3136 } 3137 goBackOne(source); 3138 break; 3139 } 3140 } else { 3141 break; 3142 } 3143 } 3144 3145 if (nonZeroValReached == FALSE){ 3146 digIndx = 2; 3147 numTempBuf[2] = 6; 3148 } 3149 3150 endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ; 3151 if (digIndx % 2 != 0){ 3152 /* 3153 We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what 3154 we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward. 3155 Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a 3156 single pass and optimizes for strings with single digits. I'm just assuming that's the more common case. 3157 */ 3158 3159 for(i = 2; i < endIndex; i++){ 3160 numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) + 3161 (((numTempBuf[i+1])-6)/2) / 10) * 2 + 6; 3162 } 3163 --digIndx; 3164 } 3165 3166 // Subtract one off of the last byte. 3167 numTempBuf[endIndex-1] -= 1; 3168 3169 /* 3170 We want to skip over the first two slots in the buffer. The first slot 3171 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 3172 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 3173 */ 3174 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 3175 numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F)); 3176 3177 // Now transfer the collation key to our collIterate struct. 3178 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. 3179 //size = ((endIndex+1) & ~1)/2; 3180 CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 3181 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 3182 UCOL_BYTE_COMMON; // Tertiary weight. 3183 i = 2; // Reset the index into the buffer. 3184 while(i < endIndex) 3185 { 3186 uint32_t primWeight = numTempBuf[i++] << 8; 3187 if ( i < endIndex) 3188 primWeight |= numTempBuf[i++]; 3189 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 3190 } 3191 3192 } else { 3193 // no numeric mode, we'll just switch to whatever we stashed and continue 3194 CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 3195 CE = *CEOffset++; 3196 break; 3197 } 3198 return CE; 3199 } 3200 /* various implicits optimization */ 3201 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 3202 /* UCA is filled with these. Tailorings are NOT_FOUND */ 3203 return getImplicit(cp, source); 3204 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 3205 // TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit 3206 return getImplicit(cp, source); 3207 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 3208 { 3209 static const uint32_t 3210 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 3211 //const uint32_t LCount = 19; 3212 static const uint32_t VCount = 21; 3213 static const uint32_t TCount = 28; 3214 //const uint32_t NCount = VCount * TCount; // 588 3215 //const uint32_t SCount = LCount * NCount; // 11172 3216 uint32_t L = ch - SBase; 3217 3218 // divide into pieces 3219 3220 uint32_t T = L % TCount; // we do it in this order since some compilers can do % and / in one operation 3221 L /= TCount; 3222 uint32_t V = L % VCount; 3223 L /= VCount; 3224 3225 // offset them 3226 3227 L += LBase; 3228 V += VBase; 3229 T += TBase; 3230 3231 // return the first CE, but first put the rest into the expansion buffer 3232 if (!source->coll->image->jamoSpecial) { // FAST PATH 3233 3234 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 3235 if (T != TBase) { 3236 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 3237 } 3238 3239 return UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 3240 3241 } else { // Jamo is Special 3242 // Since Hanguls pass the FCD check, it is 3243 // guaranteed that we won't be in 3244 // the normalization buffer if something like this happens 3245 // However, if we are using a uchar iterator and normalization 3246 // is ON, the Hangul that lead us here is going to be in that 3247 // normalization buffer. Here we want to restore the uchar 3248 // iterator state and pull out of the normalization buffer 3249 if(source->iterator != NULL && source->flags & UCOL_ITER_INNORMBUF) { 3250 source->flags = source->origFlags; // restore the iterator 3251 source->pos = NULL; 3252 } 3253 // Move Jamos into normalization buffer 3254 source->writableBuffer[0] = (UChar)L; 3255 source->writableBuffer[1] = (UChar)V; 3256 if (T != TBase) { 3257 source->writableBuffer[2] = (UChar)T; 3258 source->writableBuffer[3] = 0; 3259 } else { 3260 source->writableBuffer[2] = 0; 3261 } 3262 3263 source->fcdPosition = source->pos; // Indicate where to continue in main input string 3264 // after exhausting the writableBuffer 3265 source->pos = source->writableBuffer; 3266 source->origFlags = source->flags; 3267 source->flags |= UCOL_ITER_INNORMBUF; 3268 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 3269 3270 return(UCOL_IGNORABLE); 3271 } 3272 } 3273 case SURROGATE_TAG: 3274 /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ 3275 /* two things can happen here: next code point can be a trailing surrogate - we will use it */ 3276 /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */ 3277 /* we return 0 (completely ignorable - per UCA specification */ 3278 { 3279 UChar trail; 3280 collIterateState state; 3281 backupState(source, &state); 3282 if (collIter_eos(source) || !(U16_IS_TRAIL((trail = getNextNormalizedChar(source))))) { 3283 // we chould have stepped one char forward and it might have turned that it 3284 // was not a trail surrogate. In that case, we have to backup. 3285 loadState(source, &state, TRUE); 3286 return 0; 3287 } else { 3288 /* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */ 3289 CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail); 3290 if(CE == UCOL_NOT_FOUND) { // there are tailored surrogates in this block, but not this one. 3291 // We need to backup 3292 loadState(source, &state, TRUE); 3293 return CE; 3294 } 3295 // calculate the supplementary code point value, if surrogate was not tailored 3296 cp = ((((uint32_t)ch)<<10UL)+(trail)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 3297 } 3298 } 3299 break; 3300 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 3301 UChar nextChar; 3302 if( source->flags & UCOL_USE_ITERATOR) { 3303 if(U_IS_TRAIL(nextChar = (UChar)source->iterator->current(source->iterator))) { 3304 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3305 source->iterator->next(source->iterator); 3306 return getImplicit(cp, source); 3307 } else { 3308 return 0; 3309 } 3310 } else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) && 3311 U_IS_TRAIL((nextChar=*source->pos))) { 3312 cp = U16_GET_SUPPLEMENTARY(ch, nextChar); 3313 source->pos++; 3314 return getImplicit(cp, source); 3315 } else { 3316 return 0; /* completely ignorable */ 3317 } 3318 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 3319 return 0; /* broken surrogate sequence */ 3320 case CHARSET_TAG: 3321 /* not yet implemented */ 3322 /* probably after 1.8 */ 3323 return UCOL_NOT_FOUND; 3324 default: 3325 *status = U_INTERNAL_PROGRAM_ERROR; 3326 CE=0; 3327 break; 3328 } 3329 if (CE <= UCOL_NOT_FOUND) break; 3330 } 3331 return CE; 3332} 3333 3334 3335/* now uses Mark's getImplicitPrimary code */ 3336static 3337inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) { 3338 if(isNonChar(cp)) { 3339 return 0; 3340 } 3341 3342 uint32_t r = uprv_uca_getImplicitPrimary(cp); 3343 3344 *(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505; 3345 collationSource->toReturn = collationSource->CEpos; 3346 3347 if (collationSource->offsetBuffer == NULL) { 3348 collationSource->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; 3349 collationSource->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); 3350 collationSource->offsetStore = collationSource->offsetBuffer; 3351 } 3352 3353 // **** doesn't work if using iterator **** 3354 if (collationSource->flags & UCOL_ITER_INNORMBUF) { 3355 collationSource->offsetRepeatCount = 1; 3356 } else { 3357 int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string); 3358 3359 *(collationSource->offsetStore++) = firstOffset; 3360 *(collationSource->offsetStore++) = firstOffset + 1; 3361 3362 collationSource->offsetReturn = collationSource->offsetStore - 1; 3363 *(collationSource->offsetBuffer) = firstOffset; 3364 if (collationSource->offsetReturn == collationSource->offsetBuffer) { 3365 collationSource->offsetStore = collationSource->offsetBuffer; 3366 } 3367 } 3368 3369 return ((r & 0x0000FFFF)<<16) | 0x000000C0; 3370} 3371 3372/** 3373 * This function handles the special CEs like contractions, expansions, 3374 * surrogates, Thai. 3375 * It is called by both getPrevCE 3376 */ 3377uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE, 3378 collIterate *source, 3379 UErrorCode *status) 3380{ 3381 const uint32_t *CEOffset = NULL; 3382 UChar *UCharOffset = NULL; 3383 UChar schar; 3384 const UChar *constart = NULL; 3385 uint32_t size; 3386 UChar buffer[UCOL_MAX_BUFFER]; 3387 uint32_t *endCEBuffer; 3388 UChar *strbuffer; 3389 int32_t noChars = 0; 3390 int32_t CECount = 0; 3391 3392 for(;;) 3393 { 3394 /* the only ces that loops are thai and contractions */ 3395 switch (getCETag(CE)) 3396 { 3397 case NOT_FOUND_TAG: /* this tag always returns */ 3398 return CE; 3399 3400 case SPEC_PROC_TAG: 3401 { 3402 // Special processing is getting a CE that is preceded by a certain prefix 3403 // Currently this is only needed for optimizing Japanese length and iteration marks. 3404 // When we encouter a special processing tag, we go backwards and try to see if 3405 // we have a match. 3406 // Contraction tables are used - so the whole process is not unlike contraction. 3407 // prefix data is stored backwards in the table. 3408 const UChar *UCharOffset; 3409 UChar schar, tchar; 3410 collIterateState prefixState; 3411 backupState(source, &prefixState); 3412 for(;;) { 3413 // This loop will run once per source string character, for as long as we 3414 // are matching a potential contraction sequence 3415 3416 // First we position ourselves at the begining of contraction sequence 3417 const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE); 3418 3419 if (collIter_bos(source)) { 3420 CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 3421 break; 3422 } 3423 schar = getPrevNormalizedChar(source, status); 3424 goBackOne(source); 3425 3426 while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 3427 UCharOffset++; 3428 } 3429 3430 if (schar == tchar) { 3431 // Found the source string char in the table. 3432 // Pick up the corresponding CE from the table. 3433 CE = *(coll->contractionCEs + 3434 (UCharOffset - coll->contractionIndex)); 3435 } 3436 else 3437 { 3438 // if there is a completely ignorable code point in the middle of 3439 // a prefix, we need to act as if it's not there 3440 // assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero) 3441 // lone surrogates cannot be set to zero as it would break other processing 3442 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 3443 // it's easy for BMP code points 3444 if(isZeroCE == 0) { 3445 continue; 3446 } else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) { 3447 // for supplementary code points, we have to check the next one 3448 // situations where we are going to ignore 3449 // 1. beginning of the string: schar is a lone surrogate 3450 // 2. schar is a lone surrogate 3451 // 3. schar is a trail surrogate in a valid surrogate sequence 3452 // that is explicitly set to zero. 3453 if (!collIter_bos(source)) { 3454 UChar lead; 3455 if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) { 3456 isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead); 3457 if(getCETag(isZeroCE) == SURROGATE_TAG) { 3458 uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar); 3459 if(finalCE == 0) { 3460 // this is a real, assigned completely ignorable code point 3461 goBackOne(source); 3462 continue; 3463 } 3464 } 3465 } else { 3466 // lone surrogate, completely ignorable 3467 continue; 3468 } 3469 } else { 3470 // lone surrogate at the beggining, completely ignorable 3471 continue; 3472 } 3473 } 3474 // Source string char was not in the table. 3475 // We have not found the prefix. 3476 CE = *(coll->contractionCEs + 3477 (ContractionStart - coll->contractionIndex)); 3478 } 3479 3480 if(!isPrefix(CE)) { 3481 // The source string char was in the contraction table, and the corresponding 3482 // CE is not a prefix CE. We found the prefix, break 3483 // out of loop, this CE will end up being returned. This is the normal 3484 // way out of prefix handling when the source actually contained 3485 // the prefix. 3486 break; 3487 } 3488 } 3489 loadState(source, &prefixState, TRUE); 3490 break; 3491 } 3492 3493 case CONTRACTION_TAG: 3494 /* to ensure that the backwards and forwards iteration matches, we 3495 take the current region of most possible match and pass it through 3496 the forward iteration. this will ensure that the obstinate problem of 3497 overlapping contractions will not occur. 3498 */ 3499 schar = peekCharacter(source, 0); 3500 constart = (UChar *)coll->image + getContractOffset(CE); 3501 if (isAtStartPrevIterate(source) 3502 /* commented away contraction end checks after adding the checks 3503 in getPrevCE */) { 3504 /* start of string or this is not the end of any contraction */ 3505 CE = *(coll->contractionCEs + 3506 (constart - coll->contractionIndex)); 3507 break; 3508 } 3509 strbuffer = buffer; 3510 UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1); 3511 *(UCharOffset --) = 0; 3512 noChars = 0; 3513 // have to swap thai characters 3514 while (ucol_unsafeCP(schar, coll)) { 3515 *(UCharOffset) = schar; 3516 noChars++; 3517 UCharOffset --; 3518 schar = getPrevNormalizedChar(source, status); 3519 goBackOne(source); 3520 // TODO: when we exhaust the contraction buffer, 3521 // it needs to get reallocated. The problem is 3522 // that the size depends on the string which is 3523 // not iterated over. However, since we're travelling 3524 // backwards, we already had to set the iterator at 3525 // the end - so we might as well know where we are? 3526 if (UCharOffset + 1 == buffer) { 3527 /* we have exhausted the buffer */ 3528 int32_t newsize = 0; 3529 if(source->pos) { // actually dealing with a position 3530 newsize = source->pos - source->string + 1; 3531 } else { // iterator 3532 newsize = 4 * UCOL_MAX_BUFFER; 3533 } 3534 strbuffer = (UChar *)uprv_malloc(sizeof(UChar) * 3535 (newsize + UCOL_MAX_BUFFER)); 3536 /* test for NULL */ 3537 if (strbuffer == NULL) { 3538 *status = U_MEMORY_ALLOCATION_ERROR; 3539 return UCOL_NO_MORE_CES; 3540 } 3541 UCharOffset = strbuffer + newsize; 3542 uprv_memcpy(UCharOffset, buffer, 3543 UCOL_MAX_BUFFER * sizeof(UChar)); 3544 UCharOffset --; 3545 } 3546 if ((source->pos && (source->pos == source->string || 3547 ((source->flags & UCOL_ITER_INNORMBUF) && 3548 *(source->pos - 1) == 0 && source->fcdPosition == NULL))) 3549 || (source->iterator && !source->iterator->hasPrevious(source->iterator))) { 3550 break; 3551 } 3552 } 3553 /* adds the initial base character to the string */ 3554 *(UCharOffset) = schar; 3555 noChars++; 3556 3557 int32_t offsetBias; 3558 3559 // **** doesn't work if using iterator **** 3560 if (source->flags & UCOL_ITER_INNORMBUF) { 3561 offsetBias = -1; 3562 } else { 3563 offsetBias = (int32_t)(source->pos - source->string); 3564 } 3565 3566 /* a new collIterate is used to simplify things, since using the current 3567 collIterate will mean that the forward and backwards iteration will 3568 share and change the same buffers. we don't want to get into that. */ 3569 collIterate temp; 3570 int32_t rawOffset; 3571 3572 IInit_collIterate(coll, UCharOffset, noChars, &temp); 3573 temp.flags &= ~UCOL_ITER_NORM; 3574 temp.flags |= source->flags & UCOL_FORCE_HAN_IMPLICIT; 3575 3576 rawOffset = temp.pos - temp.string; // should always be zero? 3577 CE = ucol_IGetNextCE(coll, &temp, status); 3578 3579 if (source->extendCEs) { 3580 endCEBuffer = source->extendCEs + source->extendCEsSize; 3581 CECount = (source->CEpos - source->extendCEs)/sizeof(uint32_t); 3582 } else { 3583 endCEBuffer = source->CEs + UCOL_EXPAND_CE_BUFFER_SIZE; 3584 CECount = (source->CEpos - source->CEs)/sizeof(uint32_t); 3585 } 3586 3587 if (source->offsetBuffer == NULL) { 3588 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; 3589 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); 3590 source->offsetStore = source->offsetBuffer; 3591 } 3592 3593 while (CE != UCOL_NO_MORE_CES) { 3594 *(source->CEpos ++) = CE; 3595 3596 if (offsetBias >= 0) { 3597 *(source->offsetStore ++) = rawOffset + offsetBias; 3598 } 3599 3600 CECount++; 3601 if (source->CEpos == endCEBuffer) { 3602 /* ran out of CE space, reallocate to new buffer. 3603 If reallocation fails, reset pointers and bail out, 3604 there's no guarantee of the right character position after 3605 this bail*/ 3606 if (source->extendCEs == NULL) { 3607 source->extendCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t) * 3608 (source->extendCEsSize =UCOL_EXPAND_CE_BUFFER_SIZE + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE)); 3609 if (source->extendCEs == NULL) { 3610 // Handle error later. 3611 CECount = -1; 3612 } else { 3613 source->extendCEs = (uint32_t *)uprv_memcpy(source->extendCEs, source->CEs, UCOL_EXPAND_CE_BUFFER_SIZE * sizeof(uint32_t)); 3614 } 3615 } else { 3616 uint32_t *tempBufCE = (uint32_t *)uprv_realloc(source->extendCEs, 3617 sizeof(uint32_t) * (source->extendCEsSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE)); 3618 if (tempBufCE == NULL) { 3619 // Handle error later. 3620 CECount = -1; 3621 } 3622 else { 3623 source->extendCEs = tempBufCE; 3624 } 3625 } 3626 3627 if (CECount == -1) { 3628 *status = U_MEMORY_ALLOCATION_ERROR; 3629 source->extendCEsSize = 0; 3630 source->CEpos = source->CEs; 3631 freeHeapWritableBuffer(&temp); 3632 3633 if (strbuffer != buffer) { 3634 uprv_free(strbuffer); 3635 } 3636 3637 return (uint32_t)UCOL_NULLORDER; 3638 } 3639 3640 source->CEpos = source->extendCEs + CECount; 3641 endCEBuffer = source->extendCEs + source->extendCEsSize; 3642 } 3643 3644 if (offsetBias >= 0 && source->offsetStore >= &source->offsetBuffer[source->offsetBufferSize]) { 3645 int32_t storeIX = source->offsetStore - source->offsetBuffer; 3646 int32_t *tob = (int32_t *) uprv_realloc(source->offsetBuffer, 3647 sizeof(int32_t) * (source->offsetBufferSize + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE)); 3648 3649 if (tob != NULL) { 3650 source->offsetBuffer = tob; 3651 source->offsetStore = &source->offsetBuffer[storeIX]; 3652 source->offsetBufferSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE; 3653 } else { 3654 // memory error... 3655 *status = U_MEMORY_ALLOCATION_ERROR; 3656 source->CEpos = source->CEs; 3657 freeHeapWritableBuffer(&temp); 3658 3659 if (strbuffer != buffer) { 3660 uprv_free(strbuffer); 3661 } 3662 3663 return (uint32_t) UCOL_NULLORDER; 3664 } 3665 } 3666 3667 if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) { 3668 rawOffset = temp.fcdPosition - temp.string; 3669 } else { 3670 rawOffset = temp.pos - temp.string; 3671 } 3672 3673 CE = ucol_IGetNextCE(coll, &temp, status); 3674 } 3675 3676 if (source->offsetRepeatValue != 0) { 3677 if (CECount > noChars) { 3678 source->offsetRepeatCount += temp.offsetRepeatCount; 3679 } else { 3680 // **** does this really skip the right offsets? **** 3681 source->offsetReturn -= (noChars - CECount); 3682 } 3683 } 3684 3685 freeHeapWritableBuffer(&temp); 3686 3687 if (strbuffer != buffer) { 3688 uprv_free(strbuffer); 3689 } 3690 3691 if (offsetBias >= 0) { 3692 source->offsetReturn = source->offsetStore - 1; 3693 if (source->offsetReturn == source->offsetBuffer) { 3694 source->offsetStore = source->offsetBuffer; 3695 } 3696 } 3697 3698 source->toReturn = source->CEpos - 1; 3699 if (source->toReturn == source->CEs) { 3700 source->CEpos = source->CEs; 3701 } 3702 3703 return *(source->toReturn); 3704 3705 case LONG_PRIMARY_TAG: 3706 { 3707 *(source->CEpos++) = ((CE & 0xFFFF00) << 8) | (UCOL_BYTE_COMMON << 8) | UCOL_BYTE_COMMON; 3708 *(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER; 3709 source->toReturn = source->CEpos - 1; 3710 3711 if (source->offsetBuffer == NULL) { 3712 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; 3713 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); 3714 source->offsetStore = source->offsetBuffer; 3715 } 3716 3717 if (source->flags & UCOL_ITER_INNORMBUF) { 3718 source->offsetRepeatCount = 1; 3719 } else { 3720 int32_t firstOffset = (int32_t)(source->pos - source->string); 3721 3722 *(source->offsetStore++) = firstOffset; 3723 *(source->offsetStore++) = firstOffset + 1; 3724 3725 source->offsetReturn = source->offsetStore - 1; 3726 *(source->offsetBuffer) = firstOffset; 3727 if (source->offsetReturn == source->offsetBuffer) { 3728 source->offsetStore = source->offsetBuffer; 3729 } 3730 } 3731 3732 3733 return *(source->toReturn); 3734 } 3735 3736 case EXPANSION_TAG: /* this tag always returns */ 3737 { 3738 /* 3739 This should handle expansion. 3740 NOTE: we can encounter both continuations and expansions in an expansion! 3741 I have to decide where continuations are going to be dealt with 3742 */ 3743 int32_t firstOffset = (int32_t)(source->pos - source->string); 3744 3745 // **** doesn't work if using iterator **** 3746 if (source->offsetReturn != NULL) { 3747 if (! (source->flags & UCOL_ITER_INNORMBUF) && source->offsetReturn == source->offsetBuffer) { 3748 source->offsetStore = source->offsetBuffer; 3749 }else { 3750 firstOffset = -1; 3751 } 3752 } 3753 3754 if (source->offsetBuffer == NULL) { 3755 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; 3756 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); 3757 source->offsetStore = source->offsetBuffer; 3758 } 3759 3760 /* find the offset to expansion table */ 3761 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 3762 size = getExpansionCount(CE); 3763 if (size != 0) { 3764 /* 3765 if there are less than 16 elements in expansion, we don't terminate 3766 */ 3767 uint32_t count; 3768 3769 for (count = 0; count < size; count++) { 3770 *(source->CEpos ++) = *CEOffset++; 3771 3772 if (firstOffset >= 0) { 3773 *(source->offsetStore ++) = firstOffset + 1; 3774 } 3775 } 3776 } else { 3777 /* else, we do */ 3778 while (*CEOffset != 0) { 3779 *(source->CEpos ++) = *CEOffset ++; 3780 3781 if (firstOffset >= 0) { 3782 *(source->offsetStore ++) = firstOffset + 1; 3783 } 3784 } 3785 } 3786 3787 if (firstOffset >= 0) { 3788 source->offsetReturn = source->offsetStore - 1; 3789 *(source->offsetBuffer) = firstOffset; 3790 if (source->offsetReturn == source->offsetBuffer) { 3791 source->offsetStore = source->offsetBuffer; 3792 } 3793 } else { 3794 source->offsetRepeatCount += size - 1; 3795 } 3796 3797 source->toReturn = source->CEpos - 1; 3798 // in case of one element expansion, we 3799 // want to immediately return CEpos 3800 if(source->toReturn == source->CEs) { 3801 source->CEpos = source->CEs; 3802 } 3803 3804 return *(source->toReturn); 3805 } 3806 3807 case DIGIT_TAG: 3808 { 3809 /* 3810 We do a check to see if we want to collate digits as numbers; if so we generate 3811 a custom collation key. Otherwise we pull out the value stored in the expansion table. 3812 */ 3813 //uint32_t size; 3814 uint32_t i; /* general counter */ 3815 3816 if (source->coll->numericCollation == UCOL_ON){ 3817 uint32_t digIndx = 0; 3818 uint32_t endIndex = 0; 3819 uint32_t leadingZeroIndex = 0; 3820 uint32_t trailingZeroCount = 0; 3821 3822 uint8_t collateVal = 0; 3823 3824 UBool nonZeroValReached = FALSE; 3825 3826 uint8_t numTempBuf[UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2]; // I just need a temporary place to store my generated CEs. 3827 /* 3828 We parse the source string until we hit a char that's NOT a digit. 3829 Use this u_charDigitValue. This might be slow because we have to 3830 handle surrogates... 3831 */ 3832 /* 3833 We need to break up the digit string into collection elements of UCOL_MAX_DIGITS_FOR_NUMBER or less, 3834 with any chunks smaller than that being on the right end of the digit string - i.e. the first collation 3835 element we process when going backward. To determine how long that chunk might be, we may need to make 3836 two passes through the loop that collects digits - one to see how long the string is (and how much is 3837 leading zeros) to determine the length of that right-hand chunk, and a second (if the whole string has 3838 more than UCOL_MAX_DIGITS_FOR_NUMBER non-leading-zero digits) to actually process that collation 3839 element chunk after resetting the state to the initialState at the right side of the digit string. 3840 */ 3841 uint32_t ceLimit = 0; 3842 UChar initial_ch = ch; 3843 collIterateState initialState = {0,0,0,0,0,0,0,0,0}; 3844 backupState(source, &initialState); 3845 3846 for(;;) { 3847 collIterateState state = {0,0,0,0,0,0,0,0,0}; 3848 UChar32 char32 = 0; 3849 int32_t digVal = 0; 3850 3851 if (U16_IS_TRAIL (ch)) { 3852 if (!collIter_bos(source)){ 3853 UChar lead = getPrevNormalizedChar(source, status); 3854 if(U16_IS_LEAD(lead)) { 3855 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3856 goBackOne(source); 3857 } else { 3858 char32 = ch; 3859 } 3860 } else { 3861 char32 = ch; 3862 } 3863 } else { 3864 char32 = ch; 3865 } 3866 digVal = u_charDigitValue(char32); 3867 3868 for(;;) { 3869 // Make sure we have enough space. No longer needed; 3870 // at this point the largest value of digIndx when we need to save data in numTempBuf 3871 // is UCOL_MAX_DIGITS_FOR_NUMBER-1 (digIndx is post-incremented) so we just ensure 3872 // that numTempBuf is big enough (UCOL_MAX_DIGITS_FOR_NUMBER/2 + 2). 3873 3874 // Skip over trailing zeroes, and keep a count of them. 3875 if (digVal != 0) 3876 nonZeroValReached = TRUE; 3877 3878 if (nonZeroValReached) { 3879 /* 3880 We parse the digit string into base 100 numbers (this fits into a byte). 3881 We only add to the buffer in twos, thus if we are parsing an odd character, 3882 that serves as the 'tens' digit while the if we are parsing an even one, that 3883 is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into 3884 a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid 3885 overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less 3886 than all the other bytes. 3887 3888 Since we're doing in this reverse we want to put the first digit encountered into the 3889 ones place and the second digit encountered into the tens place. 3890 */ 3891 3892 if ((digIndx + trailingZeroCount) % 2 == 1) { 3893 // High-order digit case (tens place) 3894 collateVal += (uint8_t)(digVal * 10); 3895 3896 // We cannot set leadingZeroIndex unless it has been set for the 3897 // low-order digit. Therefore, all we can do for the high-order 3898 // digit is turn it off, never on. 3899 // The only time we will have a high digit without a low is for 3900 // the very first non-zero digit, so no zero check is necessary. 3901 if (collateVal != 0) 3902 leadingZeroIndex = 0; 3903 3904 // The first pass through, digIndx may exceed the limit, but in that case 3905 // we no longer care about numTempBuf contents since they will be discarded 3906 if ( digIndx < UCOL_MAX_DIGITS_FOR_NUMBER ) { 3907 numTempBuf[(digIndx/2) + 2] = collateVal*2 + 6; 3908 } 3909 collateVal = 0; 3910 } else { 3911 // Low-order digit case (ones place) 3912 collateVal = (uint8_t)digVal; 3913 3914 // Check for leading zeroes. 3915 if (collateVal == 0) { 3916 if (!leadingZeroIndex) 3917 leadingZeroIndex = (digIndx/2) + 2; 3918 } else 3919 leadingZeroIndex = 0; 3920 3921 // No need to write to buffer; the case of a last odd digit 3922 // is handled below. 3923 } 3924 ++digIndx; 3925 } else 3926 ++trailingZeroCount; 3927 3928 if (!collIter_bos(source)) { 3929 ch = getPrevNormalizedChar(source, status); 3930 //goBackOne(source); 3931 if (U16_IS_TRAIL(ch)) { 3932 backupState(source, &state); 3933 if (!collIter_bos(source)) { 3934 goBackOne(source); 3935 UChar lead = getPrevNormalizedChar(source, status); 3936 3937 if(U16_IS_LEAD(lead)) { 3938 char32 = U16_GET_SUPPLEMENTARY(lead,ch); 3939 } else { 3940 loadState(source, &state, FALSE); 3941 char32 = ch; 3942 } 3943 } 3944 } else 3945 char32 = ch; 3946 3947 if ((digVal = u_charDigitValue(char32)) == -1 || (ceLimit > 0 && (digIndx + trailingZeroCount) >= ceLimit)) { 3948 if (char32 > 0xFFFF) {// For surrogates. 3949 loadState(source, &state, FALSE); 3950 } 3951 // Don't need to "reverse" the goBackOne call, 3952 // as this points to the next position to process.. 3953 //if (char32 > 0xFFFF) // For surrogates. 3954 //getNextNormalizedChar(source); 3955 break; 3956 } 3957 3958 goBackOne(source); 3959 }else 3960 break; 3961 } 3962 3963 if (digIndx + trailingZeroCount <= UCOL_MAX_DIGITS_FOR_NUMBER) { 3964 // our collation element is not too big, go ahead and finish with it 3965 break; 3966 } 3967 // our digit string is too long for a collation element; 3968 // set the limit for it, reset the state and begin again 3969 ceLimit = (digIndx + trailingZeroCount) % UCOL_MAX_DIGITS_FOR_NUMBER; 3970 if ( ceLimit == 0 ) { 3971 ceLimit = UCOL_MAX_DIGITS_FOR_NUMBER; 3972 } 3973 ch = initial_ch; 3974 loadState(source, &initialState, FALSE); 3975 digIndx = endIndex = leadingZeroIndex = trailingZeroCount = 0; 3976 collateVal = 0; 3977 nonZeroValReached = FALSE; 3978 } 3979 3980 if (! nonZeroValReached) { 3981 digIndx = 2; 3982 trailingZeroCount = 0; 3983 numTempBuf[2] = 6; 3984 } 3985 3986 if ((digIndx + trailingZeroCount) % 2 != 0) { 3987 numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6; 3988 digIndx += 1; // The implicit leading zero 3989 } 3990 if (trailingZeroCount % 2 != 0) { 3991 // We had to consume one trailing zero for the low digit 3992 // of the least significant byte 3993 digIndx += 1; // The trailing zero not in the exponent 3994 trailingZeroCount -= 1; 3995 } 3996 3997 endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ; 3998 3999 // Subtract one off of the last byte. Really the first byte here, but it's reversed... 4000 numTempBuf[2] -= 1; 4001 4002 /* 4003 We want to skip over the first two slots in the buffer. The first slot 4004 is reserved for the header byte UCOL_CODAN_PLACEHOLDER. The second slot is for the 4005 sign/exponent byte: 0x80 + (decimalPos/2) & 7f. 4006 The exponent must be adjusted by the number of leading zeroes, and the number of 4007 trailing zeroes. 4008 */ 4009 numTempBuf[0] = UCOL_CODAN_PLACEHOLDER; 4010 uint32_t exponent = (digIndx+trailingZeroCount)/2; 4011 if (leadingZeroIndex) 4012 exponent -= ((digIndx/2) + 2 - leadingZeroIndex); 4013 numTempBuf[1] = (uint8_t)(0x80 + (exponent & 0x7F)); 4014 4015 // Now transfer the collation key to our collIterate struct. 4016 // The total size for our collation key is endIndx bumped up to the next largest even value divided by two. 4017 //size = ((endIndex+1) & ~1)/2; 4018 *(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight 4019 (UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight 4020 UCOL_BYTE_COMMON; // Tertiary weight. 4021 i = endIndex - 1; // Reset the index into the buffer. 4022 while(i >= 2) { 4023 uint32_t primWeight = numTempBuf[i--] << 8; 4024 if ( i >= 2) 4025 primWeight |= numTempBuf[i--]; 4026 *(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER; 4027 } 4028 4029 source->toReturn = source->CEpos -1; 4030 return *(source->toReturn); 4031 } else { 4032 CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 4033 CE = *(CEOffset++); 4034 break; 4035 } 4036 } 4037 4038 case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/ 4039 { 4040 static const uint32_t 4041 SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; 4042 //const uint32_t LCount = 19; 4043 static const uint32_t VCount = 21; 4044 static const uint32_t TCount = 28; 4045 //const uint32_t NCount = VCount * TCount; /* 588 */ 4046 //const uint32_t SCount = LCount * NCount; /* 11172 */ 4047 4048 uint32_t L = ch - SBase; 4049 /* 4050 divide into pieces. 4051 we do it in this order since some compilers can do % and / in one 4052 operation 4053 */ 4054 uint32_t T = L % TCount; 4055 L /= TCount; 4056 uint32_t V = L % VCount; 4057 L /= VCount; 4058 4059 /* offset them */ 4060 L += LBase; 4061 V += VBase; 4062 T += TBase; 4063 4064 if (source->offsetBuffer == NULL) { 4065 source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE; 4066 source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE); 4067 source->offsetStore = source->offsetBuffer; 4068 } 4069 4070 int32_t firstOffset = (int32_t)(source->pos - source->string); 4071 4072 *(source->offsetStore++) = firstOffset; 4073 4074 /* 4075 * return the first CE, but first put the rest into the expansion buffer 4076 */ 4077 if (!source->coll->image->jamoSpecial) { 4078 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L); 4079 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V); 4080 *(source->offsetStore++) = firstOffset + 1; 4081 4082 if (T != TBase) { 4083 *(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T); 4084 *(source->offsetStore++) = firstOffset + 1; 4085 } 4086 4087 source->toReturn = source->CEpos - 1; 4088 4089 source->offsetReturn = source->offsetStore - 1; 4090 if (source->offsetReturn == source->offsetBuffer) { 4091 source->offsetStore = source->offsetBuffer; 4092 } 4093 4094 return *(source->toReturn); 4095 } else { 4096 // Since Hanguls pass the FCD check, it is 4097 // guaranteed that we won't be in 4098 // the normalization buffer if something like this happens 4099 // Move Jamos into normalization buffer 4100 /* 4101 Move the Jamos into the 4102 normalization buffer 4103 */ 4104 UChar *tempbuffer = source->writableBuffer + 4105 (source->writableBufSize - 1); 4106 *(tempbuffer) = 0; 4107 if (T != TBase) { 4108 *(tempbuffer - 1) = (UChar)T; 4109 *(tempbuffer - 2) = (UChar)V; 4110 *(tempbuffer - 3) = (UChar)L; 4111 *(tempbuffer - 4) = 0; 4112 } else { 4113 *(tempbuffer - 1) = (UChar)V; 4114 *(tempbuffer - 2) = (UChar)L; 4115 *(tempbuffer - 3) = 0; 4116 } 4117 4118 /* 4119 Indicate where to continue in main input string after exhausting 4120 the writableBuffer 4121 */ 4122 if (source->pos == source->string) { 4123 source->fcdPosition = NULL; 4124 } else { 4125 source->fcdPosition = source->pos-1; 4126 } 4127 4128 source->pos = tempbuffer; 4129 source->origFlags = source->flags; 4130 source->flags |= UCOL_ITER_INNORMBUF; 4131 source->flags &= ~(UCOL_ITER_NORM | UCOL_ITER_HASLEN); 4132 4133 return(UCOL_IGNORABLE); 4134 } 4135 } 4136 4137 case IMPLICIT_TAG: /* everything that is not defined otherwise */ 4138 return getPrevImplicit(ch, source); 4139 4140 // TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function 4141 case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/ 4142 return getPrevImplicit(ch, source); 4143 4144 case SURROGATE_TAG: /* This is a surrogate pair */ 4145 /* essentialy an engaged lead surrogate. */ 4146 /* if you have encountered it here, it means that a */ 4147 /* broken sequence was encountered and this is an error */ 4148 return 0; 4149 4150 case LEAD_SURROGATE_TAG: /* D800-DBFF*/ 4151 return 0; /* broken surrogate sequence */ 4152 4153 case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/ 4154 { 4155 UChar32 cp = 0; 4156 UChar prevChar; 4157 UChar *prev; 4158 if (isAtStartPrevIterate(source)) { 4159 /* we are at the start of the string, wrong place to be at */ 4160 return 0; 4161 } 4162 if (source->pos != source->writableBuffer) { 4163 prev = source->pos - 1; 4164 } else { 4165 prev = source->fcdPosition; 4166 } 4167 prevChar = *prev; 4168 4169 /* Handles Han and Supplementary characters here.*/ 4170 if (U16_IS_LEAD(prevChar)) { 4171 cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000)); 4172 source->pos = prev; 4173 } else { 4174 return 0; /* completely ignorable */ 4175 } 4176 4177 return getPrevImplicit(cp, source); 4178 } 4179 4180 /* UCA is filled with these. Tailorings are NOT_FOUND */ 4181 /* not yet implemented */ 4182 case CHARSET_TAG: /* this tag always returns */ 4183 /* probably after 1.8 */ 4184 return UCOL_NOT_FOUND; 4185 4186 default: /* this tag always returns */ 4187 *status = U_INTERNAL_PROGRAM_ERROR; 4188 CE=0; 4189 break; 4190 } 4191 4192 if (CE <= UCOL_NOT_FOUND) { 4193 break; 4194 } 4195 } 4196 4197 return CE; 4198} 4199 4200/* This should really be a macro */ 4201/* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */ 4202/* anyway */ 4203static 4204uint8_t *reallocateBuffer(uint8_t **secondaries, uint8_t *secStart, uint8_t *second, uint32_t *secSize, uint32_t newSize, UErrorCode *status) { 4205#ifdef UCOL_DEBUG 4206 fprintf(stderr, "."); 4207#endif 4208 uint8_t *newStart = NULL; 4209 uint32_t offset = *secondaries-secStart; 4210 4211 if(secStart==second) { 4212 newStart=(uint8_t*)uprv_malloc(newSize); 4213 if(newStart==NULL) { 4214 *status = U_MEMORY_ALLOCATION_ERROR; 4215 return NULL; 4216 } 4217 uprv_memcpy(newStart, secStart, *secondaries-secStart); 4218 } else { 4219 newStart=(uint8_t*)uprv_realloc(secStart, newSize); 4220 if(newStart==NULL) { 4221 *status = U_MEMORY_ALLOCATION_ERROR; 4222 /* Since we're reallocating, return original reference so we don't loose it. */ 4223 return secStart; 4224 } 4225 } 4226 *secondaries=newStart+offset; 4227 *secSize=newSize; 4228 return newStart; 4229} 4230 4231 4232/* This should really be a macro */ 4233/* This function is used to reverse parts of a buffer. We need this operation when doing continuation */ 4234/* secondaries in French */ 4235/* 4236void uprv_ucol_reverse_buffer(uint8_t *start, uint8_t *end) { 4237 uint8_t temp; 4238 while(start<end) { 4239 temp = *start; 4240 *start++ = *end; 4241 *end-- = temp; 4242 } 4243} 4244*/ 4245 4246#define uprv_ucol_reverse_buffer(TYPE, start, end) { \ 4247 TYPE tempA; \ 4248while((start)<(end)) { \ 4249 tempA = *(start); \ 4250 *(start)++ = *(end); \ 4251 *(end)-- = tempA; \ 4252} \ 4253} 4254 4255/****************************************************************************/ 4256/* Following are the sortkey generation functions */ 4257/* */ 4258/****************************************************************************/ 4259 4260/** 4261 * Merge two sort keys. 4262 * This is useful, for example, to combine sort keys from first and last names 4263 * to sort such pairs. 4264 * Merged sort keys consider on each collation level the first part first entirely, 4265 * then the second one. 4266 * It is possible to merge multiple sort keys by consecutively merging 4267 * another one with the intermediate result. 4268 * 4269 * The length of the merge result is the sum of the lengths of the input sort keys 4270 * minus 1. 4271 * 4272 * @param src1 the first sort key 4273 * @param src1Length the length of the first sort key, including the zero byte at the end; 4274 * can be -1 if the function is to find the length 4275 * @param src2 the second sort key 4276 * @param src2Length the length of the second sort key, including the zero byte at the end; 4277 * can be -1 if the function is to find the length 4278 * @param dest the buffer where the merged sort key is written, 4279 * can be NULL if destCapacity==0 4280 * @param destCapacity the number of bytes in the dest buffer 4281 * @return the length of the merged sort key, src1Length+src2Length-1; 4282 * can be larger than destCapacity, or 0 if an error occurs (only for illegal arguments), 4283 * in which cases the contents of dest is undefined 4284 * 4285 * @draft 4286 */ 4287U_CAPI int32_t U_EXPORT2 4288ucol_mergeSortkeys(const uint8_t *src1, int32_t src1Length, 4289 const uint8_t *src2, int32_t src2Length, 4290 uint8_t *dest, int32_t destCapacity) { 4291 int32_t destLength; 4292 uint8_t b; 4293 4294 /* check arguments */ 4295 if( src1==NULL || src1Length<-2 || src1Length==0 || (src1Length>0 && src1[src1Length-1]!=0) || 4296 src2==NULL || src2Length<-2 || src2Length==0 || (src2Length>0 && src2[src2Length-1]!=0) || 4297 destCapacity<0 || (destCapacity>0 && dest==NULL) 4298 ) { 4299 /* error, attempt to write a zero byte and return 0 */ 4300 if(dest!=NULL && destCapacity>0) { 4301 *dest=0; 4302 } 4303 return 0; 4304 } 4305 4306 /* check lengths and capacity */ 4307 if(src1Length<0) { 4308 src1Length=(int32_t)uprv_strlen((const char *)src1)+1; 4309 } 4310 if(src2Length<0) { 4311 src2Length=(int32_t)uprv_strlen((const char *)src2)+1; 4312 } 4313 4314 destLength=src1Length+src2Length-1; 4315 if(destLength>destCapacity) { 4316 /* the merged sort key does not fit into the destination */ 4317 return destLength; 4318 } 4319 4320 /* merge the sort keys with the same number of levels */ 4321 while(*src1!=0 && *src2!=0) { /* while both have another level */ 4322 /* copy level from src1 not including 00 or 01 */ 4323 while((b=*src1)>=2) { 4324 ++src1; 4325 *dest++=b; 4326 } 4327 4328 /* add a 02 merge separator */ 4329 *dest++=2; 4330 4331 /* copy level from src2 not including 00 or 01 */ 4332 while((b=*src2)>=2) { 4333 ++src2; 4334 *dest++=b; 4335 } 4336 4337 /* if both sort keys have another level, then add a 01 level separator and continue */ 4338 if(*src1==1 && *src2==1) { 4339 ++src1; 4340 ++src2; 4341 *dest++=1; 4342 } 4343 } 4344 4345 /* 4346 * here, at least one sort key is finished now, but the other one 4347 * might have some contents left from containing more levels; 4348 * that contents is just appended to the result 4349 */ 4350 if(*src1!=0) { 4351 /* src1 is not finished, therefore *src2==0, and src1 is appended */ 4352 src2=src1; 4353 } 4354 /* append src2, "the other, unfinished sort key" */ 4355 uprv_strcpy((char *)dest, (const char *)src2); 4356 4357 /* trust that neither sort key contained illegally embedded zero bytes */ 4358 return destLength; 4359} 4360 4361/* sortkey API */ 4362U_CAPI int32_t U_EXPORT2 4363ucol_getSortKey(const UCollator *coll, 4364 const UChar *source, 4365 int32_t sourceLength, 4366 uint8_t *result, 4367 int32_t resultLength) 4368{ 4369 UTRACE_ENTRY(UTRACE_UCOL_GET_SORTKEY); 4370 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 4371 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source string = %vh ", coll, source, 4372 ((sourceLength==-1 && source!=NULL) ? u_strlen(source) : sourceLength)); 4373 } 4374 4375 UErrorCode status = U_ZERO_ERROR; 4376 int32_t keySize = 0; 4377 4378 if(source != NULL) { 4379 // source == NULL is actually an error situation, but we would need to 4380 // have an error code to return it. Until we introduce a new 4381 // API, it stays like this 4382 4383 /* this uses the function pointer that is set in updateinternalstate */ 4384 /* currently, there are two funcs: */ 4385 /*ucol_calcSortKey(...);*/ 4386 /*ucol_calcSortKeySimpleTertiary(...);*/ 4387 4388 keySize = coll->sortKeyGen(coll, source, sourceLength, &result, resultLength, FALSE, &status); 4389 //if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && result && resultLength > 0) { 4390 // That's not good. Something unusual happened. 4391 // We don't know how much we initialized before we failed. 4392 // NULL terminate for safety. 4393 // We have no way say that we have generated a partial sort key. 4394 //result[0] = 0; 4395 //keySize = 0; 4396 //} 4397 } 4398 UTRACE_DATA2(UTRACE_VERBOSE, "Sort Key = %vb", result, keySize); 4399 UTRACE_EXIT_STATUS(status); 4400 return keySize; 4401} 4402 4403/* this function is called by the C++ API for sortkey generation */ 4404U_CFUNC int32_t 4405ucol_getSortKeyWithAllocation(const UCollator *coll, 4406 const UChar *source, int32_t sourceLength, 4407 uint8_t **pResult, 4408 UErrorCode *pErrorCode) { 4409 *pResult = 0; 4410 return coll->sortKeyGen(coll, source, sourceLength, pResult, 0, TRUE, pErrorCode); 4411} 4412 4413#define UCOL_FSEC_BUF_SIZE 256 4414 4415/* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */ 4416/* or if we run out of space while making a sortkey and want to return ASAP */ 4417int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) { 4418 UErrorCode status = U_ZERO_ERROR; 4419 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); 4420 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); 4421 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); 4422 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); 4423 UBool compareIdent = (strength == UCOL_IDENTICAL); 4424 UBool doCase = (coll->caseLevel == UCOL_ON); 4425 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 4426 //UBool qShifted = shifted && (compareQuad == 0); 4427 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); 4428 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); 4429 uint8_t fSecsBuff[UCOL_FSEC_BUF_SIZE]; 4430 uint8_t *fSecs = fSecsBuff; 4431 uint32_t fSecsLen = 0, fSecsMaxLen = UCOL_FSEC_BUF_SIZE; 4432 uint8_t *frenchStartPtr = NULL, *frenchEndPtr = NULL; 4433 4434 uint32_t variableTopValue = coll->variableTopValue; 4435 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); 4436 if(doHiragana) { 4437 UCOL_COMMON_BOT4++; 4438 /* allocate one more space for hiragana */ 4439 } 4440 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); 4441 4442 uint32_t order = UCOL_NO_MORE_CES; 4443 uint8_t primary1 = 0; 4444 uint8_t primary2 = 0; 4445 uint8_t secondary = 0; 4446 uint8_t tertiary = 0; 4447 int32_t caseShift = 0; 4448 uint32_t c2 = 0, c3 = 0, c4 = 0; /* variables for compression */ 4449 4450 uint8_t caseSwitch = coll->caseSwitch; 4451 uint8_t tertiaryMask = coll->tertiaryMask; 4452 uint8_t tertiaryCommon = coll->tertiaryCommon; 4453 4454 UBool wasShifted = FALSE; 4455 UBool notIsContinuation = FALSE; 4456 uint8_t leadPrimary = 0; 4457 4458 4459 for(;;) { 4460 order = ucol_IGetNextCE(coll, s, &status); 4461 if(order == UCOL_NO_MORE_CES) { 4462 break; 4463 } 4464 4465 if(order == 0) { 4466 continue; 4467 } 4468 4469 notIsContinuation = !isContinuation(order); 4470 4471 4472 if(notIsContinuation) { 4473 tertiary = (uint8_t)((order & UCOL_BYTE_SIZE_MASK)); 4474 } else { 4475 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 4476 } 4477 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4478 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4479 primary1 = (uint8_t)(order >> 8); 4480 4481 4482 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) 4483 || (!notIsContinuation && wasShifted)) 4484 || (wasShifted && primary1 == 0)) { /* amendment to the UCA says that primary ignorables */ 4485 /* and other ignorables should be removed if following a shifted code point */ 4486 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ 4487 /* we should just completely ignore it */ 4488 continue; 4489 } 4490 if(compareQuad == 0) { 4491 if(c4 > 0) { 4492 currentSize += (c2/UCOL_BOT_COUNT4)+1; 4493 c4 = 0; 4494 } 4495 currentSize++; 4496 if(primary2 != 0) { 4497 currentSize++; 4498 } 4499 } 4500 wasShifted = TRUE; 4501 } else { 4502 wasShifted = FALSE; 4503 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 4504 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */ 4505 /* calculate sortkey size */ 4506 if(primary1 != UCOL_IGNORABLE) { 4507 if(notIsContinuation) { 4508 if(leadPrimary == primary1) { 4509 currentSize++; 4510 } else { 4511 if(leadPrimary != 0) { 4512 currentSize++; 4513 } 4514 if(primary2 == UCOL_IGNORABLE) { 4515 /* one byter, not compressed */ 4516 currentSize++; 4517 leadPrimary = 0; 4518 } 4519 else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY || 4520 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) { 4521 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { 4522 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) 4523 { 4524 /* not compressible */ 4525 leadPrimary = 0; 4526 currentSize+=2; 4527 } 4528 else { /* compress */ 4529 leadPrimary = primary1; 4530 currentSize+=2; 4531 } 4532 } 4533 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 4534 currentSize++; 4535 if(primary2 != UCOL_IGNORABLE) { 4536 currentSize++; 4537 } 4538 } 4539 } 4540 4541 if(secondary > compareSec) { /* I think that != 0 test should be != IGNORABLE */ 4542 if(!isFrenchSec){ 4543 if (secondary == UCOL_COMMON2 && notIsContinuation) { 4544 c2++; 4545 } else { 4546 if(c2 > 0) { 4547 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4548 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+1; 4549 } else { 4550 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+1; 4551 } 4552 c2 = 0; 4553 } 4554 currentSize++; 4555 } 4556 } else { 4557 fSecs[fSecsLen++] = secondary; 4558 if(fSecsLen == fSecsMaxLen) { 4559 uint8_t *fSecsTemp; 4560 if(fSecs == fSecsBuff) { 4561 fSecsTemp = (uint8_t *)uprv_malloc(2*fSecsLen); 4562 } else { 4563 fSecsTemp = (uint8_t *)uprv_realloc(fSecs, 2*fSecsLen); 4564 } 4565 if(fSecsTemp == NULL) { 4566 status = U_MEMORY_ALLOCATION_ERROR; 4567 return 0; 4568 } 4569 fSecs = fSecsTemp; 4570 fSecsMaxLen *= 2; 4571 } 4572 if(notIsContinuation) { 4573 if (frenchStartPtr != NULL) { 4574 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 4575 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4576 frenchStartPtr = NULL; 4577 } 4578 } else { 4579 if (frenchStartPtr == NULL) { 4580 frenchStartPtr = fSecs+fSecsLen-2; 4581 } 4582 frenchEndPtr = fSecs+fSecsLen-1; 4583 } 4584 } 4585 } 4586 4587 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { 4588 // do the case level if we need to do it. We don't want to calculate 4589 // case level for primary ignorables if we have only primary strength and case level 4590 // otherwise we would break well formedness of CEs 4591 if (caseShift == 0) { 4592 currentSize++; 4593 caseShift = UCOL_CASE_SHIFT_START; 4594 } 4595 if((tertiary&0x3F) > 0 && notIsContinuation) { 4596 caseShift--; 4597 if((tertiary &0xC0) != 0) { 4598 if (caseShift == 0) { 4599 currentSize++; 4600 caseShift = UCOL_CASE_SHIFT_START; 4601 } 4602 caseShift--; 4603 } 4604 } 4605 } else { 4606 if(notIsContinuation) { 4607 tertiary ^= caseSwitch; 4608 } 4609 } 4610 4611 tertiary &= tertiaryMask; 4612 if(tertiary > compareTer) { /* I think that != 0 test should be != IGNORABLE */ 4613 if (tertiary == tertiaryCommon && notIsContinuation) { 4614 c3++; 4615 } else { 4616 if(c3 > 0) { 4617 if((tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) 4618 || (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST)) { 4619 currentSize += (c3/(uint32_t)coll->tertiaryTopCount)+1; 4620 } else { 4621 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount)+1; 4622 } 4623 c3 = 0; 4624 } 4625 currentSize++; 4626 } 4627 } 4628 4629 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { 4630 if(s->flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 4631 if(c4>0) { // Close this part 4632 currentSize += (c4/UCOL_BOT_COUNT4)+1; 4633 c4 = 0; 4634 } 4635 currentSize++; // Add the Hiragana 4636 } else { // This wasn't Hiragana, so we can continue adding stuff 4637 c4++; 4638 } 4639 } 4640 } 4641 } 4642 4643 if(!isFrenchSec){ 4644 if(c2 > 0) { 4645 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); 4646 } 4647 } else { 4648 uint32_t i = 0; 4649 if(frenchStartPtr != NULL) { 4650 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4651 } 4652 for(i = 0; i<fSecsLen; i++) { 4653 secondary = *(fSecs+fSecsLen-i-1); 4654 /* This is compression code. */ 4655 if (secondary == UCOL_COMMON2) { 4656 ++c2; 4657 } else { 4658 if(c2 > 0) { 4659 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4660 currentSize += (c2/(uint32_t)UCOL_TOP_COUNT2)+((c2%(uint32_t)UCOL_TOP_COUNT2 != 0)?1:0); 4661 } else { 4662 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); 4663 } 4664 c2 = 0; 4665 } 4666 currentSize++; 4667 } 4668 } 4669 if(c2 > 0) { 4670 currentSize += (c2/(uint32_t)UCOL_BOT_COUNT2)+((c2%(uint32_t)UCOL_BOT_COUNT2 != 0)?1:0); 4671 } 4672 if(fSecs != fSecsBuff) { 4673 uprv_free(fSecs); 4674 } 4675 } 4676 4677 if(c3 > 0) { 4678 currentSize += (c3/(uint32_t)coll->tertiaryBottomCount) + ((c3%(uint32_t)coll->tertiaryBottomCount != 0)?1:0); 4679 } 4680 4681 if(c4 > 0 && compareQuad == 0) { 4682 currentSize += (c4/(uint32_t)UCOL_BOT_COUNT4)+((c4%(uint32_t)UCOL_BOT_COUNT4 != 0)?1:0); 4683 } 4684 4685 if(compareIdent) { 4686 currentSize += u_lengthOfIdenticalLevelRun(s->string, len); 4687 } 4688 return currentSize; 4689} 4690 4691static 4692inline void doCaseShift(uint8_t **cases, uint32_t &caseShift) { 4693 if (caseShift == 0) { 4694 *(*cases)++ = UCOL_CASE_BYTE_START; 4695 caseShift = UCOL_CASE_SHIFT_START; 4696 } 4697} 4698 4699// Adds a value to the buffer if it's safe to add. Increments the number of added values, so that we 4700// know how many values we wanted to add, even if we didn't add them all 4701static 4702inline void addWithIncrement(uint8_t *&primaries, uint8_t *limit, uint32_t &size, const uint8_t value) { 4703 size++; 4704 if(primaries < limit) { 4705 *(primaries)++ = value; 4706 } 4707} 4708 4709// Packs the secondary buffer when processing French locale. Adds the terminator. 4710static 4711inline uint8_t *packFrench(uint8_t *primaries, uint8_t *primEnd, uint8_t *secondaries, uint32_t *secsize, uint8_t *frenchStartPtr, uint8_t *frenchEndPtr) { 4712 uint8_t secondary; 4713 int32_t count2 = 0; 4714 uint32_t i = 0, size = 0; 4715 // we use i here since the key size already accounts for terminators, so we'll discard the increment 4716 addWithIncrement(primaries, primEnd, i, UCOL_LEVELTERMINATOR); 4717 /* If there are any unresolved continuation secondaries, reverse them here so that we can reverse the whole secondary thing */ 4718 if(frenchStartPtr != NULL) { 4719 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 4720 } 4721 for(i = 0; i<*secsize; i++) { 4722 secondary = *(secondaries-i-1); 4723 /* This is compression code. */ 4724 if (secondary == UCOL_COMMON2) { 4725 ++count2; 4726 } else { 4727 if (count2 > 0) { 4728 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 4729 while (count2 > UCOL_TOP_COUNT2) { 4730 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2)); 4731 count2 -= (uint32_t)UCOL_TOP_COUNT2; 4732 } 4733 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_TOP2 - (count2-1))); 4734 } else { 4735 while (count2 > UCOL_BOT_COUNT2) { 4736 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 4737 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4738 } 4739 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 4740 } 4741 count2 = 0; 4742 } 4743 addWithIncrement(primaries, primEnd, size, secondary); 4744 } 4745 } 4746 if (count2 > 0) { 4747 while (count2 > UCOL_BOT_COUNT2) { 4748 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2)); 4749 count2 -= (uint32_t)UCOL_BOT_COUNT2; 4750 } 4751 addWithIncrement(primaries, primEnd, size, (uint8_t)(UCOL_COMMON_BOT2 + (count2-1))); 4752 } 4753 *secsize = size; 4754 return primaries; 4755} 4756 4757#define DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY 0 4758 4759/* This is the sortkey work horse function */ 4760U_CFUNC int32_t U_CALLCONV 4761ucol_calcSortKey(const UCollator *coll, 4762 const UChar *source, 4763 int32_t sourceLength, 4764 uint8_t **result, 4765 uint32_t resultLength, 4766 UBool allocateSKBuffer, 4767 UErrorCode *status) 4768{ 4769 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); 4770 4771 uint32_t i = 0; /* general purpose counter */ 4772 4773 /* Stack allocated buffers for buffers we use */ 4774 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER], caseB[UCOL_CASE_MAX_BUFFER], quad[UCOL_QUAD_MAX_BUFFER]; 4775 4776 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert, *cases = caseB, *quads = quad; 4777 4778 if(U_FAILURE(*status)) { 4779 return 0; 4780 } 4781 4782 if(primaries == NULL && allocateSKBuffer == TRUE) { 4783 primaries = *result = prim; 4784 resultLength = UCOL_PRIMARY_MAX_BUFFER; 4785 } 4786 4787 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER, 4788 caseSize = UCOL_CASE_MAX_BUFFER, quadSize = UCOL_QUAD_MAX_BUFFER; 4789 4790 uint32_t sortKeySize = 1; /* it is always \0 terminated */ 4791 4792 UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER]; 4793 UChar *normSource = normBuffer; 4794 int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER; 4795 4796 int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength); 4797 4798 UColAttributeValue strength = coll->strength; 4799 4800 uint8_t compareSec = (uint8_t)((strength >= UCOL_SECONDARY)?0:0xFF); 4801 uint8_t compareTer = (uint8_t)((strength >= UCOL_TERTIARY)?0:0xFF); 4802 uint8_t compareQuad = (uint8_t)((strength >= UCOL_QUATERNARY)?0:0xFF); 4803 UBool compareIdent = (strength == UCOL_IDENTICAL); 4804 UBool doCase = (coll->caseLevel == UCOL_ON); 4805 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && (compareSec == 0); 4806 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 4807 //UBool qShifted = shifted && (compareQuad == 0); 4808 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && (compareQuad == 0); 4809 /*const uint8_t *scriptOrder = coll->scriptOrder;*/ 4810 4811 uint32_t variableTopValue = coll->variableTopValue; 4812 // TODO: UCOL_COMMON_BOT4 should be a function of qShifted. If we have no 4813 // qShifted, we don't need to set UCOL_COMMON_BOT4 so high. 4814 uint8_t UCOL_COMMON_BOT4 = (uint8_t)((coll->variableTopValue>>8)+1); 4815 uint8_t UCOL_HIRAGANA_QUAD = 0; 4816 if(doHiragana) { 4817 UCOL_HIRAGANA_QUAD=UCOL_COMMON_BOT4++; 4818 /* allocate one more space for hiragana, value for hiragana */ 4819 } 4820 uint8_t UCOL_BOT_COUNT4 = (uint8_t)(0xFF - UCOL_COMMON_BOT4); 4821 4822 /* support for special features like caselevel and funky secondaries */ 4823 uint8_t *frenchStartPtr = NULL; 4824 uint8_t *frenchEndPtr = NULL; 4825 uint32_t caseShift = 0; 4826 4827 sortKeySize += ((compareSec?0:1) + (compareTer?0:1) + (doCase?1:0) + /*(qShifted?1:0)*/(compareQuad?0:1) + (compareIdent?1:0)); 4828 4829 /* If we need to normalize, we'll do it all at once at the beginning! */ 4830 UNormalizationMode normMode; 4831 if(compareIdent) { 4832 normMode = UNORM_NFD; 4833 } else if(coll->normalizationMode != UCOL_OFF) { 4834 normMode = UNORM_FCD; 4835 } else { 4836 normMode = UNORM_NONE; 4837 } 4838 4839 if(normMode != UNORM_NONE && UNORM_YES != unorm_quickCheck(source, len, normMode, status)) { 4840 len = unorm_internalNormalize(normSource, normSourceLen, 4841 source, len, 4842 normMode, FALSE, 4843 status); 4844 if(*status == U_BUFFER_OVERFLOW_ERROR) { 4845 normSourceLen = len; 4846 normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR); 4847 if(normSource == NULL) { 4848 *status = U_MEMORY_ALLOCATION_ERROR; 4849 return 0; 4850 } 4851 *status = U_ZERO_ERROR; 4852 len = unorm_internalNormalize(normSource, normSourceLen, 4853 source, len, 4854 normMode, FALSE, 4855 status); 4856 } 4857 4858 if(U_FAILURE(*status)) { 4859 return 0; 4860 } 4861 source = normSource; 4862 } 4863 4864 collIterate s; 4865 IInit_collIterate(coll, (UChar *)source, len, &s); 4866 if(source == normSource) { 4867 s.flags &= ~UCOL_ITER_NORM; 4868 } 4869 4870 if(resultLength == 0 || primaries == NULL) { 4871 int32_t keyLen = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); 4872 if(normSource != normBuffer) { 4873 uprv_free(normSource); 4874 } 4875 return keyLen; 4876 } 4877 uint8_t *primarySafeEnd = primaries + resultLength - 1; 4878 if(strength > UCOL_PRIMARY) { 4879 primarySafeEnd--; 4880 } 4881 4882 uint32_t minBufferSize = UCOL_MAX_BUFFER; 4883 4884 uint8_t *primStart = primaries; 4885 uint8_t *secStart = secondaries; 4886 uint8_t *terStart = tertiaries; 4887 uint8_t *caseStart = cases; 4888 uint8_t *quadStart = quads; 4889 4890 uint32_t order = 0; 4891 4892 uint8_t primary1 = 0; 4893 uint8_t primary2 = 0; 4894 uint8_t secondary = 0; 4895 uint8_t tertiary = 0; 4896 uint8_t caseSwitch = coll->caseSwitch; 4897 uint8_t tertiaryMask = coll->tertiaryMask; 4898 int8_t tertiaryAddition = coll->tertiaryAddition; 4899 uint8_t tertiaryTop = coll->tertiaryTop; 4900 uint8_t tertiaryBottom = coll->tertiaryBottom; 4901 uint8_t tertiaryCommon = coll->tertiaryCommon; 4902 uint8_t caseBits = 0; 4903 4904 UBool finished = FALSE; 4905 UBool wasShifted = FALSE; 4906 UBool notIsContinuation = FALSE; 4907 4908 uint32_t prevBuffSize = 0; 4909 4910 uint32_t count2 = 0, count3 = 0, count4 = 0; 4911 uint8_t leadPrimary = 0; 4912 4913 for(;;) { 4914 for(i=prevBuffSize; i<minBufferSize; ++i) { 4915 4916 order = ucol_IGetNextCE(coll, &s, status); 4917 if(order == UCOL_NO_MORE_CES) { 4918 finished = TRUE; 4919 break; 4920 } 4921 4922 if(order == 0) { 4923 continue; 4924 } 4925 4926 notIsContinuation = !isContinuation(order); 4927 4928 if(notIsContinuation) { 4929 tertiary = (uint8_t)(order & UCOL_BYTE_SIZE_MASK); 4930 } else { 4931 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 4932 } 4933 4934 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4935 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 4936 primary1 = (uint8_t)(order >> 8); 4937 4938 /*if(notIsContinuation && scriptOrder != NULL) { 4939 primary1 = scriptOrder[primary1]; 4940 }*/ 4941 4942 if(shifted && ((notIsContinuation && order <= variableTopValue && primary1 > 0) 4943 || (!notIsContinuation && wasShifted)) 4944 || (wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 4945 { 4946 /* and other ignorables should be removed if following a shifted code point */ 4947 if(primary1 == 0) { /* if we were shifted and we got an ignorable code point */ 4948 /* we should just completely ignore it */ 4949 continue; 4950 } 4951 if(compareQuad == 0) { 4952 if(count4 > 0) { 4953 while (count4 > UCOL_BOT_COUNT4) { 4954 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 4955 count4 -= UCOL_BOT_COUNT4; 4956 } 4957 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); 4958 count4 = 0; 4959 } 4960 /* We are dealing with a variable and we're treating them as shifted */ 4961 /* This is a shifted ignorable */ 4962 if(primary1 != 0) { /* we need to check this since we could be in continuation */ 4963 *quads++ = primary1; 4964 } 4965 if(primary2 != 0) { 4966 *quads++ = primary2; 4967 } 4968 } 4969 wasShifted = TRUE; 4970 } else { 4971 wasShifted = FALSE; 4972 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 4973 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */ 4974 /* regular and simple sortkey calc */ 4975 if(primary1 != UCOL_IGNORABLE) { 4976 if(notIsContinuation) { 4977 if(leadPrimary == primary1) { 4978 *primaries++ = primary2; 4979 } else { 4980 if(leadPrimary != 0) { 4981 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 4982 } 4983 if(primary2 == UCOL_IGNORABLE) { 4984 /* one byter, not compressed */ 4985 *primaries++ = primary1; 4986 leadPrimary = 0; 4987 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY || 4988 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { 4989 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) { 4990 /* not compressible */ 4991 leadPrimary = 0; 4992 *primaries++ = primary1; 4993 if(primaries <= primarySafeEnd) { 4994 *primaries++ = primary2; 4995 } 4996 } else { /* compress */ 4997 *primaries++ = leadPrimary = primary1; 4998 if(primaries <= primarySafeEnd) { 4999 *primaries++ = primary2; 5000 } 5001 } 5002 } 5003 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 5004 *primaries++ = primary1; 5005 if((primary2 != UCOL_IGNORABLE) && (primaries <= primarySafeEnd)) { 5006 *primaries++ = primary2; /* second part */ 5007 } 5008 } 5009 } 5010 5011 if(secondary > compareSec) { 5012 if(!isFrenchSec) { 5013 /* This is compression code. */ 5014 if (secondary == UCOL_COMMON2 && notIsContinuation) { 5015 ++count2; 5016 } else { 5017 if (count2 > 0) { 5018 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 5019 while (count2 > UCOL_TOP_COUNT2) { 5020 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 5021 count2 -= (uint32_t)UCOL_TOP_COUNT2; 5022 } 5023 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); 5024 } else { 5025 while (count2 > UCOL_BOT_COUNT2) { 5026 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5027 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5028 } 5029 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 5030 } 5031 count2 = 0; 5032 } 5033 *secondaries++ = secondary; 5034 } 5035 } else { 5036 *secondaries++ = secondary; 5037 /* Do the special handling for French secondaries */ 5038 /* We need to get continuation elements and do intermediate restore */ 5039 /* abc1c2c3de with french secondaries need to be edc1c2c3ba NOT edc3c2c1ba */ 5040 if(notIsContinuation) { 5041 if (frenchStartPtr != NULL) { 5042 /* reverse secondaries from frenchStartPtr up to frenchEndPtr */ 5043 uprv_ucol_reverse_buffer(uint8_t, frenchStartPtr, frenchEndPtr); 5044 frenchStartPtr = NULL; 5045 } 5046 } else { 5047 if (frenchStartPtr == NULL) { 5048 frenchStartPtr = secondaries - 2; 5049 } 5050 frenchEndPtr = secondaries-1; 5051 } 5052 } 5053 } 5054 5055 if(doCase && (primary1 > 0 || strength >= UCOL_SECONDARY)) { 5056 // do the case level if we need to do it. We don't want to calculate 5057 // case level for primary ignorables if we have only primary strength and case level 5058 // otherwise we would break well formedness of CEs 5059 doCaseShift(&cases, caseShift); 5060 if(notIsContinuation) { 5061 caseBits = (uint8_t)(tertiary & 0xC0); 5062 5063 if(tertiary != 0) { 5064 if(coll->caseFirst == UCOL_UPPER_FIRST) { 5065 if((caseBits & 0xC0) == 0) { 5066 *(cases-1) |= 1 << (--caseShift); 5067 } else { 5068 *(cases-1) |= 0 << (--caseShift); 5069 /* second bit */ 5070 doCaseShift(&cases, caseShift); 5071 *(cases-1) |= ((caseBits>>6)&1) << (--caseShift); 5072 } 5073 } else { 5074 if((caseBits & 0xC0) == 0) { 5075 *(cases-1) |= 0 << (--caseShift); 5076 } else { 5077 *(cases-1) |= 1 << (--caseShift); 5078 /* second bit */ 5079 doCaseShift(&cases, caseShift); 5080 *(cases-1) |= ((caseBits>>7)&1) << (--caseShift); 5081 } 5082 } 5083 } 5084 5085 } 5086 } else { 5087 if(notIsContinuation) { 5088 tertiary ^= caseSwitch; 5089 } 5090 } 5091 5092 tertiary &= tertiaryMask; 5093 if(tertiary > compareTer) { 5094 /* This is compression code. */ 5095 /* sequence size check is included in the if clause */ 5096 if (tertiary == tertiaryCommon && notIsContinuation) { 5097 ++count3; 5098 } else { 5099 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 5100 tertiary += tertiaryAddition; 5101 } else if(tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 5102 tertiary -= tertiaryAddition; 5103 } 5104 if (count3 > 0) { 5105 if ((tertiary > tertiaryCommon)) { 5106 while (count3 > coll->tertiaryTopCount) { 5107 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 5108 count3 -= (uint32_t)coll->tertiaryTopCount; 5109 } 5110 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); 5111 } else { 5112 while (count3 > coll->tertiaryBottomCount) { 5113 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 5114 count3 -= (uint32_t)coll->tertiaryBottomCount; 5115 } 5116 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 5117 } 5118 count3 = 0; 5119 } 5120 *tertiaries++ = tertiary; 5121 } 5122 } 5123 5124 if(/*qShifted*/(compareQuad==0) && notIsContinuation) { 5125 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 5126 if(count4>0) { // Close this part 5127 while (count4 > UCOL_BOT_COUNT4) { 5128 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 5129 count4 -= UCOL_BOT_COUNT4; 5130 } 5131 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); 5132 count4 = 0; 5133 } 5134 *quads++ = UCOL_HIRAGANA_QUAD; // Add the Hiragana 5135 } else { // This wasn't Hiragana, so we can continue adding stuff 5136 count4++; 5137 } 5138 } 5139 } 5140 5141 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */ 5142 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ 5143 IInit_collIterate(coll, (UChar *)source, len, &s); 5144 if(source == normSource) { 5145 s.flags &= ~UCOL_ITER_NORM; 5146 } 5147 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len); 5148 *status = U_BUFFER_OVERFLOW_ERROR; 5149 finished = TRUE; 5150 break; 5151 } else { /* It's much nicer if we can actually reallocate */ 5152 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart)+(cases-caseStart)+(quads-quadStart); 5153 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); 5154 if(U_SUCCESS(*status)) { 5155 *result = primStart; 5156 primarySafeEnd = primStart + resultLength - 1; 5157 if(strength > UCOL_PRIMARY) { 5158 primarySafeEnd--; 5159 } 5160 } else { 5161 /* We ran out of memory!? We can't recover. */ 5162 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5163 finished = TRUE; 5164 break; 5165 } 5166 } 5167 } 5168 } 5169 if(finished) { 5170 break; 5171 } else { 5172 prevBuffSize = minBufferSize; 5173 5174 uint32_t frenchStartOffset = 0, frenchEndOffset = 0; 5175 if (frenchStartPtr != NULL) { 5176 frenchStartOffset = frenchStartPtr - secStart; 5177 frenchEndOffset = frenchEndPtr - secStart; 5178 } 5179 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); 5180 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); 5181 caseStart = reallocateBuffer(&cases, caseStart, caseB, &caseSize, 2*caseSize, status); 5182 quadStart = reallocateBuffer(&quads, quadStart, quad, &quadSize, 2*quadSize, status); 5183 if(U_FAILURE(*status)) { 5184 /* We ran out of memory!? We can't recover. */ 5185 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5186 break; 5187 } 5188 if (frenchStartPtr != NULL) { 5189 frenchStartPtr = secStart + frenchStartOffset; 5190 frenchEndPtr = secStart + frenchEndOffset; 5191 } 5192 minBufferSize *= 2; 5193 } 5194 } 5195 5196 /* Here, we are generally done with processing */ 5197 /* bailing out would not be too productive */ 5198 5199 if(U_SUCCESS(*status)) { 5200 sortKeySize += (primaries - primStart); 5201 /* we have done all the CE's, now let's put them together to form a key */ 5202 if(compareSec == 0) { 5203 if (count2 > 0) { 5204 while (count2 > UCOL_BOT_COUNT2) { 5205 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5206 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5207 } 5208 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 5209 } 5210 uint32_t secsize = secondaries-secStart; 5211 if(!isFrenchSec) { // Regular situation, we know the length of secondaries 5212 sortKeySize += secsize; 5213 if(sortKeySize <= resultLength) { 5214 *(primaries++) = UCOL_LEVELTERMINATOR; 5215 uprv_memcpy(primaries, secStart, secsize); 5216 primaries += secsize; 5217 } else { 5218 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ 5219 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5220 if(U_SUCCESS(*status)) { 5221 *result = primStart; 5222 *(primaries++) = UCOL_LEVELTERMINATOR; 5223 uprv_memcpy(primaries, secStart, secsize); 5224 primaries += secsize; 5225 } 5226 else { 5227 /* We ran out of memory!? We can't recover. */ 5228 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5229 goto cleanup; 5230 } 5231 } else { 5232 *status = U_BUFFER_OVERFLOW_ERROR; 5233 } 5234 } 5235 } else { // French secondary is on. We will need to pack French. packFrench will add the level terminator 5236 uint8_t *newPrim = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); 5237 sortKeySize += secsize; 5238 if(sortKeySize <= resultLength) { // if we managed to pack fine 5239 primaries = newPrim; // update the primary pointer 5240 } else { // overflow, need to reallocate and redo 5241 if(allocateSKBuffer == TRUE) { /* need to save our butts if we cannot reallocate */ 5242 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5243 if(U_SUCCESS(*status)) { 5244 primaries = packFrench(primaries, primStart+resultLength, secondaries, &secsize, frenchStartPtr, frenchEndPtr); 5245 } 5246 else { 5247 /* We ran out of memory!? We can't recover. */ 5248 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5249 goto cleanup; 5250 } 5251 } else { 5252 *status = U_BUFFER_OVERFLOW_ERROR; 5253 } 5254 } 5255 } 5256 } 5257 5258 if(doCase) { 5259 uint32_t casesize = cases - caseStart; 5260 sortKeySize += casesize; 5261 if(sortKeySize <= resultLength) { 5262 *(primaries++) = UCOL_LEVELTERMINATOR; 5263 uprv_memcpy(primaries, caseStart, casesize); 5264 primaries += casesize; 5265 } else { 5266 if(allocateSKBuffer == TRUE) { 5267 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5268 if(U_SUCCESS(*status)) { 5269 *result = primStart; 5270 *(primaries++) = UCOL_LEVELTERMINATOR; 5271 uprv_memcpy(primaries, caseStart, casesize); 5272 } 5273 else { 5274 /* We ran out of memory!? We can't recover. */ 5275 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5276 goto cleanup; 5277 } 5278 } else { 5279 *status = U_BUFFER_OVERFLOW_ERROR; 5280 } 5281 } 5282 } 5283 5284 if(compareTer == 0) { 5285 if (count3 > 0) { 5286 if (coll->tertiaryCommon != UCOL_COMMON_BOT3) { 5287 while (count3 >= coll->tertiaryTopCount) { 5288 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 5289 count3 -= (uint32_t)coll->tertiaryTopCount; 5290 } 5291 *tertiaries++ = (uint8_t)(tertiaryTop - count3); 5292 } else { 5293 while (count3 > coll->tertiaryBottomCount) { 5294 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 5295 count3 -= (uint32_t)coll->tertiaryBottomCount; 5296 } 5297 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 5298 } 5299 } 5300 uint32_t tersize = tertiaries - terStart; 5301 sortKeySize += tersize; 5302 if(sortKeySize <= resultLength) { 5303 *(primaries++) = UCOL_LEVELTERMINATOR; 5304 uprv_memcpy(primaries, terStart, tersize); 5305 primaries += tersize; 5306 } else { 5307 if(allocateSKBuffer == TRUE) { 5308 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5309 if(U_SUCCESS(*status)) { 5310 *result = primStart; 5311 *(primaries++) = UCOL_LEVELTERMINATOR; 5312 uprv_memcpy(primaries, terStart, tersize); 5313 } 5314 else { 5315 /* We ran out of memory!? We can't recover. */ 5316 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5317 goto cleanup; 5318 } 5319 } else { 5320 *status = U_BUFFER_OVERFLOW_ERROR; 5321 } 5322 } 5323 5324 if(compareQuad == 0/*qShifted == TRUE*/) { 5325 if(count4 > 0) { 5326 while (count4 > UCOL_BOT_COUNT4) { 5327 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + UCOL_BOT_COUNT4); 5328 count4 -= UCOL_BOT_COUNT4; 5329 } 5330 *quads++ = (uint8_t)(UCOL_COMMON_BOT4 + (count4-1)); 5331 } 5332 uint32_t quadsize = quads - quadStart; 5333 sortKeySize += quadsize; 5334 if(sortKeySize <= resultLength) { 5335 *(primaries++) = UCOL_LEVELTERMINATOR; 5336 uprv_memcpy(primaries, quadStart, quadsize); 5337 primaries += quadsize; 5338 } else { 5339 if(allocateSKBuffer == TRUE) { 5340 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5341 if(U_SUCCESS(*status)) { 5342 *result = primStart; 5343 *(primaries++) = UCOL_LEVELTERMINATOR; 5344 uprv_memcpy(primaries, quadStart, quadsize); 5345 } 5346 else { 5347 /* We ran out of memory!? We can't recover. */ 5348 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5349 goto cleanup; 5350 } 5351 } else { 5352 *status = U_BUFFER_OVERFLOW_ERROR; 5353 } 5354 } 5355 } 5356 5357 if(compareIdent) { 5358 sortKeySize += u_lengthOfIdenticalLevelRun(s.string, len); 5359 if(sortKeySize <= resultLength) { 5360 *(primaries++) = UCOL_LEVELTERMINATOR; 5361 primaries += u_writeIdenticalLevelRun(s.string, len, primaries); 5362 } else { 5363 if(allocateSKBuffer == TRUE) { 5364 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, sortKeySize, status); 5365 if(U_SUCCESS(*status)) { 5366 *result = primStart; 5367 *(primaries++) = UCOL_LEVELTERMINATOR; 5368 u_writeIdenticalLevelRun(s.string, len, primaries); 5369 } 5370 else { 5371 /* We ran out of memory!? We can't recover. */ 5372 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5373 goto cleanup; 5374 } 5375 } else { 5376 *status = U_BUFFER_OVERFLOW_ERROR; 5377 } 5378 } 5379 } 5380 } 5381 *(primaries++) = '\0'; 5382 } 5383 5384 if(allocateSKBuffer == TRUE) { 5385 *result = (uint8_t*)uprv_malloc(sortKeySize); 5386 /* test for NULL */ 5387 if (*result == NULL) { 5388 *status = U_MEMORY_ALLOCATION_ERROR; 5389 goto cleanup; 5390 } 5391 uprv_memcpy(*result, primStart, sortKeySize); 5392 if(primStart != prim) { 5393 uprv_free(primStart); 5394 } 5395 } 5396 5397cleanup: 5398 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { 5399 /* NULL terminate for safety */ 5400 **result = 0; 5401 } 5402 if(terStart != tert) { 5403 uprv_free(terStart); 5404 uprv_free(secStart); 5405 uprv_free(caseStart); 5406 uprv_free(quadStart); 5407 } 5408 5409 /* To avoid memory leak, free the offset buffer if necessary. */ 5410 ucol_freeOffsetBuffer(&s); 5411 5412 if(normSource != normBuffer) { 5413 uprv_free(normSource); 5414 } 5415 5416 return sortKeySize; 5417} 5418 5419 5420U_CFUNC int32_t U_CALLCONV 5421ucol_calcSortKeySimpleTertiary(const UCollator *coll, 5422 const UChar *source, 5423 int32_t sourceLength, 5424 uint8_t **result, 5425 uint32_t resultLength, 5426 UBool allocateSKBuffer, 5427 UErrorCode *status) 5428{ 5429 U_ALIGN_CODE(16); 5430 5431 //const UCAConstants *UCAconsts = (UCAConstants *)((uint8_t *)coll->UCA->image + coll->image->UCAConsts); 5432 uint32_t i = 0; /* general purpose counter */ 5433 5434 /* Stack allocated buffers for buffers we use */ 5435 uint8_t prim[UCOL_PRIMARY_MAX_BUFFER], second[UCOL_SECONDARY_MAX_BUFFER], tert[UCOL_TERTIARY_MAX_BUFFER]; 5436 5437 uint8_t *primaries = *result, *secondaries = second, *tertiaries = tert; 5438 5439 if(U_FAILURE(*status)) { 5440 return 0; 5441 } 5442 5443 if(primaries == NULL && allocateSKBuffer == TRUE) { 5444 primaries = *result = prim; 5445 resultLength = UCOL_PRIMARY_MAX_BUFFER; 5446 } 5447 5448 uint32_t secSize = UCOL_SECONDARY_MAX_BUFFER, terSize = UCOL_TERTIARY_MAX_BUFFER; 5449 5450 uint32_t sortKeySize = 3; /* it is always \0 terminated plus separators for secondary and tertiary */ 5451 5452 UChar normBuffer[UCOL_NORMALIZATION_MAX_BUFFER]; 5453 UChar *normSource = normBuffer; 5454 int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER; 5455 5456 int32_t len = sourceLength; 5457 5458 /* If we need to normalize, we'll do it all at once at the beginning! */ 5459 if(coll->normalizationMode != UCOL_OFF && UNORM_YES != unorm_quickCheck(source, len, UNORM_FCD, status)) { 5460 len = unorm_internalNormalize(normSource, normSourceLen, 5461 source, len, 5462 UNORM_FCD, FALSE, 5463 status); 5464 if(*status == U_BUFFER_OVERFLOW_ERROR) { 5465 normSourceLen = len; 5466 normSource = (UChar *)uprv_malloc(len*U_SIZEOF_UCHAR); 5467 if(normSource == NULL) { 5468 *status = U_MEMORY_ALLOCATION_ERROR; 5469 return 0; 5470 } 5471 *status = U_ZERO_ERROR; 5472 len = unorm_internalNormalize(normSource, normSourceLen, 5473 source, len, 5474 UNORM_FCD, FALSE, 5475 status); 5476 if(U_FAILURE(*status)) { 5477 /* Should never happen. */ 5478 uprv_free(normSource); 5479 normSource = normBuffer; 5480 } 5481 } 5482 5483 if(U_FAILURE(*status)) { 5484 return 0; 5485 } 5486 source = normSource; 5487 } 5488 5489 collIterate s; 5490 IInit_collIterate(coll, (UChar *)source, len, &s); 5491 if(source == normSource) { 5492 s.flags &= ~UCOL_ITER_NORM; 5493 } 5494 5495 if(resultLength == 0 || primaries == NULL) { 5496 int32_t t = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); 5497 if(normSource != normBuffer) { 5498 uprv_free(normSource); 5499 } 5500 return t; 5501 } 5502 5503 uint8_t *primarySafeEnd = primaries + resultLength - 2; 5504 5505 uint32_t minBufferSize = UCOL_MAX_BUFFER; 5506 5507 uint8_t *primStart = primaries; 5508 uint8_t *secStart = secondaries; 5509 uint8_t *terStart = tertiaries; 5510 5511 uint32_t order = 0; 5512 5513 uint8_t primary1 = 0; 5514 uint8_t primary2 = 0; 5515 uint8_t secondary = 0; 5516 uint8_t tertiary = 0; 5517 uint8_t caseSwitch = coll->caseSwitch; 5518 uint8_t tertiaryMask = coll->tertiaryMask; 5519 int8_t tertiaryAddition = coll->tertiaryAddition; 5520 uint8_t tertiaryTop = coll->tertiaryTop; 5521 uint8_t tertiaryBottom = coll->tertiaryBottom; 5522 uint8_t tertiaryCommon = coll->tertiaryCommon; 5523 5524 uint32_t prevBuffSize = 0; 5525 5526 UBool finished = FALSE; 5527 UBool notIsContinuation = FALSE; 5528 5529 uint32_t count2 = 0, count3 = 0; 5530 uint8_t leadPrimary = 0; 5531 5532 for(;;) { 5533 for(i=prevBuffSize; i<minBufferSize; ++i) { 5534 5535 order = ucol_IGetNextCE(coll, &s, status); 5536 5537 if(order == 0) { 5538 continue; 5539 } 5540 5541 if(order == UCOL_NO_MORE_CES) { 5542 finished = TRUE; 5543 break; 5544 } 5545 5546 notIsContinuation = !isContinuation(order); 5547 5548 if(notIsContinuation) { 5549 tertiary = (uint8_t)((order & tertiaryMask)); 5550 } else { 5551 tertiary = (uint8_t)((order & UCOL_REMOVE_CONTINUATION)); 5552 } 5553 secondary = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5554 primary2 = (uint8_t)((order >>= 8) & UCOL_BYTE_SIZE_MASK); 5555 primary1 = (uint8_t)(order >> 8); 5556 5557 /* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */ 5558 /* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */ 5559 /* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */ 5560 /* regular and simple sortkey calc */ 5561 if(primary1 != UCOL_IGNORABLE) { 5562 if(notIsContinuation) { 5563 if(leadPrimary == primary1) { 5564 *primaries++ = primary2; 5565 } else { 5566 if(leadPrimary != 0) { 5567 *primaries++ = (uint8_t)((primary1 > leadPrimary) ? UCOL_BYTE_UNSHIFTED_MAX : UCOL_BYTE_UNSHIFTED_MIN); 5568 } 5569 if(primary2 == UCOL_IGNORABLE) { 5570 /* one byter, not compressed */ 5571 *primaries++ = primary1; 5572 leadPrimary = 0; 5573 } else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY || 5574 //(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) 5575 //(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) { 5576 (primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) { 5577 /* not compressible */ 5578 leadPrimary = 0; 5579 *primaries++ = primary1; 5580 *primaries++ = primary2; 5581 } else { /* compress */ 5582 *primaries++ = leadPrimary = primary1; 5583 *primaries++ = primary2; 5584 } 5585 } 5586 } else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */ 5587 *primaries++ = primary1; 5588 if(primary2 != UCOL_IGNORABLE) { 5589 *primaries++ = primary2; /* second part */ 5590 } 5591 } 5592 } 5593 5594 if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */ 5595 /* This is compression code. */ 5596 if (secondary == UCOL_COMMON2 && notIsContinuation) { 5597 ++count2; 5598 } else { 5599 if (count2 > 0) { 5600 if (secondary > UCOL_COMMON2) { // not necessary for 4th level. 5601 while (count2 > UCOL_TOP_COUNT2) { 5602 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - UCOL_TOP_COUNT2); 5603 count2 -= (uint32_t)UCOL_TOP_COUNT2; 5604 } 5605 *secondaries++ = (uint8_t)(UCOL_COMMON_TOP2 - (count2-1)); 5606 } else { 5607 while (count2 > UCOL_BOT_COUNT2) { 5608 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5609 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5610 } 5611 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 5612 } 5613 count2 = 0; 5614 } 5615 *secondaries++ = secondary; 5616 } 5617 } 5618 5619 if(notIsContinuation) { 5620 tertiary ^= caseSwitch; 5621 } 5622 5623 if(tertiary > 0) { 5624 /* This is compression code. */ 5625 /* sequence size check is included in the if clause */ 5626 if (tertiary == tertiaryCommon && notIsContinuation) { 5627 ++count3; 5628 } else { 5629 if(tertiary > tertiaryCommon && tertiaryCommon == UCOL_COMMON3_NORMAL) { 5630 tertiary += tertiaryAddition; 5631 } else if (tertiary <= tertiaryCommon && tertiaryCommon == UCOL_COMMON3_UPPERFIRST) { 5632 tertiary -= tertiaryAddition; 5633 } 5634 if (count3 > 0) { 5635 if ((tertiary > tertiaryCommon)) { 5636 while (count3 > coll->tertiaryTopCount) { 5637 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 5638 count3 -= (uint32_t)coll->tertiaryTopCount; 5639 } 5640 *tertiaries++ = (uint8_t)(tertiaryTop - (count3-1)); 5641 } else { 5642 while (count3 > coll->tertiaryBottomCount) { 5643 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 5644 count3 -= (uint32_t)coll->tertiaryBottomCount; 5645 } 5646 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 5647 } 5648 count3 = 0; 5649 } 5650 *tertiaries++ = tertiary; 5651 } 5652 } 5653 5654 if(primaries > primarySafeEnd) { /* We have stepped over the primary buffer */ 5655 if(allocateSKBuffer == FALSE) { /* need to save our butts if we cannot reallocate */ 5656 IInit_collIterate(coll, (UChar *)source, len, &s); 5657 if(source == normSource) { 5658 s.flags &= ~UCOL_ITER_NORM; 5659 } 5660 sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len); 5661 *status = U_BUFFER_OVERFLOW_ERROR; 5662 finished = TRUE; 5663 break; 5664 } else { /* It's much nicer if we can actually reallocate */ 5665 int32_t sks = sortKeySize+(primaries - primStart)+(secondaries - secStart)+(tertiaries - terStart); 5666 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sks, status); 5667 if(U_SUCCESS(*status)) { 5668 *result = primStart; 5669 primarySafeEnd = primStart + resultLength - 2; 5670 } else { 5671 /* We ran out of memory!? We can't recover. */ 5672 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5673 finished = TRUE; 5674 break; 5675 } 5676 } 5677 } 5678 } 5679 if(finished) { 5680 break; 5681 } else { 5682 prevBuffSize = minBufferSize; 5683 secStart = reallocateBuffer(&secondaries, secStart, second, &secSize, 2*secSize, status); 5684 terStart = reallocateBuffer(&tertiaries, terStart, tert, &terSize, 2*terSize, status); 5685 minBufferSize *= 2; 5686 if(U_FAILURE(*status)) { // if we cannot reallocate buffers, we can at least give the sortkey size 5687 /* We ran out of memory!? We can't recover. */ 5688 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5689 break; 5690 } 5691 } 5692 } 5693 5694 if(U_SUCCESS(*status)) { 5695 sortKeySize += (primaries - primStart); 5696 /* we have done all the CE's, now let's put them together to form a key */ 5697 if (count2 > 0) { 5698 while (count2 > UCOL_BOT_COUNT2) { 5699 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + UCOL_BOT_COUNT2); 5700 count2 -= (uint32_t)UCOL_BOT_COUNT2; 5701 } 5702 *secondaries++ = (uint8_t)(UCOL_COMMON_BOT2 + (count2-1)); 5703 } 5704 uint32_t secsize = secondaries-secStart; 5705 sortKeySize += secsize; 5706 if(sortKeySize <= resultLength) { 5707 *(primaries++) = UCOL_LEVELTERMINATOR; 5708 uprv_memcpy(primaries, secStart, secsize); 5709 primaries += secsize; 5710 } else { 5711 if(allocateSKBuffer == TRUE) { 5712 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5713 if(U_SUCCESS(*status)) { 5714 *(primaries++) = UCOL_LEVELTERMINATOR; 5715 *result = primStart; 5716 uprv_memcpy(primaries, secStart, secsize); 5717 } 5718 else { 5719 /* We ran out of memory!? We can't recover. */ 5720 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5721 goto cleanup; 5722 } 5723 } else { 5724 *status = U_BUFFER_OVERFLOW_ERROR; 5725 } 5726 } 5727 5728 if (count3 > 0) { 5729 if (coll->tertiaryCommon != UCOL_COMMON3_NORMAL) { 5730 while (count3 >= coll->tertiaryTopCount) { 5731 *tertiaries++ = (uint8_t)(tertiaryTop - coll->tertiaryTopCount); 5732 count3 -= (uint32_t)coll->tertiaryTopCount; 5733 } 5734 *tertiaries++ = (uint8_t)(tertiaryTop - count3); 5735 } else { 5736 while (count3 > coll->tertiaryBottomCount) { 5737 *tertiaries++ = (uint8_t)(tertiaryBottom + coll->tertiaryBottomCount); 5738 count3 -= (uint32_t)coll->tertiaryBottomCount; 5739 } 5740 *tertiaries++ = (uint8_t)(tertiaryBottom + (count3-1)); 5741 } 5742 } 5743 uint32_t tersize = tertiaries - terStart; 5744 sortKeySize += tersize; 5745 if(sortKeySize <= resultLength) { 5746 *(primaries++) = UCOL_LEVELTERMINATOR; 5747 uprv_memcpy(primaries, terStart, tersize); 5748 primaries += tersize; 5749 } else { 5750 if(allocateSKBuffer == TRUE) { 5751 primStart = reallocateBuffer(&primaries, *result, prim, &resultLength, 2*sortKeySize, status); 5752 if(U_SUCCESS(*status)) { 5753 *result = primStart; 5754 *(primaries++) = UCOL_LEVELTERMINATOR; 5755 uprv_memcpy(primaries, terStart, tersize); 5756 } 5757 else { 5758 /* We ran out of memory!? We can't recover. */ 5759 sortKeySize = DEFAULT_ERROR_SIZE_FOR_CALCSORTKEY; 5760 goto cleanup; 5761 } 5762 } else { 5763 *status = U_MEMORY_ALLOCATION_ERROR; 5764 } 5765 } 5766 5767 *(primaries++) = '\0'; 5768 } 5769 5770 if(allocateSKBuffer == TRUE) { 5771 *result = (uint8_t*)uprv_malloc(sortKeySize); 5772 /* test for NULL */ 5773 if (*result == NULL) { 5774 *status = U_MEMORY_ALLOCATION_ERROR; 5775 goto cleanup; 5776 } 5777 uprv_memcpy(*result, primStart, sortKeySize); 5778 if(primStart != prim) { 5779 uprv_free(primStart); 5780 } 5781 } 5782 5783cleanup: 5784 if (allocateSKBuffer == FALSE && resultLength > 0 && U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) { 5785 /* NULL terminate for safety */ 5786 **result = 0; 5787 } 5788 if(terStart != tert) { 5789 uprv_free(terStart); 5790 uprv_free(secStart); 5791 } 5792 5793 /* To avoid memory leak, free the offset buffer if necessary. */ 5794 ucol_freeOffsetBuffer(&s); 5795 5796 if(normSource != normBuffer) { 5797 uprv_free(normSource); 5798 } 5799 5800 return sortKeySize; 5801} 5802 5803static inline 5804UBool isShiftedCE(uint32_t CE, uint32_t LVT, UBool *wasShifted) { 5805 UBool notIsContinuation = !isContinuation(CE); 5806 uint8_t primary1 = (uint8_t)((CE >> 24) & 0xFF); 5807 if(LVT && ((notIsContinuation && (CE & 0xFFFF0000)<= LVT && primary1 > 0) 5808 || (!notIsContinuation && *wasShifted)) 5809 || (*wasShifted && primary1 == 0)) /* amendment to the UCA says that primary ignorables */ 5810 { 5811 // The stuff below should probably be in the sortkey code... maybe not... 5812 if(primary1 != 0) { /* if we were shifted and we got an ignorable code point */ 5813 /* we should just completely ignore it */ 5814 *wasShifted = TRUE; 5815 //continue; 5816 } 5817 //*wasShifted = TRUE; 5818 return TRUE; 5819 } else { 5820 *wasShifted = FALSE; 5821 return FALSE; 5822 } 5823} 5824static inline 5825void terminatePSKLevel(int32_t level, int32_t maxLevel, int32_t &i, uint8_t *dest) { 5826 if(level < maxLevel) { 5827 dest[i++] = UCOL_LEVELTERMINATOR; 5828 } else { 5829 dest[i++] = 0; 5830 } 5831} 5832 5833/** enumeration of level identifiers for partial sort key generation */ 5834enum { 5835 UCOL_PSK_PRIMARY = 0, 5836 UCOL_PSK_SECONDARY = 1, 5837 UCOL_PSK_CASE = 2, 5838 UCOL_PSK_TERTIARY = 3, 5839 UCOL_PSK_QUATERNARY = 4, 5840 UCOL_PSK_QUIN = 5, /** This is an extra level, not used - but we have three bits to blow */ 5841 UCOL_PSK_IDENTICAL = 6, 5842 UCOL_PSK_NULL = 7, /** level for the end of sort key. Will just produce zeros */ 5843 UCOL_PSK_LIMIT 5844}; 5845 5846/** collation state enum. *_SHIFT value is how much to shift right 5847 * to get the state piece to the right. *_MASK value should be 5848 * ANDed with the shifted state. This data is stored in state[1] 5849 * field. 5850 */ 5851enum { 5852 UCOL_PSK_LEVEL_SHIFT = 0, /** level identificator. stores an enum value from above */ 5853 UCOL_PSK_LEVEL_MASK = 7, /** three bits */ 5854 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT = 3, /** number of bytes of primary or quaternary already written */ 5855 UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK = 1, 5856 /** can be only 0 or 1, since we get up to two bytes from primary or quaternary 5857 * This field is also used to denote that the French secondary level is finished 5858 */ 5859 UCOL_PSK_WAS_SHIFTED_SHIFT = 4,/** was the last value shifted */ 5860 UCOL_PSK_WAS_SHIFTED_MASK = 1, /** can be 0 or 1 (Boolean) */ 5861 UCOL_PSK_USED_FRENCH_SHIFT = 5,/** how many French bytes have we already written */ 5862 UCOL_PSK_USED_FRENCH_MASK = 3, /** up to 4 bytes. See comment just below */ 5863 /** When we do French we need to reverse secondary values. However, continuations 5864 * need to stay the same. So if you had abc1c2c3de, you need to have edc1c2c3ba 5865 */ 5866 UCOL_PSK_BOCSU_BYTES_SHIFT = 7, 5867 UCOL_PSK_BOCSU_BYTES_MASK = 3, 5868 UCOL_PSK_CONSUMED_CES_SHIFT = 9, 5869 UCOL_PSK_CONSUMED_CES_MASK = 0x7FFFF 5870}; 5871 5872// macro calculating the number of expansion CEs available 5873#define uprv_numAvailableExpCEs(s) (s).CEpos - (s).toReturn 5874 5875 5876/** main sortkey part procedure. On the first call, 5877 * you should pass in a collator, an iterator, empty state 5878 * state[0] == state[1] == 0, a buffer to hold results 5879 * number of bytes you need and an error code pointer. 5880 * Make sure your buffer is big enough to hold the wanted 5881 * number of sortkey bytes. I don't check. 5882 * The only meaningful status you can get back is 5883 * U_BUFFER_OVERFLOW_ERROR, which basically means that you 5884 * have been dealt a raw deal and that you probably won't 5885 * be able to use partial sortkey generation for this 5886 * particular combination of string and collator. This 5887 * is highly unlikely, but you should still check the error code. 5888 * Any other status means that you're not in a sane situation 5889 * anymore. After the first call, preserve state values and 5890 * use them on subsequent calls to obtain more bytes of a sortkey. 5891 * Use until the number of bytes written is smaller than the requested 5892 * number of bytes. Generated sortkey is not compatible with the 5893 * one generated by ucol_getSortKey, as we don't do any compression. 5894 * However, levels are still terminated by a 1 (one) and the sortkey 5895 * is terminated by a 0 (zero). Identical level is the same as in the 5896 * regular sortkey - internal bocu-1 implementation is used. 5897 * For curious, although you cannot do much about this, here is 5898 * the structure of state words. 5899 * state[0] - iterator state. Depends on the iterator implementation, 5900 * but allows the iterator to continue where it stopped in 5901 * the last iteration. 5902 * state[1] - collation processing state. Here is the distribution 5903 * of the bits: 5904 * 0, 1, 2 - level of the sortkey - primary, secondary, case, tertiary 5905 * quaternary, quin (we don't use this one), identical and 5906 * null (producing only zeroes - first one to terminate the 5907 * sortkey and subsequent to fill the buffer). 5908 * 3 - byte count. Number of bytes written on the primary level. 5909 * 4 - was shifted. Whether the previous iteration finished in the 5910 * shifted state. 5911 * 5, 6 - French continuation bytes written. See the comment in the enum 5912 * 7,8 - Bocsu bytes used. Number of bytes from a bocu sequence on 5913 * the identical level. 5914 * 9..31 - CEs consumed. Number of getCE or next32 operations performed 5915 * since thes last successful update of the iterator state. 5916 */ 5917U_CAPI int32_t U_EXPORT2 5918ucol_nextSortKeyPart(const UCollator *coll, 5919 UCharIterator *iter, 5920 uint32_t state[2], 5921 uint8_t *dest, int32_t count, 5922 UErrorCode *status) 5923{ 5924 /* error checking */ 5925 if(status==NULL || U_FAILURE(*status)) { 5926 return 0; 5927 } 5928 UTRACE_ENTRY(UTRACE_UCOL_NEXTSORTKEYPART); 5929 if( coll==NULL || iter==NULL || 5930 state==NULL || 5931 count<0 || (count>0 && dest==NULL) 5932 ) { 5933 *status=U_ILLEGAL_ARGUMENT_ERROR; 5934 UTRACE_EXIT_STATUS(status); 5935 return 0; 5936 } 5937 5938 UTRACE_DATA6(UTRACE_VERBOSE, "coll=%p, iter=%p, state=%d %d, dest=%p, count=%d", 5939 coll, iter, state[0], state[1], dest, count); 5940 5941 if(count==0) { 5942 /* nothing to do */ 5943 UTRACE_EXIT_VALUE(0); 5944 return 0; 5945 } 5946 /** Setting up situation according to the state we got from the previous iteration */ 5947 // The state of the iterator from the previous invocation 5948 uint32_t iterState = state[0]; 5949 // Has the last iteration ended in the shifted state 5950 UBool wasShifted = ((state[1] >> UCOL_PSK_WAS_SHIFTED_SHIFT) & UCOL_PSK_WAS_SHIFTED_MASK)?TRUE:FALSE; 5951 // What is the current level of the sortkey? 5952 int32_t level= (state[1] >> UCOL_PSK_LEVEL_SHIFT) & UCOL_PSK_LEVEL_MASK; 5953 // Have we written only one byte from a two byte primary in the previous iteration? 5954 // Also on secondary level - have we finished with the French secondary? 5955 int32_t byteCountOrFrenchDone = (state[1] >> UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK; 5956 // number of bytes in the continuation buffer for French 5957 int32_t usedFrench = (state[1] >> UCOL_PSK_USED_FRENCH_SHIFT) & UCOL_PSK_USED_FRENCH_MASK; 5958 // Number of bytes already written from a bocsu sequence. Since 5959 // the longes bocsu sequence is 4 long, this can be up to 3. 5960 int32_t bocsuBytesUsed = (state[1] >> UCOL_PSK_BOCSU_BYTES_SHIFT) & UCOL_PSK_BOCSU_BYTES_MASK; 5961 // Number of elements that need to be consumed in this iteration because 5962 // the iterator returned UITER_NO_STATE at the end of the last iteration, 5963 // so we had to save the last valid state. 5964 int32_t cces = (state[1] >> UCOL_PSK_CONSUMED_CES_SHIFT) & UCOL_PSK_CONSUMED_CES_MASK; 5965 5966 /** values that depend on the collator attributes */ 5967 // strength of the collator. 5968 int32_t strength = ucol_getAttribute(coll, UCOL_STRENGTH, status); 5969 // maximal level of the partial sortkey. Need to take whether case level is done 5970 int32_t maxLevel = 0; 5971 if(strength < UCOL_TERTIARY) { 5972 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 5973 maxLevel = UCOL_PSK_CASE; 5974 } else { 5975 maxLevel = strength; 5976 } 5977 } else { 5978 if(strength == UCOL_TERTIARY) { 5979 maxLevel = UCOL_PSK_TERTIARY; 5980 } else if(strength == UCOL_QUATERNARY) { 5981 maxLevel = UCOL_PSK_QUATERNARY; 5982 } else { // identical 5983 maxLevel = UCOL_IDENTICAL; 5984 } 5985 } 5986 // value for the quaternary level if Hiragana is encountered. Used for JIS X 4061 collation 5987 uint8_t UCOL_HIRAGANA_QUAD = 5988 (ucol_getAttribute(coll, UCOL_HIRAGANA_QUATERNARY_MODE, status) == UCOL_ON)?0xFE:0xFF; 5989 // Boundary value that decides whether a CE is shifted or not 5990 uint32_t LVT = (coll->alternateHandling == UCOL_SHIFTED)?(coll->variableTopValue<<16):0; 5991 // Are we doing French collation? 5992 UBool doingFrench = (ucol_getAttribute(coll, UCOL_FRENCH_COLLATION, status) == UCOL_ON); 5993 5994 /** initializing the collation state */ 5995 UBool notIsContinuation = FALSE; 5996 uint32_t CE = UCOL_NO_MORE_CES; 5997 5998 collIterate s; 5999 IInit_collIterate(coll, NULL, -1, &s); 6000 s.iterator = iter; 6001 s.flags |= UCOL_USE_ITERATOR; 6002 // This variable tells us whether we have produced some other levels in this iteration 6003 // before we moved to the identical level. In that case, we need to switch the 6004 // type of the iterator. 6005 UBool doingIdenticalFromStart = FALSE; 6006 // Normalizing iterator 6007 // The division for the array length may truncate the array size to 6008 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 6009 // for all platforms anyway. 6010 UAlignedMemory stackNormIter[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 6011 UNormIterator *normIter = NULL; 6012 // If the normalization is turned on for the collator and we are below identical level 6013 // we will use a FCD normalizing iterator 6014 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON && level < UCOL_PSK_IDENTICAL) { 6015 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 6016 s.iterator = unorm_setIter(normIter, iter, UNORM_FCD, status); 6017 s.flags &= ~UCOL_ITER_NORM; 6018 if(U_FAILURE(*status)) { 6019 UTRACE_EXIT_STATUS(*status); 6020 return 0; 6021 } 6022 } else if(level == UCOL_PSK_IDENTICAL) { 6023 // for identical level, we need a NFD iterator. We need to instantiate it here, since we 6024 // will be updating the state - and this cannot be done on an ordinary iterator. 6025 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 6026 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 6027 s.flags &= ~UCOL_ITER_NORM; 6028 if(U_FAILURE(*status)) { 6029 UTRACE_EXIT_STATUS(*status); 6030 return 0; 6031 } 6032 doingIdenticalFromStart = TRUE; 6033 } 6034 6035 // This is the tentative new state of the iterator. The problem 6036 // is that the iterator might return an undefined state, in 6037 // which case we should save the last valid state and increase 6038 // the iterator skip value. 6039 uint32_t newState = 0; 6040 6041 // First, we set the iterator to the last valid position 6042 // from the last iteration. This was saved in state[0]. 6043 if(iterState == 0) { 6044 /* initial state */ 6045 if(level == UCOL_PSK_SECONDARY && doingFrench && !byteCountOrFrenchDone) { 6046 s.iterator->move(s.iterator, 0, UITER_LIMIT); 6047 } else { 6048 s.iterator->move(s.iterator, 0, UITER_START); 6049 } 6050 } else { 6051 /* reset to previous state */ 6052 s.iterator->setState(s.iterator, iterState, status); 6053 if(U_FAILURE(*status)) { 6054 UTRACE_EXIT_STATUS(*status); 6055 return 0; 6056 } 6057 } 6058 6059 6060 6061 // This variable tells us whether we can attempt to update the state 6062 // of iterator. Situations where we don't want to update iterator state 6063 // are the existence of expansion CEs that are not yet processed, and 6064 // finishing the case level without enough space in the buffer to insert 6065 // a level terminator. 6066 UBool canUpdateState = TRUE; 6067 6068 // Consume all the CEs that were consumed at the end of the previous 6069 // iteration without updating the iterator state. On identical level, 6070 // consume the code points. 6071 int32_t counter = cces; 6072 if(level < UCOL_PSK_IDENTICAL) { 6073 while(counter-->0) { 6074 // If we're doing French and we are on the secondary level, 6075 // we go backwards. 6076 if(level == UCOL_PSK_SECONDARY && doingFrench) { 6077 CE = ucol_IGetPrevCE(coll, &s, status); 6078 } else { 6079 CE = ucol_IGetNextCE(coll, &s, status); 6080 } 6081 if(CE==UCOL_NO_MORE_CES) { 6082 /* should not happen */ 6083 *status=U_INTERNAL_PROGRAM_ERROR; 6084 UTRACE_EXIT_STATUS(*status); 6085 return 0; 6086 } 6087 if(uprv_numAvailableExpCEs(s)) { 6088 canUpdateState = FALSE; 6089 } 6090 } 6091 } else { 6092 while(counter-->0) { 6093 uiter_next32(s.iterator); 6094 } 6095 } 6096 6097 // French secondary needs to know whether the iterator state of zero came from previous level OR 6098 // from a new invocation... 6099 UBool wasDoingPrimary = FALSE; 6100 // destination buffer byte counter. When this guy 6101 // gets to count, we're done with the iteration 6102 int32_t i = 0; 6103 // used to count the zero bytes written after we 6104 // have finished with the sort key 6105 int32_t j = 0; 6106 6107 6108 // Hm.... I think we're ready to plunge in. Basic story is as following: 6109 // we have a fall through case based on level. This is used for initial 6110 // positioning on iteration start. Every level processor contains a 6111 // for(;;) which will be broken when we exhaust all the CEs. Other 6112 // way to exit is a goto saveState, which happens when we have filled 6113 // out our buffer. 6114 switch(level) { 6115 case UCOL_PSK_PRIMARY: 6116 wasDoingPrimary = TRUE; 6117 for(;;) { 6118 if(i==count) { 6119 goto saveState; 6120 } 6121 // We should save the state only if we 6122 // are sure that we are done with the 6123 // previous iterator state 6124 if(canUpdateState && byteCountOrFrenchDone == 0) { 6125 newState = s.iterator->getState(s.iterator); 6126 if(newState != UITER_NO_STATE) { 6127 iterState = newState; 6128 cces = 0; 6129 } 6130 } 6131 CE = ucol_IGetNextCE(coll, &s, status); 6132 cces++; 6133 if(CE==UCOL_NO_MORE_CES) { 6134 // Add the level separator 6135 terminatePSKLevel(level, maxLevel, i, dest); 6136 byteCountOrFrenchDone=0; 6137 // Restart the iteration an move to the 6138 // second level 6139 s.iterator->move(s.iterator, 0, UITER_START); 6140 cces = 0; 6141 level = UCOL_PSK_SECONDARY; 6142 break; 6143 } 6144 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6145 CE >>= UCOL_PRIMARYORDERSHIFT; /* get primary */ 6146 if(CE != 0) { 6147 if(byteCountOrFrenchDone == 0) { 6148 // get the second byte of primary 6149 dest[i++]=(uint8_t)(CE >> 8); 6150 } else { 6151 byteCountOrFrenchDone = 0; 6152 } 6153 if((CE &=0xff)!=0) { 6154 if(i==count) { 6155 /* overflow */ 6156 byteCountOrFrenchDone = 1; 6157 cces--; 6158 goto saveState; 6159 } 6160 dest[i++]=(uint8_t)CE; 6161 } 6162 } 6163 } 6164 if(uprv_numAvailableExpCEs(s)) { 6165 canUpdateState = FALSE; 6166 } else { 6167 canUpdateState = TRUE; 6168 } 6169 } 6170 /* fall through to next level */ 6171 case UCOL_PSK_SECONDARY: 6172 if(strength >= UCOL_SECONDARY) { 6173 if(!doingFrench) { 6174 for(;;) { 6175 if(i == count) { 6176 goto saveState; 6177 } 6178 // We should save the state only if we 6179 // are sure that we are done with the 6180 // previous iterator state 6181 if(canUpdateState) { 6182 newState = s.iterator->getState(s.iterator); 6183 if(newState != UITER_NO_STATE) { 6184 iterState = newState; 6185 cces = 0; 6186 } 6187 } 6188 CE = ucol_IGetNextCE(coll, &s, status); 6189 cces++; 6190 if(CE==UCOL_NO_MORE_CES) { 6191 // Add the level separator 6192 terminatePSKLevel(level, maxLevel, i, dest); 6193 byteCountOrFrenchDone = 0; 6194 // Restart the iteration an move to the 6195 // second level 6196 s.iterator->move(s.iterator, 0, UITER_START); 6197 cces = 0; 6198 level = UCOL_PSK_CASE; 6199 break; 6200 } 6201 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6202 CE >>= 8; /* get secondary */ 6203 if(CE != 0) { 6204 dest[i++]=(uint8_t)CE; 6205 } 6206 } 6207 if(uprv_numAvailableExpCEs(s)) { 6208 canUpdateState = FALSE; 6209 } else { 6210 canUpdateState = TRUE; 6211 } 6212 } 6213 } else { // French secondary processing 6214 uint8_t frenchBuff[UCOL_MAX_BUFFER]; 6215 int32_t frenchIndex = 0; 6216 // Here we are going backwards. 6217 // If the iterator is at the beggining, it should be 6218 // moved to end. 6219 if(wasDoingPrimary) { 6220 s.iterator->move(s.iterator, 0, UITER_LIMIT); 6221 cces = 0; 6222 } 6223 for(;;) { 6224 if(i == count) { 6225 goto saveState; 6226 } 6227 if(canUpdateState) { 6228 newState = s.iterator->getState(s.iterator); 6229 if(newState != UITER_NO_STATE) { 6230 iterState = newState; 6231 cces = 0; 6232 } 6233 } 6234 CE = ucol_IGetPrevCE(coll, &s, status); 6235 cces++; 6236 if(CE==UCOL_NO_MORE_CES) { 6237 // Add the level separator 6238 terminatePSKLevel(level, maxLevel, i, dest); 6239 byteCountOrFrenchDone = 0; 6240 // Restart the iteration an move to the next level 6241 s.iterator->move(s.iterator, 0, UITER_START); 6242 level = UCOL_PSK_CASE; 6243 break; 6244 } 6245 if(isContinuation(CE)) { // if it's a continuation, we want to save it and 6246 // reverse when we get a first non-continuation CE. 6247 CE >>= 8; 6248 frenchBuff[frenchIndex++] = (uint8_t)CE; 6249 } else if(!isShiftedCE(CE, LVT, &wasShifted)) { 6250 CE >>= 8; /* get secondary */ 6251 if(!frenchIndex) { 6252 if(CE != 0) { 6253 dest[i++]=(uint8_t)CE; 6254 } 6255 } else { 6256 frenchBuff[frenchIndex++] = (uint8_t)CE; 6257 frenchIndex -= usedFrench; 6258 usedFrench = 0; 6259 while(i < count && frenchIndex) { 6260 dest[i++] = frenchBuff[--frenchIndex]; 6261 usedFrench++; 6262 } 6263 } 6264 } 6265 if(uprv_numAvailableExpCEs(s)) { 6266 canUpdateState = FALSE; 6267 } else { 6268 canUpdateState = TRUE; 6269 } 6270 } 6271 } 6272 } else { 6273 level = UCOL_PSK_CASE; 6274 } 6275 /* fall through to next level */ 6276 case UCOL_PSK_CASE: 6277 if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, status) == UCOL_ON) { 6278 uint32_t caseShift = UCOL_CASE_SHIFT_START; 6279 uint8_t caseByte = UCOL_CASE_BYTE_START; 6280 uint8_t caseBits = 0; 6281 6282 for(;;) { 6283 if(i == count) { 6284 goto saveState; 6285 } 6286 // We should save the state only if we 6287 // are sure that we are done with the 6288 // previous iterator state 6289 if(canUpdateState) { 6290 newState = s.iterator->getState(s.iterator); 6291 if(newState != UITER_NO_STATE) { 6292 iterState = newState; 6293 cces = 0; 6294 } 6295 } 6296 CE = ucol_IGetNextCE(coll, &s, status); 6297 cces++; 6298 if(CE==UCOL_NO_MORE_CES) { 6299 // On the case level we might have an unfinished 6300 // case byte. Add one if it's started. 6301 if(caseShift != UCOL_CASE_SHIFT_START) { 6302 dest[i++] = caseByte; 6303 } 6304 cces = 0; 6305 // We have finished processing CEs on this level. 6306 // However, we don't know if we have enough space 6307 // to add a case level terminator. 6308 if(i < count) { 6309 // Add the level separator 6310 terminatePSKLevel(level, maxLevel, i, dest); 6311 // Restart the iteration and move to the 6312 // next level 6313 s.iterator->move(s.iterator, 0, UITER_START); 6314 level = UCOL_PSK_TERTIARY; 6315 } else { 6316 canUpdateState = FALSE; 6317 } 6318 break; 6319 } 6320 6321 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6322 if(!isContinuation(CE) && ((CE & UCOL_PRIMARYMASK) != 0 || strength > UCOL_PRIMARY)) { 6323 // do the case level if we need to do it. We don't want to calculate 6324 // case level for primary ignorables if we have only primary strength and case level 6325 // otherwise we would break well formedness of CEs 6326 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 6327 caseBits = (uint8_t)(CE & 0xC0); 6328 // this copies the case level logic from the 6329 // sort key generation code 6330 if(CE != 0) { 6331 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6332 if((caseBits & 0xC0) == 0) { 6333 caseByte |= 1 << (--caseShift); 6334 } else { 6335 caseByte |= 0 << (--caseShift); 6336 /* second bit */ 6337 if(caseShift == 0) { 6338 dest[i++] = caseByte; 6339 caseShift = UCOL_CASE_SHIFT_START; 6340 caseByte = UCOL_CASE_BYTE_START; 6341 } 6342 caseByte |= ((caseBits>>6)&1) << (--caseShift); 6343 } 6344 } else { 6345 if((caseBits & 0xC0) == 0) { 6346 caseByte |= 0 << (--caseShift); 6347 } else { 6348 caseByte |= 1 << (--caseShift); 6349 /* second bit */ 6350 if(caseShift == 0) { 6351 dest[i++] = caseByte; 6352 caseShift = UCOL_CASE_SHIFT_START; 6353 caseByte = UCOL_CASE_BYTE_START; 6354 } 6355 caseByte |= ((caseBits>>7)&1) << (--caseShift); 6356 } 6357 } 6358 } 6359 6360 } 6361 } 6362 // Not sure this is correct for the case level - revisit 6363 if(uprv_numAvailableExpCEs(s)) { 6364 canUpdateState = FALSE; 6365 } else { 6366 canUpdateState = TRUE; 6367 } 6368 } 6369 } else { 6370 level = UCOL_PSK_TERTIARY; 6371 } 6372 /* fall through to next level */ 6373 case UCOL_PSK_TERTIARY: 6374 if(strength >= UCOL_TERTIARY) { 6375 for(;;) { 6376 if(i == count) { 6377 goto saveState; 6378 } 6379 // We should save the state only if we 6380 // are sure that we are done with the 6381 // previous iterator state 6382 if(canUpdateState) { 6383 newState = s.iterator->getState(s.iterator); 6384 if(newState != UITER_NO_STATE) { 6385 iterState = newState; 6386 cces = 0; 6387 } 6388 } 6389 CE = ucol_IGetNextCE(coll, &s, status); 6390 cces++; 6391 if(CE==UCOL_NO_MORE_CES) { 6392 // Add the level separator 6393 terminatePSKLevel(level, maxLevel, i, dest); 6394 byteCountOrFrenchDone = 0; 6395 // Restart the iteration an move to the 6396 // second level 6397 s.iterator->move(s.iterator, 0, UITER_START); 6398 cces = 0; 6399 level = UCOL_PSK_QUATERNARY; 6400 break; 6401 } 6402 if(!isShiftedCE(CE, LVT, &wasShifted)) { 6403 notIsContinuation = !isContinuation(CE); 6404 6405 if(notIsContinuation) { 6406 CE = (uint8_t)(CE & UCOL_BYTE_SIZE_MASK); 6407 CE ^= coll->caseSwitch; 6408 CE &= coll->tertiaryMask; 6409 } else { 6410 CE = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 6411 } 6412 6413 if(CE != 0) { 6414 dest[i++]=(uint8_t)CE; 6415 } 6416 } 6417 if(uprv_numAvailableExpCEs(s)) { 6418 canUpdateState = FALSE; 6419 } else { 6420 canUpdateState = TRUE; 6421 } 6422 } 6423 } else { 6424 // if we're not doing tertiary 6425 // skip to the end 6426 level = UCOL_PSK_NULL; 6427 } 6428 /* fall through to next level */ 6429 case UCOL_PSK_QUATERNARY: 6430 if(strength >= UCOL_QUATERNARY) { 6431 for(;;) { 6432 if(i == count) { 6433 goto saveState; 6434 } 6435 // We should save the state only if we 6436 // are sure that we are done with the 6437 // previous iterator state 6438 if(canUpdateState) { 6439 newState = s.iterator->getState(s.iterator); 6440 if(newState != UITER_NO_STATE) { 6441 iterState = newState; 6442 cces = 0; 6443 } 6444 } 6445 CE = ucol_IGetNextCE(coll, &s, status); 6446 cces++; 6447 if(CE==UCOL_NO_MORE_CES) { 6448 // Add the level separator 6449 terminatePSKLevel(level, maxLevel, i, dest); 6450 //dest[i++] = UCOL_LEVELTERMINATOR; 6451 byteCountOrFrenchDone = 0; 6452 // Restart the iteration an move to the 6453 // second level 6454 s.iterator->move(s.iterator, 0, UITER_START); 6455 cces = 0; 6456 level = UCOL_PSK_QUIN; 6457 break; 6458 } 6459 if(CE==0) 6460 continue; 6461 if(isShiftedCE(CE, LVT, &wasShifted)) { 6462 CE >>= 16; /* get primary */ 6463 if(CE != 0) { 6464 if(byteCountOrFrenchDone == 0) { 6465 dest[i++]=(uint8_t)(CE >> 8); 6466 } else { 6467 byteCountOrFrenchDone = 0; 6468 } 6469 if((CE &=0xff)!=0) { 6470 if(i==count) { 6471 /* overflow */ 6472 byteCountOrFrenchDone = 1; 6473 goto saveState; 6474 } 6475 dest[i++]=(uint8_t)CE; 6476 } 6477 } 6478 } else { 6479 notIsContinuation = !isContinuation(CE); 6480 if(notIsContinuation) { 6481 if(s.flags & UCOL_WAS_HIRAGANA) { // This was Hiragana and we need to note it 6482 dest[i++] = UCOL_HIRAGANA_QUAD; 6483 } else { 6484 dest[i++] = 0xFF; 6485 } 6486 } 6487 } 6488 if(uprv_numAvailableExpCEs(s)) { 6489 canUpdateState = FALSE; 6490 } else { 6491 canUpdateState = TRUE; 6492 } 6493 } 6494 } else { 6495 // if we're not doing quaternary 6496 // skip to the end 6497 level = UCOL_PSK_NULL; 6498 } 6499 /* fall through to next level */ 6500 case UCOL_PSK_QUIN: 6501 level = UCOL_PSK_IDENTICAL; 6502 /* fall through to next level */ 6503 case UCOL_PSK_IDENTICAL: 6504 if(strength >= UCOL_IDENTICAL) { 6505 UChar32 first, second; 6506 int32_t bocsuBytesWritten = 0; 6507 // We always need to do identical on 6508 // the NFD form of the string. 6509 if(normIter == NULL) { 6510 // we arrived from the level below and 6511 // normalization was not turned on. 6512 // therefore, we need to make a fresh NFD iterator 6513 normIter = unorm_openIter(stackNormIter, sizeof(stackNormIter), status); 6514 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 6515 } else if(!doingIdenticalFromStart) { 6516 // there is an iterator, but we did some other levels. 6517 // therefore, we have a FCD iterator - need to make 6518 // a NFD one. 6519 // normIter being at the beginning does not guarantee 6520 // that the underlying iterator is at the beginning 6521 iter->move(iter, 0, UITER_START); 6522 s.iterator = unorm_setIter(normIter, iter, UNORM_NFD, status); 6523 } 6524 // At this point we have a NFD iterator that is positioned 6525 // in the right place 6526 if(U_FAILURE(*status)) { 6527 UTRACE_EXIT_STATUS(*status); 6528 return 0; 6529 } 6530 first = uiter_previous32(s.iterator); 6531 // maybe we're at the start of the string 6532 if(first == U_SENTINEL) { 6533 first = 0; 6534 } else { 6535 uiter_next32(s.iterator); 6536 } 6537 6538 j = 0; 6539 for(;;) { 6540 if(i == count) { 6541 if(j+1 < bocsuBytesWritten) { 6542 bocsuBytesUsed = j+1; 6543 } 6544 goto saveState; 6545 } 6546 6547 // On identical level, we will always save 6548 // the state if we reach this point, since 6549 // we don't depend on getNextCE for content 6550 // all the content is in our buffer and we 6551 // already either stored the full buffer OR 6552 // otherwise we won't arrive here. 6553 newState = s.iterator->getState(s.iterator); 6554 if(newState != UITER_NO_STATE) { 6555 iterState = newState; 6556 cces = 0; 6557 } 6558 6559 uint8_t buff[4]; 6560 second = uiter_next32(s.iterator); 6561 cces++; 6562 6563 // end condition for identical level 6564 if(second == U_SENTINEL) { 6565 terminatePSKLevel(level, maxLevel, i, dest); 6566 level = UCOL_PSK_NULL; 6567 break; 6568 } 6569 bocsuBytesWritten = u_writeIdenticalLevelRunTwoChars(first, second, buff); 6570 first = second; 6571 6572 j = 0; 6573 if(bocsuBytesUsed != 0) { 6574 while(bocsuBytesUsed-->0) { 6575 j++; 6576 } 6577 } 6578 6579 while(i < count && j < bocsuBytesWritten) { 6580 dest[i++] = buff[j++]; 6581 } 6582 } 6583 6584 } else { 6585 level = UCOL_PSK_NULL; 6586 } 6587 /* fall through to next level */ 6588 case UCOL_PSK_NULL: 6589 j = i; 6590 while(j<count) { 6591 dest[j++]=0; 6592 } 6593 break; 6594 default: 6595 *status = U_INTERNAL_PROGRAM_ERROR; 6596 UTRACE_EXIT_STATUS(*status); 6597 return 0; 6598 } 6599 6600saveState: 6601 // Now we need to return stuff. First we want to see whether we have 6602 // done everything for the current state of iterator. 6603 if(byteCountOrFrenchDone 6604 || canUpdateState == FALSE 6605 || (newState = s.iterator->getState(s.iterator)) == UITER_NO_STATE) 6606 { 6607 // Any of above mean that the previous transaction 6608 // wasn't finished and that we should store the 6609 // previous iterator state. 6610 state[0] = iterState; 6611 } else { 6612 // The transaction is complete. We will continue in the next iteration. 6613 state[0] = s.iterator->getState(s.iterator); 6614 cces = 0; 6615 } 6616 // Store the number of bocsu bytes written. 6617 if((bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) != bocsuBytesUsed) { 6618 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6619 } 6620 state[1] = (bocsuBytesUsed & UCOL_PSK_BOCSU_BYTES_MASK) << UCOL_PSK_BOCSU_BYTES_SHIFT; 6621 6622 // Next we put in the level of comparison 6623 state[1] |= ((level & UCOL_PSK_LEVEL_MASK) << UCOL_PSK_LEVEL_SHIFT); 6624 6625 // If we are doing French, we need to store whether we have just finished the French level 6626 if(level == UCOL_PSK_SECONDARY && doingFrench) { 6627 state[1] |= (((state[0] == 0) & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6628 } else { 6629 state[1] |= ((byteCountOrFrenchDone & UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_MASK) << UCOL_PSK_BYTE_COUNT_OR_FRENCH_DONE_SHIFT); 6630 } 6631 6632 // Was the latest CE shifted 6633 if(wasShifted) { 6634 state[1] |= 1 << UCOL_PSK_WAS_SHIFTED_SHIFT; 6635 } 6636 // Check for cces overflow 6637 if((cces & UCOL_PSK_CONSUMED_CES_MASK) != cces) { 6638 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6639 } 6640 // Store cces 6641 state[1] |= ((cces & UCOL_PSK_CONSUMED_CES_MASK) << UCOL_PSK_CONSUMED_CES_SHIFT); 6642 6643 // Check for French overflow 6644 if((usedFrench & UCOL_PSK_USED_FRENCH_MASK) != usedFrench) { 6645 *status = U_INDEX_OUTOFBOUNDS_ERROR; 6646 } 6647 // Store number of bytes written in the French secondary continuation sequence 6648 state[1] |= ((usedFrench & UCOL_PSK_USED_FRENCH_MASK) << UCOL_PSK_USED_FRENCH_SHIFT); 6649 6650 6651 // If we have used normalizing iterator, get rid of it 6652 if(normIter != NULL) { 6653 unorm_closeIter(normIter); 6654 } 6655 6656 /* To avoid memory leak, free the offset buffer if necessary. */ 6657 ucol_freeOffsetBuffer(&s); 6658 6659 // Return number of meaningful sortkey bytes. 6660 UTRACE_DATA4(UTRACE_VERBOSE, "dest = %vb, state=%d %d", 6661 dest,i, state[0], state[1]); 6662 UTRACE_EXIT_VALUE(i); 6663 return i; 6664} 6665 6666/** 6667 * Produce a bound for a given sortkey and a number of levels. 6668 */ 6669U_CAPI int32_t U_EXPORT2 6670ucol_getBound(const uint8_t *source, 6671 int32_t sourceLength, 6672 UColBoundMode boundType, 6673 uint32_t noOfLevels, 6674 uint8_t *result, 6675 int32_t resultLength, 6676 UErrorCode *status) 6677{ 6678 // consistency checks 6679 if(status == NULL || U_FAILURE(*status)) { 6680 return 0; 6681 } 6682 if(source == NULL) { 6683 *status = U_ILLEGAL_ARGUMENT_ERROR; 6684 return 0; 6685 } 6686 6687 int32_t sourceIndex = 0; 6688 // Scan the string until we skip enough of the key OR reach the end of the key 6689 do { 6690 sourceIndex++; 6691 if(source[sourceIndex] == UCOL_LEVELTERMINATOR) { 6692 noOfLevels--; 6693 } 6694 } while (noOfLevels > 0 6695 && (source[sourceIndex] != 0 || sourceIndex < sourceLength)); 6696 6697 if((source[sourceIndex] == 0 || sourceIndex == sourceLength) 6698 && noOfLevels > 0) { 6699 *status = U_SORT_KEY_TOO_SHORT_WARNING; 6700 } 6701 6702 6703 // READ ME: this code assumes that the values for boundType 6704 // enum will not changes. They are set so that the enum value 6705 // corresponds to the number of extra bytes each bound type 6706 // needs. 6707 if(result != NULL && resultLength >= sourceIndex+boundType) { 6708 uprv_memcpy(result, source, sourceIndex); 6709 switch(boundType) { 6710 // Lower bound just gets terminated. No extra bytes 6711 case UCOL_BOUND_LOWER: // = 0 6712 break; 6713 // Upper bound needs one extra byte 6714 case UCOL_BOUND_UPPER: // = 1 6715 result[sourceIndex++] = 2; 6716 break; 6717 // Upper long bound needs two extra bytes 6718 case UCOL_BOUND_UPPER_LONG: // = 2 6719 result[sourceIndex++] = 0xFF; 6720 result[sourceIndex++] = 0xFF; 6721 break; 6722 default: 6723 *status = U_ILLEGAL_ARGUMENT_ERROR; 6724 return 0; 6725 } 6726 result[sourceIndex++] = 0; 6727 6728 return sourceIndex; 6729 } else { 6730 return sourceIndex+boundType+1; 6731 } 6732} 6733 6734/****************************************************************************/ 6735/* Following are the functions that deal with the properties of a collator */ 6736/* there are new APIs and some compatibility APIs */ 6737/****************************************************************************/ 6738 6739static inline void 6740ucol_addLatinOneEntry(UCollator *coll, UChar ch, uint32_t CE, 6741 int32_t *primShift, int32_t *secShift, int32_t *terShift) 6742{ 6743 uint8_t primary1 = 0, primary2 = 0, secondary = 0, tertiary = 0; 6744 UBool reverseSecondary = FALSE; 6745 if(!isContinuation(CE)) { 6746 tertiary = (uint8_t)((CE & coll->tertiaryMask)); 6747 tertiary ^= coll->caseSwitch; 6748 reverseSecondary = TRUE; 6749 } else { 6750 tertiary = (uint8_t)((CE & UCOL_REMOVE_CONTINUATION)); 6751 tertiary &= UCOL_REMOVE_CASE; 6752 reverseSecondary = FALSE; 6753 } 6754 6755 secondary = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6756 primary2 = (uint8_t)((CE >>= 8) & UCOL_BYTE_SIZE_MASK); 6757 primary1 = (uint8_t)(CE >> 8); 6758 6759 if(primary1 != 0) { 6760 coll->latinOneCEs[ch] |= (primary1 << *primShift); 6761 *primShift -= 8; 6762 } 6763 if(primary2 != 0) { 6764 if(*primShift < 0) { 6765 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6766 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6767 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6768 return; 6769 } 6770 coll->latinOneCEs[ch] |= (primary2 << *primShift); 6771 *primShift -= 8; 6772 } 6773 if(secondary != 0) { 6774 if(reverseSecondary && coll->frenchCollation == UCOL_ON) { // reverse secondary 6775 coll->latinOneCEs[coll->latinOneTableLen+ch] >>= 8; // make space for secondary 6776 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << 24); 6777 } else { // normal case 6778 coll->latinOneCEs[coll->latinOneTableLen+ch] |= (secondary << *secShift); 6779 } 6780 *secShift -= 8; 6781 } 6782 if(tertiary != 0) { 6783 coll->latinOneCEs[2*coll->latinOneTableLen+ch] |= (tertiary << *terShift); 6784 *terShift -= 8; 6785 } 6786} 6787 6788static inline UBool 6789ucol_resizeLatinOneTable(UCollator *coll, int32_t size, UErrorCode *status) { 6790 uint32_t *newTable = (uint32_t *)uprv_malloc(size*sizeof(uint32_t)*3); 6791 if(newTable == NULL) { 6792 *status = U_MEMORY_ALLOCATION_ERROR; 6793 coll->latinOneFailed = TRUE; 6794 return FALSE; 6795 } 6796 int32_t sizeToCopy = ((size<coll->latinOneTableLen)?size:coll->latinOneTableLen)*sizeof(uint32_t); 6797 uprv_memset(newTable, 0, size*sizeof(uint32_t)*3); 6798 uprv_memcpy(newTable, coll->latinOneCEs, sizeToCopy); 6799 uprv_memcpy(newTable+size, coll->latinOneCEs+coll->latinOneTableLen, sizeToCopy); 6800 uprv_memcpy(newTable+2*size, coll->latinOneCEs+2*coll->latinOneTableLen, sizeToCopy); 6801 coll->latinOneTableLen = size; 6802 uprv_free(coll->latinOneCEs); 6803 coll->latinOneCEs = newTable; 6804 return TRUE; 6805} 6806 6807static UBool 6808ucol_setUpLatinOne(UCollator *coll, UErrorCode *status) { 6809 UBool result = TRUE; 6810 if(coll->latinOneCEs == NULL) { 6811 coll->latinOneCEs = (uint32_t *)uprv_malloc(sizeof(uint32_t)*UCOL_LATINONETABLELEN*3); 6812 if(coll->latinOneCEs == NULL) { 6813 *status = U_MEMORY_ALLOCATION_ERROR; 6814 return FALSE; 6815 } 6816 coll->latinOneTableLen = UCOL_LATINONETABLELEN; 6817 } 6818 UChar ch = 0; 6819 UCollationElements *it = ucol_openElements(coll, &ch, 1, status); 6820 // Check for null pointer 6821 if (U_FAILURE(*status)) { 6822 return FALSE; 6823 } 6824 uprv_memset(coll->latinOneCEs, 0, sizeof(uint32_t)*coll->latinOneTableLen*3); 6825 6826 int32_t primShift = 24, secShift = 24, terShift = 24; 6827 uint32_t CE = 0; 6828 int32_t contractionOffset = UCOL_ENDOFLATINONERANGE+1; 6829 6830 // TODO: make safe if you get more than you wanted... 6831 for(ch = 0; ch <= UCOL_ENDOFLATINONERANGE; ch++) { 6832 primShift = 24; secShift = 24; terShift = 24; 6833 if(ch < 0x100) { 6834 CE = coll->latinOneMapping[ch]; 6835 } else { 6836 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, ch); 6837 if(CE == UCOL_NOT_FOUND && coll->UCA) { 6838 CE = UTRIE_GET32_FROM_LEAD(&coll->UCA->mapping, ch); 6839 } 6840 } 6841 if(CE < UCOL_NOT_FOUND) { 6842 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6843 } else { 6844 switch (getCETag(CE)) { 6845 case EXPANSION_TAG: 6846 case DIGIT_TAG: 6847 ucol_setText(it, &ch, 1, status); 6848 while((int32_t)(CE = ucol_next(it, status)) != UCOL_NULLORDER) { 6849 if(primShift < 0 || secShift < 0 || terShift < 0) { 6850 coll->latinOneCEs[ch] = UCOL_BAIL_OUT_CE; 6851 coll->latinOneCEs[coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6852 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = UCOL_BAIL_OUT_CE; 6853 break; 6854 } 6855 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6856 } 6857 break; 6858 case CONTRACTION_TAG: 6859 // here is the trick 6860 // F2 is contraction. We do something very similar to contractions 6861 // but have two indices, one in the real contraction table and the 6862 // other to where we stuffed things. This hopes that we don't have 6863 // many contractions (this should work for latin-1 tables). 6864 { 6865 if((CE & 0x00FFF000) != 0) { 6866 *status = U_UNSUPPORTED_ERROR; 6867 goto cleanup_after_failure; 6868 } 6869 6870 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); 6871 6872 CE |= (contractionOffset & 0xFFF) << 12; // insert the offset in latin-1 table 6873 6874 coll->latinOneCEs[ch] = CE; 6875 coll->latinOneCEs[coll->latinOneTableLen+ch] = CE; 6876 coll->latinOneCEs[2*coll->latinOneTableLen+ch] = CE; 6877 6878 // We're going to jump into contraction table, pick the elements 6879 // and use them 6880 do { 6881 CE = *(coll->contractionCEs + 6882 (UCharOffset - coll->contractionIndex)); 6883 if(CE > UCOL_NOT_FOUND && getCETag(CE) == EXPANSION_TAG) { 6884 uint32_t size; 6885 uint32_t i; /* general counter */ 6886 uint32_t *CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */ 6887 size = getExpansionCount(CE); 6888 //CE = *CEOffset++; 6889 if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ 6890 for(i = 0; i<size; i++) { 6891 if(primShift < 0 || secShift < 0 || terShift < 0) { 6892 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6893 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6894 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6895 break; 6896 } 6897 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6898 } 6899 } else { /* else, we do */ 6900 while(*CEOffset != 0) { 6901 if(primShift < 0 || secShift < 0 || terShift < 0) { 6902 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6903 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6904 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6905 break; 6906 } 6907 ucol_addLatinOneEntry(coll, (UChar)contractionOffset, *CEOffset++, &primShift, &secShift, &terShift); 6908 } 6909 } 6910 contractionOffset++; 6911 } else if(CE < UCOL_NOT_FOUND) { 6912 ucol_addLatinOneEntry(coll, (UChar)contractionOffset++, CE, &primShift, &secShift, &terShift); 6913 } else { 6914 coll->latinOneCEs[(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6915 coll->latinOneCEs[coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6916 coll->latinOneCEs[2*coll->latinOneTableLen+(UChar)contractionOffset] = UCOL_BAIL_OUT_CE; 6917 contractionOffset++; 6918 } 6919 UCharOffset++; 6920 primShift = 24; secShift = 24; terShift = 24; 6921 if(contractionOffset == coll->latinOneTableLen) { // we need to reallocate 6922 if(!ucol_resizeLatinOneTable(coll, 2*coll->latinOneTableLen, status)) { 6923 goto cleanup_after_failure; 6924 } 6925 } 6926 } while(*UCharOffset != 0xFFFF); 6927 } 6928 break;; 6929 case SPEC_PROC_TAG: 6930 { 6931 // 0xB7 is a precontext character defined in UCA5.1, a special 6932 // handle is implemeted in order to save LatinOne table for 6933 // most locales. 6934 if (ch==0xb7) { 6935 ucol_addLatinOneEntry(coll, ch, CE, &primShift, &secShift, &terShift); 6936 } 6937 else { 6938 goto cleanup_after_failure; 6939 } 6940 } 6941 break; 6942 default: 6943 goto cleanup_after_failure; 6944 } 6945 } 6946 } 6947 // compact table 6948 if(contractionOffset < coll->latinOneTableLen) { 6949 if(!ucol_resizeLatinOneTable(coll, contractionOffset, status)) { 6950 goto cleanup_after_failure; 6951 } 6952 } 6953 ucol_closeElements(it); 6954 return result; 6955 6956cleanup_after_failure: 6957 // status should already be set before arriving here. 6958 coll->latinOneFailed = TRUE; 6959 ucol_closeElements(it); 6960 return FALSE; 6961} 6962 6963void ucol_updateInternalState(UCollator *coll, UErrorCode *status) { 6964 if(U_SUCCESS(*status)) { 6965 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6966 coll->caseSwitch = UCOL_CASE_SWITCH; 6967 } else { 6968 coll->caseSwitch = UCOL_NO_CASE_SWITCH; 6969 } 6970 6971 if(coll->caseLevel == UCOL_ON || coll->caseFirst == UCOL_OFF) { 6972 coll->tertiaryMask = UCOL_REMOVE_CASE; 6973 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6974 coll->tertiaryAddition = (int8_t)UCOL_FLAG_BIT_MASK_CASE_SW_OFF; /* Should be 0x80 */ 6975 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_OFF; 6976 coll->tertiaryBottom = UCOL_COMMON_BOT3; 6977 } else { 6978 coll->tertiaryMask = UCOL_KEEP_CASE; 6979 coll->tertiaryAddition = UCOL_FLAG_BIT_MASK_CASE_SW_ON; 6980 if(coll->caseFirst == UCOL_UPPER_FIRST) { 6981 coll->tertiaryCommon = UCOL_COMMON3_UPPERFIRST; 6982 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_UPPER; 6983 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_UPPER; 6984 } else { 6985 coll->tertiaryCommon = UCOL_COMMON3_NORMAL; 6986 coll->tertiaryTop = UCOL_COMMON_TOP3_CASE_SW_LOWER; 6987 coll->tertiaryBottom = UCOL_COMMON_BOTTOM3_CASE_SW_LOWER; 6988 } 6989 } 6990 6991 /* Set the compression values */ 6992 uint8_t tertiaryTotal = (uint8_t)(coll->tertiaryTop - UCOL_COMMON_BOT3-1); 6993 coll->tertiaryTopCount = (uint8_t)(UCOL_PROPORTION3*tertiaryTotal); /* we multilply double with int, but need only int */ 6994 coll->tertiaryBottomCount = (uint8_t)(tertiaryTotal - coll->tertiaryTopCount); 6995 6996 if(coll->caseLevel == UCOL_OFF && coll->strength == UCOL_TERTIARY 6997 && coll->frenchCollation == UCOL_OFF && coll->alternateHandling == UCOL_NON_IGNORABLE) 6998 { 6999 coll->sortKeyGen = ucol_calcSortKeySimpleTertiary; 7000 } else { 7001 coll->sortKeyGen = ucol_calcSortKey; 7002 } 7003 if(coll->caseLevel == UCOL_OFF && coll->strength <= UCOL_TERTIARY && coll->numericCollation == UCOL_OFF 7004 && coll->alternateHandling == UCOL_NON_IGNORABLE && !coll->latinOneFailed) 7005 { 7006 if(coll->latinOneCEs == NULL || coll->latinOneRegenTable) { 7007 if(ucol_setUpLatinOne(coll, status)) { // if we succeed in building latin1 table, we'll use it 7008 //fprintf(stderr, "F"); 7009 coll->latinOneUse = TRUE; 7010 } else { 7011 coll->latinOneUse = FALSE; 7012 } 7013 if(*status == U_UNSUPPORTED_ERROR) { 7014 *status = U_ZERO_ERROR; 7015 } 7016 } else { // latin1Table exists and it doesn't need to be regenerated, just use it 7017 coll->latinOneUse = TRUE; 7018 } 7019 } else { 7020 coll->latinOneUse = FALSE; 7021 } 7022 } 7023} 7024 7025U_CAPI uint32_t U_EXPORT2 7026ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCode *status) { 7027 if(U_FAILURE(*status) || coll == NULL) { 7028 return 0; 7029 } 7030 if(len == -1) { 7031 len = u_strlen(varTop); 7032 } 7033 if(len == 0) { 7034 *status = U_ILLEGAL_ARGUMENT_ERROR; 7035 return 0; 7036 } 7037 7038 collIterate s; 7039 IInit_collIterate(coll, varTop, len, &s); 7040 7041 uint32_t CE = ucol_IGetNextCE(coll, &s, status); 7042 7043 /* here we check if we have consumed all characters */ 7044 /* you can put in either one character or a contraction */ 7045 /* you shouldn't put more... */ 7046 if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) { 7047 *status = U_CE_NOT_FOUND_ERROR; 7048 return 0; 7049 } 7050 7051 uint32_t nextCE = ucol_IGetNextCE(coll, &s, status); 7052 7053 if(isContinuation(nextCE) && (nextCE & UCOL_PRIMARYMASK) != 0) { 7054 *status = U_PRIMARY_TOO_LONG_ERROR; 7055 return 0; 7056 } 7057 if(coll->variableTopValue != (CE & UCOL_PRIMARYMASK)>>16) { 7058 coll->variableTopValueisDefault = FALSE; 7059 coll->variableTopValue = (CE & UCOL_PRIMARYMASK)>>16; 7060 } 7061 7062 /* To avoid memory leak, free the offset buffer if necessary. */ 7063 ucol_freeOffsetBuffer(&s); 7064 7065 return CE & UCOL_PRIMARYMASK; 7066} 7067 7068U_CAPI uint32_t U_EXPORT2 ucol_getVariableTop(const UCollator *coll, UErrorCode *status) { 7069 if(U_FAILURE(*status) || coll == NULL) { 7070 return 0; 7071 } 7072 return coll->variableTopValue<<16; 7073} 7074 7075U_CAPI void U_EXPORT2 7076ucol_restoreVariableTop(UCollator *coll, const uint32_t varTop, UErrorCode *status) { 7077 if(U_FAILURE(*status) || coll == NULL) { 7078 return; 7079 } 7080 7081 if(coll->variableTopValue != (varTop & UCOL_PRIMARYMASK)>>16) { 7082 coll->variableTopValueisDefault = FALSE; 7083 coll->variableTopValue = (varTop & UCOL_PRIMARYMASK)>>16; 7084 } 7085} 7086/* Attribute setter API */ 7087U_CAPI void U_EXPORT2 7088ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value, UErrorCode *status) { 7089 if(U_FAILURE(*status) || coll == NULL) { 7090 return; 7091 } 7092 UColAttributeValue oldFrench = coll->frenchCollation; 7093 UColAttributeValue oldCaseFirst = coll->caseFirst; 7094 switch(attr) { 7095 case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */ 7096 if(value == UCOL_ON) { 7097 coll->numericCollation = UCOL_ON; 7098 coll->numericCollationisDefault = FALSE; 7099 } else if (value == UCOL_OFF) { 7100 coll->numericCollation = UCOL_OFF; 7101 coll->numericCollationisDefault = FALSE; 7102 } else if (value == UCOL_DEFAULT) { 7103 coll->numericCollationisDefault = TRUE; 7104 coll->numericCollation = (UColAttributeValue)coll->options->numericCollation; 7105 } else { 7106 *status = U_ILLEGAL_ARGUMENT_ERROR; 7107 } 7108 break; 7109 case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */ 7110 if(value == UCOL_ON) { 7111 coll->hiraganaQ = UCOL_ON; 7112 coll->hiraganaQisDefault = FALSE; 7113 } else if (value == UCOL_OFF) { 7114 coll->hiraganaQ = UCOL_OFF; 7115 coll->hiraganaQisDefault = FALSE; 7116 } else if (value == UCOL_DEFAULT) { 7117 coll->hiraganaQisDefault = TRUE; 7118 coll->hiraganaQ = (UColAttributeValue)coll->options->hiraganaQ; 7119 } else { 7120 *status = U_ILLEGAL_ARGUMENT_ERROR; 7121 } 7122 break; 7123 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 7124 if(value == UCOL_ON) { 7125 coll->frenchCollation = UCOL_ON; 7126 coll->frenchCollationisDefault = FALSE; 7127 } else if (value == UCOL_OFF) { 7128 coll->frenchCollation = UCOL_OFF; 7129 coll->frenchCollationisDefault = FALSE; 7130 } else if (value == UCOL_DEFAULT) { 7131 coll->frenchCollationisDefault = TRUE; 7132 coll->frenchCollation = (UColAttributeValue)coll->options->frenchCollation; 7133 } else { 7134 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7135 } 7136 break; 7137 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 7138 if(value == UCOL_SHIFTED) { 7139 coll->alternateHandling = UCOL_SHIFTED; 7140 coll->alternateHandlingisDefault = FALSE; 7141 } else if (value == UCOL_NON_IGNORABLE) { 7142 coll->alternateHandling = UCOL_NON_IGNORABLE; 7143 coll->alternateHandlingisDefault = FALSE; 7144 } else if (value == UCOL_DEFAULT) { 7145 coll->alternateHandlingisDefault = TRUE; 7146 coll->alternateHandling = (UColAttributeValue)coll->options->alternateHandling ; 7147 } else { 7148 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7149 } 7150 break; 7151 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 7152 if(value == UCOL_LOWER_FIRST) { 7153 coll->caseFirst = UCOL_LOWER_FIRST; 7154 coll->caseFirstisDefault = FALSE; 7155 } else if (value == UCOL_UPPER_FIRST) { 7156 coll->caseFirst = UCOL_UPPER_FIRST; 7157 coll->caseFirstisDefault = FALSE; 7158 } else if (value == UCOL_OFF) { 7159 coll->caseFirst = UCOL_OFF; 7160 coll->caseFirstisDefault = FALSE; 7161 } else if (value == UCOL_DEFAULT) { 7162 coll->caseFirst = (UColAttributeValue)coll->options->caseFirst; 7163 coll->caseFirstisDefault = TRUE; 7164 } else { 7165 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7166 } 7167 break; 7168 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 7169 if(value == UCOL_ON) { 7170 coll->caseLevel = UCOL_ON; 7171 coll->caseLevelisDefault = FALSE; 7172 } else if (value == UCOL_OFF) { 7173 coll->caseLevel = UCOL_OFF; 7174 coll->caseLevelisDefault = FALSE; 7175 } else if (value == UCOL_DEFAULT) { 7176 coll->caseLevel = (UColAttributeValue)coll->options->caseLevel; 7177 coll->caseLevelisDefault = TRUE; 7178 } else { 7179 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7180 } 7181 break; 7182 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 7183 if(value == UCOL_ON) { 7184 coll->normalizationMode = UCOL_ON; 7185 coll->normalizationModeisDefault = FALSE; 7186 } else if (value == UCOL_OFF) { 7187 coll->normalizationMode = UCOL_OFF; 7188 coll->normalizationModeisDefault = FALSE; 7189 } else if (value == UCOL_DEFAULT) { 7190 coll->normalizationModeisDefault = TRUE; 7191 coll->normalizationMode = (UColAttributeValue)coll->options->normalizationMode; 7192 } else { 7193 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7194 } 7195 break; 7196 case UCOL_STRENGTH: /* attribute for strength */ 7197 if (value == UCOL_DEFAULT) { 7198 coll->strengthisDefault = TRUE; 7199 coll->strength = (UColAttributeValue)coll->options->strength; 7200 } else if (value <= UCOL_IDENTICAL) { 7201 coll->strengthisDefault = FALSE; 7202 coll->strength = value; 7203 } else { 7204 *status = U_ILLEGAL_ARGUMENT_ERROR ; 7205 } 7206 break; 7207 case UCOL_ATTRIBUTE_COUNT: 7208 default: 7209 *status = U_ILLEGAL_ARGUMENT_ERROR; 7210 break; 7211 } 7212 if(oldFrench != coll->frenchCollation || oldCaseFirst != coll->caseFirst) { 7213 coll->latinOneRegenTable = TRUE; 7214 } else { 7215 coll->latinOneRegenTable = FALSE; 7216 } 7217 ucol_updateInternalState(coll, status); 7218} 7219 7220U_CAPI UColAttributeValue U_EXPORT2 7221ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status) { 7222 if(U_FAILURE(*status) || coll == NULL) { 7223 return UCOL_DEFAULT; 7224 } 7225 switch(attr) { 7226 case UCOL_NUMERIC_COLLATION: 7227 return coll->numericCollation; 7228 case UCOL_HIRAGANA_QUATERNARY_MODE: 7229 return coll->hiraganaQ; 7230 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 7231 return coll->frenchCollation; 7232 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 7233 return coll->alternateHandling; 7234 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 7235 return coll->caseFirst; 7236 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 7237 return coll->caseLevel; 7238 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 7239 return coll->normalizationMode; 7240 case UCOL_STRENGTH: /* attribute for strength */ 7241 return coll->strength; 7242 case UCOL_ATTRIBUTE_COUNT: 7243 default: 7244 *status = U_ILLEGAL_ARGUMENT_ERROR; 7245 break; 7246 } 7247 return UCOL_DEFAULT; 7248} 7249 7250U_CAPI void U_EXPORT2 7251ucol_setStrength( UCollator *coll, 7252 UCollationStrength strength) 7253{ 7254 UErrorCode status = U_ZERO_ERROR; 7255 ucol_setAttribute(coll, UCOL_STRENGTH, strength, &status); 7256} 7257 7258U_CAPI UCollationStrength U_EXPORT2 7259ucol_getStrength(const UCollator *coll) 7260{ 7261 UErrorCode status = U_ZERO_ERROR; 7262 return ucol_getAttribute(coll, UCOL_STRENGTH, &status); 7263} 7264 7265/****************************************************************************/ 7266/* Following are misc functions */ 7267/* there are new APIs and some compatibility APIs */ 7268/****************************************************************************/ 7269 7270U_CAPI void U_EXPORT2 7271ucol_getVersion(const UCollator* coll, 7272 UVersionInfo versionInfo) 7273{ 7274 /* RunTime version */ 7275 uint8_t rtVersion = UCOL_RUNTIME_VERSION; 7276 /* Builder version*/ 7277 uint8_t bdVersion = coll->image->version[0]; 7278 7279 /* Charset Version. Need to get the version from cnv files 7280 * makeconv should populate cnv files with version and 7281 * an api has to be provided in ucnv.h to obtain this version 7282 */ 7283 uint8_t csVersion = 0; 7284 7285 /* combine the version info */ 7286 uint16_t cmbVersion = (uint16_t)((rtVersion<<11) | (bdVersion<<6) | (csVersion)); 7287 7288 /* Tailoring rules */ 7289 versionInfo[0] = (uint8_t)(cmbVersion>>8); 7290 versionInfo[1] = (uint8_t)cmbVersion; 7291 versionInfo[2] = coll->image->version[1]; 7292 if(coll->UCA) { 7293 /* Include the minor number when getting the UCA version. (major & 1f) << 3 | (minor & 7) */ 7294 versionInfo[3] = (coll->UCA->image->UCAVersion[0] & 0x1f) << 3 | (coll->UCA->image->UCAVersion[1] & 0x07); 7295 } else { 7296 versionInfo[3] = 0; 7297 } 7298} 7299 7300 7301/* This internal API checks whether a character is tailored or not */ 7302U_CAPI UBool U_EXPORT2 7303ucol_isTailored(const UCollator *coll, const UChar u, UErrorCode *status) { 7304 if(U_FAILURE(*status) || coll == NULL || coll == coll->UCA) { 7305 return FALSE; 7306 } 7307 7308 uint32_t CE = UCOL_NOT_FOUND; 7309 const UChar *ContractionStart = NULL; 7310 if(u < 0x100) { /* latin-1 */ 7311 CE = coll->latinOneMapping[u]; 7312 if(coll->UCA && CE == coll->UCA->latinOneMapping[u]) { 7313 return FALSE; 7314 } 7315 } else { /* regular */ 7316 CE = UTRIE_GET32_FROM_LEAD(&coll->mapping, u); 7317 } 7318 7319 if(isContraction(CE)) { 7320 ContractionStart = (UChar *)coll->image+getContractOffset(CE); 7321 CE = *(coll->contractionCEs + (ContractionStart- coll->contractionIndex)); 7322 } 7323 7324 return (UBool)(CE != UCOL_NOT_FOUND); 7325} 7326 7327 7328/****************************************************************************/ 7329/* Following are the string compare functions */ 7330/* */ 7331/****************************************************************************/ 7332 7333 7334/* ucol_checkIdent internal function. Does byte level string compare. */ 7335/* Used by strcoll if strength == identical and strings */ 7336/* are otherwise equal. Moved out-of-line because this */ 7337/* is a rare case. */ 7338/* */ 7339/* Comparison must be done on NFD normalized strings. */ 7340/* FCD is not good enough. */ 7341/* */ 7342/* TODO: make an incremental NFD Comparison function, which could */ 7343/* be of general use */ 7344 7345static 7346UCollationResult ucol_checkIdent(collIterate *sColl, collIterate *tColl, UBool normalize, UErrorCode *status) 7347{ 7348 7349 // TODO: When we have an UChar iterator, we need to access the whole string. One 7350 // useful modification would be a UChar iterator extract API, since reset next next... 7351 // is not optimal. 7352 // TODO: Handle long strings. Do the same in compareUsingSortKeys. 7353 7354 // When we arrive here, we can have normal strings or UCharIterators. Currently they are both 7355 // of same type, but that doesn't really mean that it will stay that way. 7356 7357 // The division for the array length may truncate the array size to 7358 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 7359 // for all platforms anyway. 7360 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 7361 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 7362 //UChar sStackBuf[256], tStackBuf[256]; 7363 //int32_t sBufSize = 256, tBufSize = 256; 7364 int32_t comparison; 7365 int32_t sLen = 0; 7366 UChar *sBuf = NULL; 7367 int32_t tLen = 0; 7368 UChar *tBuf = NULL; 7369 UBool freeSBuf = FALSE, freeTBuf = FALSE; 7370 7371 if (sColl->flags & UCOL_USE_ITERATOR) { 7372 UNormIterator *sNIt = NULL, *tNIt = NULL; 7373 sNIt = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 7374 tNIt = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 7375 sColl->iterator->move(sColl->iterator, 0, UITER_START); 7376 tColl->iterator->move(tColl->iterator, 0, UITER_START); 7377 UCharIterator *sIt = unorm_setIter(sNIt, sColl->iterator, UNORM_NFD, status); 7378 UCharIterator *tIt = unorm_setIter(tNIt, tColl->iterator, UNORM_NFD, status); 7379 comparison = u_strCompareIter(sIt, tIt, TRUE); 7380 unorm_closeIter(sNIt); 7381 unorm_closeIter(tNIt); 7382 } else { 7383 sLen = (sColl->flags & UCOL_ITER_HASLEN) ? sColl->endp - sColl->string : -1; 7384 sBuf = sColl->string; 7385 tLen = (tColl->flags & UCOL_ITER_HASLEN) ? tColl->endp - tColl->string : -1; 7386 tBuf = tColl->string; 7387 7388 if (normalize) { 7389 *status = U_ZERO_ERROR; 7390 if (unorm_quickCheck(sBuf, sLen, UNORM_NFD, status) != UNORM_YES) { 7391 sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize, 7392 sBuf, sLen, 7393 FALSE, 0, 7394 status); 7395 if(*status == U_BUFFER_OVERFLOW_ERROR) { 7396 if(!u_growBufferFromStatic(sColl->stackWritableBuffer, 7397 &sColl->writableBuffer, 7398 (int32_t *)&sColl->writableBufSize, sLen, 7399 0) 7400 ) 7401 { 7402 *status = U_MEMORY_ALLOCATION_ERROR; 7403 return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */ 7404 } 7405 *status = U_ZERO_ERROR; 7406 sLen = unorm_decompose(sColl->writableBuffer, (int32_t)sColl->writableBufSize, 7407 sBuf, sLen, 7408 FALSE, 0, 7409 status); 7410 } 7411 if(freeSBuf) { 7412 uprv_free(sBuf); 7413 freeSBuf = FALSE; 7414 } 7415 sBuf = sColl->writableBuffer; 7416 if (sBuf != sColl->stackWritableBuffer) { 7417 sColl->flags |= UCOL_ITER_ALLOCATED; 7418 } 7419 } 7420 7421 *status = U_ZERO_ERROR; 7422 if (unorm_quickCheck(tBuf, tLen, UNORM_NFD, status) != UNORM_YES) { 7423 tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize, 7424 tBuf, tLen, 7425 FALSE, 0, 7426 status); 7427 if(*status == U_BUFFER_OVERFLOW_ERROR) { 7428 if(!u_growBufferFromStatic(tColl->stackWritableBuffer, 7429 &tColl->writableBuffer, 7430 (int32_t *)&tColl->writableBufSize, tLen, 7431 0) 7432 ) 7433 { 7434 *status = U_MEMORY_ALLOCATION_ERROR; 7435 return UCOL_LESS; /* TODO set *status = U_MEMORY_ALLOCATION_ERROR; */ 7436 } 7437 *status = U_ZERO_ERROR; 7438 tLen = unorm_decompose(tColl->writableBuffer, (int32_t)tColl->writableBufSize, 7439 tBuf, tLen, 7440 FALSE, 0, 7441 status); 7442 } 7443 if(freeTBuf) { 7444 uprv_free(tBuf); 7445 freeTBuf = FALSE; 7446 } 7447 tBuf = tColl->writableBuffer; 7448 if (tBuf != tColl->stackWritableBuffer) { 7449 tColl->flags |= UCOL_ITER_ALLOCATED; 7450 } 7451 } 7452 } 7453 7454 if (sLen == -1 && tLen == -1) { 7455 comparison = u_strcmpCodePointOrder(sBuf, tBuf); 7456 } else { 7457 if (sLen == -1) { 7458 sLen = u_strlen(sBuf); 7459 } 7460 if (tLen == -1) { 7461 tLen = u_strlen(tBuf); 7462 } 7463 comparison = u_memcmpCodePointOrder(sBuf, tBuf, uprv_min(sLen, tLen)); 7464 if (comparison == 0) { 7465 comparison = sLen - tLen; 7466 } 7467 } 7468 } 7469 7470 if (comparison < 0) { 7471 return UCOL_LESS; 7472 } else if (comparison == 0) { 7473 return UCOL_EQUAL; 7474 } else /* comparison > 0 */ { 7475 return UCOL_GREATER; 7476 } 7477} 7478 7479/* CEBuf - A struct and some inline functions to handle the saving */ 7480/* of CEs in a buffer within ucol_strcoll */ 7481 7482#define UCOL_CEBUF_SIZE 512 7483typedef struct ucol_CEBuf { 7484 uint32_t *buf; 7485 uint32_t *endp; 7486 uint32_t *pos; 7487 uint32_t localArray[UCOL_CEBUF_SIZE]; 7488} ucol_CEBuf; 7489 7490 7491static 7492inline void UCOL_INIT_CEBUF(ucol_CEBuf *b) { 7493 (b)->buf = (b)->pos = (b)->localArray; 7494 (b)->endp = (b)->buf + UCOL_CEBUF_SIZE; 7495} 7496 7497static 7498void ucol_CEBuf_Expand(ucol_CEBuf *b, collIterate *ci, UErrorCode *status) { 7499 uint32_t oldSize; 7500 uint32_t newSize; 7501 uint32_t *newBuf; 7502 7503 ci->flags |= UCOL_ITER_ALLOCATED; 7504 oldSize = b->pos - b->buf; 7505 newSize = oldSize * 2; 7506 newBuf = (uint32_t *)uprv_malloc(newSize * sizeof(uint32_t)); 7507 if(newBuf == NULL) { 7508 *status = U_MEMORY_ALLOCATION_ERROR; 7509 } 7510 else { 7511 uprv_memcpy(newBuf, b->buf, oldSize * sizeof(uint32_t)); 7512 if (b->buf != b->localArray) { 7513 uprv_free(b->buf); 7514 } 7515 b->buf = newBuf; 7516 b->endp = b->buf + newSize; 7517 b->pos = b->buf + oldSize; 7518 } 7519} 7520 7521static 7522inline void UCOL_CEBUF_PUT(ucol_CEBuf *b, uint32_t ce, collIterate *ci, UErrorCode *status) { 7523 if (b->pos == b->endp) { 7524 ucol_CEBuf_Expand(b, ci, status); 7525 } 7526 if (U_SUCCESS(*status)) { 7527 *(b)->pos++ = ce; 7528 } 7529} 7530 7531/* This is a trick string compare function that goes in and uses sortkeys to compare */ 7532/* It is used when compare gets in trouble and needs to bail out */ 7533static UCollationResult ucol_compareUsingSortKeys(collIterate *sColl, 7534 collIterate *tColl, 7535 UErrorCode *status) 7536{ 7537 uint8_t sourceKey[UCOL_MAX_BUFFER], targetKey[UCOL_MAX_BUFFER]; 7538 uint8_t *sourceKeyP = sourceKey; 7539 uint8_t *targetKeyP = targetKey; 7540 int32_t sourceKeyLen = UCOL_MAX_BUFFER, targetKeyLen = UCOL_MAX_BUFFER; 7541 const UCollator *coll = sColl->coll; 7542 UChar *source = NULL; 7543 UChar *target = NULL; 7544 int32_t result = UCOL_EQUAL; 7545 UChar sStackBuf[256], tStackBuf[256]; 7546 int32_t sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1; 7547 int32_t targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1; 7548 7549 // TODO: Handle long strings. Do the same in ucol_checkIdent. 7550 if(sColl->flags & UCOL_USE_ITERATOR) { 7551 sColl->iterator->move(sColl->iterator, 0, UITER_START); 7552 tColl->iterator->move(tColl->iterator, 0, UITER_START); 7553 source = sStackBuf; 7554 UChar *sBufp = source; 7555 target = tStackBuf; 7556 UChar *tBufp = target; 7557 while(sColl->iterator->hasNext(sColl->iterator)) { 7558 *sBufp++ = (UChar)sColl->iterator->next(sColl->iterator); 7559 } 7560 while(tColl->iterator->hasNext(tColl->iterator)) { 7561 *tBufp++ = (UChar)tColl->iterator->next(tColl->iterator); 7562 } 7563 sourceLength = sBufp - source; 7564 targetLength = tBufp - target; 7565 } else { // no iterators 7566 sourceLength = (sColl->flags&UCOL_ITER_HASLEN)?(sColl->endp-sColl->string):-1; 7567 targetLength = (tColl->flags&UCOL_ITER_HASLEN)?(tColl->endp-tColl->string):-1; 7568 source = sColl->string; 7569 target = tColl->string; 7570 } 7571 7572 7573 7574 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7575 if(sourceKeyLen > UCOL_MAX_BUFFER) { 7576 sourceKeyP = (uint8_t*)uprv_malloc(sourceKeyLen*sizeof(uint8_t)); 7577 if(sourceKeyP == NULL) { 7578 *status = U_MEMORY_ALLOCATION_ERROR; 7579 goto cleanup_and_do_compare; 7580 } 7581 sourceKeyLen = ucol_getSortKey(coll, source, sourceLength, sourceKeyP, sourceKeyLen); 7582 } 7583 7584 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7585 if(targetKeyLen > UCOL_MAX_BUFFER) { 7586 targetKeyP = (uint8_t*)uprv_malloc(targetKeyLen*sizeof(uint8_t)); 7587 if(targetKeyP == NULL) { 7588 *status = U_MEMORY_ALLOCATION_ERROR; 7589 goto cleanup_and_do_compare; 7590 } 7591 targetKeyLen = ucol_getSortKey(coll, target, targetLength, targetKeyP, targetKeyLen); 7592 } 7593 7594 result = uprv_strcmp((const char*)sourceKeyP, (const char*)targetKeyP); 7595 7596cleanup_and_do_compare: 7597 if(sourceKeyP != NULL && sourceKeyP != sourceKey) { 7598 uprv_free(sourceKeyP); 7599 } 7600 7601 if(targetKeyP != NULL && targetKeyP != targetKey) { 7602 uprv_free(targetKeyP); 7603 } 7604 7605 if(result<0) { 7606 return UCOL_LESS; 7607 } else if(result>0) { 7608 return UCOL_GREATER; 7609 } else { 7610 return UCOL_EQUAL; 7611 } 7612} 7613 7614 7615static inline UCollationResult 7616ucol_strcollRegular( collIterate *sColl, collIterate *tColl, 7617// const UCollator *coll, 7618// const UChar *source, 7619// int32_t sourceLength, 7620// const UChar *target, 7621// int32_t targetLength, 7622 UErrorCode *status) 7623{ 7624 U_ALIGN_CODE(16); 7625 7626 const UCollator *coll = sColl->coll; 7627 7628 7629 // setting up the collator parameters 7630 UColAttributeValue strength = coll->strength; 7631 UBool initialCheckSecTer = (strength >= UCOL_SECONDARY); 7632 7633 UBool checkSecTer = initialCheckSecTer; 7634 UBool checkTertiary = (strength >= UCOL_TERTIARY); 7635 UBool checkQuad = (strength >= UCOL_QUATERNARY); 7636 UBool checkIdent = (strength == UCOL_IDENTICAL); 7637 UBool checkCase = (coll->caseLevel == UCOL_ON); 7638 UBool isFrenchSec = (coll->frenchCollation == UCOL_ON) && checkSecTer; 7639 UBool shifted = (coll->alternateHandling == UCOL_SHIFTED); 7640 UBool qShifted = shifted && checkQuad; 7641 UBool doHiragana = (coll->hiraganaQ == UCOL_ON) && checkQuad; 7642 7643 if(doHiragana && shifted) { 7644 return (ucol_compareUsingSortKeys(sColl, tColl, status)); 7645 } 7646 uint8_t caseSwitch = coll->caseSwitch; 7647 uint8_t tertiaryMask = coll->tertiaryMask; 7648 7649 // This is the lowest primary value that will not be ignored if shifted 7650 uint32_t LVT = (shifted)?(coll->variableTopValue<<16):0; 7651 7652 UCollationResult result = UCOL_EQUAL; 7653 UCollationResult hirResult = UCOL_EQUAL; 7654 7655 // Preparing the CE buffers. They will be filled during the primary phase 7656 ucol_CEBuf sCEs; 7657 ucol_CEBuf tCEs; 7658 UCOL_INIT_CEBUF(&sCEs); 7659 UCOL_INIT_CEBUF(&tCEs); 7660 7661 uint32_t secS = 0, secT = 0; 7662 uint32_t sOrder=0, tOrder=0; 7663 7664 // Non shifted primary processing is quite simple 7665 if(!shifted) { 7666 for(;;) { 7667 7668 // We fetch CEs until we hit a non ignorable primary or end. 7669 do { 7670 // We get the next CE 7671 sOrder = ucol_IGetNextCE(coll, sColl, status); 7672 // Stuff it in the buffer 7673 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7674 // And keep just the primary part. 7675 sOrder &= UCOL_PRIMARYMASK; 7676 } while(sOrder == 0); 7677 7678 // see the comments on the above block 7679 do { 7680 tOrder = ucol_IGetNextCE(coll, tColl, status); 7681 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7682 tOrder &= UCOL_PRIMARYMASK; 7683 } while(tOrder == 0); 7684 7685 // if both primaries are the same 7686 if(sOrder == tOrder) { 7687 // and there are no more CEs, we advance to the next level 7688 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 7689 break; 7690 } 7691 if(doHiragana && hirResult == UCOL_EQUAL) { 7692 if((sColl->flags & UCOL_WAS_HIRAGANA) != (tColl->flags & UCOL_WAS_HIRAGANA)) { 7693 hirResult = ((sColl->flags & UCOL_WAS_HIRAGANA) > (tColl->flags & UCOL_WAS_HIRAGANA)) 7694 ? UCOL_LESS:UCOL_GREATER; 7695 } 7696 } 7697 } else { 7698 // if two primaries are different, we are done 7699 result = (sOrder < tOrder) ? UCOL_LESS: UCOL_GREATER; 7700 goto commonReturn; 7701 } 7702 } // no primary difference... do the rest from the buffers 7703 } else { // shifted - do a slightly more complicated processing :) 7704 for(;;) { 7705 UBool sInShifted = FALSE; 7706 UBool tInShifted = FALSE; 7707 // This version of code can be refactored. However, it seems easier to understand this way. 7708 // Source loop. Sam as the target loop. 7709 for(;;) { 7710 sOrder = ucol_IGetNextCE(coll, sColl, status); 7711 if(sOrder == UCOL_NO_MORE_CES) { 7712 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7713 break; 7714 } else if(sOrder == 0 || (sInShifted && (sOrder & UCOL_PRIMARYMASK) == 0)) { 7715 /* UCA amendment - ignore ignorables that follow shifted code points */ 7716 continue; 7717 } else if(isContinuation(sOrder)) { 7718 if((sOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7719 if(sInShifted) { 7720 sOrder = (sOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7721 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7722 continue; 7723 } else { 7724 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7725 break; 7726 } 7727 } else { /* Just lower level values */ 7728 if(sInShifted) { 7729 continue; 7730 } else { 7731 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7732 continue; 7733 } 7734 } 7735 } else { /* regular */ 7736 if((sOrder & UCOL_PRIMARYMASK) > LVT) { 7737 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7738 break; 7739 } else { 7740 if((sOrder & UCOL_PRIMARYMASK) > 0) { 7741 sInShifted = TRUE; 7742 sOrder &= UCOL_PRIMARYMASK; 7743 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7744 continue; 7745 } else { 7746 UCOL_CEBUF_PUT(&sCEs, sOrder, sColl, status); 7747 sInShifted = FALSE; 7748 continue; 7749 } 7750 } 7751 } 7752 } 7753 sOrder &= UCOL_PRIMARYMASK; 7754 sInShifted = FALSE; 7755 7756 for(;;) { 7757 tOrder = ucol_IGetNextCE(coll, tColl, status); 7758 if(tOrder == UCOL_NO_MORE_CES) { 7759 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7760 break; 7761 } else if(tOrder == 0 || (tInShifted && (tOrder & UCOL_PRIMARYMASK) == 0)) { 7762 /* UCA amendment - ignore ignorables that follow shifted code points */ 7763 continue; 7764 } else if(isContinuation(tOrder)) { 7765 if((tOrder & UCOL_PRIMARYMASK) > 0) { /* There is primary value */ 7766 if(tInShifted) { 7767 tOrder = (tOrder & UCOL_PRIMARYMASK) | 0xC0; /* preserve interesting continuation */ 7768 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7769 continue; 7770 } else { 7771 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7772 break; 7773 } 7774 } else { /* Just lower level values */ 7775 if(tInShifted) { 7776 continue; 7777 } else { 7778 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7779 continue; 7780 } 7781 } 7782 } else { /* regular */ 7783 if((tOrder & UCOL_PRIMARYMASK) > LVT) { 7784 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7785 break; 7786 } else { 7787 if((tOrder & UCOL_PRIMARYMASK) > 0) { 7788 tInShifted = TRUE; 7789 tOrder &= UCOL_PRIMARYMASK; 7790 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7791 continue; 7792 } else { 7793 UCOL_CEBUF_PUT(&tCEs, tOrder, tColl, status); 7794 tInShifted = FALSE; 7795 continue; 7796 } 7797 } 7798 } 7799 } 7800 tOrder &= UCOL_PRIMARYMASK; 7801 tInShifted = FALSE; 7802 7803 if(sOrder == tOrder) { 7804 /* 7805 if(doHiragana && hirResult == UCOL_EQUAL) { 7806 if((sColl.flags & UCOL_WAS_HIRAGANA) != (tColl.flags & UCOL_WAS_HIRAGANA)) { 7807 hirResult = ((sColl.flags & UCOL_WAS_HIRAGANA) > (tColl.flags & UCOL_WAS_HIRAGANA)) 7808 ? UCOL_LESS:UCOL_GREATER; 7809 } 7810 } 7811 */ 7812 if(sOrder == UCOL_NO_MORE_CES_PRIMARY) { 7813 break; 7814 } else { 7815 sOrder = 0; 7816 tOrder = 0; 7817 continue; 7818 } 7819 } else { 7820 result = (sOrder < tOrder) ? UCOL_LESS : UCOL_GREATER; 7821 goto commonReturn; 7822 } 7823 } /* no primary difference... do the rest from the buffers */ 7824 } 7825 7826 /* now, we're gonna reexamine collected CEs */ 7827 uint32_t *sCE; 7828 uint32_t *tCE; 7829 7830 /* This is the secondary level of comparison */ 7831 if(checkSecTer) { 7832 if(!isFrenchSec) { /* normal */ 7833 sCE = sCEs.buf; 7834 tCE = tCEs.buf; 7835 for(;;) { 7836 while (secS == 0) { 7837 secS = *(sCE++) & UCOL_SECONDARYMASK; 7838 } 7839 7840 while(secT == 0) { 7841 secT = *(tCE++) & UCOL_SECONDARYMASK; 7842 } 7843 7844 if(secS == secT) { 7845 if(secS == UCOL_NO_MORE_CES_SECONDARY) { 7846 break; 7847 } else { 7848 secS = 0; secT = 0; 7849 continue; 7850 } 7851 } else { 7852 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7853 goto commonReturn; 7854 } 7855 } 7856 } else { /* do the French */ 7857 uint32_t *sCESave = NULL; 7858 uint32_t *tCESave = NULL; 7859 sCE = sCEs.pos-2; /* this could also be sCEs-- if needs to be optimized */ 7860 tCE = tCEs.pos-2; 7861 for(;;) { 7862 while (secS == 0 && sCE >= sCEs.buf) { 7863 if(sCESave == 0) { 7864 secS = *(sCE--); 7865 if(isContinuation(secS)) { 7866 while(isContinuation(secS = *(sCE--))) 7867 ; 7868 /* after this, secS has the start of continuation, and sCEs points before that */ 7869 sCESave = sCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7870 sCE+=2; /* need to point to the first continuation CP */ 7871 /* However, now you can just continue doing stuff */ 7872 } 7873 } else { 7874 secS = *(sCE++); 7875 if(!isContinuation(secS)) { /* This means we have finished with this cont */ 7876 sCE = sCESave; /* reset the pointer to before continuation */ 7877 sCESave = 0; 7878 continue; 7879 } 7880 } 7881 secS &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7882 } 7883 7884 while(secT == 0 && tCE >= tCEs.buf) { 7885 if(tCESave == 0) { 7886 secT = *(tCE--); 7887 if(isContinuation(secT)) { 7888 while(isContinuation(secT = *(tCE--))) 7889 ; 7890 /* after this, secS has the start of continuation, and sCEs points before that */ 7891 tCESave = tCE; /* we save it, so that we know where to come back AND that we need to go forward */ 7892 tCE+=2; /* need to point to the first continuation CP */ 7893 /* However, now you can just continue doing stuff */ 7894 } 7895 } else { 7896 secT = *(tCE++); 7897 if(!isContinuation(secT)) { /* This means we have finished with this cont */ 7898 tCE = tCESave; /* reset the pointer to before continuation */ 7899 tCESave = 0; 7900 continue; 7901 } 7902 } 7903 secT &= UCOL_SECONDARYMASK; /* remove the continuation bit */ 7904 } 7905 7906 if(secS == secT) { 7907 if(secS == UCOL_NO_MORE_CES_SECONDARY || (sCE < sCEs.buf && tCE < tCEs.buf)) { 7908 break; 7909 } else { 7910 secS = 0; secT = 0; 7911 continue; 7912 } 7913 } else { 7914 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 7915 goto commonReturn; 7916 } 7917 } 7918 } 7919 } 7920 7921 /* doing the case bit */ 7922 if(checkCase) { 7923 sCE = sCEs.buf; 7924 tCE = tCEs.buf; 7925 for(;;) { 7926 while((secS & UCOL_REMOVE_CASE) == 0) { 7927 if(!isContinuation(*sCE++)) { 7928 secS =*(sCE-1); 7929 if(((secS & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7930 // primary ignorables should not be considered on the case level when the strength is primary 7931 // otherwise, the CEs stop being well-formed 7932 secS &= UCOL_TERT_CASE_MASK; 7933 secS ^= caseSwitch; 7934 } else { 7935 secS = 0; 7936 } 7937 } else { 7938 secS = 0; 7939 } 7940 } 7941 7942 while((secT & UCOL_REMOVE_CASE) == 0) { 7943 if(!isContinuation(*tCE++)) { 7944 secT = *(tCE-1); 7945 if(((secT & UCOL_PRIMARYMASK) != 0) || strength > UCOL_PRIMARY) { 7946 // primary ignorables should not be considered on the case level when the strength is primary 7947 // otherwise, the CEs stop being well-formed 7948 secT &= UCOL_TERT_CASE_MASK; 7949 secT ^= caseSwitch; 7950 } else { 7951 secT = 0; 7952 } 7953 } else { 7954 secT = 0; 7955 } 7956 } 7957 7958 if((secS & UCOL_CASE_BIT_MASK) < (secT & UCOL_CASE_BIT_MASK)) { 7959 result = UCOL_LESS; 7960 goto commonReturn; 7961 } else if((secS & UCOL_CASE_BIT_MASK) > (secT & UCOL_CASE_BIT_MASK)) { 7962 result = UCOL_GREATER; 7963 goto commonReturn; 7964 } 7965 7966 if((secS & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY || (secT & UCOL_REMOVE_CASE) == UCOL_NO_MORE_CES_TERTIARY ) { 7967 break; 7968 } else { 7969 secS = 0; 7970 secT = 0; 7971 } 7972 } 7973 } 7974 7975 /* Tertiary level */ 7976 if(checkTertiary) { 7977 secS = 0; 7978 secT = 0; 7979 sCE = sCEs.buf; 7980 tCE = tCEs.buf; 7981 for(;;) { 7982 while((secS & UCOL_REMOVE_CASE) == 0) { 7983 secS = *(sCE++) & tertiaryMask; 7984 if(!isContinuation(secS)) { 7985 secS ^= caseSwitch; 7986 } else { 7987 secS &= UCOL_REMOVE_CASE; 7988 } 7989 } 7990 7991 while((secT & UCOL_REMOVE_CASE) == 0) { 7992 secT = *(tCE++) & tertiaryMask; 7993 if(!isContinuation(secT)) { 7994 secT ^= caseSwitch; 7995 } else { 7996 secT &= UCOL_REMOVE_CASE; 7997 } 7998 } 7999 8000 if(secS == secT) { 8001 if((secS & UCOL_REMOVE_CASE) == 1) { 8002 break; 8003 } else { 8004 secS = 0; secT = 0; 8005 continue; 8006 } 8007 } else { 8008 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 8009 goto commonReturn; 8010 } 8011 } 8012 } 8013 8014 8015 if(qShifted /*checkQuad*/) { 8016 UBool sInShifted = TRUE; 8017 UBool tInShifted = TRUE; 8018 secS = 0; 8019 secT = 0; 8020 sCE = sCEs.buf; 8021 tCE = tCEs.buf; 8022 for(;;) { 8023 while(secS == 0 && secS != UCOL_NO_MORE_CES || (isContinuation(secS) && !sInShifted)) { 8024 secS = *(sCE++); 8025 if(isContinuation(secS)) { 8026 if(!sInShifted) { 8027 continue; 8028 } 8029 } else if(secS > LVT || (secS & UCOL_PRIMARYMASK) == 0) { /* non continuation */ 8030 secS = UCOL_PRIMARYMASK; 8031 sInShifted = FALSE; 8032 } else { 8033 sInShifted = TRUE; 8034 } 8035 } 8036 secS &= UCOL_PRIMARYMASK; 8037 8038 8039 while(secT == 0 && secT != UCOL_NO_MORE_CES || (isContinuation(secT) && !tInShifted)) { 8040 secT = *(tCE++); 8041 if(isContinuation(secT)) { 8042 if(!tInShifted) { 8043 continue; 8044 } 8045 } else if(secT > LVT || (secT & UCOL_PRIMARYMASK) == 0) { 8046 secT = UCOL_PRIMARYMASK; 8047 tInShifted = FALSE; 8048 } else { 8049 tInShifted = TRUE; 8050 } 8051 } 8052 secT &= UCOL_PRIMARYMASK; 8053 8054 if(secS == secT) { 8055 if(secS == UCOL_NO_MORE_CES_PRIMARY) { 8056 break; 8057 } else { 8058 secS = 0; secT = 0; 8059 continue; 8060 } 8061 } else { 8062 result = (secS < secT) ? UCOL_LESS : UCOL_GREATER; 8063 goto commonReturn; 8064 } 8065 } 8066 } else if(doHiragana && hirResult != UCOL_EQUAL) { 8067 // If we're fine on quaternaries, we might be different 8068 // on Hiragana. This, however, might fail us in shifted. 8069 result = hirResult; 8070 goto commonReturn; 8071 } 8072 8073 /* For IDENTICAL comparisons, we use a bitwise character comparison */ 8074 /* as a tiebreaker if all else is equal. */ 8075 /* Getting here should be quite rare - strings are not identical - */ 8076 /* that is checked first, but compared == through all other checks. */ 8077 if(checkIdent) 8078 { 8079 //result = ucol_checkIdent(&sColl, &tColl, coll->normalizationMode == UCOL_ON); 8080 result = ucol_checkIdent(sColl, tColl, TRUE, status); 8081 } 8082 8083commonReturn: 8084 if ((sColl->flags | tColl->flags) & UCOL_ITER_ALLOCATED) { 8085 freeHeapWritableBuffer(sColl); 8086 freeHeapWritableBuffer(tColl); 8087 8088 if (sCEs.buf != sCEs.localArray ) { 8089 uprv_free(sCEs.buf); 8090 } 8091 if (tCEs.buf != tCEs.localArray ) { 8092 uprv_free(tCEs.buf); 8093 } 8094 } 8095 8096 return result; 8097} 8098 8099 8100static inline uint32_t 8101ucol_getLatinOneContraction(const UCollator *coll, int32_t strength, 8102 uint32_t CE, const UChar *s, int32_t *index, int32_t len) 8103{ 8104 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF); 8105 int32_t latinOneOffset = (CE & 0x00FFF000) >> 12; 8106 int32_t offset = 1; 8107 UChar schar = 0, tchar = 0; 8108 8109 for(;;) { 8110 if(len == -1) { 8111 if(s[*index] == 0) { // end of string 8112 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 8113 } else { 8114 schar = s[*index]; 8115 } 8116 } else { 8117 if(*index == len) { 8118 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 8119 } else { 8120 schar = s[*index]; 8121 } 8122 } 8123 8124 while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ 8125 offset++; 8126 } 8127 8128 if (schar == tchar) { 8129 (*index)++; 8130 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]); 8131 } 8132 else 8133 { 8134 if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) { 8135 return UCOL_BAIL_OUT_CE; 8136 } 8137 // skip completely ignorables 8138 uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar); 8139 if(isZeroCE == 0) { // we have to ignore completely ignorables 8140 (*index)++; 8141 continue; 8142 } 8143 8144 return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]); 8145 } 8146 } 8147} 8148 8149 8150/** 8151 * This is a fast strcoll, geared towards text in Latin-1. 8152 * It supports contractions of size two, French secondaries 8153 * and case switching. You can use it with strengths primary 8154 * to tertiary. It does not support shifted and case level. 8155 * It relies on the table build by setupLatin1Table. If it 8156 * doesn't understand something, it will go to the regular 8157 * strcoll. 8158 */ 8159static inline UCollationResult 8160ucol_strcollUseLatin1( const UCollator *coll, 8161 const UChar *source, 8162 int32_t sLen, 8163 const UChar *target, 8164 int32_t tLen, 8165 UErrorCode *status) 8166{ 8167 U_ALIGN_CODE(16); 8168 int32_t strength = coll->strength; 8169 8170 int32_t sIndex = 0, tIndex = 0; 8171 UChar sChar = 0, tChar = 0; 8172 uint32_t sOrder=0, tOrder=0; 8173 8174 UBool endOfSource = FALSE; 8175 8176 uint32_t *elements = coll->latinOneCEs; 8177 8178 UBool haveContractions = FALSE; // if we have contractions in our string 8179 // we cannot do French secondary 8180 8181 // Do the primary level 8182 for(;;) { 8183 while(sOrder==0) { // this loop skips primary ignorables 8184 // sOrder=getNextlatinOneCE(source); 8185 if(sLen==-1) { // handling zero terminated strings 8186 sChar=source[sIndex++]; 8187 if(sChar==0) { 8188 endOfSource = TRUE; 8189 break; 8190 } 8191 } else { // handling strings with known length 8192 if(sIndex==sLen) { 8193 endOfSource = TRUE; 8194 break; 8195 } 8196 sChar=source[sIndex++]; 8197 } 8198 if(sChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 8199 //fprintf(stderr, "R"); 8200 goto returnRegular; 8201 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8202 } 8203 sOrder = elements[sChar]; 8204 if(sOrder >= UCOL_NOT_FOUND) { // if we got a special 8205 // specials can basically be either contractions or bail-out signs. If we get anything 8206 // else, we'll bail out anywasy 8207 if(getCETag(sOrder) == CONTRACTION_TAG) { 8208 sOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen); 8209 haveContractions = TRUE; // if there are contractions, we cannot do French secondary 8210 // However, if there are contractions in the table, but we always use just one char, 8211 // we might be able to do French. This should be checked out. 8212 } 8213 if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 8214 //fprintf(stderr, "S"); 8215 goto returnRegular; 8216 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8217 } 8218 } 8219 } 8220 8221 while(tOrder==0) { // this loop skips primary ignorables 8222 // tOrder=getNextlatinOneCE(target); 8223 if(tLen==-1) { // handling zero terminated strings 8224 tChar=target[tIndex++]; 8225 if(tChar==0) { 8226 if(endOfSource) { // this is different than source loop, 8227 // as we already know that source loop is done here, 8228 // so we can either finish the primary loop if both 8229 // strings are done or anounce the result if only 8230 // target is done. Same below. 8231 goto endOfPrimLoop; 8232 } else { 8233 return UCOL_GREATER; 8234 } 8235 } 8236 } else { // handling strings with known length 8237 if(tIndex==tLen) { 8238 if(endOfSource) { 8239 goto endOfPrimLoop; 8240 } else { 8241 return UCOL_GREATER; 8242 } 8243 } 8244 tChar=target[tIndex++]; 8245 } 8246 if(tChar&0xFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32) 8247 //fprintf(stderr, "R"); 8248 goto returnRegular; 8249 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8250 } 8251 tOrder = elements[tChar]; 8252 if(tOrder >= UCOL_NOT_FOUND) { 8253 // Handling specials, see the comments for source 8254 if(getCETag(tOrder) == CONTRACTION_TAG) { 8255 tOrder = ucol_getLatinOneContraction(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen); 8256 haveContractions = TRUE; 8257 } 8258 if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) { 8259 //fprintf(stderr, "S"); 8260 goto returnRegular; 8261 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8262 } 8263 } 8264 } 8265 if(endOfSource) { // source is finished, but target is not, say the result. 8266 return UCOL_LESS; 8267 } 8268 8269 if(sOrder == tOrder) { // if we have same CEs, we continue the loop 8270 sOrder = 0; tOrder = 0; 8271 continue; 8272 } else { 8273 // compare current top bytes 8274 if(((sOrder^tOrder)&0xFF000000)!=0) { 8275 // top bytes differ, return difference 8276 if(sOrder < tOrder) { 8277 return UCOL_LESS; 8278 } else if(sOrder > tOrder) { 8279 return UCOL_GREATER; 8280 } 8281 // instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24); 8282 // since we must return enum value 8283 } 8284 8285 // top bytes match, continue with following bytes 8286 sOrder<<=8; 8287 tOrder<<=8; 8288 } 8289 } 8290 8291endOfPrimLoop: 8292 // after primary loop, we definitely know the sizes of strings, 8293 // so we set it and use simpler loop for secondaries and tertiaries 8294 sLen = sIndex; tLen = tIndex; 8295 if(strength >= UCOL_SECONDARY) { 8296 // adjust the table beggining 8297 elements += coll->latinOneTableLen; 8298 endOfSource = FALSE; 8299 8300 if(coll->frenchCollation == UCOL_OFF) { // non French 8301 // This loop is a simplified copy of primary loop 8302 // at this point we know that whole strings are latin-1, so we don't 8303 // check for that. We also know that we only have contractions as 8304 // specials. 8305 sIndex = 0; tIndex = 0; 8306 for(;;) { 8307 while(sOrder==0) { 8308 if(sIndex==sLen) { 8309 endOfSource = TRUE; 8310 break; 8311 } 8312 sChar=source[sIndex++]; 8313 sOrder = elements[sChar]; 8314 if(sOrder > UCOL_NOT_FOUND) { 8315 sOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen); 8316 } 8317 } 8318 8319 while(tOrder==0) { 8320 if(tIndex==tLen) { 8321 if(endOfSource) { 8322 goto endOfSecLoop; 8323 } else { 8324 return UCOL_GREATER; 8325 } 8326 } 8327 tChar=target[tIndex++]; 8328 tOrder = elements[tChar]; 8329 if(tOrder > UCOL_NOT_FOUND) { 8330 tOrder = ucol_getLatinOneContraction(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen); 8331 } 8332 } 8333 if(endOfSource) { 8334 return UCOL_LESS; 8335 } 8336 8337 if(sOrder == tOrder) { 8338 sOrder = 0; tOrder = 0; 8339 continue; 8340 } else { 8341 // see primary loop for comments on this 8342 if(((sOrder^tOrder)&0xFF000000)!=0) { 8343 if(sOrder < tOrder) { 8344 return UCOL_LESS; 8345 } else if(sOrder > tOrder) { 8346 return UCOL_GREATER; 8347 } 8348 } 8349 sOrder<<=8; 8350 tOrder<<=8; 8351 } 8352 } 8353 } else { // French 8354 if(haveContractions) { // if we have contractions, we have to bail out 8355 // since we don't really know how to handle them here 8356 goto returnRegular; 8357 //return ucol_strcollRegular(coll, source, sLen, target, tLen, status); 8358 } 8359 // For French, we go backwards 8360 sIndex = sLen; tIndex = tLen; 8361 for(;;) { 8362 while(sOrder==0) { 8363 if(sIndex==0) { 8364 endOfSource = TRUE; 8365 break; 8366 } 8367 sChar=source[--sIndex]; 8368 sOrder = elements[sChar]; 8369 // don't even look for contractions 8370 } 8371 8372 while(tOrder==0) { 8373 if(tIndex==0) { 8374 if(endOfSource) { 8375 goto endOfSecLoop; 8376 } else { 8377 return UCOL_GREATER; 8378 } 8379 } 8380 tChar=target[--tIndex]; 8381 tOrder = elements[tChar]; 8382 // don't even look for contractions 8383 } 8384 if(endOfSource) { 8385 return UCOL_LESS; 8386 } 8387 8388 if(sOrder == tOrder) { 8389 sOrder = 0; tOrder = 0; 8390 continue; 8391 } else { 8392 // see the primary loop for comments 8393 if(((sOrder^tOrder)&0xFF000000)!=0) { 8394 if(sOrder < tOrder) { 8395 return UCOL_LESS; 8396 } else if(sOrder > tOrder) { 8397 return UCOL_GREATER; 8398 } 8399 } 8400 sOrder<<=8; 8401 tOrder<<=8; 8402 } 8403 } 8404 } 8405 } 8406 8407endOfSecLoop: 8408 if(strength >= UCOL_TERTIARY) { 8409 // tertiary loop is the same as secondary (except no French) 8410 elements += coll->latinOneTableLen; 8411 sIndex = 0; tIndex = 0; 8412 endOfSource = FALSE; 8413 for(;;) { 8414 while(sOrder==0) { 8415 if(sIndex==sLen) { 8416 endOfSource = TRUE; 8417 break; 8418 } 8419 sChar=source[sIndex++]; 8420 sOrder = elements[sChar]; 8421 if(sOrder > UCOL_NOT_FOUND) { 8422 sOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen); 8423 } 8424 } 8425 while(tOrder==0) { 8426 if(tIndex==tLen) { 8427 if(endOfSource) { 8428 return UCOL_EQUAL; // if both strings are at the end, they are equal 8429 } else { 8430 return UCOL_GREATER; 8431 } 8432 } 8433 tChar=target[tIndex++]; 8434 tOrder = elements[tChar]; 8435 if(tOrder > UCOL_NOT_FOUND) { 8436 tOrder = ucol_getLatinOneContraction(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen); 8437 } 8438 } 8439 if(endOfSource) { 8440 return UCOL_LESS; 8441 } 8442 if(sOrder == tOrder) { 8443 sOrder = 0; tOrder = 0; 8444 continue; 8445 } else { 8446 if(((sOrder^tOrder)&0xff000000)!=0) { 8447 if(sOrder < tOrder) { 8448 return UCOL_LESS; 8449 } else if(sOrder > tOrder) { 8450 return UCOL_GREATER; 8451 } 8452 } 8453 sOrder<<=8; 8454 tOrder<<=8; 8455 } 8456 } 8457 } 8458 return UCOL_EQUAL; 8459 8460returnRegular: 8461 // Preparing the context objects for iterating over strings 8462 collIterate sColl, tColl; 8463 8464 IInit_collIterate(coll, source, sLen, &sColl); 8465 IInit_collIterate(coll, target, tLen, &tColl); 8466 return ucol_strcollRegular(&sColl, &tColl, status); 8467} 8468 8469 8470U_CAPI UCollationResult U_EXPORT2 8471ucol_strcollIter( const UCollator *coll, 8472 UCharIterator *sIter, 8473 UCharIterator *tIter, 8474 UErrorCode *status) 8475{ 8476 if(!status || U_FAILURE(*status)) { 8477 return UCOL_EQUAL; 8478 } 8479 8480 UTRACE_ENTRY(UTRACE_UCOL_STRCOLLITER); 8481 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, sIter=%p, tIter=%p", coll, sIter, tIter); 8482 8483 if (sIter == tIter) { 8484 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8485 return UCOL_EQUAL; 8486 } 8487 if(sIter == NULL || tIter == NULL || coll == NULL) { 8488 *status = U_ILLEGAL_ARGUMENT_ERROR; 8489 UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status) 8490 return UCOL_EQUAL; 8491 } 8492 8493 UCollationResult result = UCOL_EQUAL; 8494 8495 // Preparing the context objects for iterating over strings 8496 collIterate sColl, tColl; 8497 // The division for the array length may truncate the array size to 8498 // a little less than UNORM_ITER_SIZE, but that size is dimensioned too high 8499 // for all platforms anyway. 8500 UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8501 UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)]; 8502 UNormIterator *sNormIter = NULL, *tNormIter = NULL; 8503 8504 IInit_collIterate(coll, NULL, -1, &sColl); 8505 sColl.iterator = sIter; 8506 sColl.flags |= UCOL_USE_ITERATOR; 8507 IInit_collIterate(coll, NULL, -1, &tColl); 8508 tColl.flags |= UCOL_USE_ITERATOR; 8509 tColl.iterator = tIter; 8510 8511 if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) { 8512 sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status); 8513 sColl.iterator = unorm_setIter(sNormIter, sIter, UNORM_FCD, status); 8514 sColl.flags &= ~UCOL_ITER_NORM; 8515 8516 tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status); 8517 tColl.iterator = unorm_setIter(tNormIter, tIter, UNORM_FCD, status); 8518 tColl.flags &= ~UCOL_ITER_NORM; 8519 } 8520 8521 UChar32 sChar = U_SENTINEL, tChar = U_SENTINEL; 8522 8523 while((sChar = sColl.iterator->next(sColl.iterator)) == 8524 (tChar = tColl.iterator->next(tColl.iterator))) { 8525 if(sChar == U_SENTINEL) { 8526 result = UCOL_EQUAL; 8527 goto end_compare; 8528 } 8529 } 8530 8531 if(sChar == U_SENTINEL) { 8532 tChar = tColl.iterator->previous(tColl.iterator); 8533 } 8534 8535 if(tChar == U_SENTINEL) { 8536 sChar = sColl.iterator->previous(sColl.iterator); 8537 } 8538 8539 sChar = sColl.iterator->previous(sColl.iterator); 8540 tChar = tColl.iterator->previous(tColl.iterator); 8541 8542 if (ucol_unsafeCP((UChar)sChar, coll) || ucol_unsafeCP((UChar)tChar, coll)) 8543 { 8544 // We are stopped in the middle of a contraction. 8545 // Scan backwards through the == part of the string looking for the start of the contraction. 8546 // It doesn't matter which string we scan, since they are the same in this region. 8547 do 8548 { 8549 sChar = sColl.iterator->previous(sColl.iterator); 8550 tChar = tColl.iterator->previous(tColl.iterator); 8551 } 8552 while (sChar != U_SENTINEL && ucol_unsafeCP((UChar)sChar, coll)); 8553 } 8554 8555 8556 if(U_SUCCESS(*status)) { 8557 result = ucol_strcollRegular(&sColl, &tColl, status); 8558 } 8559 8560end_compare: 8561 if(sNormIter || tNormIter) { 8562 unorm_closeIter(sNormIter); 8563 unorm_closeIter(tNormIter); 8564 } 8565 8566 UTRACE_EXIT_VALUE_STATUS(result, *status) 8567 return result; 8568} 8569 8570 8571/* */ 8572/* ucol_strcoll Main public API string comparison function */ 8573/* */ 8574U_CAPI UCollationResult U_EXPORT2 8575ucol_strcoll( const UCollator *coll, 8576 const UChar *source, 8577 int32_t sourceLength, 8578 const UChar *target, 8579 int32_t targetLength) 8580{ 8581 U_ALIGN_CODE(16); 8582 8583 UTRACE_ENTRY(UTRACE_UCOL_STRCOLL); 8584 if (UTRACE_LEVEL(UTRACE_VERBOSE)) { 8585 UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target); 8586 UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vh ", source, sourceLength); 8587 UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vh ", target, targetLength); 8588 } 8589 8590 if(source == NULL || target == NULL) { 8591 // do not crash, but return. Should have 8592 // status argument to return error. 8593 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8594 return UCOL_EQUAL; 8595 } 8596 8597 /* Quick check if source and target are same strings. */ 8598 /* They should either both be NULL terminated or the explicit length should be set on both. */ 8599 if (source==target && sourceLength==targetLength) { 8600 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8601 return UCOL_EQUAL; 8602 } 8603 8604 /* Scan the strings. Find: */ 8605 /* The length of any leading portion that is equal */ 8606 /* Whether they are exactly equal. (in which case we just return) */ 8607 const UChar *pSrc = source; 8608 const UChar *pTarg = target; 8609 int32_t equalLength; 8610 8611 if (sourceLength == -1 && targetLength == -1) { 8612 // Both strings are null terminated. 8613 // Scan through any leading equal portion. 8614 while (*pSrc == *pTarg && *pSrc != 0) { 8615 pSrc++; 8616 pTarg++; 8617 } 8618 if (*pSrc == 0 && *pTarg == 0) { 8619 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8620 return UCOL_EQUAL; 8621 } 8622 equalLength = pSrc - source; 8623 } 8624 else 8625 { 8626 // One or both strings has an explicit length. 8627 const UChar *pSrcEnd = source + sourceLength; 8628 const UChar *pTargEnd = target + targetLength; 8629 8630 // Scan while the strings are bitwise ==, or until one is exhausted. 8631 for (;;) { 8632 if (pSrc == pSrcEnd || pTarg == pTargEnd) { 8633 break; 8634 } 8635 if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) { 8636 break; 8637 } 8638 if (*pSrc != *pTarg) { 8639 break; 8640 } 8641 pSrc++; 8642 pTarg++; 8643 } 8644 equalLength = pSrc - source; 8645 8646 // If we made it all the way through both strings, we are done. They are == 8647 if ((pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0)) && /* At end of src string, however it was specified. */ 8648 (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0))) /* and also at end of dest string */ 8649 { 8650 UTRACE_EXIT_VALUE(UCOL_EQUAL); 8651 return UCOL_EQUAL; 8652 } 8653 } 8654 if (equalLength > 0) { 8655 /* There is an identical portion at the beginning of the two strings. */ 8656 /* If the identical portion ends within a contraction or a comibining */ 8657 /* character sequence, back up to the start of that sequence. */ 8658 8659 // These values should already be set by the code above. 8660 //pSrc = source + equalLength; /* point to the first differing chars */ 8661 //pTarg = target + equalLength; 8662 if (pSrc != source+sourceLength && ucol_unsafeCP(*pSrc, coll) || 8663 pTarg != target+targetLength && ucol_unsafeCP(*pTarg, coll)) 8664 { 8665 // We are stopped in the middle of a contraction. 8666 // Scan backwards through the == part of the string looking for the start of the contraction. 8667 // It doesn't matter which string we scan, since they are the same in this region. 8668 do 8669 { 8670 equalLength--; 8671 pSrc--; 8672 } 8673 while (equalLength>0 && ucol_unsafeCP(*pSrc, coll)); 8674 } 8675 8676 source += equalLength; 8677 target += equalLength; 8678 if (sourceLength > 0) { 8679 sourceLength -= equalLength; 8680 } 8681 if (targetLength > 0) { 8682 targetLength -= equalLength; 8683 } 8684 } 8685 8686 UErrorCode status = U_ZERO_ERROR; 8687 UCollationResult returnVal; 8688 if(!coll->latinOneUse || (sourceLength > 0 && *source&0xff00) || (targetLength > 0 && *target&0xff00)) { 8689 collIterate sColl, tColl; 8690 // Preparing the context objects for iterating over strings 8691 IInit_collIterate(coll, source, sourceLength, &sColl); 8692 IInit_collIterate(coll, target, targetLength, &tColl); 8693 returnVal = ucol_strcollRegular(&sColl, &tColl, &status); 8694 } else { 8695 returnVal = ucol_strcollUseLatin1(coll, source, sourceLength, target, targetLength, &status); 8696 } 8697 UTRACE_EXIT_VALUE(returnVal); 8698 return returnVal; 8699} 8700 8701/* convenience function for comparing strings */ 8702U_CAPI UBool U_EXPORT2 8703ucol_greater( const UCollator *coll, 8704 const UChar *source, 8705 int32_t sourceLength, 8706 const UChar *target, 8707 int32_t targetLength) 8708{ 8709 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8710 == UCOL_GREATER); 8711} 8712 8713/* convenience function for comparing strings */ 8714U_CAPI UBool U_EXPORT2 8715ucol_greaterOrEqual( const UCollator *coll, 8716 const UChar *source, 8717 int32_t sourceLength, 8718 const UChar *target, 8719 int32_t targetLength) 8720{ 8721 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8722 != UCOL_LESS); 8723} 8724 8725/* convenience function for comparing strings */ 8726U_CAPI UBool U_EXPORT2 8727ucol_equal( const UCollator *coll, 8728 const UChar *source, 8729 int32_t sourceLength, 8730 const UChar *target, 8731 int32_t targetLength) 8732{ 8733 return (ucol_strcoll(coll, source, sourceLength, target, targetLength) 8734 == UCOL_EQUAL); 8735} 8736 8737U_CAPI void U_EXPORT2 8738ucol_getUCAVersion(const UCollator* coll, UVersionInfo info) { 8739 if(coll && coll->UCA) { 8740 uprv_memcpy(info, coll->UCA->image->UCAVersion, sizeof(UVersionInfo)); 8741 } 8742} 8743 8744#endif /* #if !UCONFIG_NO_COLLATION */ 8745