1/* 2******************************************************************************* 3* 4* Copyright (C) 2001-2011, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: ucol_bld.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created 02/22/2001 14* created by: Vladimir Weinstein 15* 16* This module builds a collator based on the rule set. 17* 18*/ 19 20#include "unicode/utypes.h" 21 22#if !UCONFIG_NO_COLLATION 23 24#include "unicode/ucoleitr.h" 25#include "unicode/udata.h" 26#include "unicode/uchar.h" 27#include "unicode/uniset.h" 28#include "unicode/uscript.h" 29#include "unicode/ustring.h" 30#include "normalizer2impl.h" 31#include "ucol_bld.h" 32#include "ucol_elm.h" 33#include "ucol_cnt.h" 34#include "ucln_in.h" 35#include "umutex.h" 36#include "cmemory.h" 37#include "cstring.h" 38 39static const InverseUCATableHeader* _staticInvUCA = NULL; 40static UDataMemory* invUCA_DATA_MEM = NULL; 41 42U_CDECL_BEGIN 43static UBool U_CALLCONV 44isAcceptableInvUCA(void * /*context*/, 45 const char * /*type*/, const char * /*name*/, 46 const UDataInfo *pInfo) 47{ 48 /* context, type & name are intentionally not used */ 49 if( pInfo->size>=20 && 50 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 51 pInfo->charsetFamily==U_CHARSET_FAMILY && 52 pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 && /* dataFormat="InvC" */ 53 pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 && 54 pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 && 55 pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 && 56 pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 && 57 pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&& 58 //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 && 59 //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 && 60 //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 && 61 ) 62 { 63 UVersionInfo UCDVersion; 64 u_getUnicodeVersion(UCDVersion); 65 return (pInfo->dataVersion[0]==UCDVersion[0] && 66 pInfo->dataVersion[1]==UCDVersion[1]); 67 //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] && 68 //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] && 69 //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) { 70 } else { 71 return FALSE; 72 } 73} 74U_CDECL_END 75 76/* 77* Takes two CEs (lead and continuation) and 78* compares them as CEs should be compared: 79* primary vs. primary, secondary vs. secondary 80* tertiary vs. tertiary 81*/ 82static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) { 83 uint32_t s1 = source0, s2, t1 = target0, t2; 84 if(isContinuation(source1)) { 85 s2 = source1; 86 } else { 87 s2 = 0; 88 } 89 if(isContinuation(target1)) { 90 t2 = target1; 91 } else { 92 t2 = 0; 93 } 94 95 uint32_t s = 0, t = 0; 96 if(s1 == t1 && s2 == t2) { 97 return 0; 98 } 99 s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16); 100 t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16); 101 if(s < t) { 102 return -1; 103 } else if(s > t) { 104 return 1; 105 } else { 106 s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8; 107 t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8; 108 if(s < t) { 109 return -1; 110 } else if(s > t) { 111 return 1; 112 } else { 113 s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF); 114 t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF); 115 if(s < t) { 116 return -1; 117 } else { 118 return 1; 119 } 120 } 121 } 122} 123 124static 125int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) { 126 uint32_t bottom = 0, top = src->invUCA->tableSize; 127 uint32_t i = 0; 128 uint32_t first = 0, second = 0; 129 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 130 int32_t res = 0; 131 132 while(bottom < top-1) { 133 i = (top+bottom)/2; 134 first = *(CETable+3*i); 135 second = *(CETable+3*i+1); 136 res = compareCEs(first, second, CE, SecondCE); 137 if(res > 0) { 138 top = i; 139 } else if(res < 0) { 140 bottom = i; 141 } else { 142 break; 143 } 144 } 145 146 /* weiv: */ 147 /* in searching for elements, I have removed the failure */ 148 /* The reason for this is that the builder does not rely */ 149 /* on search mechanism telling it that it didn't find an */ 150 /* element. However, indirect positioning relies on being */ 151 /* able to find the elements around any CE, even if it is */ 152 /* not defined in the UCA. */ 153 return i; 154 /* 155 if((first == CE && second == SecondCE)) { 156 return i; 157 } else { 158 return -1; 159 } 160 */ 161} 162 163static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { 164 0xFFFF0000, 165 0xFFFFFF00, 166 0xFFFFFFFF 167}; 168 169U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src, 170 uint32_t CE, uint32_t contCE, 171 uint32_t *nextCE, uint32_t *nextContCE, 172 uint32_t strength) 173{ 174 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 175 int32_t iCE; 176 177 iCE = ucol_inv_findCE(src, CE, contCE); 178 179 if(iCE<0) { 180 *nextCE = UCOL_NOT_FOUND; 181 return -1; 182 } 183 184 CE &= strengthMask[strength]; 185 contCE &= strengthMask[strength]; 186 187 *nextCE = CE; 188 *nextContCE = contCE; 189 190 while((*nextCE & strengthMask[strength]) == CE 191 && (*nextContCE & strengthMask[strength]) == contCE) 192 { 193 *nextCE = (*(CETable+3*(++iCE))); 194 *nextContCE = (*(CETable+3*(iCE)+1)); 195 } 196 197 return iCE; 198} 199 200U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src, 201 uint32_t CE, uint32_t contCE, 202 uint32_t *prevCE, uint32_t *prevContCE, 203 uint32_t strength) 204{ 205 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 206 int32_t iCE; 207 208 iCE = ucol_inv_findCE(src, CE, contCE); 209 210 if(iCE<0) { 211 *prevCE = UCOL_NOT_FOUND; 212 return -1; 213 } 214 215 CE &= strengthMask[strength]; 216 contCE &= strengthMask[strength]; 217 218 *prevCE = CE; 219 *prevContCE = contCE; 220 221 while((*prevCE & strengthMask[strength]) == CE 222 && (*prevContCE & strengthMask[strength])== contCE 223 && iCE > 0) /* this condition should prevent falling off the edge of the world */ 224 { 225 /* here, we end up in a singularity - zero */ 226 *prevCE = (*(CETable+3*(--iCE))); 227 *prevContCE = (*(CETable+3*(iCE)+1)); 228 } 229 230 return iCE; 231} 232 233U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE, 234 uint32_t prevCE, uint32_t prevContCE) 235{ 236 if(prevCE == CE && prevContCE == contCE) { 237 return UCOL_IDENTICAL; 238 } 239 if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY]) 240 || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY])) 241 { 242 return UCOL_PRIMARY; 243 } 244 if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY]) 245 || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY])) 246 { 247 return UCOL_SECONDARY; 248 } 249 return UCOL_TERTIARY; 250} 251 252 253/*static 254inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { 255 256 uint32_t CE = lh->baseCE; 257 uint32_t SecondCE = lh->baseContCE; 258 259 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 260 uint32_t previousCE, previousContCE; 261 int32_t iCE; 262 263 iCE = ucol_inv_findCE(src, CE, SecondCE); 264 265 if(iCE<0) { 266 return -1; 267 } 268 269 CE &= strengthMask[strength]; 270 SecondCE &= strengthMask[strength]; 271 272 previousCE = CE; 273 previousContCE = SecondCE; 274 275 while((previousCE & strengthMask[strength]) == CE && (previousContCE & strengthMask[strength])== SecondCE) { 276 previousCE = (*(CETable+3*(--iCE))); 277 previousContCE = (*(CETable+3*(iCE)+1)); 278 } 279 lh->previousCE = previousCE; 280 lh->previousContCE = previousContCE; 281 282 return iCE; 283}*/ 284 285static 286inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) { 287 uint32_t CE = lh->baseCE; 288 uint32_t SecondCE = lh->baseContCE; 289 290 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 291 uint32_t nextCE, nextContCE; 292 int32_t iCE; 293 294 iCE = ucol_inv_findCE(src, CE, SecondCE); 295 296 if(iCE<0) { 297 return -1; 298 } 299 300 CE &= strengthMask[strength]; 301 SecondCE &= strengthMask[strength]; 302 303 nextCE = CE; 304 nextContCE = SecondCE; 305 306 while((nextCE & strengthMask[strength]) == CE 307 && (nextContCE & strengthMask[strength]) == SecondCE) 308 { 309 nextCE = (*(CETable+3*(++iCE))); 310 nextContCE = (*(CETable+3*(iCE)+1)); 311 } 312 313 lh->nextCE = nextCE; 314 lh->nextContCE = nextContCE; 315 316 return iCE; 317} 318 319static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { 320 /* reset all the gaps */ 321 int32_t i = 0; 322 uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table); 323 uint32_t st = 0; 324 uint32_t t1, t2; 325 int32_t pos; 326 327 UColToken *tok = lh->first; 328 uint32_t tokStrength = tok->strength; 329 330 for(i = 0; i<3; i++) { 331 lh->gapsHi[3*i] = 0; 332 lh->gapsHi[3*i+1] = 0; 333 lh->gapsHi[3*i+2] = 0; 334 lh->gapsLo[3*i] = 0; 335 lh->gapsLo[3*i+1] = 0; 336 lh->gapsLo[3*i+2] = 0; 337 lh->numStr[i] = 0; 338 lh->fStrToken[i] = NULL; 339 lh->lStrToken[i] = NULL; 340 lh->pos[i] = -1; 341 } 342 343 UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts); 344 345 if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */ 346 //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */ 347 lh->pos[0] = 0; 348 t1 = lh->baseCE; 349 t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION; 350 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 351 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 352 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 353 uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16); 354 primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1); 355 356 t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505; 357 t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER; 358 359 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 360 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 361 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 362 } else if(lh->indirect == TRUE && lh->nextCE != 0) { 363 //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) { 364 lh->pos[0] = 0; 365 t1 = lh->baseCE; 366 t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION; 367 lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 368 lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 369 lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 370 t1 = lh->nextCE; 371 t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION; 372 lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 373 lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 374 lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 375 } else { 376 for(;;) { 377 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { 378 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) { 379 lh->fStrToken[tokStrength] = tok; 380 } else { /* The CE must be implicit, since it's not in the table */ 381 /* Error */ 382 *status = U_INTERNAL_PROGRAM_ERROR; 383 } 384 } 385 386 while(tok != NULL && tok->strength >= tokStrength) { 387 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) { 388 lh->lStrToken[tokStrength] = tok; 389 } 390 tok = tok->next; 391 } 392 if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) { 393 /* check if previous interval is the same and merge the intervals if it is so */ 394 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) { 395 lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1]; 396 lh->fStrToken[tokStrength+1] = NULL; 397 lh->lStrToken[tokStrength+1] = NULL; 398 lh->pos[tokStrength+1] = -1; 399 } 400 } 401 if(tok != NULL) { 402 tokStrength = tok->strength; 403 } else { 404 break; 405 } 406 } 407 for(st = 0; st < 3; st++) { 408 if((pos = lh->pos[st]) >= 0) { 409 t1 = *(CETable+3*(pos)); 410 t2 = *(CETable+3*(pos)+1); 411 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 412 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 413 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16; 414 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; 415 //pos--; 416 //t1 = *(CETable+3*(pos)); 417 //t2 = *(CETable+3*(pos)+1); 418 t1 = lh->baseCE; 419 t2 = lh->baseContCE; 420 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16; 421 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8; 422 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16; 423 } 424 } 425 } 426} 427 428 429#define ucol_countBytes(value, noOfBytes) \ 430{ \ 431 uint32_t mask = 0xFFFFFFFF; \ 432 (noOfBytes) = 0; \ 433 while(mask != 0) { \ 434 if(((value) & mask) != 0) { \ 435 (noOfBytes)++; \ 436 } \ 437 mask >>= 8; \ 438 } \ 439} 440 441static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) { 442 if(U_SUCCESS(*status)) { 443 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 444 } 445 return g->current; 446} 447 448static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) { 449 /* TODO: rename to enum names */ 450 uint32_t high, low, count=1; 451 uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF; 452 453 if(strength == UCOL_SECONDARY) { 454 low = UCOL_COMMON_TOP2<<24; 455 high = 0xFFFFFFFF; 456 count = 0xFF - UCOL_COMMON_TOP2; 457 } else { 458 low = UCOL_BYTE_COMMON << 24; //0x05000000; 459 high = 0x40000000; 460 count = 0x40 - UCOL_BYTE_COMMON; 461 } 462 463 if(tok->next != NULL && tok->next->strength == strength) { 464 count = tok->next->toInsert; 465 } 466 467 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); 468 g->current = UCOL_BYTE_COMMON<<24; 469 470 if(g->noOfRanges == 0) { 471 *status = U_INTERNAL_PROGRAM_ERROR; 472 } 473 return g->current; 474} 475 476static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) { 477 uint32_t strength = tok->strength; 478 uint32_t low = lows[fStrength*3+strength]; 479 uint32_t high = highs[fStrength*3+strength]; 480 uint32_t maxByte = 0; 481 if(strength == UCOL_TERTIARY) { 482 maxByte = 0x3F; 483 } else if(strength == UCOL_PRIMARY) { 484 maxByte = 0xFE; 485 } else { 486 maxByte = 0xFF; 487 } 488 489 uint32_t count = tok->toInsert; 490 491 if(low >= high && strength > UCOL_PRIMARY) { 492 int32_t s = strength; 493 for(;;) { 494 s--; 495 if(lows[fStrength*3+s] != highs[fStrength*3+s]) { 496 if(strength == UCOL_SECONDARY) { 497 if (low < UCOL_COMMON_TOP2<<24 ) { 498 // Override if low range is less than UCOL_COMMON_TOP2. 499 low = UCOL_COMMON_TOP2<<24; 500 } 501 high = 0xFFFFFFFF; 502 } else { 503 // Override if low range is less than UCOL_COMMON_BOT3. 504 if ( low < UCOL_COMMON_BOT3<<24 ) { 505 low = UCOL_COMMON_BOT3<<24; 506 } 507 high = 0x40000000; 508 } 509 break; 510 } 511 if(s<0) { 512 *status = U_INTERNAL_PROGRAM_ERROR; 513 return 0; 514 } 515 } 516 } 517 518 if(low < 0x02000000) { 519 // We must not use CE weight byte 02, so we set it as the minimum lower bound. 520 // See http://site.icu-project.org/design/collation/bytes 521 low = 0x02000000; 522 } 523 524 if(strength == UCOL_SECONDARY) { /* similar as simple */ 525 if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) { 526 low = UCOL_COMMON_TOP2<<24; 527 } 528 if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) { 529 high = UCOL_COMMON_TOP2<<24; 530 } 531 if(low < (UCOL_COMMON_BOT2<<24)) { 532 g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges); 533 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 534 //g->current = UCOL_COMMON_BOT2<<24; 535 return g->current; 536 } 537 } 538 539 g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges); 540 if(g->noOfRanges == 0) { 541 *status = U_INTERNAL_PROGRAM_ERROR; 542 } 543 g->current = ucol_nextWeight(g->ranges, &g->noOfRanges); 544 return g->current; 545} 546 547static 548uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { 549 uint32_t i = 0; 550 UChar c; 551 552 if(U_FAILURE(*status)) { 553 return 0; 554 } 555 556 if(sourceLen > resLen) { 557 *status = U_MEMORY_ALLOCATION_ERROR; 558 return 0; 559 } 560 561 for(i = 0; i < sourceLen; i++) { 562 c = source[i]; 563 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ 564 switch(c - 0x3000) { 565 case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E: 566 case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE: 567 c++; 568 break; 569 case 0xF5: 570 c = 0x30AB; 571 break; 572 case 0xF6: 573 c = 0x30B1; 574 break; 575 } 576 } 577 resBuf[i] = c; 578 } 579 return sourceLen; 580} 581 582static 583uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) { 584 uint32_t i = 0; 585 UChar c; 586 587 if(U_FAILURE(*status)) { 588 return 0; 589 } 590 591 if(sourceLen > resLen) { 592 *status = U_MEMORY_ALLOCATION_ERROR; 593 return 0; 594 } 595 596 for(i = 0; i < sourceLen; i++) { 597 c = source[i]; 598 if(0x3041 <= c && c <= 0x30FA) { /* Kana range */ 599 switch(c - 0x3000) { 600 case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F: 601 case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF: 602 c--; 603 break; 604 case 0xAB: 605 c = 0x30F5; 606 break; 607 case 0xB1: 608 c = 0x30F6; 609 break; 610 } 611 } 612 resBuf[i] = c; 613 } 614 return sourceLen; 615} 616 617U_NAMESPACE_BEGIN 618 619static 620uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) { 621 uint32_t i = 0; 622 UChar n[128]; 623 uint32_t nLen = 0; 624 uint32_t uCount = 0, lCount = 0; 625 626 collIterate s; 627 uint32_t order = 0; 628 629 if(U_FAILURE(*status)) { 630 return UCOL_LOWER_CASE; 631 } 632 633 nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status); 634 if(U_SUCCESS(*status)) { 635 for(i = 0; i < nLen; i++) { 636 uprv_init_collIterate(UCA, &n[i], 1, &s, status); 637 order = ucol_getNextCE(UCA, &s, status); 638 if(isContinuation(order)) { 639 *status = U_INTERNAL_PROGRAM_ERROR; 640 return UCOL_LOWER_CASE; 641 } 642 if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) { 643 uCount++; 644 } else { 645 if(u_islower(n[i])) { 646 lCount++; 647 } else if(U_SUCCESS(*status)) { 648 UChar sk[1], lk[1]; 649 u_toSmallKana(&n[i], 1, sk, 1, status); 650 u_toLargeKana(&n[i], 1, lk, 1, status); 651 if(sk[0] == n[i] && lk[0] != n[i]) { 652 lCount++; 653 } 654 } 655 } 656 } 657 } 658 659 if(uCount != 0 && lCount != 0) { 660 return UCOL_MIXED_CASE; 661 } else if(uCount != 0) { 662 return UCOL_UPPER_CASE; 663 } else { 664 return UCOL_LOWER_CASE; 665 } 666} 667 668 669U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) { 670 /* this one makes the table and stuff */ 671 uint32_t noOfBytes[3]; 672 uint32_t i; 673 674 for(i = 0; i<3; i++) { 675 ucol_countBytes(CEparts[i], noOfBytes[i]); 676 } 677 678 /* Here we have to pack CEs from parts */ 679 680 uint32_t CEi = 0; 681 uint32_t value = 0; 682 683 while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) { 684 if(CEi > 0) { 685 value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ 686 } else { 687 value = 0; 688 } 689 690 if(2*CEi<noOfBytes[0]) { 691 value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16; 692 } 693 if(CEi<noOfBytes[1]) { 694 value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8; 695 } 696 if(CEi<noOfBytes[2]) { 697 value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F); 698 } 699 tok->CEs[CEi] = value; 700 CEi++; 701 } 702 if(CEi == 0) { /* totally ignorable */ 703 tok->noOfCEs = 1; 704 tok->CEs[0] = 0; 705 } else { /* there is at least something */ 706 tok->noOfCEs = CEi; 707 } 708 709 710 // we want to set case bits here and now, not later. 711 // Case bits handling 712 if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables 713 tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field 714 int32_t cSize = (tok->source & 0xFF000000) >> 24; 715 UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source; 716 717 if(cSize > 1) { 718 // Do it manually 719 tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status); 720 } else { 721 // Copy it from the UCA 722 uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status); 723 tok->CEs[0] |= (caseCE & 0xC0); 724 } 725 } 726 727#if UCOL_DEBUG==2 728 fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2])); 729 for(i = 0; i<tok->noOfCEs; i++) { 730 fprintf(stderr, "%08X ", tok->CEs[i]); 731 } 732 fprintf(stderr, "\n"); 733#endif 734} 735 736U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) { 737 ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT]; 738 uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT]; 739 740 UColToken *tok = lh->last; 741 uint32_t t[UCOL_STRENGTH_LIMIT]; 742 743 uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t)); 744 745 tok->toInsert = 1; 746 t[tok->strength] = 1; 747 748 while(tok->previous != NULL) { 749 if(tok->previous->strength < tok->strength) { /* going up */ 750 t[tok->strength] = 0; 751 t[tok->previous->strength]++; 752 } else if(tok->previous->strength > tok->strength) { /* going down */ 753 t[tok->previous->strength] = 1; 754 } else { 755 t[tok->strength]++; 756 } 757 tok=tok->previous; 758 tok->toInsert = t[tok->strength]; 759 } 760 761 tok->toInsert = t[tok->strength]; 762 ucol_inv_getGapPositions(src, lh, status); 763 764#if UCOL_DEBUG 765 fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE); 766 int32_t j = 2; 767 for(j = 2; j >= 0; j--) { 768 fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]); 769 fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]); 770 } 771 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; 772 773 do { 774 fprintf(stderr,"%i", tok->strength); 775 tok = tok->next; 776 } while(tok != NULL); 777 fprintf(stderr, "\n"); 778 779 tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE]; 780 781 do { 782 fprintf(stderr,"%i", tok->toInsert); 783 tok = tok->next; 784 } while(tok != NULL); 785#endif 786 787 tok = lh->first; 788 uint32_t fStrength = UCOL_IDENTICAL; 789 uint32_t initStrength = UCOL_IDENTICAL; 790 791 792 CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16; 793 CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8; 794 CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16; 795 796 while (tok != NULL && U_SUCCESS(*status)) { 797 fStrength = tok->strength; 798 if(fStrength < initStrength) { 799 initStrength = fStrength; 800 if(lh->pos[fStrength] == -1) { 801 while(lh->pos[fStrength] == -1 && fStrength > 0) { 802 fStrength--; 803 } 804 if(lh->pos[fStrength] == -1) { 805 *status = U_INTERNAL_PROGRAM_ERROR; 806 return; 807 } 808 } 809 if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */ 810 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; 811 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1]; 812 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */ 813 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 814 } else if(initStrength == UCOL_SECONDARY) { /* secondaries */ 815 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3]; 816 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/ 817 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 818 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 819 } else { /* primaries */ 820 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/ 821 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status); 822 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); 823 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 824 } 825 } else { 826 if(tok->strength == UCOL_TERTIARY) { 827 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status); 828 } else if(tok->strength == UCOL_SECONDARY) { 829 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status); 830 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 831 } else if(tok->strength == UCOL_PRIMARY) { 832 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status); 833 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status); 834 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status); 835 } 836 } 837 ucol_doCE(src, CEparts, tok, status); 838 tok = tok->next; 839 } 840} 841 842U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) { 843 UCAElements el; 844 UColToken *tok = lh->first; 845 UColToken *expt = NULL; 846 uint32_t i = 0, j = 0; 847 UChar32 fcdHighStart; 848 const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status); 849 850 while(tok != NULL && U_SUCCESS(*status)) { 851 /* first, check if there are any expansions */ 852 /* if there are expansions, we need to do a little bit more processing */ 853 /* since parts of expansion can be tailored, while others are not */ 854 if(tok->expansion != 0) { 855 uint32_t len = tok->expansion >> 24; 856 uint32_t currentSequenceLen = len; 857 uint32_t expOffset = tok->expansion & 0x00FFFFFF; 858 //uint32_t exp = currentSequenceLen | expOffset; 859 UColToken exp; 860 exp.source = currentSequenceLen | expOffset; 861 exp.rulesToParseHdl = &(src->source); 862 863 while(len > 0) { 864 currentSequenceLen = len; 865 while(currentSequenceLen > 0) { 866 exp.source = (currentSequenceLen << 24) | expOffset; 867 if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */ 868 uint32_t noOfCEsToCopy = expt->noOfCEs; 869 for(j = 0; j<noOfCEsToCopy; j++) { 870 tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j]; 871 } 872 tok->noOfExpCEs += noOfCEsToCopy; 873 // Smart people never try to add codepoints and CEs. 874 // For some odd reason, it won't work. 875 expOffset += currentSequenceLen; //noOfCEsToCopy; 876 len -= currentSequenceLen; //noOfCEsToCopy; 877 break; 878 } else { 879 currentSequenceLen--; 880 } 881 } 882 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */ 883 /* will have to get one from UCA */ 884 /* first, get the UChars from the rules */ 885 /* then pick CEs out until there is no more and stuff them into expansion */ 886 collIterate s; 887 uint32_t order = 0; 888 uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status); 889 890 for(;;) { 891 order = ucol_getNextCE(src->UCA, &s, status); 892 if(order == UCOL_NO_MORE_CES) { 893 break; 894 } 895 tok->expCEs[tok->noOfExpCEs++] = order; 896 } 897 expOffset++; 898 len--; 899 } 900 } 901 } else { 902 tok->noOfExpCEs = 0; 903 } 904 905 /* set the ucaelement with obtained values */ 906 el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs; 907 /* copy CEs */ 908 for(i = 0; i<tok->noOfCEs; i++) { 909 el.CEs[i] = tok->CEs[i]; 910 } 911 for(i = 0; i<tok->noOfExpCEs; i++) { 912 el.CEs[i+tok->noOfCEs] = tok->expCEs[i]; 913 } 914 915 /* copy UChars */ 916 // We kept prefix and source kind of together, as it is a kind of a contraction. 917 // However, now we have to slice the prefix off the main thing - 918 el.prefix = el.prefixChars; 919 el.cPoints = el.uchars; 920 if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the 921 // addPrefix function in ucol_elm. The reason is that we need to add both composed AND 922 // decomposed elements to the unsaf table. 923 el.prefixSize = tok->prefix>>24; 924 uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar)); 925 926 el.cSize = (tok->source >> 24)-(tok->prefix>>24); 927 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar)); 928 } else { 929 el.prefixSize = 0; 930 *el.prefix = 0; 931 932 el.cSize = (tok->source >> 24); 933 uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar)); 934 } 935 if(src->UCA != NULL) { 936 for(i = 0; i<el.cSize; i++) { 937 if(UCOL_ISJAMO(el.cPoints[i])) { 938 t->image->jamoSpecial = TRUE; 939 } 940 } 941 if (!src->buildCCTabFlag && el.cSize > 0) { 942 // Check the trailing canonical combining class (tccc) of the last character. 943 const UChar *s = el.cPoints + el.cSize; 944 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s); 945 if ((fcd & 0xff) != 0) { 946 src->buildCCTabFlag = TRUE; 947 } 948 } 949 } 950 951 /* and then, add it */ 952#if UCOL_DEBUG==2 953 fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]); 954#endif 955 uprv_uca_addAnElement(t, &el, status); 956 957#if UCOL_DEBUG_DUPLICATES 958 if(*status != U_ZERO_ERROR) { 959 fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource); 960 *status = U_ZERO_ERROR; 961 } 962#endif 963 964 tok = tok->next; 965 } 966} 967 968U_CDECL_BEGIN 969static UBool U_CALLCONV 970_processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) { 971 UErrorCode status = U_ZERO_ERROR; 972 tempUCATable *t = (tempUCATable *)context; 973 if(value == 0) { 974 while(start < limit) { 975 uint32_t CE = utrie_get32(t->mapping, start, NULL); 976 if(CE == UCOL_NOT_FOUND) { 977 UCAElements el; 978 el.isThai = FALSE; 979 el.prefixSize = 0; 980 el.prefixChars[0] = 0; 981 el.prefix = el.prefixChars; 982 el.cPoints = el.uchars; 983 984 el.cSize = 0; 985 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start); 986 987 el.noOfCEs = 1; 988 el.CEs[0] = 0; 989 uprv_uca_addAnElement(t, &el, &status); 990 991 } 992 start++; 993 } 994 } 995 if(U_FAILURE(status)) { 996 return FALSE; 997 } else { 998 return TRUE; 999 } 1000} 1001U_CDECL_END 1002 1003static void 1004ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t, 1005 UChar32 start, UChar32 end, 1006 UErrorCode *status) 1007{ 1008 //UChar decomp[256]; 1009 uint32_t CE = UCOL_NOT_FOUND; 1010 UChar32 u = 0; 1011 UCAElements el; 1012 el.isThai = FALSE; 1013 el.prefixSize = 0; 1014 el.prefixChars[0] = 0; 1015 collIterate colIt; 1016 1017 if(U_SUCCESS(*status)) { 1018 for(u = start; u<=end; u++) { 1019 if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND 1020 /* this test is for contractions that are missing the starting element. */ 1021 || ((isCntTableElement(CE)) && 1022 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND)) 1023 ) 1024 { 1025 el.cSize = 0; 1026 U16_APPEND_UNSAFE(el.uchars, el.cSize, u); 1027 //decomp[0] = (UChar)u; 1028 //el.uchars[0] = (UChar)u; 1029 el.cPoints = el.uchars; 1030 //el.cSize = 1; 1031 el.noOfCEs = 0; 1032 el.prefix = el.prefixChars; 1033 el.prefixSize = 0; 1034 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt); 1035 // We actually want to check whether this element is a special 1036 // If it is an implicit element (hangul, CJK - we want to copy the 1037 // special, not the resolved CEs) - for hangul, copying resolved 1038 // would just make things the same (there is an expansion and it 1039 // takes approximately the same amount of time to resolve as 1040 // falling back to the UCA). 1041 /* 1042 UTRIE_GET32(src->UCA->mapping, u, CE); 1043 tag = getCETag(CE); 1044 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG 1045 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG 1046 || tag == LEAD_SURROGATE_TAG) { 1047 el.CEs[el.noOfCEs++] = CE; 1048 } else { 1049 */ 1050 // It turns out that it does not make sense to keep implicits 1051 // unresolved. The cost of resolving them is big enough so that 1052 // it doesn't make any difference whether we have to go to the UCA 1053 // or not. 1054 { 1055 uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status); 1056 while(CE != UCOL_NO_MORE_CES) { 1057 CE = ucol_getNextCE(src->UCA, &colIt, status); 1058 if(CE != UCOL_NO_MORE_CES) { 1059 el.CEs[el.noOfCEs++] = CE; 1060 } 1061 } 1062 } 1063 uprv_uca_addAnElement(t, &el, status); 1064 } 1065 } 1066 } 1067} 1068 1069U_NAMESPACE_END 1070 1071U_CFUNC UCATableHeader * 1072ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) { 1073 U_NAMESPACE_USE 1074 1075 uint32_t i = 0; 1076 if(U_FAILURE(*status)) { 1077 return NULL; 1078 } 1079 /* 1080 2. Eliminate the negative lists by doing the following for each non-null negative list: 1081 o if previousCE(baseCE, strongestN) != some ListHeader X's baseCE, 1082 create new ListHeader X 1083 o reverse the list, add to the end of X's positive list. Reset the strength of the 1084 first item you add, based on the stronger strength levels of the two lists. 1085 */ 1086 /* 1087 3. For each ListHeader with a non-null positive list: 1088 */ 1089 /* 1090 o Find all character strings with CEs between the baseCE and the 1091 next/previous CE, at the strength of the first token. Add these to the 1092 tailoring. 1093 ? That is, if UCA has ... x <<< X << x' <<< X' < y ..., and the 1094 tailoring has & x < z... 1095 ? Then we change the tailoring to & x <<< X << x' <<< X' < z ... 1096 */ 1097 /* It is possible that this part should be done even while constructing list */ 1098 /* The problem is that it is unknown what is going to be the strongest weight */ 1099 /* So we might as well do it here */ 1100 1101 /* 1102 o Allocate CEs for each token in the list, based on the total number N of the 1103 largest level difference, and the gap G between baseCE and nextCE at that 1104 level. The relation * between the last item and nextCE is the same as the 1105 strongest strength. 1106 o Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1) 1107 ? There are 3 primary items: a, d, e. Fit them into the primary gap. 1108 Then fit b and c into the secondary gap between a and d, then fit q 1109 into the tertiary gap between b and c. 1110 1111 o Example: baseCE << b <<< q << c * nextCE(X,2) 1112 ? There are 2 secondary items: b, c. Fit them into the secondary gap. 1113 Then fit q into the tertiary gap between b and c. 1114 o When incrementing primary values, we will not cross high byte 1115 boundaries except where there is only a single-byte primary. That is to 1116 ensure that the script reordering will continue to work. 1117 */ 1118 UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader)); 1119 /* test for NULL */ 1120 if (image == NULL) { 1121 *status = U_MEMORY_ALLOCATION_ERROR; 1122 return NULL; 1123 } 1124 uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader)); 1125 1126 for(i = 0; i<src->resultLen; i++) { 1127 /* now we need to generate the CEs */ 1128 /* We stuff the initial value in the buffers, and increase the appropriate buffer */ 1129 /* According to strength */ 1130 if(U_SUCCESS(*status)) { 1131 if(src->lh[i].first) { // if there are any elements 1132 // due to the way parser works, subsequent tailorings 1133 // may remove all the elements from a sequence, therefore 1134 // leaving an empty tailoring sequence. 1135 ucol_initBuffers(src, &src->lh[i], status); 1136 } 1137 } 1138 if(U_FAILURE(*status)) { 1139 uprv_free(image); 1140 return NULL; 1141 } 1142 } 1143 1144 if(src->varTop != NULL) { /* stuff the variable top value */ 1145 src->opts->variableTopValue = (*(src->varTop->CEs))>>16; 1146 /* remove it from the list */ 1147 if(src->varTop->listHeader->first == src->varTop) { /* first in list */ 1148 src->varTop->listHeader->first = src->varTop->next; 1149 } 1150 if(src->varTop->listHeader->last == src->varTop) { /* first in list */ 1151 src->varTop->listHeader->last = src->varTop->previous; 1152 } 1153 if(src->varTop->next != NULL) { 1154 src->varTop->next->previous = src->varTop->previous; 1155 } 1156 if(src->varTop->previous != NULL) { 1157 src->varTop->previous->next = src->varTop->next; 1158 } 1159 } 1160 1161 1162 tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status); 1163 if(U_FAILURE(*status)) { 1164 uprv_free(image); 1165 return NULL; 1166 } 1167 1168 1169 /* After this, we have assigned CE values to all regular CEs */ 1170 /* now we will go through list once more and resolve expansions, */ 1171 /* make UCAElements structs and add them to table */ 1172 for(i = 0; i<src->resultLen; i++) { 1173 /* now we need to generate the CEs */ 1174 /* We stuff the initial value in the buffers, and increase the appropriate buffer */ 1175 /* According to strength */ 1176 if(U_SUCCESS(*status)) { 1177 ucol_createElements(src, t, &src->lh[i], status); 1178 } 1179 } 1180 1181 UCAElements el; 1182 el.isThai = FALSE; 1183 el.prefixSize = 0; 1184 el.prefixChars[0] = 0; 1185 1186 /* add latin-1 stuff */ 1187 ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status); 1188 1189 /* add stuff for copying */ 1190 if(src->copySet != NULL) { 1191 int32_t i = 0; 1192 UnicodeSet *set = (UnicodeSet *)src->copySet; 1193 for(i = 0; i < set->getRangeCount(); i++) { 1194 ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status); 1195 } 1196 } 1197 1198 if(U_SUCCESS(*status)) { 1199 /* copy contractions from the UCA - this is felt mostly for cyrillic*/ 1200 1201 uint32_t tailoredCE = UCOL_NOT_FOUND; 1202 //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants)); 1203 UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos); 1204 UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status); 1205 // Check for null pointer 1206 if (ucaEl == NULL) { 1207 *status = U_MEMORY_ALLOCATION_ERROR; 1208 return NULL; 1209 } 1210 while(*conts != 0) { 1211 /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/ 1212 tailoredCE = utrie_get32(t->mapping, *conts, NULL); 1213 if(tailoredCE != UCOL_NOT_FOUND) { 1214 UBool needToAdd = TRUE; 1215 if(isCntTableElement(tailoredCE)) { 1216 if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) { 1217 needToAdd = FALSE; 1218 } 1219 } 1220 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) { 1221 UCAElements elm; 1222 elm.cPoints = el.uchars; 1223 elm.noOfCEs = 0; 1224 elm.uchars[0] = *conts; 1225 elm.uchars[1] = 0; 1226 elm.cSize = 1; 1227 elm.prefixChars[0] = *(conts+2); 1228 elm.isThai = FALSE; 1229 elm.prefix = elm.prefixChars; 1230 elm.prefixSize = 1; 1231 UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm); 1232 if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) { 1233 needToAdd = TRUE; 1234 } 1235 } 1236 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) { 1237 needToAdd = FALSE; 1238 } 1239 1240 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored. 1241 if (*(conts+1) != 0) { // contractions 1242 el.prefix = el.prefixChars; 1243 el.prefixSize = 0; 1244 el.cPoints = el.uchars; 1245 el.noOfCEs = 0; 1246 el.uchars[0] = *conts; 1247 el.uchars[1] = *(conts+1); 1248 if(*(conts+2)!=0) { 1249 el.uchars[2] = *(conts+2); 1250 el.cSize = 3; 1251 } else { 1252 el.cSize = 2; 1253 } 1254 ucol_setText(ucaEl, el.uchars, el.cSize, status); 1255 } 1256 else { // pre-context character 1257 UChar str[4] = { 0 }; 1258 int32_t len=0; 1259 int32_t preKeyLen=0; 1260 1261 el.cPoints = el.uchars; 1262 el.noOfCEs = 0; 1263 el.uchars[0] = *conts; 1264 el.uchars[1] = 0; 1265 el.cSize = 1; 1266 el.prefixChars[0] = *(conts+2); 1267 el.prefix = el.prefixChars; 1268 el.prefixSize = 1; 1269 if (el.prefixChars[0]!=0) { 1270 // get CE of prefix character first 1271 str[0]=el.prefixChars[0]; 1272 str[1]=0; 1273 ucol_setText(ucaEl, str, 1, status); 1274 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) 1275 != UCOL_NULLORDER) { 1276 preKeyLen++; // count number of keys for prefix character 1277 } 1278 str[len++] = el.prefixChars[0]; 1279 } 1280 1281 str[len++] = el.uchars[0]; 1282 str[len]=0; 1283 ucol_setText(ucaEl, str, len, status); 1284 // Skip the keys for prefix character, then copy the rest to el. 1285 while ((preKeyLen-->0) && 1286 (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { 1287 continue; 1288 } 1289 1290 } 1291 while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) { 1292 el.noOfCEs++; 1293 } 1294 uprv_uca_addAnElement(t, &el, status); 1295 } 1296 1297 } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) { 1298 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status); 1299 } 1300 conts+=3; 1301 } 1302 ucol_closeElements(ucaEl); 1303 } 1304 1305 // Add completely ignorable elements 1306 utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t); 1307 1308 // add tailoring characters related canonical closures 1309 uprv_uca_canonicalClosure(t, src, NULL, status); 1310 1311 /* still need to produce compatibility closure */ 1312 1313 UCATableHeader *myData = uprv_uca_assembleTable(t, status); 1314 1315 uprv_uca_closeTempTable(t); 1316 uprv_free(image); 1317 1318 return myData; 1319} 1320 1321U_CDECL_BEGIN 1322static UBool U_CALLCONV 1323ucol_bld_cleanup(void) 1324{ 1325 udata_close(invUCA_DATA_MEM); 1326 invUCA_DATA_MEM = NULL; 1327 _staticInvUCA = NULL; 1328 return TRUE; 1329} 1330U_CDECL_END 1331 1332U_CAPI const InverseUCATableHeader * U_EXPORT2 1333ucol_initInverseUCA(UErrorCode *status) 1334{ 1335 if(U_FAILURE(*status)) return NULL; 1336 1337 UBool needsInit; 1338 UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit); 1339 1340 if(needsInit) { 1341 InverseUCATableHeader *newInvUCA = NULL; 1342 UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status); 1343 1344 if(U_FAILURE(*status)) { 1345 if (result) { 1346 udata_close(result); 1347 } 1348 // This is not needed, as we are talking about 1349 // memory we got from UData 1350 //uprv_free(newInvUCA); 1351 } 1352 1353 if(result != NULL) { /* It looks like sometimes we can fail to find the data file */ 1354 newInvUCA = (InverseUCATableHeader *)udata_getMemory(result); 1355 UCollator *UCA = ucol_initUCA(status); 1356 // UCA versions of UCA and inverse UCA should match 1357 if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) { 1358 *status = U_INVALID_FORMAT_ERROR; 1359 udata_close(result); 1360 return NULL; 1361 } 1362 1363 umtx_lock(NULL); 1364 if(_staticInvUCA == NULL) { 1365 invUCA_DATA_MEM = result; 1366 _staticInvUCA = newInvUCA; 1367 result = NULL; 1368 newInvUCA = NULL; 1369 } 1370 umtx_unlock(NULL); 1371 1372 if(newInvUCA != NULL) { 1373 udata_close(result); 1374 // This is not needed, as we are talking about 1375 // memory we got from UData 1376 //uprv_free(newInvUCA); 1377 } 1378 else { 1379 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup); 1380 } 1381 } 1382 } 1383 return _staticInvUCA; 1384} 1385 1386/* This is the data that is used for non-script reordering codes. These _must_ be kept 1387 * in order that they are to be applied as defaults and in synch with the UColReorderCode enum. 1388 */ 1389static const char* ReorderingTokenNames[] = { 1390 "SPACE", 1391 "PUNCT", 1392 "SYMBOL", 1393 "CURRENCY", 1394 "DIGIT", 1395 NULL 1396}; 1397 1398static void toUpper(const char* src, char* dst, uint32_t length) { 1399 for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) { 1400 *dst = toupper(*src); 1401 } 1402 *dst = '\0'; 1403} 1404 1405U_INTERNAL int32_t U_EXPORT2 1406ucol_findReorderingEntry(const char* name) { 1407 char buffer[32]; 1408 toUpper(name, buffer, 32); 1409 for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) { 1410 if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) { 1411 return entry + UCOL_REORDER_CODE_FIRST; 1412 } 1413 } 1414 return USCRIPT_INVALID_CODE; 1415} 1416 1417#endif /* #if !UCONFIG_NO_COLLATION */ 1418