1/* 2******************************************************************************* 3* Copyright (C) 2004-2009, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* file name: ucol_sit.cpp 7* encoding: US-ASCII 8* tab size: 8 (not used) 9* indentation:4 10* 11* Modification history 12* Date Name Comments 13* 03/12/2004 weiv Creation 14*/ 15 16#include "unicode/ustring.h" 17#include "unicode/udata.h" 18 19#include "utracimp.h" 20#include "ucol_imp.h" 21#include "ucol_tok.h" 22#include "unormimp.h" 23#include "cmemory.h" 24#include "cstring.h" 25#include "uresimp.h" 26 27#if !UCONFIG_NO_COLLATION 28 29enum OptionsList { 30 UCOL_SIT_LANGUAGE = 0, 31 UCOL_SIT_SCRIPT, 32 UCOL_SIT_REGION, 33 UCOL_SIT_VARIANT, 34 UCOL_SIT_KEYWORD, 35 UCOL_SIT_BCP47, 36 UCOL_SIT_STRENGTH, 37 UCOL_SIT_CASE_LEVEL, 38 UCOL_SIT_CASE_FIRST, 39 UCOL_SIT_NUMERIC_COLLATION, 40 UCOL_SIT_ALTERNATE_HANDLING, 41 UCOL_SIT_NORMALIZATION_MODE, 42 UCOL_SIT_FRENCH_COLLATION, 43 UCOL_SIT_HIRAGANA_QUATERNARY, 44 UCOL_SIT_VARIABLE_TOP, 45 UCOL_SIT_VARIABLE_TOP_VALUE, 46 UCOL_SIT_ITEMS_COUNT 47}; 48 49/* option starters chars. */ 50static const char alternateHArg = 'A'; 51static const char variableTopValArg = 'B'; 52static const char caseFirstArg = 'C'; 53static const char numericCollArg = 'D'; 54static const char caseLevelArg = 'E'; 55static const char frenchCollArg = 'F'; 56static const char hiraganaQArg = 'H'; 57static const char keywordArg = 'K'; 58static const char languageArg = 'L'; 59static const char normArg = 'N'; 60static const char regionArg = 'R'; 61static const char strengthArg = 'S'; 62static const char variableTopArg = 'T'; 63static const char variantArg = 'V'; 64static const char RFC3066Arg = 'X'; 65static const char scriptArg = 'Z'; 66 67static const char collationKeyword[] = "@collation="; 68 69static const int32_t locElementCount = 5; 70static const int32_t locElementCapacity = 32; 71static const int32_t loc3066Capacity = 256; 72static const int32_t internalBufferSize = 512; 73 74/* structure containing specification of a collator. Initialized 75 * from a short string. Also used to construct a short string from a 76 * collator instance 77 */ 78struct CollatorSpec { 79 char locElements[locElementCount][locElementCapacity]; 80 char locale[loc3066Capacity]; 81 UColAttributeValue options[UCOL_ATTRIBUTE_COUNT]; 82 uint32_t variableTopValue; 83 UChar variableTopString[locElementCapacity]; 84 int32_t variableTopStringLen; 85 UBool variableTopSet; 86 struct { 87 const char *start; 88 int32_t len; 89 } entries[UCOL_SIT_ITEMS_COUNT]; 90}; 91 92 93/* structure for converting between character attribute 94 * representation and real collation attribute value. 95 */ 96struct AttributeConversion { 97 char letter; 98 UColAttributeValue value; 99}; 100 101static const AttributeConversion conversions[12] = { 102 { '1', UCOL_PRIMARY }, 103 { '2', UCOL_SECONDARY }, 104 { '3', UCOL_TERTIARY }, 105 { '4', UCOL_QUATERNARY }, 106 { 'D', UCOL_DEFAULT }, 107 { 'I', UCOL_IDENTICAL }, 108 { 'L', UCOL_LOWER_FIRST }, 109 { 'N', UCOL_NON_IGNORABLE }, 110 { 'O', UCOL_ON }, 111 { 'S', UCOL_SHIFTED }, 112 { 'U', UCOL_UPPER_FIRST }, 113 { 'X', UCOL_OFF } 114}; 115 116 117static char 118ucol_sit_attributeValueToLetter(UColAttributeValue value, UErrorCode *status) { 119 uint32_t i = 0; 120 for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) { 121 if(conversions[i].value == value) { 122 return conversions[i].letter; 123 } 124 } 125 *status = U_ILLEGAL_ARGUMENT_ERROR; 126 return 0; 127} 128 129static UColAttributeValue 130ucol_sit_letterToAttributeValue(char letter, UErrorCode *status) { 131 uint32_t i = 0; 132 for(i = 0; i < sizeof(conversions)/sizeof(conversions[0]); i++) { 133 if(conversions[i].letter == letter) { 134 return conversions[i].value; 135 } 136 } 137 *status = U_ILLEGAL_ARGUMENT_ERROR; 138 return UCOL_DEFAULT; 139} 140 141/* function prototype for functions used to parse a short string */ 142U_CDECL_BEGIN 143typedef const char* U_CALLCONV 144ActionFunction(CollatorSpec *spec, uint32_t value1, const char* string, 145 UErrorCode *status); 146U_CDECL_END 147 148U_CDECL_BEGIN 149static const char* U_CALLCONV 150_processLocaleElement(CollatorSpec *spec, uint32_t value, const char* string, 151 UErrorCode *status) 152{ 153 int32_t len = 0; 154 do { 155 if(value == 0 || value == 4) { 156 spec->locElements[value][len++] = uprv_tolower(*string); 157 } else { 158 spec->locElements[value][len++] = *string; 159 } 160 } while(*(++string) != '_' && *string && len < locElementCapacity); 161 if(len >= locElementCapacity) { 162 *status = U_BUFFER_OVERFLOW_ERROR; 163 return string; 164 } 165 // don't skip the underscore at the end 166 return string; 167} 168U_CDECL_END 169 170U_CDECL_BEGIN 171static const char* U_CALLCONV 172_processRFC3066Locale(CollatorSpec *spec, uint32_t, const char* string, 173 UErrorCode *status) 174{ 175 char terminator = *string; 176 string++; 177 const char *end = uprv_strchr(string+1, terminator); 178 if(end == NULL || end - string >= loc3066Capacity) { 179 *status = U_BUFFER_OVERFLOW_ERROR; 180 return string; 181 } else { 182 uprv_strncpy(spec->locale, string, end-string); 183 return end+1; 184 } 185} 186 187U_CDECL_END 188 189U_CDECL_BEGIN 190static const char* U_CALLCONV 191_processCollatorOption(CollatorSpec *spec, uint32_t option, const char* string, 192 UErrorCode *status) 193{ 194 spec->options[option] = ucol_sit_letterToAttributeValue(*string, status); 195 if((*(++string) != '_' && *string) || U_FAILURE(*status)) { 196 *status = U_ILLEGAL_ARGUMENT_ERROR; 197 } 198 return string; 199} 200U_CDECL_END 201 202 203static UChar 204readHexCodeUnit(const char **string, UErrorCode *status) 205{ 206 UChar result = 0; 207 int32_t value = 0; 208 char c; 209 int32_t noDigits = 0; 210 while((c = **string) != 0 && noDigits < 4) { 211 if( c >= '0' && c <= '9') { 212 value = c - '0'; 213 } else if ( c >= 'a' && c <= 'f') { 214 value = c - 'a' + 10; 215 } else if ( c >= 'A' && c <= 'F') { 216 value = c - 'A' + 10; 217 } else { 218 *status = U_ILLEGAL_ARGUMENT_ERROR; 219 return 0; 220 } 221 result = (result << 4) | (UChar)value; 222 noDigits++; 223 (*string)++; 224 } 225 // if the string was terminated before we read 4 digits, set an error 226 if(noDigits < 4) { 227 *status = U_ILLEGAL_ARGUMENT_ERROR; 228 } 229 return result; 230} 231 232U_CDECL_BEGIN 233static const char* U_CALLCONV 234_processVariableTop(CollatorSpec *spec, uint32_t value1, const char* string, UErrorCode *status) 235{ 236 // get four digits 237 int32_t i = 0; 238 if(!value1) { 239 while(U_SUCCESS(*status) && i < locElementCapacity && *string != 0 && *string != '_') { 240 spec->variableTopString[i++] = readHexCodeUnit(&string, status); 241 } 242 spec->variableTopStringLen = i; 243 if(i == locElementCapacity && (*string != 0 || *string != '_')) { 244 *status = U_BUFFER_OVERFLOW_ERROR; 245 } 246 } else { 247 spec->variableTopValue = readHexCodeUnit(&string, status); 248 } 249 if(U_SUCCESS(*status)) { 250 spec->variableTopSet = TRUE; 251 } 252 return string; 253} 254U_CDECL_END 255 256 257/* Table for parsing short strings */ 258struct ShortStringOptions { 259 char optionStart; 260 ActionFunction *action; 261 uint32_t attr; 262}; 263 264static const ShortStringOptions options[UCOL_SIT_ITEMS_COUNT] = 265{ 266/* 10 ALTERNATE_HANDLING */ {alternateHArg, _processCollatorOption, UCOL_ALTERNATE_HANDLING }, // alternate N, S, D 267/* 15 VARIABLE_TOP_VALUE */ {variableTopValArg, _processVariableTop, 1 }, 268/* 08 CASE_FIRST */ {caseFirstArg, _processCollatorOption, UCOL_CASE_FIRST }, // case first L, U, X, D 269/* 09 NUMERIC_COLLATION */ {numericCollArg, _processCollatorOption, UCOL_NUMERIC_COLLATION }, // codan O, X, D 270/* 07 CASE_LEVEL */ {caseLevelArg, _processCollatorOption, UCOL_CASE_LEVEL }, // case level O, X, D 271/* 12 FRENCH_COLLATION */ {frenchCollArg, _processCollatorOption, UCOL_FRENCH_COLLATION }, // french O, X, D 272/* 13 HIRAGANA_QUATERNARY] */ {hiraganaQArg, _processCollatorOption, UCOL_HIRAGANA_QUATERNARY_MODE }, // hiragana O, X, D 273/* 04 KEYWORD */ {keywordArg, _processLocaleElement, 4 }, // keyword 274/* 00 LANGUAGE */ {languageArg, _processLocaleElement, 0 }, // language 275/* 11 NORMALIZATION_MODE */ {normArg, _processCollatorOption, UCOL_NORMALIZATION_MODE }, // norm O, X, D 276/* 02 REGION */ {regionArg, _processLocaleElement, 2 }, // region 277/* 06 STRENGTH */ {strengthArg, _processCollatorOption, UCOL_STRENGTH }, // strength 1, 2, 3, 4, I, D 278/* 14 VARIABLE_TOP */ {variableTopArg, _processVariableTop, 0 }, 279/* 03 VARIANT */ {variantArg, _processLocaleElement, 3 }, // variant 280/* 05 RFC3066BIS */ {RFC3066Arg, _processRFC3066Locale, 0 }, // rfc3066bis locale name 281/* 01 SCRIPT */ {scriptArg, _processLocaleElement, 1 } // script 282}; 283 284 285static 286const char* ucol_sit_readOption(const char *start, CollatorSpec *spec, 287 UErrorCode *status) 288{ 289 int32_t i = 0; 290 291 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { 292 if(*start == options[i].optionStart) { 293 spec->entries[i].start = start; 294 const char* end = options[i].action(spec, options[i].attr, start+1, status); 295 spec->entries[i].len = end - start; 296 return end; 297 } 298 } 299 *status = U_ILLEGAL_ARGUMENT_ERROR; 300 return start; 301} 302 303static 304void ucol_sit_initCollatorSpecs(CollatorSpec *spec) 305{ 306 // reset everything 307 uprv_memset(spec, 0, sizeof(CollatorSpec)); 308 // set collation options to default 309 int32_t i = 0; 310 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 311 spec->options[i] = UCOL_DEFAULT; 312 } 313} 314 315static const char* 316ucol_sit_readSpecs(CollatorSpec *s, const char *string, 317 UParseError *parseError, UErrorCode *status) 318{ 319 const char *definition = string; 320 while(U_SUCCESS(*status) && *string) { 321 string = ucol_sit_readOption(string, s, status); 322 // advance over '_' 323 while(*string && *string == '_') { 324 string++; 325 } 326 } 327 if(U_FAILURE(*status)) { 328 parseError->offset = string - definition; 329 } 330 return string; 331} 332 333static 334int32_t ucol_sit_dumpSpecs(CollatorSpec *s, char *destination, int32_t capacity, UErrorCode *status) 335{ 336 int32_t i = 0, j = 0; 337 int32_t len = 0; 338 char optName; 339 if(U_SUCCESS(*status)) { 340 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { 341 if(s->entries[i].start) { 342 if(len) { 343 if(len < capacity) { 344 uprv_strcat(destination, "_"); 345 } 346 len++; 347 } 348 optName = *(s->entries[i].start); 349 if(optName == languageArg || optName == regionArg || optName == variantArg || optName == keywordArg) { 350 for(j = 0; j < s->entries[i].len; j++) { 351 if(len + j < capacity) { 352 destination[len+j] = uprv_toupper(*(s->entries[i].start+j)); 353 } 354 } 355 len += s->entries[i].len; 356 } else { 357 len += s->entries[i].len; 358 if(len < capacity) { 359 uprv_strncat(destination,s->entries[i].start, s->entries[i].len); 360 } 361 } 362 } 363 } 364 return len; 365 } else { 366 return 0; 367 } 368} 369 370static void 371ucol_sit_calculateWholeLocale(CollatorSpec *s) { 372 // put the locale together, unless we have a done 373 // locale 374 if(s->locale[0] == 0) { 375 // first the language 376 uprv_strcat(s->locale, s->locElements[0]); 377 // then the script, if present 378 if(*(s->locElements[1])) { 379 uprv_strcat(s->locale, "_"); 380 uprv_strcat(s->locale, s->locElements[1]); 381 } 382 // then the region, if present 383 if(*(s->locElements[2])) { 384 uprv_strcat(s->locale, "_"); 385 uprv_strcat(s->locale, s->locElements[2]); 386 } else if(*(s->locElements[3])) { // if there is a variant, we need an underscore 387 uprv_strcat(s->locale, "_"); 388 } 389 // add variant, if there 390 if(*(s->locElements[3])) { 391 uprv_strcat(s->locale, "_"); 392 uprv_strcat(s->locale, s->locElements[3]); 393 } 394 395 // if there is a collation keyword, add that too 396 if(*(s->locElements[4])) { 397 uprv_strcat(s->locale, collationKeyword); 398 uprv_strcat(s->locale, s->locElements[4]); 399 } 400 } 401} 402 403 404U_CAPI void U_EXPORT2 405ucol_prepareShortStringOpen( const char *definition, 406 UBool, 407 UParseError *parseError, 408 UErrorCode *status) 409{ 410 if(U_FAILURE(*status)) return; 411 412 UParseError internalParseError; 413 414 if(!parseError) { 415 parseError = &internalParseError; 416 } 417 parseError->line = 0; 418 parseError->offset = 0; 419 parseError->preContext[0] = 0; 420 parseError->postContext[0] = 0; 421 422 423 // first we want to pick stuff out of short string. 424 // we'll end up with an UCA version, locale and a bunch of 425 // settings 426 427 // analyse the string in order to get everything we need. 428 CollatorSpec s; 429 ucol_sit_initCollatorSpecs(&s); 430 ucol_sit_readSpecs(&s, definition, parseError, status); 431 ucol_sit_calculateWholeLocale(&s); 432 433 char buffer[internalBufferSize]; 434 uprv_memset(buffer, 0, internalBufferSize); 435 uloc_canonicalize(s.locale, buffer, internalBufferSize, status); 436 437 UResourceBundle *b = ures_open(U_ICUDATA_COLL, buffer, status); 438 /* we try to find stuff from keyword */ 439 UResourceBundle *collations = ures_getByKey(b, "collations", NULL, status); 440 UResourceBundle *collElem = NULL; 441 char keyBuffer[256]; 442 // if there is a keyword, we pick it up and try to get elements 443 if(!uloc_getKeywordValue(buffer, "collation", keyBuffer, 256, status)) { 444 // no keyword. we try to find the default setting, which will give us the keyword value 445 UResourceBundle *defaultColl = ures_getByKeyWithFallback(collations, "default", NULL, status); 446 if(U_SUCCESS(*status)) { 447 int32_t defaultKeyLen = 0; 448 const UChar *defaultKey = ures_getString(defaultColl, &defaultKeyLen, status); 449 u_UCharsToChars(defaultKey, keyBuffer, defaultKeyLen); 450 keyBuffer[defaultKeyLen] = 0; 451 } else { 452 *status = U_INTERNAL_PROGRAM_ERROR; 453 return; 454 } 455 ures_close(defaultColl); 456 } 457 collElem = ures_getByKeyWithFallback(collations, keyBuffer, collElem, status); 458 ures_close(collElem); 459 ures_close(collations); 460 ures_close(b); 461} 462 463 464U_CAPI UCollator* U_EXPORT2 465ucol_openFromShortString( const char *definition, 466 UBool forceDefaults, 467 UParseError *parseError, 468 UErrorCode *status) 469{ 470 UTRACE_ENTRY_OC(UTRACE_UCOL_OPEN_FROM_SHORT_STRING); 471 UTRACE_DATA1(UTRACE_INFO, "short string = \"%s\"", definition); 472 473 if(U_FAILURE(*status)) return 0; 474 475 UParseError internalParseError; 476 477 if(!parseError) { 478 parseError = &internalParseError; 479 } 480 parseError->line = 0; 481 parseError->offset = 0; 482 parseError->preContext[0] = 0; 483 parseError->postContext[0] = 0; 484 485 486 // first we want to pick stuff out of short string. 487 // we'll end up with an UCA version, locale and a bunch of 488 // settings 489 490 // analyse the string in order to get everything we need. 491 const char *string = definition; 492 CollatorSpec s; 493 ucol_sit_initCollatorSpecs(&s); 494 string = ucol_sit_readSpecs(&s, definition, parseError, status); 495 ucol_sit_calculateWholeLocale(&s); 496 497 char buffer[internalBufferSize]; 498 uprv_memset(buffer, 0, internalBufferSize); 499 uloc_canonicalize(s.locale, buffer, internalBufferSize, status); 500 501 UCollator *result = ucol_open(buffer, status); 502 int32_t i = 0; 503 504 for(i = 0; i < UCOL_ATTRIBUTE_COUNT; i++) { 505 if(s.options[i] != UCOL_DEFAULT) { 506 if(forceDefaults || ucol_getAttribute(result, (UColAttribute)i, status) != s.options[i]) { 507 ucol_setAttribute(result, (UColAttribute)i, s.options[i], status); 508 } 509 510 if(U_FAILURE(*status)) { 511 parseError->offset = string - definition; 512 ucol_close(result); 513 return NULL; 514 } 515 516 } 517 } 518 if(s.variableTopSet) { 519 if(s.variableTopString[0]) { 520 ucol_setVariableTop(result, s.variableTopString, s.variableTopStringLen, status); 521 } else { // we set by value, using 'B' 522 ucol_restoreVariableTop(result, s.variableTopValue, status); 523 } 524 } 525 526 527 if(U_FAILURE(*status)) { // here it can only be a bogus value 528 ucol_close(result); 529 result = NULL; 530 } 531 532 UTRACE_EXIT_PTR_STATUS(result, *status); 533 return result; 534} 535 536 537static void appendShortStringElement(const char *src, int32_t len, char *result, int32_t *resultSize, int32_t capacity, char arg) 538{ 539 if(len) { 540 if(*resultSize) { 541 if(*resultSize < capacity) { 542 uprv_strcat(result, "_"); 543 } 544 (*resultSize)++; 545 } 546 *resultSize += len + 1; 547 if(*resultSize < capacity) { 548 uprv_strncat(result, &arg, 1); 549 uprv_strncat(result, src, len); 550 } 551 } 552} 553 554U_CAPI int32_t U_EXPORT2 555ucol_getShortDefinitionString(const UCollator *coll, 556 const char *locale, 557 char *dst, 558 int32_t capacity, 559 UErrorCode *status) 560{ 561 if(U_FAILURE(*status)) return 0; 562 char buffer[internalBufferSize]; 563 uprv_memset(buffer, 0, internalBufferSize*sizeof(char)); 564 int32_t resultSize = 0; 565 char tempbuff[internalBufferSize]; 566 char locBuff[internalBufferSize]; 567 uprv_memset(buffer, 0, internalBufferSize*sizeof(char)); 568 int32_t elementSize = 0; 569 UBool isAvailable = 0; 570 CollatorSpec s; 571 ucol_sit_initCollatorSpecs(&s); 572 573 if(!locale) { 574 locale = ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, status); 575 } 576 elementSize = ucol_getFunctionalEquivalent(locBuff, internalBufferSize, "collation", locale, &isAvailable, status); 577 578 if(elementSize) { 579 // we should probably canonicalize here... 580 elementSize = uloc_getLanguage(locBuff, tempbuff, internalBufferSize, status); 581 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, languageArg); 582 elementSize = uloc_getCountry(locBuff, tempbuff, internalBufferSize, status); 583 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, regionArg); 584 elementSize = uloc_getScript(locBuff, tempbuff, internalBufferSize, status); 585 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, scriptArg); 586 elementSize = uloc_getVariant(locBuff, tempbuff, internalBufferSize, status); 587 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, variantArg); 588 elementSize = uloc_getKeywordValue(locBuff, "collation", tempbuff, internalBufferSize, status); 589 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, /*capacity*/internalBufferSize, keywordArg); 590 } 591 592 int32_t i = 0; 593 UColAttributeValue attribute = UCOL_DEFAULT; 594 for(i = 0; i < UCOL_SIT_ITEMS_COUNT; i++) { 595 if(options[i].action == _processCollatorOption) { 596 attribute = ucol_getAttributeOrDefault(coll, (UColAttribute)options[i].attr, status); 597 if(attribute != UCOL_DEFAULT) { 598 char letter = ucol_sit_attributeValueToLetter(attribute, status); 599 appendShortStringElement(&letter, 1, 600 buffer, &resultSize, /*capacity*/internalBufferSize, options[i].optionStart); 601 } 602 } 603 } 604 if(coll->variableTopValueisDefault == FALSE) { 605 //s.variableTopValue = ucol_getVariableTop(coll, status); 606 elementSize = T_CString_integerToString(tempbuff, coll->variableTopValue, 16); 607 appendShortStringElement(tempbuff, elementSize, buffer, &resultSize, capacity, variableTopValArg); 608 } 609 610 UParseError parseError; 611 return ucol_normalizeShortDefinitionString(buffer, dst, capacity, &parseError, status); 612} 613 614U_CAPI int32_t U_EXPORT2 615ucol_normalizeShortDefinitionString(const char *definition, 616 char *destination, 617 int32_t capacity, 618 UParseError *parseError, 619 UErrorCode *status) 620{ 621 622 if(U_FAILURE(*status)) { 623 return 0; 624 } 625 626 if(destination) { 627 uprv_memset(destination, 0, capacity*sizeof(char)); 628 } 629 630 UParseError pe; 631 if(!parseError) { 632 parseError = &pe; 633 } 634 635 // validate 636 CollatorSpec s; 637 ucol_sit_initCollatorSpecs(&s); 638 ucol_sit_readSpecs(&s, definition, parseError, status); 639 return ucol_sit_dumpSpecs(&s, destination, capacity, status); 640} 641 642U_CAPI UColAttributeValue U_EXPORT2 643ucol_getAttributeOrDefault(const UCollator *coll, UColAttribute attr, UErrorCode *status) 644{ 645 if(U_FAILURE(*status) || coll == NULL) { 646 return UCOL_DEFAULT; 647 } 648 switch(attr) { 649 case UCOL_NUMERIC_COLLATION: 650 return coll->numericCollationisDefault?UCOL_DEFAULT:coll->numericCollation; 651 case UCOL_HIRAGANA_QUATERNARY_MODE: 652 return coll->hiraganaQisDefault?UCOL_DEFAULT:coll->hiraganaQ; 653 case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/ 654 return coll->frenchCollationisDefault?UCOL_DEFAULT:coll->frenchCollation; 655 case UCOL_ALTERNATE_HANDLING: /* attribute for handling variable elements*/ 656 return coll->alternateHandlingisDefault?UCOL_DEFAULT:coll->alternateHandling; 657 case UCOL_CASE_FIRST: /* who goes first, lower case or uppercase */ 658 return coll->caseFirstisDefault?UCOL_DEFAULT:coll->caseFirst; 659 case UCOL_CASE_LEVEL: /* do we have an extra case level */ 660 return coll->caseLevelisDefault?UCOL_DEFAULT:coll->caseLevel; 661 case UCOL_NORMALIZATION_MODE: /* attribute for normalization */ 662 return coll->normalizationModeisDefault?UCOL_DEFAULT:coll->normalizationMode; 663 case UCOL_STRENGTH: /* attribute for strength */ 664 return coll->strengthisDefault?UCOL_DEFAULT:coll->strength; 665 case UCOL_ATTRIBUTE_COUNT: 666 default: 667 *status = U_ILLEGAL_ARGUMENT_ERROR; 668 break; 669 } 670 return UCOL_DEFAULT; 671} 672 673 674struct contContext { 675 const UCollator *coll; 676 USet *conts; 677 USet *expansions; 678 USet *removedContractions; 679 UBool addPrefixes; 680 UErrorCode *status; 681}; 682 683 684 685static void 686addSpecial(contContext *context, UChar *buffer, int32_t bufLen, 687 uint32_t CE, int32_t leftIndex, int32_t rightIndex, UErrorCode *status) 688{ 689 const UCollator *coll = context->coll; 690 USet *contractions = context->conts; 691 USet *expansions = context->expansions; 692 UBool addPrefixes = context->addPrefixes; 693 694 const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE); 695 uint32_t newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 696 // we might have a contraction that ends from previous level 697 if(newCE != UCOL_NOT_FOUND) { 698 if(isSpecial(CE) && getCETag(CE) == CONTRACTION_TAG && isSpecial(newCE) && getCETag(newCE) == SPEC_PROC_TAG && addPrefixes) { 699 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status); 700 } 701 if(contractions && rightIndex-leftIndex > 1) { 702 uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex); 703 if(expansions && isSpecial(CE) && getCETag(CE) == EXPANSION_TAG) { 704 uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex); 705 } 706 } 707 } 708 709 UCharOffset++; 710 // check whether we're doing contraction or prefix 711 if(getCETag(CE) == SPEC_PROC_TAG && addPrefixes) { 712 if(leftIndex == 0) { 713 *status = U_INTERNAL_PROGRAM_ERROR; 714 return; 715 } 716 --leftIndex; 717 while(*UCharOffset != 0xFFFF) { 718 newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 719 buffer[leftIndex] = *UCharOffset; 720 if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) { 721 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex, status); 722 } else { 723 if(contractions) { 724 uset_addString(contractions, buffer+leftIndex, rightIndex-leftIndex); 725 } 726 if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) { 727 uset_addString(expansions, buffer+leftIndex, rightIndex-leftIndex); 728 } 729 } 730 UCharOffset++; 731 } 732 } else if(getCETag(CE) == CONTRACTION_TAG) { 733 if(rightIndex == bufLen-1) { 734 *status = U_INTERNAL_PROGRAM_ERROR; 735 return; 736 } 737 while(*UCharOffset != 0xFFFF) { 738 newCE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); 739 buffer[rightIndex] = *UCharOffset; 740 if(isSpecial(newCE) && (getCETag(newCE) == CONTRACTION_TAG || getCETag(newCE) == SPEC_PROC_TAG)) { 741 addSpecial(context, buffer, bufLen, newCE, leftIndex, rightIndex+1, status); 742 } else { 743 if(contractions) { 744 uset_addString(contractions, buffer+leftIndex, rightIndex+1-leftIndex); 745 } 746 if(expansions && isSpecial(newCE) && getCETag(newCE) == EXPANSION_TAG) { 747 uset_addString(expansions, buffer+leftIndex, rightIndex+1-leftIndex); 748 } 749 } 750 UCharOffset++; 751 } 752 } 753 754} 755 756U_CDECL_BEGIN 757static UBool U_CALLCONV 758_processSpecials(const void *context, UChar32 start, UChar32 limit, uint32_t CE) 759{ 760 UErrorCode *status = ((contContext *)context)->status; 761 USet *expansions = ((contContext *)context)->expansions; 762 USet *removed = ((contContext *)context)->removedContractions; 763 UBool addPrefixes = ((contContext *)context)->addPrefixes; 764 UChar contraction[internalBufferSize]; 765 if(isSpecial(CE)) { 766 if(((getCETag(CE) == SPEC_PROC_TAG && addPrefixes) || getCETag(CE) == CONTRACTION_TAG)) { 767 while(start < limit && U_SUCCESS(*status)) { 768 // if there are suppressed contractions, we don't 769 // want to add them. 770 if(removed && uset_contains(removed, start)) { 771 start++; 772 continue; 773 } 774 // we start our contraction from middle, since we don't know if it 775 // will grow toward right or left 776 contraction[internalBufferSize/2] = (UChar)start; 777 addSpecial(((contContext *)context), contraction, internalBufferSize, CE, internalBufferSize/2, internalBufferSize/2+1, status); 778 start++; 779 } 780 } else if(expansions && getCETag(CE) == EXPANSION_TAG) { 781 while(start < limit && U_SUCCESS(*status)) { 782 uset_add(expansions, start++); 783 } 784 } 785 } 786 if(U_FAILURE(*status)) { 787 return FALSE; 788 } else { 789 return TRUE; 790 } 791} 792 793U_CDECL_END 794 795 796 797/** 798 * Get a set containing the contractions defined by the collator. The set includes 799 * both the UCA contractions and the contractions defined by the collator 800 * @param coll collator 801 * @param conts the set to hold the result 802 * @param status to hold the error code 803 * @return the size of the contraction set 804 */ 805U_CAPI int32_t U_EXPORT2 806ucol_getContractions( const UCollator *coll, 807 USet *contractions, 808 UErrorCode *status) 809{ 810 ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status); 811 return uset_getItemCount(contractions); 812} 813 814/** 815 * Get a set containing the expansions defined by the collator. The set includes 816 * both the UCA expansions and the expansions defined by the tailoring 817 * @param coll collator 818 * @param conts the set to hold the result 819 * @param addPrefixes add the prefix contextual elements to contractions 820 * @param status to hold the error code 821 * 822 * @draft ICU 3.4 823 */ 824U_CAPI void U_EXPORT2 825ucol_getContractionsAndExpansions( const UCollator *coll, 826 USet *contractions, 827 USet *expansions, 828 UBool addPrefixes, 829 UErrorCode *status) 830{ 831 if(U_FAILURE(*status)) { 832 return; 833 } 834 if(coll == NULL) { 835 *status = U_ILLEGAL_ARGUMENT_ERROR; 836 return; 837 } 838 839 if(contractions) { 840 uset_clear(contractions); 841 } 842 if(expansions) { 843 uset_clear(expansions); 844 } 845 int32_t rulesLen = 0; 846 const UChar* rules = ucol_getRules(coll, &rulesLen); 847 UColTokenParser src; 848 ucol_tok_initTokenList(&src, rules, rulesLen, coll->UCA, status); 849 850 contContext c = { NULL, contractions, expansions, src.removeSet, addPrefixes, status }; 851 852 // Add the UCA contractions 853 c.coll = coll->UCA; 854 utrie_enum(&coll->UCA->mapping, NULL, _processSpecials, &c); 855 856 // This is collator specific. Add contractions from a collator 857 c.coll = coll; 858 c.removedContractions = NULL; 859 utrie_enum(&coll->mapping, NULL, _processSpecials, &c); 860 ucol_tok_closeTokenList(&src); 861} 862 863U_CAPI int32_t U_EXPORT2 864ucol_getUnsafeSet( const UCollator *coll, 865 USet *unsafe, 866 UErrorCode *status) 867{ 868 UChar buffer[internalBufferSize]; 869 int32_t len = 0; 870 871 uset_clear(unsafe); 872 873 // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant 874 static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 875 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; 876 877 // add chars that fail the fcd check 878 uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); 879 880 // add Thai/Lao prevowels 881 uset_addRange(unsafe, 0xe40, 0xe44); 882 uset_addRange(unsafe, 0xec0, 0xec4); 883 // add lead/trail surrogates 884 uset_addRange(unsafe, 0xd800, 0xdfff); 885 886 USet *contractions = uset_open(0,0); 887 888 int32_t i = 0, j = 0; 889 int32_t contsSize = ucol_getContractions(coll, contractions, status); 890 UChar32 c = 0; 891 // Contraction set consists only of strings 892 // to get unsafe code points, we need to 893 // break the strings apart and add them to the unsafe set 894 for(i = 0; i < contsSize; i++) { 895 len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); 896 if(len > 0) { 897 j = 0; 898 while(j < len) { 899 U16_NEXT(buffer, j, len, c); 900 if(j < len) { 901 uset_add(unsafe, c); 902 } 903 } 904 } 905 } 906 907 uset_close(contractions); 908 909 return uset_size(unsafe); 910} 911#endif 912