1/* 2******************************************************************************* 3* 4* Copyright (C) 2004-2014, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: ucase.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2004aug30 14* created by: Markus W. Scherer 15* 16* Low-level Unicode character/string case mapping code. 17* Much code moved here (and modified) from uchar.c. 18*/ 19 20#include "unicode/utypes.h" 21#include "unicode/unistr.h" 22#include "unicode/uset.h" 23#include "unicode/udata.h" /* UDataInfo */ 24#include "unicode/utf16.h" 25#include "ucmndata.h" /* DataHeader */ 26#include "udatamem.h" 27#include "umutex.h" 28#include "uassert.h" 29#include "cmemory.h" 30#include "utrie2.h" 31#include "ucase.h" 32 33struct UCaseProps { 34 UDataMemory *mem; 35 const int32_t *indexes; 36 const uint16_t *exceptions; 37 const uint16_t *unfold; 38 39 UTrie2 trie; 40 uint8_t formatVersion[4]; 41}; 42 43/* ucase_props_data.h is machine-generated by gencase --csource */ 44#define INCLUDED_FROM_UCASE_CPP 45#include "ucase_props_data.h" 46 47/* UCaseProps singleton ----------------------------------------------------- */ 48 49U_CAPI const UCaseProps * U_EXPORT2 50ucase_getSingleton() { 51 return &ucase_props_singleton; 52} 53 54/* set of property starts for UnicodeSet ------------------------------------ */ 55 56static UBool U_CALLCONV 57_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { 58 /* add the start code point to the USet */ 59 const USetAdder *sa=(const USetAdder *)context; 60 sa->add(sa->set, start); 61 return TRUE; 62} 63 64U_CFUNC void U_EXPORT2 65ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) { 66 if(U_FAILURE(*pErrorCode)) { 67 return; 68 } 69 70 /* add the start code point of each same-value range of the trie */ 71 utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa); 72 73 /* add code points with hardcoded properties, plus the ones following them */ 74 75 /* (none right now, see comment below) */ 76 77 /* 78 * Omit code points with hardcoded specialcasing properties 79 * because we do not build property UnicodeSets for them right now. 80 */ 81} 82 83/* data access primitives --------------------------------------------------- */ 84 85#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT)) 86 87#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) 88 89/* number of bits in an 8-bit integer value */ 90static const uint8_t flagsOffset[256]={ 91 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 92 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 93 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 94 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 95 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 96 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 97 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 98 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 99 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 100 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 101 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 102 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 103 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 104 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 105 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 106 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 107}; 108 109#define HAS_SLOT(flags, idx) ((flags)&(1<<(idx))) 110#define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)] 111 112/* 113 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx). 114 * 115 * @param excWord (in) initial exceptions word 116 * @param idx (in) desired slot index 117 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++; 118 * moved to the last uint16_t of the value, use +1 for beginning of next slot 119 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified 120 */ 121#define GET_SLOT_VALUE(excWord, idx, pExc16, value) \ 122 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \ 123 (pExc16)+=SLOT_OFFSET(excWord, idx); \ 124 (value)=*pExc16; \ 125 } else { \ 126 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \ 127 (value)=*pExc16++; \ 128 (value)=((value)<<16)|*pExc16; \ 129 } 130 131/* simple case mappings ----------------------------------------------------- */ 132 133U_CAPI UChar32 U_EXPORT2 134ucase_tolower(const UCaseProps *csp, UChar32 c) { 135 uint16_t props=UTRIE2_GET16(&csp->trie, c); 136 if(!PROPS_HAS_EXCEPTION(props)) { 137 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 138 c+=UCASE_GET_DELTA(props); 139 } 140 } else { 141 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 142 uint16_t excWord=*pe++; 143 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 144 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c); 145 } 146 } 147 return c; 148} 149 150U_CAPI UChar32 U_EXPORT2 151ucase_toupper(const UCaseProps *csp, UChar32 c) { 152 uint16_t props=UTRIE2_GET16(&csp->trie, c); 153 if(!PROPS_HAS_EXCEPTION(props)) { 154 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 155 c+=UCASE_GET_DELTA(props); 156 } 157 } else { 158 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 159 uint16_t excWord=*pe++; 160 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 161 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c); 162 } 163 } 164 return c; 165} 166 167U_CAPI UChar32 U_EXPORT2 168ucase_totitle(const UCaseProps *csp, UChar32 c) { 169 uint16_t props=UTRIE2_GET16(&csp->trie, c); 170 if(!PROPS_HAS_EXCEPTION(props)) { 171 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 172 c+=UCASE_GET_DELTA(props); 173 } 174 } else { 175 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 176 uint16_t excWord=*pe++; 177 int32_t idx; 178 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) { 179 idx=UCASE_EXC_TITLE; 180 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 181 idx=UCASE_EXC_UPPER; 182 } else { 183 return c; 184 } 185 GET_SLOT_VALUE(excWord, idx, pe, c); 186 } 187 return c; 188} 189 190static const UChar iDot[2] = { 0x69, 0x307 }; 191static const UChar jDot[2] = { 0x6a, 0x307 }; 192static const UChar iOgonekDot[3] = { 0x12f, 0x307 }; 193static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 }; 194static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 }; 195static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 }; 196 197 198U_CFUNC void U_EXPORT2 199ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) { 200 uint16_t props; 201 202 /* 203 * Hardcode the case closure of i and its relatives and ignore the 204 * data file data for these characters. 205 * The Turkic dotless i and dotted I with their case mapping conditions 206 * and case folding option make the related characters behave specially. 207 * This code matches their closure behavior to their case folding behavior. 208 */ 209 210 switch(c) { 211 case 0x49: 212 /* regular i and I are in one equivalence class */ 213 sa->add(sa->set, 0x69); 214 return; 215 case 0x69: 216 sa->add(sa->set, 0x49); 217 return; 218 case 0x130: 219 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ 220 sa->addString(sa->set, iDot, 2); 221 return; 222 case 0x131: 223 /* dotless i is in a class by itself */ 224 return; 225 default: 226 /* otherwise use the data file data */ 227 break; 228 } 229 230 props=UTRIE2_GET16(&csp->trie, c); 231 if(!PROPS_HAS_EXCEPTION(props)) { 232 if(UCASE_GET_TYPE(props)!=UCASE_NONE) { 233 /* add the one simple case mapping, no matter what type it is */ 234 int32_t delta=UCASE_GET_DELTA(props); 235 if(delta!=0) { 236 sa->add(sa->set, c+delta); 237 } 238 } 239 } else { 240 /* 241 * c has exceptions, so there may be multiple simple and/or 242 * full case mappings. Add them all. 243 */ 244 const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props); 245 const UChar *closure; 246 uint16_t excWord=*pe++; 247 int32_t idx, closureLength, fullLength, length; 248 249 pe0=pe; 250 251 /* add all simple case mappings */ 252 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) { 253 if(HAS_SLOT(excWord, idx)) { 254 pe=pe0; 255 GET_SLOT_VALUE(excWord, idx, pe, c); 256 sa->add(sa->set, c); 257 } 258 } 259 260 /* get the closure string pointer & length */ 261 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { 262 pe=pe0; 263 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength); 264 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */ 265 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */ 266 } else { 267 closureLength=0; 268 closure=NULL; 269 } 270 271 /* add the full case folding */ 272 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 273 pe=pe0; 274 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength); 275 276 /* start of full case mapping strings */ 277 ++pe; 278 279 fullLength&=0xffff; /* bits 16 and higher are reserved */ 280 281 /* skip the lowercase result string */ 282 pe+=fullLength&UCASE_FULL_LOWER; 283 fullLength>>=4; 284 285 /* add the full case folding string */ 286 length=fullLength&0xf; 287 if(length!=0) { 288 sa->addString(sa->set, (const UChar *)pe, length); 289 pe+=length; 290 } 291 292 /* skip the uppercase and titlecase strings */ 293 fullLength>>=4; 294 pe+=fullLength&0xf; 295 fullLength>>=4; 296 pe+=fullLength; 297 298 closure=(const UChar *)pe; /* behind full case mappings */ 299 } 300 301 /* add each code point in the closure string */ 302 for(idx=0; idx<closureLength;) { 303 U16_NEXT_UNSAFE(closure, idx, c); 304 sa->add(sa->set, c); 305 } 306 } 307} 308 309/* 310 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated 311 * must be length>0 and max>0 and length<=max 312 */ 313static inline int32_t 314strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) { 315 int32_t c1, c2; 316 317 max-=length; /* we require length<=max, so no need to decrement max in the loop */ 318 do { 319 c1=*s++; 320 c2=*t++; 321 if(c2==0) { 322 return 1; /* reached the end of t but not of s */ 323 } 324 c1-=c2; 325 if(c1!=0) { 326 return c1; /* return difference result */ 327 } 328 } while(--length>0); 329 /* ends with length==0 */ 330 331 if(max==0 || *t==0) { 332 return 0; /* equal to length of both strings */ 333 } else { 334 return -max; /* return lengh difference */ 335 } 336} 337 338U_CFUNC UBool U_EXPORT2 339ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) { 340 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth; 341 342 if(csp->unfold==NULL || s==NULL) { 343 return FALSE; /* no reverse case folding data, or no string */ 344 } 345 if(length<=1) { 346 /* the string is too short to find any match */ 347 /* 348 * more precise would be: 349 * if(!u_strHasMoreChar32Than(s, length, 1)) 350 * but this does not make much practical difference because 351 * a single supplementary code point would just not be found 352 */ 353 return FALSE; 354 } 355 356 const uint16_t *unfold=csp->unfold; 357 unfoldRows=unfold[UCASE_UNFOLD_ROWS]; 358 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH]; 359 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH]; 360 unfold+=unfoldRowWidth; 361 362 if(length>unfoldStringWidth) { 363 /* the string is too long to find any match */ 364 return FALSE; 365 } 366 367 /* do a binary search for the string */ 368 start=0; 369 limit=unfoldRows; 370 while(start<limit) { 371 i=(start+limit)/2; 372 const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth)); 373 result=strcmpMax(s, length, p, unfoldStringWidth); 374 375 if(result==0) { 376 /* found the string: add each code point, and its case closure */ 377 UChar32 c; 378 379 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) { 380 U16_NEXT_UNSAFE(p, i, c); 381 sa->add(sa->set, c); 382 ucase_addCaseClosure(csp, c, sa); 383 } 384 return TRUE; 385 } else if(result<0) { 386 limit=i; 387 } else /* result>0 */ { 388 start=i+1; 389 } 390 } 391 392 return FALSE; /* string not found */ 393} 394 395U_NAMESPACE_BEGIN 396 397FullCaseFoldingIterator::FullCaseFoldingIterator() 398 : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)), 399 unfoldRows(unfold[UCASE_UNFOLD_ROWS]), 400 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]), 401 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]), 402 currentRow(0), 403 rowCpIndex(unfoldStringWidth) { 404 unfold+=unfoldRowWidth; 405} 406 407UChar32 408FullCaseFoldingIterator::next(UnicodeString &full) { 409 // Advance past the last-delivered code point. 410 const UChar *p=unfold+(currentRow*unfoldRowWidth); 411 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) { 412 ++currentRow; 413 p+=unfoldRowWidth; 414 rowCpIndex=unfoldStringWidth; 415 } 416 if(currentRow>=unfoldRows) { return U_SENTINEL; } 417 // Set "full" to the NUL-terminated string in the first unfold column. 418 int32_t length=unfoldStringWidth; 419 while(length>0 && p[length-1]==0) { --length; } 420 full.setTo(FALSE, p, length); 421 // Return the code point. 422 UChar32 c; 423 U16_NEXT_UNSAFE(p, rowCpIndex, c); 424 return c; 425} 426 427U_NAMESPACE_END 428 429/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ 430U_CAPI int32_t U_EXPORT2 431ucase_getType(const UCaseProps *csp, UChar32 c) { 432 uint16_t props=UTRIE2_GET16(&csp->trie, c); 433 return UCASE_GET_TYPE(props); 434} 435 436/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */ 437U_CAPI int32_t U_EXPORT2 438ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) { 439 uint16_t props=UTRIE2_GET16(&csp->trie, c); 440 return UCASE_GET_TYPE_AND_IGNORABLE(props); 441} 442 443/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */ 444static inline int32_t 445getDotType(const UCaseProps *csp, UChar32 c) { 446 uint16_t props=UTRIE2_GET16(&csp->trie, c); 447 if(!PROPS_HAS_EXCEPTION(props)) { 448 return props&UCASE_DOT_MASK; 449 } else { 450 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 451 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK; 452 } 453} 454 455U_CAPI UBool U_EXPORT2 456ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) { 457 return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED); 458} 459 460U_CAPI UBool U_EXPORT2 461ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) { 462 uint16_t props=UTRIE2_GET16(&csp->trie, c); 463 return (UBool)((props&UCASE_SENSITIVE)!=0); 464} 465 466/* string casing ------------------------------------------------------------ */ 467 468/* 469 * These internal functions form the core of string case mappings. 470 * They map single code points to result code points or strings and take 471 * all necessary conditions (context, locale ID, options) into account. 472 * 473 * They do not iterate over the source or write to the destination 474 * so that the same functions are useful for non-standard string storage, 475 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc. 476 * For the same reason, the "surrounding text" context is passed in as a 477 * UCaseContextIterator which does not make any assumptions about 478 * the underlying storage. 479 * 480 * This section contains helper functions that check for conditions 481 * in the input text surrounding the current code point 482 * according to SpecialCasing.txt. 483 * 484 * Each helper function gets the index 485 * - after the current code point if it looks at following text 486 * - before the current code point if it looks at preceding text 487 * 488 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows: 489 * 490 * Final_Sigma 491 * C is preceded by a sequence consisting of 492 * a cased letter and a case-ignorable sequence, 493 * and C is not followed by a sequence consisting of 494 * an ignorable sequence and then a cased letter. 495 * 496 * More_Above 497 * C is followed by one or more characters of combining class 230 (ABOVE) 498 * in the combining character sequence. 499 * 500 * After_Soft_Dotted 501 * The last preceding character with combining class of zero before C 502 * was Soft_Dotted, 503 * and there is no intervening combining character class 230 (ABOVE). 504 * 505 * Before_Dot 506 * C is followed by combining dot above (U+0307). 507 * Any sequence of characters with a combining class that is neither 0 nor 230 508 * may intervene between the current character and the combining dot above. 509 * 510 * The erratum from 2002-10-31 adds the condition 511 * 512 * After_I 513 * The last preceding base character was an uppercase I, and there is no 514 * intervening combining character class 230 (ABOVE). 515 * 516 * (See Jitterbug 2344 and the comments on After_I below.) 517 * 518 * Helper definitions in Unicode 3.2 UAX 21: 519 * 520 * D1. A character C is defined to be cased 521 * if it meets any of the following criteria: 522 * 523 * - The general category of C is Titlecase Letter (Lt) 524 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase 525 * - Given D = NFD(C), then it is not the case that: 526 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D) 527 * (This third criterium does not add any characters to the list 528 * for Unicode 3.2. Ignored.) 529 * 530 * D2. A character C is defined to be case-ignorable 531 * if it meets either of the following criteria: 532 * 533 * - The general category of C is 534 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or 535 * Letter Modifier (Lm), or Symbol Modifier (Sk) 536 * - C is one of the following characters 537 * U+0027 APOSTROPHE 538 * U+00AD SOFT HYPHEN (SHY) 539 * U+2019 RIGHT SINGLE QUOTATION MARK 540 * (the preferred character for apostrophe) 541 * 542 * D3. A case-ignorable sequence is a sequence of 543 * zero or more case-ignorable characters. 544 */ 545 546#define is_a(c) ((c)=='a' || (c)=='A') 547#define is_d(c) ((c)=='d' || (c)=='D') 548#define is_e(c) ((c)=='e' || (c)=='E') 549#define is_i(c) ((c)=='i' || (c)=='I') 550#define is_l(c) ((c)=='l' || (c)=='L') 551#define is_n(c) ((c)=='n' || (c)=='N') 552#define is_r(c) ((c)=='r' || (c)=='R') 553#define is_t(c) ((c)=='t' || (c)=='T') 554#define is_u(c) ((c)=='u' || (c)=='U') 555#define is_z(c) ((c)=='z' || (c)=='Z') 556 557/* separator? */ 558#define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0) 559 560/** 561 * Requires non-NULL locale ID but otherwise does the equivalent of 562 * checking for language codes as if uloc_getLanguage() were called: 563 * Accepts both 2- and 3-letter codes and accepts case variants. 564 */ 565U_CFUNC int32_t 566ucase_getCaseLocale(const char *locale, int32_t *locCache) { 567 int32_t result; 568 char c; 569 570 if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) { 571 return result; 572 } 573 574 result=UCASE_LOC_ROOT; 575 576 /* 577 * This function used to use uloc_getLanguage(), but the current code 578 * removes the dependency of this low-level code on uloc implementation code 579 * and is faster because not the whole locale ID has to be 580 * examined and copied/transformed. 581 * 582 * Because this code does not want to depend on uloc, the caller must 583 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault(). 584 */ 585 c=*locale++; 586 if(is_t(c)) { 587 /* tr or tur? */ 588 c=*locale++; 589 if(is_u(c)) { 590 c=*locale++; 591 } 592 if(is_r(c)) { 593 c=*locale; 594 if(is_sep(c)) { 595 result=UCASE_LOC_TURKISH; 596 } 597 } 598 } else if(is_a(c)) { 599 /* az or aze? */ 600 c=*locale++; 601 if(is_z(c)) { 602 c=*locale++; 603 if(is_e(c)) { 604 c=*locale; 605 } 606 if(is_sep(c)) { 607 result=UCASE_LOC_TURKISH; 608 } 609 } 610 } else if(is_l(c)) { 611 /* lt or lit? */ 612 c=*locale++; 613 if(is_i(c)) { 614 c=*locale++; 615 } 616 if(is_t(c)) { 617 c=*locale; 618 if(is_sep(c)) { 619 result=UCASE_LOC_LITHUANIAN; 620 } 621 } 622 } else if(is_n(c)) { 623 /* nl or nld? */ 624 c=*locale++; 625 if(is_l(c)) { 626 c=*locale++; 627 if(is_d(c)) { 628 c=*locale; 629 } 630 if(is_sep(c)) { 631 result=UCASE_LOC_DUTCH; 632 } 633 } 634 } 635 636 if(locCache!=NULL) { 637 *locCache=result; 638 } 639 return result; 640} 641 642/* 643 * Is followed by 644 * {case-ignorable}* cased 645 * ? 646 * (dir determines looking forward/backward) 647 * If a character is case-ignorable, it is skipped regardless of whether 648 * it is also cased or not. 649 */ 650static UBool 651isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) { 652 UChar32 c; 653 654 if(iter==NULL) { 655 return FALSE; 656 } 657 658 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) { 659 int32_t type=ucase_getTypeOrIgnorable(csp, c); 660 if(type&4) { 661 /* case-ignorable, continue with the loop */ 662 } else if(type!=UCASE_NONE) { 663 return TRUE; /* followed by cased letter */ 664 } else { 665 return FALSE; /* uncased and not case-ignorable */ 666 } 667 } 668 669 return FALSE; /* not followed by cased letter */ 670} 671 672/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */ 673static UBool 674isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { 675 UChar32 c; 676 int32_t dotType; 677 int8_t dir; 678 679 if(iter==NULL) { 680 return FALSE; 681 } 682 683 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { 684 dotType=getDotType(csp, c); 685 if(dotType==UCASE_SOFT_DOTTED) { 686 return TRUE; /* preceded by TYPE_i */ 687 } else if(dotType!=UCASE_OTHER_ACCENT) { 688 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */ 689 } 690 } 691 692 return FALSE; /* not preceded by TYPE_i */ 693} 694 695/* 696 * See Jitterbug 2344: 697 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above 698 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because 699 * we made those releases compatible with Unicode 3.2 which had not fixed 700 * a related bug in SpecialCasing.txt. 701 * 702 * From the Jitterbug 2344 text: 703 * ... this bug is listed as a Unicode erratum 704 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html 705 * <quote> 706 * There are two errors in SpecialCasing.txt. 707 * 1. Missing semicolons on two lines. ... [irrelevant for ICU] 708 * 2. An incorrect context definition. Correct as follows: 709 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE 710 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE 711 * --- 712 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 713 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 714 * where the context After_I is defined as: 715 * The last preceding base character was an uppercase I, and there is no 716 * intervening combining character class 230 (ABOVE). 717 * </quote> 718 * 719 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as: 720 * 721 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 722 * # This matches the behavior of the canonically equivalent I-dot_above 723 * 724 * See also the description in this place in older versions of uchar.c (revision 1.100). 725 * 726 * Markus W. Scherer 2003-feb-15 727 */ 728 729/* Is preceded by base character 'I' with no intervening cc=230 ? */ 730static UBool 731isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { 732 UChar32 c; 733 int32_t dotType; 734 int8_t dir; 735 736 if(iter==NULL) { 737 return FALSE; 738 } 739 740 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { 741 if(c==0x49) { 742 return TRUE; /* preceded by I */ 743 } 744 dotType=getDotType(csp, c); 745 if(dotType!=UCASE_OTHER_ACCENT) { 746 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */ 747 } 748 } 749 750 return FALSE; /* not preceded by I */ 751} 752 753/* Is followed by one or more cc==230 ? */ 754static UBool 755isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { 756 UChar32 c; 757 int32_t dotType; 758 int8_t dir; 759 760 if(iter==NULL) { 761 return FALSE; 762 } 763 764 for(dir=1; (c=iter(context, dir))>=0; dir=0) { 765 dotType=getDotType(csp, c); 766 if(dotType==UCASE_ABOVE) { 767 return TRUE; /* at least one cc==230 following */ 768 } else if(dotType!=UCASE_OTHER_ACCENT) { 769 return FALSE; /* next base character, no more cc==230 following */ 770 } 771 } 772 773 return FALSE; /* no more cc==230 following */ 774} 775 776/* Is followed by a dot above (without cc==230 in between) ? */ 777static UBool 778isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { 779 UChar32 c; 780 int32_t dotType; 781 int8_t dir; 782 783 if(iter==NULL) { 784 return FALSE; 785 } 786 787 for(dir=1; (c=iter(context, dir))>=0; dir=0) { 788 if(c==0x307) { 789 return TRUE; 790 } 791 dotType=getDotType(csp, c); 792 if(dotType!=UCASE_OTHER_ACCENT) { 793 return FALSE; /* next base character or cc==230 in between */ 794 } 795 } 796 797 return FALSE; /* no dot above following */ 798} 799 800U_CAPI int32_t U_EXPORT2 801ucase_toFullLower(const UCaseProps *csp, UChar32 c, 802 UCaseContextIterator *iter, void *context, 803 const UChar **pString, 804 const char *locale, int32_t *locCache) 805{ 806 UChar32 result=c; 807 uint16_t props=UTRIE2_GET16(&csp->trie, c); 808 if(!PROPS_HAS_EXCEPTION(props)) { 809 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 810 result=c+UCASE_GET_DELTA(props); 811 } 812 } else { 813 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; 814 uint16_t excWord=*pe++; 815 int32_t full; 816 817 pe2=pe; 818 819 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { 820 /* use hardcoded conditions and mappings */ 821 int32_t loc=ucase_getCaseLocale(locale, locCache); 822 823 /* 824 * Test for conditional mappings first 825 * (otherwise the unconditional default mappings are always taken), 826 * then test for characters that have unconditional mappings in SpecialCasing.txt, 827 * then get the UnicodeData.txt mappings. 828 */ 829 if( loc==UCASE_LOC_LITHUANIAN && 830 /* base characters, find accents above */ 831 (((c==0x49 || c==0x4a || c==0x12e) && 832 isFollowedByMoreAbove(csp, iter, context)) || 833 /* precomposed with accent above, no need to find one */ 834 (c==0xcc || c==0xcd || c==0x128)) 835 ) { 836 /* 837 # Lithuanian 838 839 # Lithuanian retains the dot in a lowercase i when followed by accents. 840 841 # Introduce an explicit dot above when lowercasing capital I's and J's 842 # whenever there are more accents above. 843 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 844 845 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 846 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 847 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 848 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 849 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 850 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE 851 */ 852 switch(c) { 853 case 0x49: /* LATIN CAPITAL LETTER I */ 854 *pString=iDot; 855 return 2; 856 case 0x4a: /* LATIN CAPITAL LETTER J */ 857 *pString=jDot; 858 return 2; 859 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ 860 *pString=iOgonekDot; 861 return 2; 862 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ 863 *pString=iDotGrave; 864 return 3; 865 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ 866 *pString=iDotAcute; 867 return 3; 868 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ 869 *pString=iDotTilde; 870 return 3; 871 default: 872 return 0; /* will not occur */ 873 } 874 /* # Turkish and Azeri */ 875 } else if(loc==UCASE_LOC_TURKISH && c==0x130) { 876 /* 877 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 878 # The following rules handle those cases. 879 880 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE 881 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE 882 */ 883 return 0x69; 884 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) { 885 /* 886 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 887 # This matches the behavior of the canonically equivalent I-dot_above 888 889 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 890 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 891 */ 892 return 0; /* remove the dot (continue without output) */ 893 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) { 894 /* 895 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 896 897 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 898 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I 899 */ 900 return 0x131; 901 } else if(c==0x130) { 902 /* 903 # Preserve canonical equivalence for I with dot. Turkic is handled below. 904 905 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE 906 */ 907 *pString=iDot; 908 return 2; 909 } else if( c==0x3a3 && 910 !isFollowedByCasedLetter(csp, iter, context, 1) && 911 isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */ 912 ) { 913 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */ 914 /* 915 # Special case for final form of sigma 916 917 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA 918 */ 919 return 0x3c2; /* greek small final sigma */ 920 } else { 921 /* no known conditional special case mapping, use a normal mapping */ 922 } 923 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 924 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 925 full&=UCASE_FULL_LOWER; 926 if(full!=0) { 927 /* set the output pointer to the lowercase mapping */ 928 *pString=reinterpret_cast<const UChar *>(pe+1); 929 930 /* return the string length */ 931 return full; 932 } 933 } 934 935 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 936 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result); 937 } 938 } 939 940 return (result==c) ? ~result : result; 941} 942 943/* internal */ 944static int32_t 945toUpperOrTitle(const UCaseProps *csp, UChar32 c, 946 UCaseContextIterator *iter, void *context, 947 const UChar **pString, 948 const char *locale, int32_t *locCache, 949 UBool upperNotTitle) { 950 UChar32 result=c; 951 uint16_t props=UTRIE2_GET16(&csp->trie, c); 952 if(!PROPS_HAS_EXCEPTION(props)) { 953 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 954 result=c+UCASE_GET_DELTA(props); 955 } 956 } else { 957 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; 958 uint16_t excWord=*pe++; 959 int32_t full, idx; 960 961 pe2=pe; 962 963 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { 964 /* use hardcoded conditions and mappings */ 965 int32_t loc=ucase_getCaseLocale(locale, locCache); 966 967 if(loc==UCASE_LOC_TURKISH && c==0x69) { 968 /* 969 # Turkish and Azeri 970 971 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 972 # The following rules handle those cases. 973 974 # When uppercasing, i turns into a dotted capital I 975 976 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I 977 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I 978 */ 979 return 0x130; 980 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) { 981 /* 982 # Lithuanian 983 984 # Lithuanian retains the dot in a lowercase i when followed by accents. 985 986 # Remove DOT ABOVE after "i" with upper or titlecase 987 988 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE 989 */ 990 return 0; /* remove the dot (continue without output) */ 991 } else { 992 /* no known conditional special case mapping, use a normal mapping */ 993 } 994 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 995 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 996 997 /* start of full case mapping strings */ 998 ++pe; 999 1000 /* skip the lowercase and case-folding result strings */ 1001 pe+=full&UCASE_FULL_LOWER; 1002 full>>=4; 1003 pe+=full&0xf; 1004 full>>=4; 1005 1006 if(upperNotTitle) { 1007 full&=0xf; 1008 } else { 1009 /* skip the uppercase result string */ 1010 pe+=full&0xf; 1011 full=(full>>4)&0xf; 1012 } 1013 1014 if(full!=0) { 1015 /* set the output pointer to the result string */ 1016 *pString=reinterpret_cast<const UChar *>(pe); 1017 1018 /* return the string length */ 1019 return full; 1020 } 1021 } 1022 1023 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) { 1024 idx=UCASE_EXC_TITLE; 1025 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 1026 /* here, titlecase is same as uppercase */ 1027 idx=UCASE_EXC_UPPER; 1028 } else { 1029 return ~c; 1030 } 1031 GET_SLOT_VALUE(excWord, idx, pe2, result); 1032 } 1033 1034 return (result==c) ? ~result : result; 1035} 1036 1037U_CAPI int32_t U_EXPORT2 1038ucase_toFullUpper(const UCaseProps *csp, UChar32 c, 1039 UCaseContextIterator *iter, void *context, 1040 const UChar **pString, 1041 const char *locale, int32_t *locCache) { 1042 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE); 1043} 1044 1045U_CAPI int32_t U_EXPORT2 1046ucase_toFullTitle(const UCaseProps *csp, UChar32 c, 1047 UCaseContextIterator *iter, void *context, 1048 const UChar **pString, 1049 const char *locale, int32_t *locCache) { 1050 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE); 1051} 1052 1053/* case folding ------------------------------------------------------------- */ 1054 1055/* 1056 * Case folding is similar to lowercasing. 1057 * The result may be a simple mapping, i.e., a single code point, or 1058 * a full mapping, i.e., a string. 1059 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping, 1060 * then only the lowercase mapping is stored. 1061 * 1062 * Some special cases are hardcoded because their conditions cannot be 1063 * parsed and processed from CaseFolding.txt. 1064 * 1065 * Unicode 3.2 CaseFolding.txt specifies for its status field: 1066 1067# C: common case folding, common mappings shared by both simple and full mappings. 1068# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. 1069# S: simple case folding, mappings to single characters where different from F. 1070# T: special case for uppercase I and dotted uppercase I 1071# - For non-Turkic languages, this mapping is normally not used. 1072# - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. 1073# 1074# Usage: 1075# A. To do a simple case folding, use the mappings with status C + S. 1076# B. To do a full case folding, use the mappings with status C + F. 1077# 1078# The mappings with status T can be used or omitted depending on the desired case-folding 1079# behavior. (The default option is to exclude them.) 1080 1081 * Unicode 3.2 has 'T' mappings as follows: 1082 10830049; T; 0131; # LATIN CAPITAL LETTER I 10840130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1085 1086 * while the default mappings for these code points are: 1087 10880049; C; 0069; # LATIN CAPITAL LETTER I 10890130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1090 1091 * U+0130 has no simple case folding (simple-case-folds to itself). 1092 */ 1093 1094/* return the simple case folding mapping for c */ 1095U_CAPI UChar32 U_EXPORT2 1096ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) { 1097 uint16_t props=UTRIE2_GET16(&csp->trie, c); 1098 if(!PROPS_HAS_EXCEPTION(props)) { 1099 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 1100 c+=UCASE_GET_DELTA(props); 1101 } 1102 } else { 1103 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 1104 uint16_t excWord=*pe++; 1105 int32_t idx; 1106 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { 1107 /* special case folding mappings, hardcoded */ 1108 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { 1109 /* default mappings */ 1110 if(c==0x49) { 1111 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1112 return 0x69; 1113 } else if(c==0x130) { 1114 /* no simple case folding for U+0130 */ 1115 return c; 1116 } 1117 } else { 1118 /* Turkic mappings */ 1119 if(c==0x49) { 1120 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1121 return 0x131; 1122 } else if(c==0x130) { 1123 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1124 return 0x69; 1125 } 1126 } 1127 } 1128 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { 1129 idx=UCASE_EXC_FOLD; 1130 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 1131 idx=UCASE_EXC_LOWER; 1132 } else { 1133 return c; 1134 } 1135 GET_SLOT_VALUE(excWord, idx, pe, c); 1136 } 1137 return c; 1138} 1139 1140/* 1141 * Issue for canonical caseless match (UAX #21): 1142 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve 1143 * canonical equivalence, unlike default-option casefolding. 1144 * For example, I-grave and I + grave fold to strings that are not canonically 1145 * equivalent. 1146 * For more details, see the comment in unorm_compare() in unorm.cpp 1147 * and the intermediate prototype changes for Jitterbug 2021. 1148 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.) 1149 * 1150 * This did not get fixed because it appears that it is not possible to fix 1151 * it for uppercase and lowercase characters (I-grave vs. i-grave) 1152 * together in a way that they still fold to common result strings. 1153 */ 1154 1155U_CAPI int32_t U_EXPORT2 1156ucase_toFullFolding(const UCaseProps *csp, UChar32 c, 1157 const UChar **pString, 1158 uint32_t options) 1159{ 1160 UChar32 result=c; 1161 uint16_t props=UTRIE2_GET16(&csp->trie, c); 1162 if(!PROPS_HAS_EXCEPTION(props)) { 1163 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 1164 result=c+UCASE_GET_DELTA(props); 1165 } 1166 } else { 1167 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; 1168 uint16_t excWord=*pe++; 1169 int32_t full, idx; 1170 1171 pe2=pe; 1172 1173 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { 1174 /* use hardcoded conditions and mappings */ 1175 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { 1176 /* default mappings */ 1177 if(c==0x49) { 1178 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1179 return 0x69; 1180 } else if(c==0x130) { 1181 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1182 *pString=iDot; 1183 return 2; 1184 } 1185 } else { 1186 /* Turkic mappings */ 1187 if(c==0x49) { 1188 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1189 return 0x131; 1190 } else if(c==0x130) { 1191 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1192 return 0x69; 1193 } 1194 } 1195 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 1196 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 1197 1198 /* start of full case mapping strings */ 1199 ++pe; 1200 1201 /* skip the lowercase result string */ 1202 pe+=full&UCASE_FULL_LOWER; 1203 full=(full>>4)&0xf; 1204 1205 if(full!=0) { 1206 /* set the output pointer to the result string */ 1207 *pString=reinterpret_cast<const UChar *>(pe); 1208 1209 /* return the string length */ 1210 return full; 1211 } 1212 } 1213 1214 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { 1215 idx=UCASE_EXC_FOLD; 1216 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 1217 idx=UCASE_EXC_LOWER; 1218 } else { 1219 return ~c; 1220 } 1221 GET_SLOT_VALUE(excWord, idx, pe2, result); 1222 } 1223 1224 return (result==c) ? ~result : result; 1225} 1226 1227/* case mapping properties API ---------------------------------------------- */ 1228 1229#define GET_CASE_PROPS() &ucase_props_singleton 1230 1231/* public API (see uchar.h) */ 1232 1233U_CAPI UBool U_EXPORT2 1234u_isULowercase(UChar32 c) { 1235 return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c)); 1236} 1237 1238U_CAPI UBool U_EXPORT2 1239u_isUUppercase(UChar32 c) { 1240 return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c)); 1241} 1242 1243/* Transforms the Unicode character to its lower case equivalent.*/ 1244U_CAPI UChar32 U_EXPORT2 1245u_tolower(UChar32 c) { 1246 return ucase_tolower(GET_CASE_PROPS(), c); 1247} 1248 1249/* Transforms the Unicode character to its upper case equivalent.*/ 1250U_CAPI UChar32 U_EXPORT2 1251u_toupper(UChar32 c) { 1252 return ucase_toupper(GET_CASE_PROPS(), c); 1253} 1254 1255/* Transforms the Unicode character to its title case equivalent.*/ 1256U_CAPI UChar32 U_EXPORT2 1257u_totitle(UChar32 c) { 1258 return ucase_totitle(GET_CASE_PROPS(), c); 1259} 1260 1261/* return the simple case folding mapping for c */ 1262U_CAPI UChar32 U_EXPORT2 1263u_foldCase(UChar32 c, uint32_t options) { 1264 return ucase_fold(GET_CASE_PROPS(), c, options); 1265} 1266 1267U_CFUNC int32_t U_EXPORT2 1268ucase_hasBinaryProperty(UChar32 c, UProperty which) { 1269 /* case mapping properties */ 1270 const UChar *resultString; 1271 int32_t locCache; 1272 const UCaseProps *csp=GET_CASE_PROPS(); 1273 if(csp==NULL) { 1274 return FALSE; 1275 } 1276 switch(which) { 1277 case UCHAR_LOWERCASE: 1278 return (UBool)(UCASE_LOWER==ucase_getType(csp, c)); 1279 case UCHAR_UPPERCASE: 1280 return (UBool)(UCASE_UPPER==ucase_getType(csp, c)); 1281 case UCHAR_SOFT_DOTTED: 1282 return ucase_isSoftDotted(csp, c); 1283 case UCHAR_CASE_SENSITIVE: 1284 return ucase_isCaseSensitive(csp, c); 1285 case UCHAR_CASED: 1286 return (UBool)(UCASE_NONE!=ucase_getType(csp, c)); 1287 case UCHAR_CASE_IGNORABLE: 1288 return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2); 1289 /* 1290 * Note: The following Changes_When_Xyz are defined as testing whether 1291 * the NFD form of the input changes when Xyz-case-mapped. 1292 * However, this simpler implementation of these properties, 1293 * ignoring NFD, passes the tests. 1294 * The implementation needs to be changed if the tests start failing. 1295 * When that happens, optimizations should be used to work with the 1296 * per-single-code point ucase_toFullXyz() functions unless 1297 * the NFD form has more than one code point, 1298 * and the property starts set needs to be the union of the 1299 * start sets for normalization and case mappings. 1300 */ 1301 case UCHAR_CHANGES_WHEN_LOWERCASED: 1302 locCache=UCASE_LOC_ROOT; 1303 return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); 1304 case UCHAR_CHANGES_WHEN_UPPERCASED: 1305 locCache=UCASE_LOC_ROOT; 1306 return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); 1307 case UCHAR_CHANGES_WHEN_TITLECASED: 1308 locCache=UCASE_LOC_ROOT; 1309 return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); 1310 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */ 1311 case UCHAR_CHANGES_WHEN_CASEMAPPED: 1312 locCache=UCASE_LOC_ROOT; 1313 return (UBool)( 1314 ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 || 1315 ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 || 1316 ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); 1317 default: 1318 return FALSE; 1319 } 1320} 1321