1/* 2******************************************************************************** 3* Copyright (C) 1996-2010, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************** 6* 7* File UCHAR.C 8* 9* Modification History: 10* 11* Date Name Description 12* 04/02/97 aliu Creation. 13* 4/15/99 Madhu Updated all the function definitions for C Implementation 14* 5/20/99 Madhu Added the function u_getVersion() 15* 8/19/1999 srl Upgraded scripts to Unicode3.0 16* 11/11/1999 weiv added u_isalnum(), cleaned comments 17* 01/11/2000 helena Renamed u_getVersion to u_getUnicodeVersion. 18* 06/20/2000 helena OS/400 port changes; mostly typecast. 19****************************************************************************** 20*/ 21 22#include "unicode/utypes.h" 23#include "unicode/uchar.h" 24#include "unicode/uscript.h" 25#include "unicode/udata.h" 26#include "umutex.h" 27#include "cmemory.h" 28#include "ucln_cmn.h" 29#include "utrie2.h" 30#include "udataswp.h" 31#include "uprops.h" 32 33#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 34 35/* dynamically loaded Unicode character properties -------------------------- */ 36 37#define UCHAR_HARDCODE_DATA 1 38 39#if UCHAR_HARDCODE_DATA 40 41/* uchar_props_data.c is machine-generated by genprops --csource */ 42#include "uchar_props_data.c" 43 44#else 45 46/* 47 * loaded uprops.dat - 48 * for a description of the file format, see icu/source/tools/genprops/store.c 49 */ 50static const char DATA_NAME[] = "uprops"; 51static const char DATA_TYPE[] = "icu"; 52 53static UDataMemory *propsData=NULL; 54static UErrorCode dataErrorCode=U_ZERO_ERROR; 55 56static uint8_t formatVersion[4]={ 0, 0, 0, 0 }; 57static UVersionInfo dataVersion={ 0, 0, 0, 0 }; 58 59static UTrie propsTrie={ 0 }, propsVectorsTrie={ 0 }; 60static const uint32_t *pData32=NULL, *propsVectors=NULL; 61static int32_t countPropsVectors=0, propsVectorsColumns=0; 62 63static int8_t havePropsData=0; /* == 0 -> Data has not been loaded. 64 * < 0 -> Error occured attempting to load data. 65 * > 0 -> Data has been successfully loaded. 66 */ 67 68/* index values loaded from uprops.dat */ 69static int32_t indexes[UPROPS_INDEX_COUNT]; 70 71static UBool U_CALLCONV 72isAcceptable(void *context, 73 const char *type, const char *name, 74 const UDataInfo *pInfo) { 75 if( 76 pInfo->size>=20 && 77 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 78 pInfo->charsetFamily==U_CHARSET_FAMILY && 79 pInfo->dataFormat[0]==0x55 && /* dataFormat="UPro" */ 80 pInfo->dataFormat[1]==0x50 && 81 pInfo->dataFormat[2]==0x72 && 82 pInfo->dataFormat[3]==0x6f && 83 pInfo->formatVersion[0]==4 && 84 pInfo->formatVersion[2]==UTRIE_SHIFT && 85 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT 86 ) { 87 uprv_memcpy(formatVersion, pInfo->formatVersion, 4); 88 uprv_memcpy(dataVersion, pInfo->dataVersion, 4); 89 return TRUE; 90 } else { 91 return FALSE; 92 } 93} 94 95static UBool U_CALLCONV uchar_cleanup(void) 96{ 97 if (propsData) { 98 udata_close(propsData); 99 propsData=NULL; 100 } 101 pData32=NULL; 102 propsVectors=NULL; 103 countPropsVectors=0; 104 uprv_memset(dataVersion, 0, U_MAX_VERSION_LENGTH); 105 dataErrorCode=U_ZERO_ERROR; 106 havePropsData=0; 107 108 return TRUE; 109} 110 111struct UCharProps { 112 UDataMemory *propsData; 113 UTrie propsTrie, propsVectorsTrie; 114 const uint32_t *pData32; 115}; 116typedef struct UCharProps UCharProps; 117 118/* open uprops.icu */ 119static void 120_openProps(UCharProps *ucp, UErrorCode *pErrorCode) { 121 const uint32_t *p; 122 int32_t length; 123 124 ucp->propsData=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode); 125 if(U_FAILURE(*pErrorCode)) { 126 return; 127 } 128 129 ucp->pData32=p=(const uint32_t *)udata_getMemory(ucp->propsData); 130 131 /* unserialize the trie; it is directly after the int32_t indexes[UPROPS_INDEX_COUNT] */ 132 length=(int32_t)p[UPROPS_PROPS32_INDEX]*4; 133 length=utrie_unserialize(&ucp->propsTrie, (const uint8_t *)(p+UPROPS_INDEX_COUNT), length-64, pErrorCode); 134 if(U_FAILURE(*pErrorCode)) { 135 return; 136 } 137 138 /* unserialize the properties vectors trie */ 139 length=(int32_t)(p[UPROPS_ADDITIONAL_VECTORS_INDEX]-p[UPROPS_ADDITIONAL_TRIE_INDEX])*4; 140 if(length>0) { 141 length=utrie_unserialize(&ucp->propsVectorsTrie, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, pErrorCode); 142 } 143 if(length<=0 || U_FAILURE(*pErrorCode)) { 144 /* 145 * length==0: 146 * Allow the properties vectors trie to be missing - 147 * also requires propsVectorsColumns=indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX] 148 * to be zero so that this trie is never accessed. 149 */ 150 uprv_memset(&ucp->propsVectorsTrie, 0, sizeof(ucp->propsVectorsTrie)); 151 } 152} 153 154#endif 155 156#if !UCHAR_HARDCODE_DATA 157static int8_t 158uprv_loadPropsData(UErrorCode *pErrorCode) { 159 /* load Unicode character properties data from file if necessary */ 160 161 /* 162 * This lazy intialization with double-checked locking (without mutex protection for 163 * haveNormData==0) is transiently unsafe under certain circumstances. 164 * Check the readme and use u_init() if necessary. 165 */ 166 if(havePropsData==0) { 167 UCharProps ucp={ NULL }; 168 169 if(U_FAILURE(*pErrorCode)) { 170 return havePropsData; 171 } 172 173 /* open the data outside the mutex block */ 174 _openProps(&ucp, pErrorCode); 175 176 if(U_SUCCESS(*pErrorCode)) { 177 /* in the mutex block, set the data for this process */ 178 umtx_lock(NULL); 179 if(propsData==NULL) { 180 propsData=ucp.propsData; 181 ucp.propsData=NULL; 182 pData32=ucp.pData32; 183 ucp.pData32=NULL; 184 uprv_memcpy(&propsTrie, &ucp.propsTrie, sizeof(propsTrie)); 185 uprv_memcpy(&propsVectorsTrie, &ucp.propsVectorsTrie, sizeof(propsVectorsTrie)); 186 } 187 188 /* initialize some variables */ 189 uprv_memcpy(indexes, pData32, sizeof(indexes)); 190 191 /* additional properties */ 192 if(indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0) { 193 propsVectors=pData32+indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]; 194 countPropsVectors=indexes[UPROPS_RESERVED_INDEX]-indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]; 195 propsVectorsColumns=indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]; 196 } 197 198 havePropsData=1; 199 umtx_unlock(NULL); 200 } else { 201 dataErrorCode=*pErrorCode; 202 havePropsData=-1; 203 } 204 ucln_common_registerCleanup(UCLN_COMMON_UCHAR, uchar_cleanup); 205 206 /* if a different thread set it first, then close the extra data */ 207 udata_close(ucp.propsData); /* NULL if it was set correctly */ 208 } 209 210 return havePropsData; 211} 212 213static int8_t 214loadPropsData(void) { 215 UErrorCode errorCode = U_ZERO_ERROR; 216 int8_t retVal = uprv_loadPropsData(&errorCode); 217 return retVal; 218} 219 220#endif 221 222/* constants and macros for access to the data ------------------------------ */ 223 224/* getting a uint32_t properties word from the data */ 225#if UCHAR_HARDCODE_DATA 226 227#define GET_PROPS(c, result) ((result)=UTRIE2_GET16(&propsTrie, c)); 228 229#else 230 231#define HAVE_DATA (havePropsData>0 || loadPropsData()>0) 232#define GET_PROPS_UNSAFE(c, result) \ 233 UTRIE_GET16(&propsTrie, c, result); 234#define GET_PROPS(c, result) \ 235 if(HAVE_DATA) { \ 236 GET_PROPS_UNSAFE(c, result); \ 237 } else { \ 238 (result)=0; \ 239 } 240 241#endif 242 243U_CFUNC UBool 244uprv_haveProperties(UErrorCode *pErrorCode) { 245 if(U_FAILURE(*pErrorCode)) { 246 return FALSE; 247 } 248#if !UCHAR_HARDCODE_DATA 249 if(havePropsData==0) { 250 uprv_loadPropsData(pErrorCode); 251 } 252 if(havePropsData<0) { 253 *pErrorCode=dataErrorCode; 254 return FALSE; 255 } 256#endif 257 return TRUE; 258} 259 260/* API functions ------------------------------------------------------------ */ 261 262/* Gets the Unicode character's general category.*/ 263U_CAPI int8_t U_EXPORT2 264u_charType(UChar32 c) { 265 uint32_t props; 266 GET_PROPS(c, props); 267 return (int8_t)GET_CATEGORY(props); 268} 269 270/* Enumerate all code points with their general categories. */ 271struct _EnumTypeCallback { 272 UCharEnumTypeRange *enumRange; 273 const void *context; 274}; 275 276static uint32_t U_CALLCONV 277_enumTypeValue(const void *context, uint32_t value) { 278 return GET_CATEGORY(value); 279} 280 281static UBool U_CALLCONV 282_enumTypeRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 283 /* just cast the value to UCharCategory */ 284 return ((struct _EnumTypeCallback *)context)-> 285 enumRange(((struct _EnumTypeCallback *)context)->context, 286 start, end+1, (UCharCategory)value); 287} 288 289U_CAPI void U_EXPORT2 290u_enumCharTypes(UCharEnumTypeRange *enumRange, const void *context) { 291 struct _EnumTypeCallback callback; 292 293 if(enumRange==NULL 294#if !UCHAR_HARDCODE_DATA 295 || !HAVE_DATA 296#endif 297 ) { 298 return; 299 } 300 301 callback.enumRange=enumRange; 302 callback.context=context; 303 utrie2_enum(&propsTrie, _enumTypeValue, _enumTypeRange, &callback); 304} 305 306/* Checks if ch is a lower case letter.*/ 307U_CAPI UBool U_EXPORT2 308u_islower(UChar32 c) { 309 uint32_t props; 310 GET_PROPS(c, props); 311 return (UBool)(GET_CATEGORY(props)==U_LOWERCASE_LETTER); 312} 313 314/* Checks if ch is an upper case letter.*/ 315U_CAPI UBool U_EXPORT2 316u_isupper(UChar32 c) { 317 uint32_t props; 318 GET_PROPS(c, props); 319 return (UBool)(GET_CATEGORY(props)==U_UPPERCASE_LETTER); 320} 321 322/* Checks if ch is a title case letter; usually upper case letters.*/ 323U_CAPI UBool U_EXPORT2 324u_istitle(UChar32 c) { 325 uint32_t props; 326 GET_PROPS(c, props); 327 return (UBool)(GET_CATEGORY(props)==U_TITLECASE_LETTER); 328} 329 330/* Checks if ch is a decimal digit. */ 331U_CAPI UBool U_EXPORT2 332u_isdigit(UChar32 c) { 333 uint32_t props; 334 GET_PROPS(c, props); 335 return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER); 336} 337 338U_CAPI UBool U_EXPORT2 339u_isxdigit(UChar32 c) { 340 uint32_t props; 341 342 /* check ASCII and Fullwidth ASCII a-fA-F */ 343 if( 344 (c<=0x66 && c>=0x41 && (c<=0x46 || c>=0x61)) || 345 (c>=0xff21 && c<=0xff46 && (c<=0xff26 || c>=0xff41)) 346 ) { 347 return TRUE; 348 } 349 350 GET_PROPS(c, props); 351 return (UBool)(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER); 352} 353 354/* Checks if the Unicode character is a letter.*/ 355U_CAPI UBool U_EXPORT2 356u_isalpha(UChar32 c) { 357 uint32_t props; 358 GET_PROPS(c, props); 359 return (UBool)((CAT_MASK(props)&U_GC_L_MASK)!=0); 360} 361 362U_CAPI UBool U_EXPORT2 363u_isUAlphabetic(UChar32 c) { 364 return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_ALPHABETIC))!=0; 365} 366 367/* Checks if c is a letter or a decimal digit */ 368U_CAPI UBool U_EXPORT2 369u_isalnum(UChar32 c) { 370 uint32_t props; 371 GET_PROPS(c, props); 372 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_ND_MASK))!=0); 373} 374 375/** 376 * Checks if c is alphabetic, or a decimal digit; implements UCHAR_POSIX_ALNUM. 377 * @internal 378 */ 379U_CFUNC UBool 380u_isalnumPOSIX(UChar32 c) { 381 return (UBool)(u_isUAlphabetic(c) || u_isdigit(c)); 382} 383 384/* Checks if ch is a unicode character with assigned character type.*/ 385U_CAPI UBool U_EXPORT2 386u_isdefined(UChar32 c) { 387 uint32_t props; 388 GET_PROPS(c, props); 389 return (UBool)(GET_CATEGORY(props)!=0); 390} 391 392/* Checks if the Unicode character is a base form character that can take a diacritic.*/ 393U_CAPI UBool U_EXPORT2 394u_isbase(UChar32 c) { 395 uint32_t props; 396 GET_PROPS(c, props); 397 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_N_MASK|U_GC_MC_MASK|U_GC_ME_MASK))!=0); 398} 399 400/* Checks if the Unicode character is a control character.*/ 401U_CAPI UBool U_EXPORT2 402u_iscntrl(UChar32 c) { 403 uint32_t props; 404 GET_PROPS(c, props); 405 return (UBool)((CAT_MASK(props)&(U_GC_CC_MASK|U_GC_CF_MASK|U_GC_ZL_MASK|U_GC_ZP_MASK))!=0); 406} 407 408U_CAPI UBool U_EXPORT2 409u_isISOControl(UChar32 c) { 410 return (uint32_t)c<=0x9f && (c<=0x1f || c>=0x7f); 411} 412 413/* Some control characters that are used as space. */ 414#define IS_THAT_CONTROL_SPACE(c) \ 415 (c<=0x9f && ((c>=TAB && c<=CR) || (c>=0x1c && c <=0x1f) || c==NL)) 416 417/* Java has decided that U+0085 New Line is not whitespace any more. */ 418#define IS_THAT_ASCII_CONTROL_SPACE(c) \ 419 (c<=0x1f && c>=TAB && (c<=CR || c>=0x1c)) 420 421/* Checks if the Unicode character is a space character.*/ 422U_CAPI UBool U_EXPORT2 423u_isspace(UChar32 c) { 424 uint32_t props; 425 GET_PROPS(c, props); 426 return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0 || IS_THAT_CONTROL_SPACE(c)); 427} 428 429U_CAPI UBool U_EXPORT2 430u_isJavaSpaceChar(UChar32 c) { 431 uint32_t props; 432 GET_PROPS(c, props); 433 return (UBool)((CAT_MASK(props)&U_GC_Z_MASK)!=0); 434} 435 436/* Checks if the Unicode character is a whitespace character.*/ 437U_CAPI UBool U_EXPORT2 438u_isWhitespace(UChar32 c) { 439 uint32_t props; 440 GET_PROPS(c, props); 441 return (UBool)( 442 ((CAT_MASK(props)&U_GC_Z_MASK)!=0 && 443 c!=NBSP && c!=FIGURESP && c!=NNBSP) || /* exclude no-break spaces */ 444 IS_THAT_ASCII_CONTROL_SPACE(c) 445 ); 446} 447 448U_CAPI UBool U_EXPORT2 449u_isblank(UChar32 c) { 450 if((uint32_t)c<=0x9f) { 451 return c==9 || c==0x20; /* TAB or SPACE */ 452 } else { 453 /* Zs */ 454 uint32_t props; 455 GET_PROPS(c, props); 456 return (UBool)(GET_CATEGORY(props)==U_SPACE_SEPARATOR); 457 } 458} 459 460U_CAPI UBool U_EXPORT2 461u_isUWhiteSpace(UChar32 c) { 462 return (u_getUnicodeProperties(c, 1)&U_MASK(UPROPS_WHITE_SPACE))!=0; 463} 464 465/* Checks if the Unicode character is printable.*/ 466U_CAPI UBool U_EXPORT2 467u_isprint(UChar32 c) { 468 uint32_t props; 469 GET_PROPS(c, props); 470 /* comparing ==0 returns FALSE for the categories mentioned */ 471 return (UBool)((CAT_MASK(props)&U_GC_C_MASK)==0); 472} 473 474/** 475 * Checks if c is in \p{graph}\p{blank} - \p{cntrl}. 476 * Implements UCHAR_POSIX_PRINT. 477 * @internal 478 */ 479U_CFUNC UBool 480u_isprintPOSIX(UChar32 c) { 481 uint32_t props; 482 GET_PROPS(c, props); 483 /* 484 * The only cntrl character in graph+blank is TAB (in blank). 485 * Here we implement (blank-TAB)=Zs instead of calling u_isblank(). 486 */ 487 return (UBool)((GET_CATEGORY(props)==U_SPACE_SEPARATOR) || u_isgraphPOSIX(c)); 488} 489 490U_CAPI UBool U_EXPORT2 491u_isgraph(UChar32 c) { 492 uint32_t props; 493 GET_PROPS(c, props); 494 /* comparing ==0 returns FALSE for the categories mentioned */ 495 return (UBool)((CAT_MASK(props)& 496 (U_GC_CC_MASK|U_GC_CF_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK)) 497 ==0); 498} 499 500/** 501 * Checks if c is in 502 * [^\p{space}\p{gc=Control}\p{gc=Surrogate}\p{gc=Unassigned}] 503 * with space=\p{Whitespace} and Control=Cc. 504 * Implements UCHAR_POSIX_GRAPH. 505 * @internal 506 */ 507U_CFUNC UBool 508u_isgraphPOSIX(UChar32 c) { 509 uint32_t props; 510 GET_PROPS(c, props); 511 /* \p{space}\p{gc=Control} == \p{gc=Z}\p{Control} */ 512 /* comparing ==0 returns FALSE for the categories mentioned */ 513 return (UBool)((CAT_MASK(props)& 514 (U_GC_CC_MASK|U_GC_CS_MASK|U_GC_CN_MASK|U_GC_Z_MASK)) 515 ==0); 516} 517 518U_CAPI UBool U_EXPORT2 519u_ispunct(UChar32 c) { 520 uint32_t props; 521 GET_PROPS(c, props); 522 return (UBool)((CAT_MASK(props)&U_GC_P_MASK)!=0); 523} 524 525/* Checks if the Unicode character can start a Unicode identifier.*/ 526U_CAPI UBool U_EXPORT2 527u_isIDStart(UChar32 c) { 528 /* same as u_isalpha() */ 529 uint32_t props; 530 GET_PROPS(c, props); 531 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_NL_MASK))!=0); 532} 533 534/* Checks if the Unicode character can be a Unicode identifier part other than starting the 535 identifier.*/ 536U_CAPI UBool U_EXPORT2 537u_isIDPart(UChar32 c) { 538 uint32_t props; 539 GET_PROPS(c, props); 540 return (UBool)( 541 (CAT_MASK(props)& 542 (U_GC_ND_MASK|U_GC_NL_MASK| 543 U_GC_L_MASK| 544 U_GC_PC_MASK|U_GC_MC_MASK|U_GC_MN_MASK) 545 )!=0 || 546 u_isIDIgnorable(c)); 547} 548 549/*Checks if the Unicode character can be ignorable in a Java or Unicode identifier.*/ 550U_CAPI UBool U_EXPORT2 551u_isIDIgnorable(UChar32 c) { 552 if(c<=0x9f) { 553 return u_isISOControl(c) && !IS_THAT_ASCII_CONTROL_SPACE(c); 554 } else { 555 uint32_t props; 556 GET_PROPS(c, props); 557 return (UBool)(GET_CATEGORY(props)==U_FORMAT_CHAR); 558 } 559} 560 561/*Checks if the Unicode character can start a Java identifier.*/ 562U_CAPI UBool U_EXPORT2 563u_isJavaIDStart(UChar32 c) { 564 uint32_t props; 565 GET_PROPS(c, props); 566 return (UBool)((CAT_MASK(props)&(U_GC_L_MASK|U_GC_SC_MASK|U_GC_PC_MASK))!=0); 567} 568 569/*Checks if the Unicode character can be a Java identifier part other than starting the 570 * identifier. 571 */ 572U_CAPI UBool U_EXPORT2 573u_isJavaIDPart(UChar32 c) { 574 uint32_t props; 575 GET_PROPS(c, props); 576 return (UBool)( 577 (CAT_MASK(props)& 578 (U_GC_ND_MASK|U_GC_NL_MASK| 579 U_GC_L_MASK| 580 U_GC_SC_MASK|U_GC_PC_MASK| 581 U_GC_MC_MASK|U_GC_MN_MASK) 582 )!=0 || 583 u_isIDIgnorable(c)); 584} 585 586U_CAPI int32_t U_EXPORT2 587u_charDigitValue(UChar32 c) { 588 uint32_t props; 589 int32_t value; 590 GET_PROPS(c, props); 591 value=(int32_t)GET_NUMERIC_TYPE_VALUE(props)-UPROPS_NTV_DECIMAL_START; 592 if(value<=9) { 593 return value; 594 } else { 595 return -1; 596 } 597} 598 599U_CAPI double U_EXPORT2 600u_getNumericValue(UChar32 c) { 601 uint32_t props; 602 int32_t ntv; 603 GET_PROPS(c, props); 604 ntv=(int32_t)GET_NUMERIC_TYPE_VALUE(props); 605 606 if(ntv==UPROPS_NTV_NONE) { 607 return U_NO_NUMERIC_VALUE; 608 } else if(ntv<UPROPS_NTV_DIGIT_START) { 609 /* decimal digit */ 610 return ntv-UPROPS_NTV_DECIMAL_START; 611 } else if(ntv<UPROPS_NTV_NUMERIC_START) { 612 /* other digit */ 613 return ntv-UPROPS_NTV_DIGIT_START; 614 } else if(ntv<UPROPS_NTV_FRACTION_START) { 615 /* small integer */ 616 return ntv-UPROPS_NTV_NUMERIC_START; 617 } else if(ntv<UPROPS_NTV_LARGE_START) { 618 /* fraction */ 619 int32_t numerator=(ntv>>4)-12; 620 int32_t denominator=(ntv&0xf)+1; 621 return (double)numerator/denominator; 622 } else if(ntv<UPROPS_NTV_RESERVED_START) { 623 /* large, single-significant-digit integer */ 624 double numValue; 625 int32_t mant=(ntv>>5)-14; 626 int32_t exp=(ntv&0x1f)+2; 627 numValue=mant; 628 629 /* multiply by 10^exp without math.h */ 630 while(exp>=4) { 631 numValue*=10000.; 632 exp-=4; 633 } 634 switch(exp) { 635 case 3: 636 numValue*=1000.; 637 break; 638 case 2: 639 numValue*=100.; 640 break; 641 case 1: 642 numValue*=10.; 643 break; 644 case 0: 645 default: 646 break; 647 } 648 649 return numValue; 650 } else { 651 /* reserved */ 652 return U_NO_NUMERIC_VALUE; 653 } 654} 655 656U_CAPI int32_t U_EXPORT2 657u_digit(UChar32 ch, int8_t radix) { 658 int8_t value; 659 if((uint8_t)(radix-2)<=(36-2)) { 660 value=(int8_t)u_charDigitValue(ch); 661 if(value<0) { 662 /* ch is not a decimal digit, try latin letters */ 663 if(ch>=0x61 && ch<=0x7A) { 664 value=(int8_t)(ch-0x57); /* ch - 'a' + 10 */ 665 } else if(ch>=0x41 && ch<=0x5A) { 666 value=(int8_t)(ch-0x37); /* ch - 'A' + 10 */ 667 } else if(ch>=0xFF41 && ch<=0xFF5A) { 668 value=(int8_t)(ch-0xFF37); /* fullwidth ASCII a-z */ 669 } else if(ch>=0xFF21 && ch<=0xFF3A) { 670 value=(int8_t)(ch-0xFF17); /* fullwidth ASCII A-Z */ 671 } 672 } 673 } else { 674 value=-1; /* invalid radix */ 675 } 676 return (int8_t)((value<radix) ? value : -1); 677} 678 679U_CAPI UChar32 U_EXPORT2 680u_forDigit(int32_t digit, int8_t radix) { 681 if((uint8_t)(radix-2)>(36-2) || (uint32_t)digit>=(uint32_t)radix) { 682 return 0; 683 } else if(digit<10) { 684 return (UChar32)(0x30+digit); 685 } else { 686 return (UChar32)((0x61-10)+digit); 687 } 688} 689 690/* miscellaneous, and support for uprops.c ---------------------------------- */ 691 692U_CAPI void U_EXPORT2 693u_getUnicodeVersion(UVersionInfo versionArray) { 694 if(versionArray!=NULL) { 695 uprv_memcpy(versionArray, dataVersion, U_MAX_VERSION_LENGTH); 696 } 697} 698 699U_CFUNC uint32_t 700u_getUnicodeProperties(UChar32 c, int32_t column) { 701 uint16_t vecIndex; 702 703 if(column==-1) { 704 uint32_t props; 705 GET_PROPS(c, props); 706 return props; 707 } else if( 708#if !UCHAR_HARDCODE_DATA 709 !HAVE_DATA || countPropsVectors==0 || 710#endif 711 column<0 || column>=propsVectorsColumns 712 ) { 713 return 0; 714 } else { 715 vecIndex=UTRIE2_GET16(&propsVectorsTrie, c); 716 return propsVectors[vecIndex+column]; 717 } 718} 719 720U_CFUNC int32_t 721uprv_getMaxValues(int32_t column) { 722#if !UCHAR_HARDCODE_DATA 723 if(HAVE_DATA) { 724#endif 725 switch(column) { 726 case 0: 727 return indexes[UPROPS_MAX_VALUES_INDEX]; 728 case 2: 729 return indexes[UPROPS_MAX_VALUES_2_INDEX]; 730 default: 731 return 0; 732 } 733#if !UCHAR_HARDCODE_DATA 734 } else { 735 return 0; 736 } 737#endif 738} 739 740U_CAPI void U_EXPORT2 741u_charAge(UChar32 c, UVersionInfo versionArray) { 742 if(versionArray!=NULL) { 743 uint32_t version=u_getUnicodeProperties(c, 0)>>UPROPS_AGE_SHIFT; 744 versionArray[0]=(uint8_t)(version>>4); 745 versionArray[1]=(uint8_t)(version&0xf); 746 versionArray[2]=versionArray[3]=0; 747 } 748} 749 750U_CAPI UScriptCode U_EXPORT2 751uscript_getScript(UChar32 c, UErrorCode *pErrorCode) { 752 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 753 return USCRIPT_INVALID_CODE; 754 } 755 if((uint32_t)c>0x10ffff) { 756 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 757 return USCRIPT_INVALID_CODE; 758 } 759 760 return (UScriptCode)(u_getUnicodeProperties(c, 0)&UPROPS_SCRIPT_MASK); 761} 762 763U_CAPI UBlockCode U_EXPORT2 764ublock_getCode(UChar32 c) { 765 return (UBlockCode)((u_getUnicodeProperties(c, 0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT); 766} 767 768/* property starts for UnicodeSet ------------------------------------------- */ 769 770static UBool U_CALLCONV 771_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 772 /* add the start code point to the USet */ 773 const USetAdder *sa=(const USetAdder *)context; 774 sa->add(sa->set, start); 775 return TRUE; 776} 777 778#define USET_ADD_CP_AND_NEXT(sa, cp) sa->add(sa->set, cp); sa->add(sa->set, cp+1) 779 780U_CFUNC void U_EXPORT2 781uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { 782 if(U_FAILURE(*pErrorCode)) { 783 return; 784 } 785 786#if !UCHAR_HARDCODE_DATA 787 if(!HAVE_DATA) { 788 *pErrorCode=dataErrorCode; 789 return; 790 } 791#endif 792 793 /* add the start code point of each same-value range of the main trie */ 794 utrie2_enum(&propsTrie, NULL, _enumPropertyStartsRange, sa); 795 796 /* add code points with hardcoded properties, plus the ones following them */ 797 798 /* add for u_isblank() */ 799 USET_ADD_CP_AND_NEXT(sa, TAB); 800 801 /* add for IS_THAT_CONTROL_SPACE() */ 802 sa->add(sa->set, CR+1); /* range TAB..CR */ 803 sa->add(sa->set, 0x1c); 804 sa->add(sa->set, 0x1f+1); 805 USET_ADD_CP_AND_NEXT(sa, NL); 806 807 /* add for u_isIDIgnorable() what was not added above */ 808 sa->add(sa->set, DEL); /* range DEL..NBSP-1, NBSP added below */ 809 sa->add(sa->set, HAIRSP); 810 sa->add(sa->set, RLM+1); 811 sa->add(sa->set, INHSWAP); 812 sa->add(sa->set, NOMDIG+1); 813 USET_ADD_CP_AND_NEXT(sa, ZWNBSP); 814 815 /* add no-break spaces for u_isWhitespace() what was not added above */ 816 USET_ADD_CP_AND_NEXT(sa, NBSP); 817 USET_ADD_CP_AND_NEXT(sa, FIGURESP); 818 USET_ADD_CP_AND_NEXT(sa, NNBSP); 819 820 /* add for u_digit() */ 821 sa->add(sa->set, U_a); 822 sa->add(sa->set, U_z+1); 823 sa->add(sa->set, U_A); 824 sa->add(sa->set, U_Z+1); 825 sa->add(sa->set, U_FW_a); 826 sa->add(sa->set, U_FW_z+1); 827 sa->add(sa->set, U_FW_A); 828 sa->add(sa->set, U_FW_Z+1); 829 830 /* add for u_isxdigit() */ 831 sa->add(sa->set, U_f+1); 832 sa->add(sa->set, U_F+1); 833 sa->add(sa->set, U_FW_f+1); 834 sa->add(sa->set, U_FW_F+1); 835 836 /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ 837 sa->add(sa->set, WJ); /* range WJ..NOMDIG */ 838 sa->add(sa->set, 0xfff0); 839 sa->add(sa->set, 0xfffb+1); 840 sa->add(sa->set, 0xe0000); 841 sa->add(sa->set, 0xe0fff+1); 842 843 /* add for UCHAR_GRAPHEME_BASE and others */ 844 USET_ADD_CP_AND_NEXT(sa, CGJ); 845} 846 847U_CFUNC void U_EXPORT2 848upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { 849 if(U_FAILURE(*pErrorCode)) { 850 return; 851 } 852 853#if !UCHAR_HARDCODE_DATA 854 if(!HAVE_DATA) { 855 *pErrorCode=dataErrorCode; 856 return; 857 } 858#endif 859 860 /* add the start code point of each same-value range of the properties vectors trie */ 861 if(propsVectorsColumns>0) { 862 /* if propsVectorsColumns==0 then the properties vectors trie may not be there at all */ 863 utrie2_enum(&propsVectorsTrie, NULL, _enumPropertyStartsRange, sa); 864 } 865} 866