1/******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1997-2013, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6/******************************************************************************* 7* 8* File CUCDTST.C 9* 10* Modification History: 11* Name Description 12* Madhu Katragadda Ported for C API, added tests for string functions 13******************************************************************************** 14*/ 15 16#include <string.h> 17#include <math.h> 18#include <stdlib.h> 19 20#include "unicode/utypes.h" 21#include "unicode/uchar.h" 22#include "unicode/putil.h" 23#include "unicode/ustring.h" 24#include "unicode/uloc.h" 25#include "unicode/unorm2.h" 26 27#include "cintltst.h" 28#include "putilimp.h" 29#include "uparse.h" 30#include "ucase.h" 31#include "ubidi_props.h" 32#include "uprops.h" 33#include "uset_imp.h" 34#include "usc_impl.h" 35#include "udatamem.h" /* for testing ucase_openBinary() */ 36#include "cucdapi.h" 37 38#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 39 40/* prototypes --------------------------------------------------------------- */ 41 42static void TestUpperLower(void); 43static void TestLetterNumber(void); 44static void TestMisc(void); 45static void TestPOSIX(void); 46static void TestControlPrint(void); 47static void TestIdentifier(void); 48static void TestUnicodeData(void); 49static void TestCodeUnit(void); 50static void TestCodePoint(void); 51static void TestCharLength(void); 52static void TestCharNames(void); 53static void TestMirroring(void); 54static void TestUScriptRunAPI(void); 55static void TestAdditionalProperties(void); 56static void TestNumericProperties(void); 57static void TestPropertyNames(void); 58static void TestPropertyValues(void); 59static void TestConsistency(void); 60static void TestUCase(void); 61static void TestUBiDiProps(void); 62static void TestCaseFolding(void); 63 64/* internal methods used */ 65static int32_t MakeProp(char* str); 66static int32_t MakeDir(char* str); 67 68/* helpers ------------------------------------------------------------------ */ 69 70static void 71parseUCDFile(const char *filename, 72 char *fields[][2], int32_t fieldCount, 73 UParseLineFn *lineFn, void *context, 74 UErrorCode *pErrorCode) { 75 char path[256]; 76 char backupPath[256]; 77 78 if(U_FAILURE(*pErrorCode)) { 79 return; 80 } 81 82 /* Look inside ICU_DATA first */ 83 strcpy(path, u_getDataDirectory()); 84 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING); 85 strcat(path, filename); 86 87 /* As a fallback, try to guess where the source data was located 88 * at the time ICU was built, and look there. 89 */ 90 strcpy(backupPath, ctest_dataSrcDir()); 91 strcat(backupPath, U_FILE_SEP_STRING); 92 strcat(backupPath, "unidata" U_FILE_SEP_STRING); 93 strcat(backupPath, filename); 94 95 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode); 96 if(*pErrorCode==U_FILE_ACCESS_ERROR) { 97 *pErrorCode=U_ZERO_ERROR; 98 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode); 99 } 100 if(U_FAILURE(*pErrorCode)) { 101 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode)); 102 } 103} 104 105/* test data ---------------------------------------------------------------- */ 106 107static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD; 108static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf"; 109static const int32_t tagValues[] = 110 { 111 /* Mn */ U_NON_SPACING_MARK, 112 /* Mc */ U_COMBINING_SPACING_MARK, 113 /* Me */ U_ENCLOSING_MARK, 114 /* Nd */ U_DECIMAL_DIGIT_NUMBER, 115 /* Nl */ U_LETTER_NUMBER, 116 /* No */ U_OTHER_NUMBER, 117 /* Zs */ U_SPACE_SEPARATOR, 118 /* Zl */ U_LINE_SEPARATOR, 119 /* Zp */ U_PARAGRAPH_SEPARATOR, 120 /* Cc */ U_CONTROL_CHAR, 121 /* Cf */ U_FORMAT_CHAR, 122 /* Cs */ U_SURROGATE, 123 /* Co */ U_PRIVATE_USE_CHAR, 124 /* Cn */ U_UNASSIGNED, 125 /* Lu */ U_UPPERCASE_LETTER, 126 /* Ll */ U_LOWERCASE_LETTER, 127 /* Lt */ U_TITLECASE_LETTER, 128 /* Lm */ U_MODIFIER_LETTER, 129 /* Lo */ U_OTHER_LETTER, 130 /* Pc */ U_CONNECTOR_PUNCTUATION, 131 /* Pd */ U_DASH_PUNCTUATION, 132 /* Ps */ U_START_PUNCTUATION, 133 /* Pe */ U_END_PUNCTUATION, 134 /* Po */ U_OTHER_PUNCTUATION, 135 /* Sm */ U_MATH_SYMBOL, 136 /* Sc */ U_CURRENCY_SYMBOL, 137 /* Sk */ U_MODIFIER_SYMBOL, 138 /* So */ U_OTHER_SYMBOL, 139 /* Pi */ U_INITIAL_PUNCTUATION, 140 /* Pf */ U_FINAL_PUNCTUATION 141 }; 142 143static const char dirStrings[][5] = { 144 "L", 145 "R", 146 "EN", 147 "ES", 148 "ET", 149 "AN", 150 "CS", 151 "B", 152 "S", 153 "WS", 154 "ON", 155 "LRE", 156 "LRO", 157 "AL", 158 "RLE", 159 "RLO", 160 "PDF", 161 "NSM", 162 "BN", 163 /* new in Unicode 6.3/ICU 52 */ 164 "FSI", 165 "LRI", 166 "RLI", 167 "PDI" 168}; 169 170void addUnicodeTest(TestNode** root); 171 172void addUnicodeTest(TestNode** root) 173{ 174 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit"); 175 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint"); 176 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength"); 177 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues"); 178 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData"); 179 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties"); 180 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties"); 181 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower"); 182 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber"); 183 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc"); 184 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX"); 185 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint"); 186 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier"); 187 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames"); 188 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring"); 189 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI"); 190 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript"); 191 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions"); 192 addTest(root, &TestScriptMetadataAPI, "tsutil/cucdtst/TestScriptMetadataAPI"); 193 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI"); 194 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames"); 195 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues"); 196 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency"); 197 addTest(root, &TestUCase, "tsutil/cucdtst/TestUCase"); 198 addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps"); 199 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding"); 200} 201 202/*==================================================== */ 203/* test u_toupper() and u_tolower() */ 204/*==================================================== */ 205static void TestUpperLower() 206{ 207 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000}; 208 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000}; 209 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21); 210 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21); 211 int32_t i; 212 213 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21); 214 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21); 215 216/* 217Checks LetterLike Symbols which were previously a source of confusion 218[Bertrand A. D. 02/04/98] 219*/ 220 for (i=0x2100;i<0x2138;i++) 221 { 222 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */ 223 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132) 224 { 225 if (i != (int)u_tolower(i)) /* itself */ 226 log_err("Failed case conversion with itself: U+%04x\n", i); 227 if (i != (int)u_toupper(i)) 228 log_err("Failed case conversion with itself: U+%04x\n", i); 229 } 230 } 231 232 for(i=0; i < u_strlen(upper); i++){ 233 if(u_tolower(upper[i]) != lower[i]){ 234 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i])); 235 } 236 } 237 238 log_verbose("testing upper lower\n"); 239 for (i = 0; i < 21; i++) { 240 241 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i])) 242 { 243 log_err("Failed isLowerCase test at %c\n", upperTest[i]); 244 } 245 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i])) 246 { 247 log_err("Failed isUpperCase test at %c\n", lowerTest[i]); 248 } 249 else if (upperTest[i] != u_tolower(lowerTest[i])) 250 { 251 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]); 252 } 253 else if (lowerTest[i] != u_toupper(upperTest[i])) 254 { 255 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]); 256 } 257 else if (upperTest[i] != u_tolower(upperTest[i])) 258 { 259 log_err("Failed case conversion with itself: %c\n", upperTest[i]); 260 } 261 else if (lowerTest[i] != u_toupper(lowerTest[i])) 262 { 263 log_err("Failed case conversion with itself: %c\n", lowerTest[i]); 264 } 265 } 266 log_verbose("done testing upper lower\n"); 267 268 log_verbose("testing u_istitle\n"); 269 { 270 static const UChar expected[] = { 271 0x1F88, 272 0x1F89, 273 0x1F8A, 274 0x1F8B, 275 0x1F8C, 276 0x1F8D, 277 0x1F8E, 278 0x1F8F, 279 0x1F88, 280 0x1F89, 281 0x1F8A, 282 0x1F8B, 283 0x1F8C, 284 0x1F8D, 285 0x1F8E, 286 0x1F8F, 287 0x1F98, 288 0x1F99, 289 0x1F9A, 290 0x1F9B, 291 0x1F9C, 292 0x1F9D, 293 0x1F9E, 294 0x1F9F, 295 0x1F98, 296 0x1F99, 297 0x1F9A, 298 0x1F9B, 299 0x1F9C, 300 0x1F9D, 301 0x1F9E, 302 0x1F9F, 303 0x1FA8, 304 0x1FA9, 305 0x1FAA, 306 0x1FAB, 307 0x1FAC, 308 0x1FAD, 309 0x1FAE, 310 0x1FAF, 311 0x1FA8, 312 0x1FA9, 313 0x1FAA, 314 0x1FAB, 315 0x1FAC, 316 0x1FAD, 317 0x1FAE, 318 0x1FAF, 319 0x1FBC, 320 0x1FBC, 321 0x1FCC, 322 0x1FCC, 323 0x1FFC, 324 0x1FFC, 325 }; 326 int32_t num = sizeof(expected)/sizeof(expected[0]); 327 for(i=0; i<num; i++){ 328 if(!u_istitle(expected[i])){ 329 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]); 330 } 331 } 332 333 } 334} 335 336/* compare two sets and verify that their difference or intersection is empty */ 337static UBool 338showADiffB(const USet *a, const USet *b, 339 const char *a_name, const char *b_name, 340 UBool expect, UBool diffIsError) { 341 USet *aa; 342 int32_t i, start, end, length; 343 UErrorCode errorCode; 344 345 /* 346 * expect: 347 * TRUE -> a-b should be empty, that is, b should contain all of a 348 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa) 349 */ 350 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) { 351 return TRUE; 352 } 353 354 /* clone a to aa because a is const */ 355 aa=uset_open(1, 0); 356 if(aa==NULL) { 357 /* unusual problem - out of memory? */ 358 return FALSE; 359 } 360 uset_addAll(aa, a); 361 362 /* compute the set in question */ 363 if(expect) { 364 /* a-b */ 365 uset_removeAll(aa, b); 366 } else { 367 /* a&b */ 368 uset_retainAll(aa, b); 369 } 370 371 /* aa is not empty because of the initial tests above; show its contents */ 372 errorCode=U_ZERO_ERROR; 373 i=0; 374 for(;;) { 375 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode); 376 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) { 377 break; /* done */ 378 } 379 if(U_FAILURE(errorCode)) { 380 log_err("error comparing %s with %s at difference item %d: %s\n", 381 a_name, b_name, i, u_errorName(errorCode)); 382 break; 383 } 384 if(length!=0) { 385 break; /* done with code points, got a string or -1 */ 386 } 387 388 if(diffIsError) { 389 if(expect) { 390 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name); 391 } else { 392 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end); 393 } 394 } else { 395 if(expect) { 396 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name); 397 } else { 398 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end); 399 } 400 } 401 402 ++i; 403 } 404 405 uset_close(aa); 406 return FALSE; 407} 408 409static UBool 410showAMinusB(const USet *a, const USet *b, 411 const char *a_name, const char *b_name, 412 UBool diffIsError) { 413 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError); 414} 415 416static UBool 417showAIntersectB(const USet *a, const USet *b, 418 const char *a_name, const char *b_name, 419 UBool diffIsError) { 420 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError); 421} 422 423static UBool 424compareUSets(const USet *a, const USet *b, 425 const char *a_name, const char *b_name, 426 UBool diffIsError) { 427 /* 428 * Use an arithmetic & not a logical && so that both branches 429 * are always taken and all differences are shown. 430 */ 431 return 432 showAMinusB(a, b, a_name, b_name, diffIsError) & 433 showAMinusB(b, a, b_name, a_name, diffIsError); 434} 435 436/* test isLetter(u_isapha()) and isDigit(u_isdigit()) */ 437static void TestLetterNumber() 438{ 439 UChar i = 0x0000; 440 441 log_verbose("Testing for isalpha\n"); 442 for (i = 0x0041; i < 0x005B; i++) { 443 if (!u_isalpha(i)) 444 { 445 log_err("Failed isLetter test at %.4X\n", i); 446 } 447 } 448 for (i = 0x0660; i < 0x066A; i++) { 449 if (u_isalpha(i)) 450 { 451 log_err("Failed isLetter test with numbers at %.4X\n", i); 452 } 453 } 454 455 log_verbose("Testing for isdigit\n"); 456 for (i = 0x0660; i < 0x066A; i++) { 457 if (!u_isdigit(i)) 458 { 459 log_verbose("Failed isNumber test at %.4X\n", i); 460 } 461 } 462 463 log_verbose("Testing for isalnum\n"); 464 for (i = 0x0041; i < 0x005B; i++) { 465 if (!u_isalnum(i)) 466 { 467 log_err("Failed isAlNum test at %.4X\n", i); 468 } 469 } 470 for (i = 0x0660; i < 0x066A; i++) { 471 if (!u_isalnum(i)) 472 { 473 log_err("Failed isAlNum test at %.4X\n", i); 474 } 475 } 476 477 { 478 /* 479 * The following checks work only starting from Unicode 4.0. 480 * Check the version number here. 481 */ 482 static UVersionInfo u401={ 4, 0, 1, 0 }; 483 UVersionInfo version; 484 u_getUnicodeVersion(version); 485 if(version[0]<4 || 0==memcmp(version, u401, 4)) { 486 return; 487 } 488 } 489 490 { 491 /* 492 * Sanity check: 493 * Verify that exactly the digit characters have decimal digit values. 494 * This assumption is used in the implementation of u_digit() 495 * (which checks nt=de) 496 * compared with the parallel java.lang.Character.digit() 497 * (which checks Nd). 498 * 499 * This was not true in Unicode 3.2 and earlier. 500 * Unicode 4.0 fixed discrepancies. 501 * Unicode 4.0.1 re-introduced problems in this area due to an 502 * unintentionally incomplete last-minute change. 503 */ 504 U_STRING_DECL(digitsPattern, "[:Nd:]", 6); 505 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24); 506 507 USet *digits, *decimalValues; 508 UErrorCode errorCode; 509 510 U_STRING_INIT(digitsPattern, "[:Nd:]", 6); 511 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24); 512 errorCode=U_ZERO_ERROR; 513 digits=uset_openPattern(digitsPattern, 6, &errorCode); 514 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode); 515 516 if(U_SUCCESS(errorCode)) { 517 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE); 518 } 519 520 uset_close(digits); 521 uset_close(decimalValues); 522 } 523} 524 525static void testSampleCharProps(UBool propFn(UChar32), const char *propName, 526 const UChar32 *sampleChars, int32_t sampleCharsLength, 527 UBool expected) { 528 int32_t i; 529 for (i = 0; i < sampleCharsLength; ++i) { 530 UBool result = propFn(sampleChars[i]); 531 if (result != expected) { 532 log_err("error: character property function %s(U+%04x)=%d is wrong\n", 533 propName, sampleChars[i], result); 534 } 535 } 536} 537 538/* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */ 539static void TestMisc() 540{ 541 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005}; 542 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74}; 543 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e}; 544 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd}; 545 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2}; 546 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B}; 547/* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/ 548 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5}; 549 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE}; 550 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c}; 551 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef}; 552 553 static const int32_t sampleDigitValues[] = {0, 2, 3, 5}; 554 555 uint32_t mask; 556 557 int32_t i; 558 char icuVersion[U_MAX_VERSION_STRING_LENGTH]; 559 UVersionInfo realVersion; 560 561 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH); 562 563 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, LENGTHOF(sampleSpaces), TRUE); 564 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, LENGTHOF(sampleNonSpaces), FALSE); 565 566 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar", 567 sampleSpaces, LENGTHOF(sampleSpaces), TRUE); 568 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar", 569 sampleNonSpaces, LENGTHOF(sampleNonSpaces), FALSE); 570 571 testSampleCharProps(u_isWhitespace, "u_isWhitespace", 572 sampleWhiteSpaces, LENGTHOF(sampleWhiteSpaces), TRUE); 573 testSampleCharProps(u_isWhitespace, "u_isWhitespace", 574 sampleNonWhiteSpaces, LENGTHOF(sampleNonWhiteSpaces), FALSE); 575 576 testSampleCharProps(u_isdefined, "u_isdefined", 577 sampleDefined, LENGTHOF(sampleDefined), TRUE); 578 testSampleCharProps(u_isdefined, "u_isdefined", 579 sampleUndefined, LENGTHOF(sampleUndefined), FALSE); 580 581 testSampleCharProps(u_isbase, "u_isbase", sampleBase, LENGTHOF(sampleBase), TRUE); 582 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, LENGTHOF(sampleNonBase), FALSE); 583 584 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, LENGTHOF(sampleDigits), TRUE); 585 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, LENGTHOF(sampleNonDigits), FALSE); 586 587 for (i = 0; i < LENGTHOF(sampleDigits); i++) { 588 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) { 589 log_err("error: u_charDigitValue(U+04x)=%d != %d\n", 590 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]); 591 } 592 } 593 594 /* Tests the ICU version #*/ 595 u_getVersion(realVersion); 596 u_versionToString(realVersion, icuVersion); 597 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0) 598 { 599 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion); 600 } 601#if defined(ICU_VERSION) 602 /* test only happens where we have configure.in with VERSION - sanity check. */ 603 if(strcmp(U_ICU_VERSION, ICU_VERSION)) 604 { 605 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION); 606 } 607#endif 608 609 /* test U_GC_... */ 610 if( 611 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK || 612 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK || 613 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK || 614 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK || 615 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK || 616 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK 617 ) { 618 log_err("error: U_GET_GC_MASK does not work properly\n"); 619 } 620 621 mask=0; 622 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK; 623 624 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK; 625 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK; 626 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK; 627 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK; 628 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK; 629 630 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK; 631 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK; 632 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK; 633 634 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK; 635 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK; 636 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK; 637 638 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK; 639 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK; 640 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK; 641 642 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK; 643 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK; 644 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK; 645 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK; 646 647 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK; 648 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK; 649 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK; 650 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK; 651 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK; 652 653 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK; 654 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK; 655 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK; 656 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK; 657 658 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK; 659 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK; 660 661 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) { 662 log_err("error: problems with U_GC_XX_MASK constants\n"); 663 } 664 665 mask=0; 666 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK; 667 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK; 668 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK; 669 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK; 670 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK; 671 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK; 672 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK; 673 674 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) { 675 log_err("error: problems with U_GC_Y_MASK constants\n"); 676 } 677 { 678 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 }; 679 for(i=0; i<10; i++){ 680 if(digit[i]!=u_forDigit(i,10)){ 681 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10)); 682 } 683 } 684 } 685 686 /* test u_digit() */ 687 { 688 static const struct { 689 UChar32 c; 690 int8_t radix, value; 691 } data[]={ 692 /* base 16 */ 693 { 0x0031, 16, 1 }, 694 { 0x0038, 16, 8 }, 695 { 0x0043, 16, 12 }, 696 { 0x0066, 16, 15 }, 697 { 0x00e4, 16, -1 }, 698 { 0x0662, 16, 2 }, 699 { 0x06f5, 16, 5 }, 700 { 0xff13, 16, 3 }, 701 { 0xff41, 16, 10 }, 702 703 /* base 8 */ 704 { 0x0031, 8, 1 }, 705 { 0x0038, 8, -1 }, 706 { 0x0043, 8, -1 }, 707 { 0x0066, 8, -1 }, 708 { 0x00e4, 8, -1 }, 709 { 0x0662, 8, 2 }, 710 { 0x06f5, 8, 5 }, 711 { 0xff13, 8, 3 }, 712 { 0xff41, 8, -1 }, 713 714 /* base 36 */ 715 { 0x5a, 36, 35 }, 716 { 0x7a, 36, 35 }, 717 { 0xff3a, 36, 35 }, 718 { 0xff5a, 36, 35 }, 719 720 /* wrong radix values */ 721 { 0x0031, 1, -1 }, 722 { 0xff3a, 37, -1 } 723 }; 724 725 for(i=0; i<LENGTHOF(data); ++i) { 726 if(u_digit(data[i].c, data[i].radix)!=data[i].value) { 727 log_err("u_digit(U+%04x, %d)=%d expected %d\n", 728 data[i].c, 729 data[i].radix, 730 u_digit(data[i].c, data[i].radix), 731 data[i].value); 732 } 733 } 734 } 735} 736 737/* test C/POSIX-style functions --------------------------------------------- */ 738 739/* bit flags */ 740#define ISAL 1 741#define ISLO 2 742#define ISUP 4 743 744#define ISDI 8 745#define ISXD 0x10 746 747#define ISAN 0x20 748 749#define ISPU 0x40 750#define ISGR 0x80 751#define ISPR 0x100 752 753#define ISSP 0x200 754#define ISBL 0x400 755#define ISCN 0x800 756 757/* C/POSIX-style functions, in the same order as the bit flags */ 758typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c); 759 760static const struct { 761 IsPOSIXClass *fn; 762 const char *name; 763} posixClasses[]={ 764 { u_isalpha, "isalpha" }, 765 { u_islower, "islower" }, 766 { u_isupper, "isupper" }, 767 { u_isdigit, "isdigit" }, 768 { u_isxdigit, "isxdigit" }, 769 { u_isalnum, "isalnum" }, 770 { u_ispunct, "ispunct" }, 771 { u_isgraph, "isgraph" }, 772 { u_isprint, "isprint" }, 773 { u_isspace, "isspace" }, 774 { u_isblank, "isblank" }, 775 { u_iscntrl, "iscntrl" } 776}; 777 778static const struct { 779 UChar32 c; 780 uint32_t posixResults; 781} posixData[]={ 782 { 0x0008, ISCN }, /* backspace */ 783 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */ 784 { 0x000a, ISSP| ISCN }, /* LF */ 785 { 0x000c, ISSP| ISCN }, /* FF */ 786 { 0x000d, ISSP| ISCN }, /* CR */ 787 { 0x0020, ISPR|ISSP|ISBL }, /* space */ 788 { 0x0021, ISPU|ISGR|ISPR }, /* ! */ 789 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */ 790 { 0x0040, ISPU|ISGR|ISPR }, /* @ */ 791 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */ 792 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */ 793 { 0x007b, ISPU|ISGR|ISPR }, /* { */ 794 { 0x0085, ISSP| ISCN }, /* NEL */ 795 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */ 796 { 0x00a4, ISGR|ISPR }, /* currency sign */ 797 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */ 798 { 0x0300, ISGR|ISPR }, /* combining grave */ 799 { 0x0600, ISCN }, /* arabic number sign */ 800 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */ 801 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */ 802 { 0x2002, ISPR|ISSP|ISBL }, /* en space */ 803 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */ 804 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */ 805 { 0x200b, ISCN }, /* ZWSP */ 806 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/ 807 { 0x200e, ISCN }, /* LRM */ 808 { 0x2028, ISPR|ISSP| ISCN }, /* LS */ 809 { 0x2029, ISPR|ISSP| ISCN }, /* PS */ 810 { 0x20ac, ISGR|ISPR }, /* Euro */ 811 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */ 812 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */ 813 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */ 814 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */ 815 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */ 816}; 817 818static void 819TestPOSIX() { 820 uint32_t mask; 821 int32_t cl, i; 822 UBool expect; 823 824 mask=1; 825 for(cl=0; cl<12; ++cl) { 826 for(i=0; i<LENGTHOF(posixData); ++i) { 827 expect=(UBool)((posixData[i].posixResults&mask)!=0); 828 if(posixClasses[cl].fn(posixData[i].c)!=expect) { 829 log_err("u_%s(U+%04x)=%s is wrong\n", 830 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE"); 831 } 832 } 833 mask<<=1; 834 } 835} 836 837/* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */ 838static void TestControlPrint() 839{ 840 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b}; 841 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2}; 842 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014}; 843 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b}; 844 UChar32 c; 845 846 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, LENGTHOF(sampleControl), TRUE); 847 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, LENGTHOF(sampleNonControl), FALSE); 848 849 testSampleCharProps(u_isprint, "u_isprint", 850 samplePrintable, LENGTHOF(samplePrintable), TRUE); 851 testSampleCharProps(u_isprint, "u_isprint", 852 sampleNonPrintable, LENGTHOF(sampleNonPrintable), FALSE); 853 854 /* test all ISO 8 controls */ 855 for(c=0; c<=0x9f; ++c) { 856 if(c==0x20) { 857 /* skip ASCII graphic characters and continue with DEL */ 858 c=0x7f; 859 } 860 if(!u_iscntrl(c)) { 861 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c); 862 } 863 if(!u_isISOControl(c)) { 864 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c); 865 } 866 if(u_isprint(c)) { 867 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c); 868 } 869 } 870 871 /* test all Latin-1 graphic characters */ 872 for(c=0x20; c<=0xff; ++c) { 873 if(c==0x7f) { 874 c=0xa0; 875 } else if(c==0xad) { 876 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */ 877 ++c; 878 } 879 if(!u_isprint(c)) { 880 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c); 881 } 882 } 883} 884 885/* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/ 886static void TestIdentifier() 887{ 888 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f}; 889 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082}; 890 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045}; 891 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020}; 892 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061}; 893 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019}; 894 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045}; 895 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020}; 896 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85}; 897 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061}; 898 899 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart", 900 sampleJavaIDStart, LENGTHOF(sampleJavaIDStart), TRUE); 901 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart", 902 sampleNonJavaIDStart, LENGTHOF(sampleNonJavaIDStart), FALSE); 903 904 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart", 905 sampleJavaIDPart, LENGTHOF(sampleJavaIDPart), TRUE); 906 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart", 907 sampleNonJavaIDPart, LENGTHOF(sampleNonJavaIDPart), FALSE); 908 909 /* IDPart should imply IDStart */ 910 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart", 911 sampleJavaIDStart, LENGTHOF(sampleJavaIDStart), TRUE); 912 913 testSampleCharProps(u_isIDStart, "u_isIDStart", 914 sampleUnicodeIDStart, LENGTHOF(sampleUnicodeIDStart), TRUE); 915 testSampleCharProps(u_isIDStart, "u_isIDStart", 916 sampleNonUnicodeIDStart, LENGTHOF(sampleNonUnicodeIDStart), FALSE); 917 918 testSampleCharProps(u_isIDPart, "u_isIDPart", 919 sampleUnicodeIDPart, LENGTHOF(sampleUnicodeIDPart), TRUE); 920 testSampleCharProps(u_isIDPart, "u_isIDPart", 921 sampleNonUnicodeIDPart, LENGTHOF(sampleNonUnicodeIDPart), FALSE); 922 923 /* IDPart should imply IDStart */ 924 testSampleCharProps(u_isIDPart, "u_isIDPart", 925 sampleUnicodeIDStart, LENGTHOF(sampleUnicodeIDStart), TRUE); 926 927 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable", 928 sampleIDIgnore, LENGTHOF(sampleIDIgnore), TRUE); 929 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable", 930 sampleNonIDIgnore, LENGTHOF(sampleNonIDIgnore), FALSE); 931} 932 933/* for each line of UnicodeData.txt, check some of the properties */ 934typedef struct UnicodeDataContext { 935#if UCONFIG_NO_NORMALIZATION 936 const void *dummy; 937#else 938 const UNormalizer2 *nfc; 939 const UNormalizer2 *nfkc; 940#endif 941} UnicodeDataContext; 942 943/* 944 * ### TODO 945 * This test fails incorrectly if the First or Last code point of a repetitive area 946 * is overridden, which is allowed and is encouraged for the PUAs. 947 * Currently, this means that both area First/Last and override lines are 948 * tested against the properties from the API, 949 * and the area boundary will not match and cause an error. 950 * 951 * This function should detect area boundaries and skip them for the test of individual 952 * code points' properties. 953 * Then it should check that the areas contain all the same properties except where overridden. 954 * For this, it would have had to set a flag for which code points were listed explicitly. 955 */ 956static void U_CALLCONV 957unicodeDataLineFn(void *context, 958 char *fields[][2], int32_t fieldCount, 959 UErrorCode *pErrorCode) 960{ 961 char buffer[100]; 962 const char *d; 963 char *end; 964 uint32_t value; 965 UChar32 c; 966 int32_t i; 967 int8_t type; 968 int32_t dt; 969 UChar dm[32], s[32]; 970 int32_t dmLength, length; 971 972#if !UCONFIG_NO_NORMALIZATION 973 const UNormalizer2 *nfc, *nfkc; 974#endif 975 976 /* get the character code, field 0 */ 977 c=strtoul(fields[0][0], &end, 16); 978 if(end<=fields[0][0] || end!=fields[0][1]) { 979 log_err("error: syntax error in field 0 at %s\n", fields[0][0]); 980 return; 981 } 982 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) { 983 log_err("error in UnicodeData.txt: code point %lu out of range\n", c); 984 return; 985 } 986 987 /* get general category, field 2 */ 988 *fields[2][1]=0; 989 type = (int8_t)tagValues[MakeProp(fields[2][0])]; 990 if(u_charType(c)!=type) { 991 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type); 992 } 993 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) { 994 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c); 995 } 996 997 /* get canonical combining class, field 3 */ 998 value=strtoul(fields[3][0], &end, 10); 999 if(end<=fields[3][0] || end!=fields[3][1]) { 1000 log_err("error: syntax error in field 3 at code 0x%lx\n", c); 1001 return; 1002 } 1003 if(value>255) { 1004 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value); 1005 return; 1006 } 1007#if !UCONFIG_NO_NORMALIZATION 1008 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) { 1009 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value); 1010 } 1011 nfkc=((UnicodeDataContext *)context)->nfkc; 1012 if(value!=unorm2_getCombiningClass(nfkc, c)) { 1013 log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value); 1014 } 1015#endif 1016 1017 /* get BiDi category, field 4 */ 1018 *fields[4][1]=0; 1019 i=MakeDir(fields[4][0]); 1020 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) { 1021 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]); 1022 } 1023 1024 /* get Decomposition_Type & Decomposition_Mapping, field 5 */ 1025 d=NULL; 1026 if(fields[5][0]==fields[5][1]) { 1027 /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */ 1028 if(c==0xac00 || c==0xd7a3) { 1029 dt=U_DT_CANONICAL; 1030 } else { 1031 dt=U_DT_NONE; 1032 } 1033 } else { 1034 d=fields[5][0]; 1035 *fields[5][1]=0; 1036 dt=UCHAR_INVALID_CODE; 1037 if(*d=='<') { 1038 end=strchr(++d, '>'); 1039 if(end!=NULL) { 1040 *end=0; 1041 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d); 1042 d=u_skipWhitespace(end+1); 1043 } 1044 } else { 1045 dt=U_DT_CANONICAL; 1046 } 1047 } 1048 if(dt>U_DT_NONE) { 1049 if(c==0xac00) { 1050 dm[0]=0x1100; 1051 dm[1]=0x1161; 1052 dm[2]=0; 1053 dmLength=2; 1054 } else if(c==0xd7a3) { 1055 dm[0]=0xd788; 1056 dm[1]=0x11c2; 1057 dm[2]=0; 1058 dmLength=2; 1059 } else { 1060 dmLength=u_parseString(d, dm, 32, NULL, pErrorCode); 1061 } 1062 } else { 1063 dmLength=-1; 1064 } 1065 if(dt<0 || U_FAILURE(*pErrorCode)) { 1066 log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c); 1067 return; 1068 } 1069#if !UCONFIG_NO_NORMALIZATION 1070 i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE); 1071 if(i!=dt) { 1072 log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt); 1073 } 1074 /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */ 1075 length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode); 1076 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) { 1077 log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d " 1078 "or the Decomposition_Mapping is different (%s)\n", 1079 c, length, dmLength, u_errorName(*pErrorCode)); 1080 return; 1081 } 1082 /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */ 1083 if(dt!=U_DT_CANONICAL) { 1084 dmLength=-1; 1085 } 1086 nfc=((UnicodeDataContext *)context)->nfc; 1087 length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode); 1088 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) { 1089 log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d " 1090 "or the Decomposition_Mapping is different (%s)\n", 1091 c, length, dmLength, u_errorName(*pErrorCode)); 1092 return; 1093 } 1094 /* recompose */ 1095 if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) { 1096 UChar32 a, b, composite; 1097 i=0; 1098 U16_NEXT(dm, i, dmLength, a); 1099 U16_NEXT(dm, i, dmLength, b); 1100 /* i==dmLength */ 1101 composite=unorm2_composePair(nfc, a, b); 1102 if(composite!=c) { 1103 log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n", 1104 (long)c, (long)a, (long)b, (long)composite); 1105 } 1106 /* 1107 * Note: NFKC has fewer round-trip mappings than NFC, 1108 * so we can't just test unorm2_composePair(nfkc, a, b) here without further data. 1109 */ 1110 } 1111#endif 1112 1113 /* get ISO Comment, field 11 */ 1114 *fields[11][1]=0; 1115 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode); 1116 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) { 1117 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n", 1118 c, u_errorName(*pErrorCode), 1119 U_FAILURE(*pErrorCode) ? buffer : "[error]", 1120 fields[11][0]); 1121 } 1122 1123 /* get uppercase mapping, field 12 */ 1124 if(fields[12][0]!=fields[12][1]) { 1125 value=strtoul(fields[12][0], &end, 16); 1126 if(end!=fields[12][1]) { 1127 log_err("error: syntax error in field 12 at code 0x%lx\n", c); 1128 return; 1129 } 1130 if((UChar32)value!=u_toupper(c)) { 1131 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value); 1132 } 1133 } else { 1134 /* no case mapping: the API must map the code point to itself */ 1135 if(c!=u_toupper(c)) { 1136 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c)); 1137 } 1138 } 1139 1140 /* get lowercase mapping, field 13 */ 1141 if(fields[13][0]!=fields[13][1]) { 1142 value=strtoul(fields[13][0], &end, 16); 1143 if(end!=fields[13][1]) { 1144 log_err("error: syntax error in field 13 at code 0x%lx\n", c); 1145 return; 1146 } 1147 if((UChar32)value!=u_tolower(c)) { 1148 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value); 1149 } 1150 } else { 1151 /* no case mapping: the API must map the code point to itself */ 1152 if(c!=u_tolower(c)) { 1153 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c)); 1154 } 1155 } 1156 1157 /* get titlecase mapping, field 14 */ 1158 if(fields[14][0]!=fields[14][1]) { 1159 value=strtoul(fields[14][0], &end, 16); 1160 if(end!=fields[14][1]) { 1161 log_err("error: syntax error in field 14 at code 0x%lx\n", c); 1162 return; 1163 } 1164 if((UChar32)value!=u_totitle(c)) { 1165 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value); 1166 } 1167 } else { 1168 /* no case mapping: the API must map the code point to itself */ 1169 if(c!=u_totitle(c)) { 1170 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c)); 1171 } 1172 } 1173} 1174 1175static UBool U_CALLCONV 1176enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) { 1177 static const UChar32 test[][2]={ 1178 {0x41, U_UPPERCASE_LETTER}, 1179 {0x308, U_NON_SPACING_MARK}, 1180 {0xfffe, U_GENERAL_OTHER_TYPES}, 1181 {0xe0041, U_FORMAT_CHAR}, 1182 {0xeffff, U_UNASSIGNED} 1183 }; 1184 1185 int32_t i, count; 1186 1187 if(0!=strcmp((const char *)context, "a1")) { 1188 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n"); 1189 return FALSE; 1190 } 1191 1192 count=LENGTHOF(test); 1193 for(i=0; i<count; ++i) { 1194 if(start<=test[i][0] && test[i][0]<limit) { 1195 if(type!=(UCharCategory)test[i][1]) { 1196 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n", 1197 start, limit, (long)type, test[i][0], test[i][1]); 1198 } 1199 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */ 1200 return i==(count-1) ? FALSE : TRUE; 1201 } 1202 } 1203 1204 if(start>test[count-1][0]) { 1205 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n", 1206 start, limit, (long)type); 1207 return FALSE; 1208 } 1209 1210 return TRUE; 1211} 1212 1213static UBool U_CALLCONV 1214enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) { 1215 /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */ 1216 static const int32_t defaultBidi[][2]={ /* { limit, class } */ 1217 { 0x0590, U_LEFT_TO_RIGHT }, 1218 { 0x0600, U_RIGHT_TO_LEFT }, 1219 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC }, 1220 { 0x08A0, U_RIGHT_TO_LEFT }, 1221 { 0x0900, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */ 1222 { 0x20A0, U_LEFT_TO_RIGHT }, 1223 { 0x20D0, U_EUROPEAN_NUMBER_TERMINATOR }, /* Unicode 6.3 changes the currency symbols block U+20A0..U+20CF to default to ET not L */ 1224 { 0xFB1D, U_LEFT_TO_RIGHT }, 1225 { 0xFB50, U_RIGHT_TO_LEFT }, 1226 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC }, 1227 { 0xFE70, U_LEFT_TO_RIGHT }, 1228 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC }, 1229 { 0x10800, U_LEFT_TO_RIGHT }, 1230 { 0x11000, U_RIGHT_TO_LEFT }, 1231 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */ 1232 { 0x1EE00, U_RIGHT_TO_LEFT }, 1233 { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */ 1234 { 0x1F000, U_RIGHT_TO_LEFT }, 1235 { 0x110000, U_LEFT_TO_RIGHT } 1236 }; 1237 1238 UChar32 c; 1239 int32_t i; 1240 UCharDirection shouldBeDir; 1241 1242 /* 1243 * LineBreak.txt specifies: 1244 * # - Assigned characters that are not listed explicitly are given the value 1245 * # "AL". 1246 * # - Unassigned characters are given the value "XX". 1247 * 1248 * PUA characters are listed explicitly with "XX". 1249 * Verify that no assigned character has "XX". 1250 */ 1251 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) { 1252 c=start; 1253 while(c<limit) { 1254 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) { 1255 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c); 1256 } 1257 ++c; 1258 } 1259 } 1260 1261 /* 1262 * Verify default Bidi classes. 1263 * For recent Unicode versions, see UCD.html. 1264 * 1265 * For older Unicode versions: 1266 * See table 3-7 "Bidirectional Character Types" in UAX #9. 1267 * http://www.unicode.org/reports/tr9/ 1268 * 1269 * See also DerivedBidiClass.txt for Cn code points! 1270 * 1271 * Unicode 4.0.1/Public Review Issue #28 (http://www.unicode.org/review/resolved-pri.html) 1272 * changed some default values. 1273 * In particular, non-characters and unassigned Default Ignorable Code Points 1274 * change from L to BN. 1275 * 1276 * UCD.html version 4.0.1 does not yet reflect these changes. 1277 */ 1278 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) { 1279 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */ 1280 c=start; 1281 for(i=0; i<LENGTHOF(defaultBidi) && c<limit; ++i) { 1282 if((int32_t)c<defaultBidi[i][0]) { 1283 while(c<limit && (int32_t)c<defaultBidi[i][0]) { 1284 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) { 1285 shouldBeDir=U_BOUNDARY_NEUTRAL; 1286 } else { 1287 shouldBeDir=(UCharDirection)defaultBidi[i][1]; 1288 } 1289 1290 if( u_charDirection(c)!=shouldBeDir || 1291 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir 1292 ) { 1293 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n", 1294 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]); 1295 } 1296 ++c; 1297 } 1298 } 1299 } 1300 } 1301 1302 return TRUE; 1303} 1304 1305/* tests for several properties */ 1306static void TestUnicodeData() 1307{ 1308 UVersionInfo expectVersionArray; 1309 UVersionInfo versionArray; 1310 char *fields[15][2]; 1311 UErrorCode errorCode; 1312 UChar32 c; 1313 int8_t type; 1314 1315 UnicodeDataContext context; 1316 1317 u_versionFromString(expectVersionArray, U_UNICODE_VERSION); 1318 u_getUnicodeVersion(versionArray); 1319 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0) 1320 { 1321 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n", 1322 versionArray[0], versionArray[1], versionArray[2], versionArray[3]); 1323 } 1324 1325#if defined(ICU_UNICODE_VERSION) 1326 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */ 1327 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION)) 1328 { 1329 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n"); 1330 } 1331#endif 1332 1333 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) { 1334 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041)); 1335 } 1336 1337 errorCode=U_ZERO_ERROR; 1338#if !UCONFIG_NO_NORMALIZATION 1339 context.nfc=unorm2_getNFCInstance(&errorCode); 1340 context.nfkc=unorm2_getNFKCInstance(&errorCode); 1341 if(U_FAILURE(errorCode)) { 1342 log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode)); 1343 return; 1344 } 1345#endif 1346 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode); 1347 if(U_FAILURE(errorCode)) { 1348 return; /* if we couldn't parse UnicodeData.txt, we should return */ 1349 } 1350 1351 /* sanity check on repeated properties */ 1352 for(c=0xfffe; c<=0x10ffff;) { 1353 type=u_charType(c); 1354 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) { 1355 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c); 1356 } 1357 if(type!=U_UNASSIGNED) { 1358 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c)); 1359 } 1360 if((c&0xffff)==0xfffe) { 1361 ++c; 1362 } else { 1363 c+=0xffff; 1364 } 1365 } 1366 1367 /* test that PUA is not "unassigned" */ 1368 for(c=0xe000; c<=0x10fffd;) { 1369 type=u_charType(c); 1370 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) { 1371 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c); 1372 } 1373 if(type==U_UNASSIGNED) { 1374 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c); 1375 } else if(type!=U_PRIVATE_USE_CHAR) { 1376 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type); 1377 } 1378 if(c==0xf8ff) { 1379 c=0xf0000; 1380 } else if(c==0xffffd) { 1381 c=0x100000; 1382 } else { 1383 ++c; 1384 } 1385 } 1386 1387 /* test u_enumCharTypes() */ 1388 u_enumCharTypes(enumTypeRange, "a1"); 1389 1390 /* check default properties */ 1391 u_enumCharTypes(enumDefaultsRange, NULL); 1392} 1393 1394static void TestCodeUnit(){ 1395 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0}; 1396 1397 int32_t i; 1398 1399 for(i=0; i<(int32_t)(sizeof(codeunit)/sizeof(codeunit[0])); i++){ 1400 UChar c=codeunit[i]; 1401 if(i<4){ 1402 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){ 1403 log_err("ERROR: U+%04x is a single", c); 1404 } 1405 1406 } 1407 if(i >= 4 && i< 8){ 1408 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){ 1409 log_err("ERROR: U+%04x is a first surrogate", c); 1410 } 1411 } 1412 if(i >= 8 && i< 12){ 1413 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){ 1414 log_err("ERROR: U+%04x is a second surrogate", c); 1415 } 1416 } 1417 } 1418 1419} 1420 1421static void TestCodePoint(){ 1422 const UChar32 codePoint[]={ 1423 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */ 1424 0xd800, 1425 0xdbff, 1426 0xdc00, 1427 0xdfff, 1428 0xdc04, 1429 0xd821, 1430 /*not a surrogate, valid, isUnicodeChar , not Error*/ 1431 0x20ac, 1432 0xd7ff, 1433 0xe000, 1434 0xe123, 1435 0x0061, 1436 0xe065, 1437 0x20402, 1438 0x24506, 1439 0x23456, 1440 0x20402, 1441 0x10402, 1442 0x23456, 1443 /*not a surrogate, not valid, isUnicodeChar, isError */ 1444 0x0015, 1445 0x009f, 1446 /*not a surrogate, not valid, not isUnicodeChar, isError */ 1447 0xffff, 1448 0xfffe, 1449 }; 1450 int32_t i; 1451 for(i=0; i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0])); i++){ 1452 UChar32 c=codePoint[i]; 1453 if(i<6){ 1454 if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){ 1455 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1456 } 1457 if(UTF_IS_VALID(c)){ 1458 log_err("ERROR: isValid() failed for U+%04x\n", c); 1459 } 1460 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){ 1461 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1462 } 1463 if(UTF_IS_ERROR(c)){ 1464 log_err("ERROR: isError() failed for U+%04x\n", c); 1465 } 1466 }else if(i >=6 && i<18){ 1467 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ 1468 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1469 } 1470 if(!UTF_IS_VALID(c)){ 1471 log_err("ERROR: isValid() failed for U+%04x\n", c); 1472 } 1473 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){ 1474 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1475 } 1476 if(UTF_IS_ERROR(c)){ 1477 log_err("ERROR: isError() failed for U+%04x\n", c); 1478 } 1479 }else if(i >=18 && i<20){ 1480 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ 1481 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1482 } 1483 if(UTF_IS_VALID(c)){ 1484 log_err("ERROR: isValid() failed for U+%04x\n", c); 1485 } 1486 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){ 1487 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1488 } 1489 if(!UTF_IS_ERROR(c)){ 1490 log_err("ERROR: isError() failed for U+%04x\n", c); 1491 } 1492 } 1493 else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){ 1494 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ 1495 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1496 } 1497 if(UTF_IS_VALID(c)){ 1498 log_err("ERROR: isValid() failed for U+%04x\n", c); 1499 } 1500 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){ 1501 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1502 } 1503 if(!UTF_IS_ERROR(c)){ 1504 log_err("ERROR: isError() failed for U+%04x\n", c); 1505 } 1506 } 1507 } 1508 1509 if( 1510 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) || 1511 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) || 1512 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) || 1513 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff) 1514 ) { 1515 log_err("error with U_IS_BMP()\n"); 1516 } 1517 1518 if( 1519 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) || 1520 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) || 1521 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) || 1522 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff) 1523 ) { 1524 log_err("error with U_IS_SUPPLEMENTARY()\n"); 1525 } 1526} 1527 1528static void TestCharLength() 1529{ 1530 const int32_t codepoint[]={ 1531 1, 0x0061, 1532 1, 0xe065, 1533 1, 0x20ac, 1534 2, 0x20402, 1535 2, 0x23456, 1536 2, 0x24506, 1537 2, 0x20402, 1538 2, 0x10402, 1539 1, 0xd7ff, 1540 1, 0xe000 1541 }; 1542 1543 int32_t i; 1544 UBool multiple; 1545 for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){ 1546 UChar32 c=codepoint[i+1]; 1547 if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){ 1548 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c)); 1549 } 1550 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE); 1551 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){ 1552 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c); 1553 } 1554 } 1555} 1556 1557/*internal functions ----*/ 1558static int32_t MakeProp(char* str) 1559{ 1560 int32_t result = 0; 1561 char* matchPosition =0; 1562 1563 matchPosition = strstr(tagStrings, str); 1564 if (matchPosition == 0) 1565 { 1566 log_err("unrecognized type letter "); 1567 log_err(str); 1568 } 1569 else 1570 result = (int32_t)((matchPosition - tagStrings) / 2); 1571 return result; 1572} 1573 1574static int32_t MakeDir(char* str) 1575{ 1576 int32_t pos = 0; 1577 for (pos = 0; pos < U_CHAR_DIRECTION_COUNT; pos++) { 1578 if (strcmp(str, dirStrings[pos]) == 0) { 1579 return pos; 1580 } 1581 } 1582 return -1; 1583} 1584 1585/* test u_charName() -------------------------------------------------------- */ 1586 1587static const struct { 1588 uint32_t code; 1589 const char *name, *oldName, *extName, *alias; 1590} names[]={ 1591 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"}, 1592 {0x01a2, "LATIN CAPITAL LETTER OI", "", 1593 "LATIN CAPITAL LETTER OI", 1594 "LATIN CAPITAL LETTER GHA"}, 1595 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "", 1596 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" }, 1597 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "", 1598 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", 1599 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"}, 1600 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" }, 1601 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" }, 1602 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" }, 1603 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" }, 1604 {0xd800, "", "", "<lead surrogate-D800>" }, 1605 {0xdc00, "", "", "<trail surrogate-DC00>" }, 1606 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS" }, 1607 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" }, 1608 {0xffff, "", "", "<noncharacter-FFFF>" }, 1609 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "", 1610 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", 1611 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"}, 1612 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" } 1613}; 1614 1615static UBool 1616enumCharNamesFn(void *context, 1617 UChar32 code, UCharNameChoice nameChoice, 1618 const char *name, int32_t length) { 1619 int32_t *pCount=(int32_t *)context; 1620 const char *expected; 1621 int i; 1622 1623 if(length<=0 || length!=(int32_t)strlen(name)) { 1624 /* should not be called with an empty string or invalid length */ 1625 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length); 1626 return TRUE; 1627 } 1628 1629 ++*pCount; 1630 for(i=0; i<sizeof(names)/sizeof(names[0]); ++i) { 1631 if(code==(UChar32)names[i].code) { 1632 switch (nameChoice) { 1633 case U_EXTENDED_CHAR_NAME: 1634 if(0!=strcmp(name, names[i].extName)) { 1635 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName); 1636 } 1637 break; 1638 case U_UNICODE_CHAR_NAME: 1639 if(0!=strcmp(name, names[i].name)) { 1640 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name); 1641 } 1642 break; 1643 case U_UNICODE_10_CHAR_NAME: 1644 expected=names[i].oldName; 1645 if(expected[0]==0 || 0!=strcmp(name, expected)) { 1646 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected); 1647 } 1648 break; 1649 case U_CHAR_NAME_ALIAS: 1650 expected=names[i].alias; 1651 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) { 1652 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected); 1653 } 1654 break; 1655 case U_CHAR_NAME_CHOICE_COUNT: 1656 break; 1657 } 1658 break; 1659 } 1660 } 1661 return TRUE; 1662} 1663 1664struct enumExtCharNamesContext { 1665 uint32_t length; 1666 int32_t last; 1667}; 1668 1669static UBool 1670enumExtCharNamesFn(void *context, 1671 UChar32 code, UCharNameChoice nameChoice, 1672 const char *name, int32_t length) { 1673 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context; 1674 1675 if (ecncp->last != (int32_t) code - 1) { 1676 if (ecncp->last < 0) { 1677 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1); 1678 } else { 1679 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code); 1680 } 1681 } 1682 ecncp->last = (int32_t) code; 1683 1684 if (!*name) { 1685 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code); 1686 } 1687 1688 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length); 1689} 1690 1691/** 1692 * This can be made more efficient by moving it into putil.c and having 1693 * it directly access the ebcdic translation tables. 1694 * TODO: If we get this method in putil.c, then delete it from here. 1695 */ 1696static UChar 1697u_charToUChar(char c) { 1698 UChar uc; 1699 u_charsToUChars(&c, &uc, 1); 1700 return uc; 1701} 1702 1703static void 1704TestCharNames() { 1705 static char name[80]; 1706 UErrorCode errorCode=U_ZERO_ERROR; 1707 struct enumExtCharNamesContext extContext; 1708 const char *expected; 1709 int32_t length; 1710 UChar32 c; 1711 int32_t i; 1712 1713 log_verbose("Testing uprv_getMaxCharNameLength()\n"); 1714 length=uprv_getMaxCharNameLength(); 1715 if(length==0) { 1716 /* no names data available */ 1717 return; 1718 } 1719 if(length<83) { /* Unicode 3.2 max char name length */ 1720 log_err("uprv_getMaxCharNameLength()=%d is too short"); 1721 } 1722 /* ### TODO same tests for max ISO comment length as for max name length */ 1723 1724 log_verbose("Testing u_charName()\n"); 1725 for(i=0; i<(int32_t)(sizeof(names)/sizeof(names[0])); ++i) { 1726 /* modern Unicode character name */ 1727 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode); 1728 if(U_FAILURE(errorCode)) { 1729 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode)); 1730 return; 1731 } 1732 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) { 1733 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name); 1734 } 1735 1736 /* find the modern name */ 1737 if (*names[i].name) { 1738 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode); 1739 if(U_FAILURE(errorCode)) { 1740 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode)); 1741 return; 1742 } 1743 if(c!=(UChar32)names[i].code) { 1744 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code); 1745 } 1746 } 1747 1748 /* Unicode 1.0 character name */ 1749 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode); 1750 if(U_FAILURE(errorCode)) { 1751 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode)); 1752 return; 1753 } 1754 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) { 1755 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName); 1756 } 1757 1758 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */ 1759 if(names[i].oldName[0]!=0 /* && length>0 */) { 1760 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode); 1761 if(U_FAILURE(errorCode)) { 1762 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode)); 1763 return; 1764 } 1765 if(c!=(UChar32)names[i].code) { 1766 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code); 1767 } 1768 } 1769 1770 /* Unicode character name alias */ 1771 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode); 1772 if(U_FAILURE(errorCode)) { 1773 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode)); 1774 return; 1775 } 1776 expected=names[i].alias; 1777 if(expected==NULL) { 1778 expected=""; 1779 } 1780 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) { 1781 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n", 1782 names[i].code, name, length, expected); 1783 } 1784 1785 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */ 1786 if(expected[0]!=0 /* && length>0 */) { 1787 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode); 1788 if(U_FAILURE(errorCode)) { 1789 log_err("u_charFromName(%s - alias) error %s\n", 1790 expected, u_errorName(errorCode)); 1791 return; 1792 } 1793 if(c!=(UChar32)names[i].code) { 1794 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n", 1795 expected, c, names[i].code); 1796 } 1797 } 1798 } 1799 1800 /* test u_enumCharNames() */ 1801 length=0; 1802 errorCode=U_ZERO_ERROR; 1803 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode); 1804 if(U_FAILURE(errorCode) || length<94140) { 1805 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length); 1806 } 1807 1808 extContext.length = 0; 1809 extContext.last = -1; 1810 errorCode=U_ZERO_ERROR; 1811 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode); 1812 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) { 1813 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length); 1814 } 1815 1816 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */ 1817 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) { 1818 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode)); 1819 } 1820 1821 /* Test getCharNameCharacters */ 1822 if(!getTestOption(QUICK_OPTION)) { 1823 enum { BUFSIZE = 256 }; 1824 UErrorCode ec = U_ZERO_ERROR; 1825 char buf[BUFSIZE]; 1826 int32_t maxLength; 1827 UChar32 cp; 1828 UChar pat[BUFSIZE], dumbPat[BUFSIZE]; 1829 int32_t l1, l2; 1830 UBool map[256]; 1831 UBool ok; 1832 1833 USet* set = uset_open(1, 0); /* empty set */ 1834 USet* dumb = uset_open(1, 0); /* empty set */ 1835 1836 /* 1837 * uprv_getCharNameCharacters() will likely return more lowercase 1838 * letters than actual character names contain because 1839 * it includes all the characters in lowercased names of 1840 * general categories, for the full possible set of extended names. 1841 */ 1842 { 1843 USetAdder sa={ 1844 NULL, 1845 uset_add, 1846 uset_addRange, 1847 uset_addString, 1848 NULL /* don't need remove() */ 1849 }; 1850 sa.set=set; 1851 uprv_getCharNameCharacters(&sa); 1852 } 1853 1854 /* build set the dumb (but sure-fire) way */ 1855 for (i=0; i<256; ++i) { 1856 map[i] = FALSE; 1857 } 1858 1859 maxLength=0; 1860 for (cp=0; cp<0x110000; ++cp) { 1861 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME, 1862 buf, BUFSIZE, &ec); 1863 if (U_FAILURE(ec)) { 1864 log_err("FAIL: u_charName failed when it shouldn't\n"); 1865 uset_close(set); 1866 uset_close(dumb); 1867 return; 1868 } 1869 if(len>maxLength) { 1870 maxLength=len; 1871 } 1872 1873 for (i=0; i<len; ++i) { 1874 if (!map[(uint8_t) buf[i]]) { 1875 uset_add(dumb, (UChar32)u_charToUChar(buf[i])); 1876 map[(uint8_t) buf[i]] = TRUE; 1877 } 1878 } 1879 1880 /* test for leading/trailing whitespace */ 1881 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') { 1882 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp); 1883 } 1884 } 1885 1886 if(map[(uint8_t)'\t']) { 1887 log_err("u_charName() returned a name with a TAB for some code point\n", cp); 1888 } 1889 1890 length=uprv_getMaxCharNameLength(); 1891 if(length!=maxLength) { 1892 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n", 1893 length, maxLength); 1894 } 1895 1896 /* compare the sets. Where is my uset_equals?!! */ 1897 ok=TRUE; 1898 for(i=0; i<256; ++i) { 1899 if(uset_contains(set, i)!=uset_contains(dumb, i)) { 1900 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) { 1901 /* ignore lowercase a-z that are in set but not in dumb */ 1902 ok=TRUE; 1903 } else { 1904 ok=FALSE; 1905 break; 1906 } 1907 } 1908 } 1909 1910 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec); 1911 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec); 1912 if (U_FAILURE(ec)) { 1913 log_err("FAIL: uset_toPattern failed when it shouldn't\n"); 1914 uset_close(set); 1915 uset_close(dumb); 1916 return; 1917 } 1918 1919 if (l1 >= BUFSIZE) { 1920 l1 = BUFSIZE-1; 1921 pat[l1] = 0; 1922 } 1923 if (l2 >= BUFSIZE) { 1924 l2 = BUFSIZE-1; 1925 dumbPat[l2] = 0; 1926 } 1927 1928 if (!ok) { 1929 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n", 1930 aescstrdup(pat, l1), aescstrdup(dumbPat, l2)); 1931 } else if(getTestOption(VERBOSITY_OPTION)) { 1932 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1)); 1933 } 1934 1935 uset_close(set); 1936 uset_close(dumb); 1937 } 1938 1939 /* ### TODO: test error cases and other interesting things */ 1940} 1941 1942/* test u_isMirrored() and u_charMirror() ----------------------------------- */ 1943 1944static void 1945TestMirroring() { 1946 USet *set; 1947 UErrorCode errorCode; 1948 1949 UChar32 start, end, c2, c3; 1950 int32_t i; 1951 1952 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17); 1953 1954 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17); 1955 1956 log_verbose("Testing u_isMirrored()\n"); 1957 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) && 1958 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400) 1959 ) 1960 ) { 1961 log_err("u_isMirrored() does not work correctly\n"); 1962 } 1963 1964 log_verbose("Testing u_charMirror()\n"); 1965 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 && 1966 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */ 1967 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab && 1968 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */ 1969 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d 1970 ) 1971 ) { 1972 log_err("u_charMirror() does not work correctly\n"); 1973 } 1974 1975 /* verify that Bidi_Mirroring_Glyph roundtrips */ 1976 errorCode=U_ZERO_ERROR; 1977 set=uset_openPattern(mirroredPattern, 17, &errorCode); 1978 1979 if (U_FAILURE(errorCode)) { 1980 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n"); 1981 } else { 1982 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) { 1983 do { 1984 c2=u_charMirror(start); 1985 c3=u_charMirror(c2); 1986 if(c3!=start) { 1987 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3); 1988 } 1989 c3=u_getBidiPairedBracket(start); 1990 if(u_getIntPropertyValue(start, UCHAR_BIDI_PAIRED_BRACKET_TYPE)==U_BPT_NONE) { 1991 if(c3!=start) { 1992 log_err("u_getBidiPairedBracket(U+%04lx) != self for bpt(c)==None\n", 1993 (long)start); 1994 } 1995 } else { 1996 if(c3!=c2) { 1997 log_err("u_getBidiPairedBracket(U+%04lx) != U+%04lx = bmg(c)'\n", 1998 (long)start, (long)c2); 1999 } 2000 } 2001 } while(++start<=end); 2002 } 2003 } 2004 2005 uset_close(set); 2006} 2007 2008 2009struct RunTestData 2010{ 2011 const char *runText; 2012 UScriptCode runCode; 2013}; 2014 2015typedef struct RunTestData RunTestData; 2016 2017static void 2018CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns, 2019 const char *prefix) 2020{ 2021 int32_t run, runStart, runLimit; 2022 UScriptCode runCode; 2023 2024 /* iterate over all the runs */ 2025 run = 0; 2026 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) { 2027 if (runStart != runStarts[run]) { 2028 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n", 2029 prefix, run, runStarts[run], runStart); 2030 } 2031 2032 if (runLimit != runStarts[run + 1]) { 2033 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n", 2034 prefix, run, runStarts[run + 1], runLimit); 2035 } 2036 2037 if (runCode != testData[run].runCode) { 2038 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n", 2039 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode)); 2040 } 2041 2042 run += 1; 2043 2044 /* stop when we've seen all the runs we expect to see */ 2045 if (run >= nRuns) { 2046 break; 2047 } 2048 } 2049 2050 /* Complain if we didn't see then number of runs we expected */ 2051 if (run != nRuns) { 2052 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns); 2053 } 2054} 2055 2056static void 2057TestUScriptRunAPI() 2058{ 2059 static const RunTestData testData1[] = { 2060 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI}, 2061 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC}, 2062 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC}, 2063 {"English (", USCRIPT_LATIN}, 2064 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI}, 2065 {") ", USCRIPT_LATIN}, 2066 {"\\u6F22\\u5B75", USCRIPT_HAN}, 2067 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA}, 2068 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA}, 2069 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET} 2070 }; 2071 2072 static const RunTestData testData2[] = { 2073 {"((((((((((abc))))))))))", USCRIPT_LATIN} 2074 }; 2075 2076 static const struct { 2077 const RunTestData *testData; 2078 int32_t nRuns; 2079 } testDataEntries[] = { 2080 {testData1, LENGTHOF(testData1)}, 2081 {testData2, LENGTHOF(testData2)} 2082 }; 2083 2084 static const int32_t nTestEntries = LENGTHOF(testDataEntries); 2085 int32_t testEntry; 2086 2087 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) { 2088 UChar testString[1024]; 2089 int32_t runStarts[256]; 2090 int32_t nTestRuns = testDataEntries[testEntry].nRuns; 2091 const RunTestData *testData = testDataEntries[testEntry].testData; 2092 2093 int32_t run, stringLimit; 2094 UScriptRun *scriptRun = NULL; 2095 UErrorCode err; 2096 2097 /* 2098 * Fill in the test string and the runStarts array. 2099 */ 2100 stringLimit = 0; 2101 for (run = 0; run < nTestRuns; run += 1) { 2102 runStarts[run] = stringLimit; 2103 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit); 2104 /*stringLimit -= 1;*/ 2105 } 2106 2107 /* The limit of the last run */ 2108 runStarts[nTestRuns] = stringLimit; 2109 2110 /* 2111 * Make sure that calling uscript_OpenRun with a NULL text pointer 2112 * and a non-zero text length returns the correct error. 2113 */ 2114 err = U_ZERO_ERROR; 2115 scriptRun = uscript_openRun(NULL, stringLimit, &err); 2116 2117 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 2118 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 2119 } 2120 2121 if (scriptRun != NULL) { 2122 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n"); 2123 uscript_closeRun(scriptRun); 2124 } 2125 2126 /* 2127 * Make sure that calling uscript_OpenRun with a non-NULL text pointer 2128 * and a zero text length returns the correct error. 2129 */ 2130 err = U_ZERO_ERROR; 2131 scriptRun = uscript_openRun(testString, 0, &err); 2132 2133 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 2134 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 2135 } 2136 2137 if (scriptRun != NULL) { 2138 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n"); 2139 uscript_closeRun(scriptRun); 2140 } 2141 2142 /* 2143 * Make sure that calling uscript_openRun with a NULL text pointer 2144 * and a zero text length doesn't return an error. 2145 */ 2146 err = U_ZERO_ERROR; 2147 scriptRun = uscript_openRun(NULL, 0, &err); 2148 2149 if (U_FAILURE(err)) { 2150 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err)); 2151 } 2152 2153 /* Make sure that the empty iterator doesn't find any runs */ 2154 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) { 2155 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n"); 2156 } 2157 2158 /* 2159 * Make sure that calling uscript_setRunText with a NULL text pointer 2160 * and a non-zero text length returns the correct error. 2161 */ 2162 err = U_ZERO_ERROR; 2163 uscript_setRunText(scriptRun, NULL, stringLimit, &err); 2164 2165 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 2166 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 2167 } 2168 2169 /* 2170 * Make sure that calling uscript_OpenRun with a non-NULL text pointer 2171 * and a zero text length returns the correct error. 2172 */ 2173 err = U_ZERO_ERROR; 2174 uscript_setRunText(scriptRun, testString, 0, &err); 2175 2176 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 2177 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 2178 } 2179 2180 /* 2181 * Now call uscript_setRunText on the empty iterator 2182 * and make sure that it works. 2183 */ 2184 err = U_ZERO_ERROR; 2185 uscript_setRunText(scriptRun, testString, stringLimit, &err); 2186 2187 if (U_FAILURE(err)) { 2188 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err)); 2189 } else { 2190 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText"); 2191 } 2192 2193 uscript_closeRun(scriptRun); 2194 2195 /* 2196 * Now open an interator over the testString 2197 * using uscript_openRun and make sure that it works 2198 */ 2199 scriptRun = uscript_openRun(testString, stringLimit, &err); 2200 2201 if (U_FAILURE(err)) { 2202 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err)); 2203 } else { 2204 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun"); 2205 } 2206 2207 /* Now reset the iterator, and make sure 2208 * that it still works. 2209 */ 2210 uscript_resetRun(scriptRun); 2211 2212 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun"); 2213 2214 /* Close the iterator */ 2215 uscript_closeRun(scriptRun); 2216 } 2217} 2218 2219/* test additional, non-core properties */ 2220static void 2221TestAdditionalProperties() { 2222 /* test data for u_charAge() */ 2223 static const struct { 2224 UChar32 c; 2225 UVersionInfo version; 2226 } charAges[]={ 2227 {0x41, { 1, 1, 0, 0 }}, 2228 {0xffff, { 1, 1, 0, 0 }}, 2229 {0x20ab, { 2, 0, 0, 0 }}, 2230 {0x2fffe, { 2, 0, 0, 0 }}, 2231 {0x20ac, { 2, 1, 0, 0 }}, 2232 {0xfb1d, { 3, 0, 0, 0 }}, 2233 {0x3f4, { 3, 1, 0, 0 }}, 2234 {0x10300, { 3, 1, 0, 0 }}, 2235 {0x220, { 3, 2, 0, 0 }}, 2236 {0xff60, { 3, 2, 0, 0 }} 2237 }; 2238 2239 /* test data for u_hasBinaryProperty() */ 2240 static const int32_t 2241 props[][3]={ /* code point, property, value */ 2242 { 0x0627, UCHAR_ALPHABETIC, TRUE }, 2243 { 0x1034a, UCHAR_ALPHABETIC, TRUE }, 2244 { 0x2028, UCHAR_ALPHABETIC, FALSE }, 2245 2246 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE }, 2247 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE }, 2248 2249 { 0x202c, UCHAR_BIDI_CONTROL, TRUE }, 2250 { 0x202f, UCHAR_BIDI_CONTROL, FALSE }, 2251 2252 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE }, 2253 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE }, 2254 2255 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */ 2256 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE }, 2257 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE }, 2258 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE }, 2259 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE }, 2260 2261 { 0x058a, UCHAR_DASH, TRUE }, 2262 { 0x007e, UCHAR_DASH, FALSE }, 2263 2264 { 0x0c4d, UCHAR_DIACRITIC, TRUE }, 2265 { 0x3000, UCHAR_DIACRITIC, FALSE }, 2266 2267 { 0x0e46, UCHAR_EXTENDER, TRUE }, 2268 { 0x0020, UCHAR_EXTENDER, FALSE }, 2269 2270#if !UCONFIG_NO_NORMALIZATION 2271 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE }, 2272 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE }, 2273 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE }, 2274 2275 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */ 2276 { 0x0308, UCHAR_NFD_INERT, FALSE }, 2277 2278 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */ 2279 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */ 2280 2281 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */ 2282 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */ 2283 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */ 2284 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */ 2285 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */ 2286 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */ 2287 2288 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */ 2289 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */ 2290 2291 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE }, 2292 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE }, 2293 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */ 2294 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */ 2295 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */ 2296 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */ 2297#endif 2298 2299 { 0x0044, UCHAR_HEX_DIGIT, TRUE }, 2300 { 0xff46, UCHAR_HEX_DIGIT, TRUE }, 2301 { 0x0047, UCHAR_HEX_DIGIT, FALSE }, 2302 2303 { 0x30fb, UCHAR_HYPHEN, TRUE }, 2304 { 0xfe58, UCHAR_HYPHEN, FALSE }, 2305 2306 { 0x2172, UCHAR_ID_CONTINUE, TRUE }, 2307 { 0x0307, UCHAR_ID_CONTINUE, TRUE }, 2308 { 0x005c, UCHAR_ID_CONTINUE, FALSE }, 2309 2310 { 0x2172, UCHAR_ID_START, TRUE }, 2311 { 0x007a, UCHAR_ID_START, TRUE }, 2312 { 0x0039, UCHAR_ID_START, FALSE }, 2313 2314 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE }, 2315 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE }, 2316 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE }, 2317 2318 { 0x200c, UCHAR_JOIN_CONTROL, TRUE }, 2319 { 0x2029, UCHAR_JOIN_CONTROL, FALSE }, 2320 2321 { 0x1d7bc, UCHAR_LOWERCASE, TRUE }, 2322 { 0x0345, UCHAR_LOWERCASE, TRUE }, 2323 { 0x0030, UCHAR_LOWERCASE, FALSE }, 2324 2325 { 0x1d7a9, UCHAR_MATH, TRUE }, 2326 { 0x2135, UCHAR_MATH, TRUE }, 2327 { 0x0062, UCHAR_MATH, FALSE }, 2328 2329 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE }, 2330 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE }, 2331 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE }, 2332 2333 { 0x0022, UCHAR_QUOTATION_MARK, TRUE }, 2334 { 0xff62, UCHAR_QUOTATION_MARK, TRUE }, 2335 { 0xd840, UCHAR_QUOTATION_MARK, FALSE }, 2336 2337 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE }, 2338 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE }, 2339 2340 { 0x1d44a, UCHAR_UPPERCASE, TRUE }, 2341 { 0x2162, UCHAR_UPPERCASE, TRUE }, 2342 { 0x0345, UCHAR_UPPERCASE, FALSE }, 2343 2344 { 0x0020, UCHAR_WHITE_SPACE, TRUE }, 2345 { 0x202f, UCHAR_WHITE_SPACE, TRUE }, 2346 { 0x3001, UCHAR_WHITE_SPACE, FALSE }, 2347 2348 { 0x0711, UCHAR_XID_CONTINUE, TRUE }, 2349 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE }, 2350 { 0x007c, UCHAR_XID_CONTINUE, FALSE }, 2351 2352 { 0x16ee, UCHAR_XID_START, TRUE }, 2353 { 0x23456, UCHAR_XID_START, TRUE }, 2354 { 0x1d1aa, UCHAR_XID_START, FALSE }, 2355 2356 /* 2357 * Version break: 2358 * The following properties are only supported starting with the 2359 * Unicode version indicated in the second field. 2360 */ 2361 { -1, 0x320, 0 }, 2362 2363 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE }, 2364 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE }, 2365 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE }, 2366 2367 { 0x0149, UCHAR_DEPRECATED, TRUE }, /* changed in Unicode 5.2 */ 2368 { 0x0341, UCHAR_DEPRECATED, FALSE }, /* changed in Unicode 5.2 */ 2369 { 0xe0041, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */ 2370 { 0xe0100, UCHAR_DEPRECATED, FALSE }, 2371 2372 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE }, 2373 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE }, 2374 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE }, 2375 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */ 2376 2377 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE }, 2378 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE }, 2379 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */ 2380 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE }, 2381 2382 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE }, 2383 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE }, 2384 2385 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE }, 2386 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE }, 2387 2388 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE }, 2389 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE }, 2390 2391 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE }, 2392 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE }, 2393 2394 { 0x2e9b, UCHAR_RADICAL, TRUE }, 2395 { 0x4e00, UCHAR_RADICAL, FALSE }, 2396 2397 { 0x012f, UCHAR_SOFT_DOTTED, TRUE }, 2398 { 0x0049, UCHAR_SOFT_DOTTED, FALSE }, 2399 2400 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE }, 2401 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE }, 2402 2403 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */ 2404 2405 { 0x002e, UCHAR_S_TERM, TRUE }, 2406 { 0x0061, UCHAR_S_TERM, FALSE }, 2407 2408 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE }, 2409 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE }, 2410 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE }, 2411 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE }, 2412 2413 /* enum/integer type properties */ 2414 2415 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */ 2416 /* test default Bidi classes for unassigned code points */ 2417 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2418 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2419 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2420 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */ 2421 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */ 2422 { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2423 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2424 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2425 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2426 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2427 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2428 2429 { 0x0605, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2430 { 0x061c, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2431 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2432 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2433 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2434 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2435 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2436 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2437 2438 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS }, 2439 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU }, 2440 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS }, 2441 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG }, 2442 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU }, 2443 { 0x1AFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK }, 2444 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA }, 2445 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS }, 2446 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK }, 2447 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK }, 2448 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B }, 2449 2450 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */ 2451 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 }, 2452 2453 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK }, 2454 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT }, 2455 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE }, 2456 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL }, 2457 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL }, 2458 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL }, 2459 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL }, 2460 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT }, 2461 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE }, 2462 2463 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL }, 2464 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW }, 2465 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2466 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH }, 2467 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2468 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH }, 2469 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2470 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2471 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2472 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2473 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL }, 2474 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2475 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2476 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */ 2477 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL }, 2478 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2479 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2480 2481 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */ 2482 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 }, 2483 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */ 2484 2485 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP }, 2486 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN }, 2487 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH }, 2488 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH }, 2489 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL }, 2490 2491 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING }, 2492 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING }, 2493 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING }, 2494 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING }, 2495 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING }, 2496 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT }, 2497 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT }, 2498 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT }, 2499 2500 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */ 2501 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN }, 2502 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN }, 2503 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION }, 2504 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION }, 2505 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2506 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2507 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2508 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2509 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE }, 2510 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE }, 2511 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE }, 2512 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION }, 2513 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS }, 2514 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC }, 2515 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC }, 2516 2517 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */ 2518 2519 /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */ 2520 2521 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2522 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2523 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2524 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2525 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */ 2526 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */ 2527 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2528 2529 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2530 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */ 2531 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */ 2532 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2533 2534 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2535 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2536 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2537 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2538 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */ 2539 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */ 2540 2541 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2542 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */ 2543 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */ 2544 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2545 2546 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2547 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2548 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2549 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2550 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */ 2551 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */ 2552 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2553 2554 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2555 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */ 2556 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */ 2557 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2558 2559 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2560 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2561 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2562 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2563 2564 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2565 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2566 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2567 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2568 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2569 2570 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2571 2572 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */ 2573 2574 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE }, 2575 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE }, 2576 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE }, 2577 2578 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE }, 2579 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE }, 2580 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE }, 2581 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE }, 2582 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE }, 2583 2584 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION }, 2585 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC }, 2586 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS }, 2587 2588 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE }, 2589 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC }, 2590 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI }, 2591 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN }, 2592 2593 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 }, 2594 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 }, 2595 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 }, 2596 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL }, 2597 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT }, 2598 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV }, 2599 2600 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT }, 2601 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND }, 2602 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL }, 2603 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V }, 2604 2605 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER }, 2606 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER }, 2607 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC }, 2608 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM }, 2609 2610 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER }, 2611 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER }, 2612 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE }, 2613 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP }, 2614 2615 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */ 2616 2617 /* unassigned code points in new default Bidi R blocks */ 2618 { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2619 { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2620 2621 /* test some script codes >127 */ 2622 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM }, 2623 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU }, 2624 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN }, 2625 2626 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */ 2627 2628 /* value changed in Unicode 6.0 */ 2629 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL }, 2630 2631 { -1, 0x610, 0 }, /* version break for Unicode 6.1 */ 2632 2633 /* unassigned code points in new/changed default Bidi AL blocks */ 2634 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2635 { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2636 2637 { -1, 0x630, 0 }, /* version break for Unicode 6.3 */ 2638 2639 /* unassigned code points in the currency symbols block now default to ET */ 2640 { 0x20C0, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR }, 2641 { 0x20CF, UCHAR_BIDI_CLASS, U_EUROPEAN_NUMBER_TERMINATOR }, 2642 2643 /* new property in Unicode 6.3 */ 2644 { 0x0027, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE }, 2645 { 0x0028, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN }, 2646 { 0x0029, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE }, 2647 { 0xFF5C, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_NONE }, 2648 { 0xFF5B, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_OPEN }, 2649 { 0xFF5D, UCHAR_BIDI_PAIRED_BRACKET_TYPE, U_BPT_CLOSE }, 2650 2651 /* undefined UProperty values */ 2652 { 0x61, 0x4a7, 0 }, 2653 { 0x234bc, 0x15ed, 0 } 2654 }; 2655 2656 UVersionInfo version; 2657 UChar32 c; 2658 int32_t i, result, uVersion; 2659 UProperty which; 2660 2661 /* what is our Unicode version? */ 2662 u_getUnicodeVersion(version); 2663 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */ 2664 2665 u_charAge(0x20, version); 2666 if(version[0]==0) { 2667 /* no additional properties available */ 2668 log_err("TestAdditionalProperties: no additional properties available, not tested\n"); 2669 return; 2670 } 2671 2672 /* test u_charAge() */ 2673 for(i=0; i<sizeof(charAges)/sizeof(charAges[0]); ++i) { 2674 u_charAge(charAges[i].c, version); 2675 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) { 2676 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n", 2677 charAges[i].c, 2678 version[0], version[1], version[2], version[3], 2679 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]); 2680 } 2681 } 2682 2683 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 || 2684 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 || 2685 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */ 2686 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/ 2687 u_getIntPropertyMinValue(0x2345)!=0 2688 ) { 2689 log_err("error: u_getIntPropertyMinValue() wrong\n"); 2690 } 2691 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) { 2692 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n"); 2693 } 2694 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) { 2695 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n"); 2696 } 2697 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) { 2698 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n"); 2699 } 2700 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) { 2701 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n"); 2702 } 2703 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) { 2704 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n"); 2705 } 2706 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) { 2707 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n"); 2708 } 2709 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) { 2710 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n"); 2711 } 2712 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) { 2713 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n"); 2714 } 2715 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) { 2716 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n"); 2717 } 2718 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) { 2719 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n"); 2720 } 2721 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) { 2722 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n"); 2723 } 2724 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) { 2725 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n"); 2726 } 2727 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) { 2728 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n"); 2729 } 2730 if(u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE)!=(int32_t)U_BPT_COUNT-1) { 2731 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_PAIRED_BRACKET_TYPE) wrong\n"); 2732 } 2733 /*JB#2410*/ 2734 if( u_getIntPropertyMaxValue(0x2345)!=-1) { 2735 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n"); 2736 } 2737 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) { 2738 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n"); 2739 } 2740 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) { 2741 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n"); 2742 } 2743 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) { 2744 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n"); 2745 } 2746 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) { 2747 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n"); 2748 } 2749 2750 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */ 2751 for(i=0; i<sizeof(props)/sizeof(props[0]); ++i) { 2752 const char *whichName; 2753 2754 if(props[i][0]<0) { 2755 /* Unicode version break */ 2756 if(uVersion<props[i][1]) { 2757 break; /* do not test properties that are not yet supported */ 2758 } else { 2759 continue; /* skip this row */ 2760 } 2761 } 2762 2763 c=(UChar32)props[i][0]; 2764 which=(UProperty)props[i][1]; 2765 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME); 2766 2767 if(which<UCHAR_INT_START) { 2768 result=u_hasBinaryProperty(c, which); 2769 if(result!=props[i][2]) { 2770 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n", 2771 c, whichName, result, i); 2772 } 2773 } 2774 2775 result=u_getIntPropertyValue(c, which); 2776 if(result!=props[i][2]) { 2777 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n", 2778 c, whichName, result, props[i][2], i); 2779 } 2780 2781 /* test separate functions, too */ 2782 switch((UProperty)props[i][1]) { 2783 case UCHAR_ALPHABETIC: 2784 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) { 2785 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n", 2786 props[i][0], result, i); 2787 } 2788 break; 2789 case UCHAR_LOWERCASE: 2790 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) { 2791 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n", 2792 props[i][0], result, i); 2793 } 2794 break; 2795 case UCHAR_UPPERCASE: 2796 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) { 2797 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n", 2798 props[i][0], result, i); 2799 } 2800 break; 2801 case UCHAR_WHITE_SPACE: 2802 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) { 2803 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n", 2804 props[i][0], result, i); 2805 } 2806 break; 2807 default: 2808 break; 2809 } 2810 } 2811} 2812 2813static void 2814TestNumericProperties(void) { 2815 /* see UnicodeData.txt, DerivedNumericValues.txt */ 2816 static const struct { 2817 UChar32 c; 2818 int32_t type; 2819 double numValue; 2820 } values[]={ 2821 { 0x0F33, U_NT_NUMERIC, -1./2. }, 2822 { 0x0C66, U_NT_DECIMAL, 0 }, 2823 { 0x96f6, U_NT_NUMERIC, 0 }, 2824 { 0xa833, U_NT_NUMERIC, 1./16. }, 2825 { 0x2152, U_NT_NUMERIC, 1./10. }, 2826 { 0x2151, U_NT_NUMERIC, 1./9. }, 2827 { 0x1245f, U_NT_NUMERIC, 1./8. }, 2828 { 0x2150, U_NT_NUMERIC, 1./7. }, 2829 { 0x2159, U_NT_NUMERIC, 1./6. }, 2830 { 0x09f6, U_NT_NUMERIC, 3./16. }, 2831 { 0x2155, U_NT_NUMERIC, 1./5. }, 2832 { 0x00BD, U_NT_NUMERIC, 1./2. }, 2833 { 0x0031, U_NT_DECIMAL, 1. }, 2834 { 0x4e00, U_NT_NUMERIC, 1. }, 2835 { 0x58f1, U_NT_NUMERIC, 1. }, 2836 { 0x10320, U_NT_NUMERIC, 1. }, 2837 { 0x0F2B, U_NT_NUMERIC, 3./2. }, 2838 { 0x00B2, U_NT_DIGIT, 2. }, 2839 { 0x5f10, U_NT_NUMERIC, 2. }, 2840 { 0x1813, U_NT_DECIMAL, 3. }, 2841 { 0x5f0e, U_NT_NUMERIC, 3. }, 2842 { 0x2173, U_NT_NUMERIC, 4. }, 2843 { 0x8086, U_NT_NUMERIC, 4. }, 2844 { 0x278E, U_NT_DIGIT, 5. }, 2845 { 0x1D7F2, U_NT_DECIMAL, 6. }, 2846 { 0x247A, U_NT_DIGIT, 7. }, 2847 { 0x7396, U_NT_NUMERIC, 9. }, 2848 { 0x1372, U_NT_NUMERIC, 10. }, 2849 { 0x216B, U_NT_NUMERIC, 12. }, 2850 { 0x16EE, U_NT_NUMERIC, 17. }, 2851 { 0x249A, U_NT_NUMERIC, 19. }, 2852 { 0x303A, U_NT_NUMERIC, 30. }, 2853 { 0x5345, U_NT_NUMERIC, 30. }, 2854 { 0x32B2, U_NT_NUMERIC, 37. }, 2855 { 0x1375, U_NT_NUMERIC, 40. }, 2856 { 0x10323, U_NT_NUMERIC, 50. }, 2857 { 0x0BF1, U_NT_NUMERIC, 100. }, 2858 { 0x964c, U_NT_NUMERIC, 100. }, 2859 { 0x217E, U_NT_NUMERIC, 500. }, 2860 { 0x2180, U_NT_NUMERIC, 1000. }, 2861 { 0x4edf, U_NT_NUMERIC, 1000. }, 2862 { 0x2181, U_NT_NUMERIC, 5000. }, 2863 { 0x137C, U_NT_NUMERIC, 10000. }, 2864 { 0x4e07, U_NT_NUMERIC, 10000. }, 2865 { 0x12432, U_NT_NUMERIC, 216000. }, 2866 { 0x12433, U_NT_NUMERIC, 432000. }, 2867 { 0x4ebf, U_NT_NUMERIC, 100000000. }, 2868 { 0x5146, U_NT_NUMERIC, 1000000000000. }, 2869 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2870 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2871 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2872 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2873 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2874 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2875 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2876 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE } 2877 }; 2878 2879 double nv; 2880 UChar32 c; 2881 int32_t i, type; 2882 2883 for(i=0; i<LENGTHOF(values); ++i) { 2884 c=values[i].c; 2885 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE); 2886 nv=u_getNumericValue(c); 2887 2888 if(type!=values[i].type) { 2889 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type); 2890 } 2891 if(0.000001 <= fabs(nv - values[i].numValue)) { 2892 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue); 2893 } 2894 } 2895} 2896 2897/** 2898 * Test the property names and property value names API. 2899 */ 2900static void 2901TestPropertyNames(void) { 2902 int32_t p, v, choice=0, rev; 2903 UBool atLeastSomething = FALSE; 2904 2905 for (p=0; ; ++p) { 2906 UProperty propEnum = (UProperty)p; 2907 UBool sawProp = FALSE; 2908 if(p > 10 && !atLeastSomething) { 2909 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice); 2910 return; 2911 } 2912 2913 for (choice=0; ; ++choice) { 2914 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice); 2915 if (name) { 2916 if (!sawProp) 2917 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff); 2918 log_verbose("%d=\"%s\"", choice, name); 2919 sawProp = TRUE; 2920 atLeastSomething = TRUE; 2921 2922 /* test reverse mapping */ 2923 rev = u_getPropertyEnum(name); 2924 if (rev != p) { 2925 log_err("Property round-trip failure: %d -> %s -> %d\n", 2926 p, name, rev); 2927 } 2928 } 2929 if (!name && choice>0) break; 2930 } 2931 if (sawProp) { 2932 /* looks like a valid property; check the values */ 2933 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME); 2934 int32_t max = 0; 2935 if (p == UCHAR_CANONICAL_COMBINING_CLASS) { 2936 max = 255; 2937 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) { 2938 /* it's far too slow to iterate all the way up to 2939 the real max, U_GC_P_MASK */ 2940 max = U_GC_NL_MASK; 2941 } else if (p == UCHAR_BLOCK) { 2942 /* UBlockCodes, unlike other values, start at 1 */ 2943 max = 1; 2944 } 2945 log_verbose("\n"); 2946 for (v=-1; ; ++v) { 2947 UBool sawValue = FALSE; 2948 for (choice=0; ; ++choice) { 2949 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice); 2950 if (vname) { 2951 if (!sawValue) log_verbose(" %s, value %d:", pname, v); 2952 log_verbose("%d=\"%s\"", choice, vname); 2953 sawValue = TRUE; 2954 2955 /* test reverse mapping */ 2956 rev = u_getPropertyValueEnum(propEnum, vname); 2957 if (rev != v) { 2958 log_err("Value round-trip failure (%s): %d -> %s -> %d\n", 2959 pname, v, vname, rev); 2960 } 2961 } 2962 if (!vname && choice>0) break; 2963 } 2964 if (sawValue) { 2965 log_verbose("\n"); 2966 } 2967 if (!sawValue && v>=max) break; 2968 } 2969 } 2970 if (!sawProp) { 2971 if (p>=UCHAR_STRING_LIMIT) { 2972 break; 2973 } else if (p>=UCHAR_DOUBLE_LIMIT) { 2974 p = UCHAR_STRING_START - 1; 2975 } else if (p>=UCHAR_MASK_LIMIT) { 2976 p = UCHAR_DOUBLE_START - 1; 2977 } else if (p>=UCHAR_INT_LIMIT) { 2978 p = UCHAR_MASK_START - 1; 2979 } else if (p>=UCHAR_BINARY_LIMIT) { 2980 p = UCHAR_INT_START - 1; 2981 } 2982 } 2983 } 2984} 2985 2986/** 2987 * Test the property values API. See JB#2410. 2988 */ 2989static void 2990TestPropertyValues(void) { 2991 int32_t i, p, min, max; 2992 UErrorCode ec; 2993 2994 /* Min should be 0 for everything. */ 2995 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */ 2996 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) { 2997 UProperty propEnum = (UProperty)p; 2998 min = u_getIntPropertyMinValue(propEnum); 2999 if (min != 0) { 3000 if (p == UCHAR_BLOCK) { 3001 /* This is okay...for now. See JB#2487. 3002 TODO Update this for JB#2487. */ 3003 } else { 3004 const char* name; 3005 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME); 3006 if (name == NULL) 3007 name = "<ERROR>"; 3008 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n", 3009 name, min); 3010 } 3011 } 3012 } 3013 3014 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 || 3015 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) { 3016 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n"); 3017 } 3018 3019 /* Max should be -1 for invalid properties. */ 3020 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE); 3021 if (max != -1) { 3022 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n", 3023 max); 3024 } 3025 3026 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */ 3027 for (i=0; i<2; ++i) { 3028 int32_t script; 3029 const char* desc; 3030 ec = U_ZERO_ERROR; 3031 switch (i) { 3032 case 0: 3033 script = uscript_getScript(-1, &ec); 3034 desc = "uscript_getScript(-1)"; 3035 break; 3036 case 1: 3037 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT); 3038 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)"; 3039 break; 3040 default: 3041 log_err("Internal test error. Too many scripts\n"); 3042 return; 3043 } 3044 /* We don't explicitly test ec. It should be U_FAILURE but it 3045 isn't documented as such. */ 3046 if (script != (int32_t)USCRIPT_INVALID_CODE) { 3047 log_err("FAIL: %s = %d, exp. 0\n", 3048 desc, script); 3049 } 3050 } 3051} 3052 3053/* various tests for consistency of UCD data and API behavior */ 3054static void 3055TestConsistency() { 3056 char buffer[300]; 3057 USet *set1, *set2, *set3, *set4; 3058 UErrorCode errorCode; 3059 3060 UChar32 start, end; 3061 int32_t i, length; 3062 3063 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10); 3064 U_STRING_DECL(dashPattern, "[:Dash:]", 8); 3065 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13); 3066 U_STRING_DECL(formatPattern, "[:Cf:]", 6); 3067 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14); 3068 3069 U_STRING_DECL(mathBlocksPattern, 3070 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]", 3071 1+32+46+46+45+43+1+1); /* +1 for NUL */ 3072 U_STRING_DECL(mathPattern, "[:Math:]", 8); 3073 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6); 3074 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14); 3075 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20); 3076 3077 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10); 3078 U_STRING_INIT(dashPattern, "[:Dash:]", 8); 3079 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13); 3080 U_STRING_INIT(formatPattern, "[:Cf:]", 6); 3081 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14); 3082 3083 U_STRING_INIT(mathBlocksPattern, 3084 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]", 3085 1+32+46+46+45+43+1+1); /* +1 for NUL */ 3086 U_STRING_INIT(mathPattern, "[:Math:]", 8); 3087 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6); 3088 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14); 3089 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20); 3090 3091 /* 3092 * It used to be that UCD.html and its precursors said 3093 * "Those dashes used to mark connections between pieces of words, 3094 * plus the Katakana middle dot." 3095 * 3096 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash 3097 * but not from Hyphen. 3098 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html. 3099 * Therefore, do not show errors when testing the Hyphen property. 3100 */ 3101 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n" 3102 "known to the UTC and not considered errors.\n"); 3103 3104 errorCode=U_ZERO_ERROR; 3105 set1=uset_openPattern(hyphenPattern, 10, &errorCode); 3106 set2=uset_openPattern(dashPattern, 8, &errorCode); 3107 if(U_SUCCESS(errorCode)) { 3108 /* remove the Katakana middle dot(s) from set1 */ 3109 uset_remove(set1, 0x30fb); 3110 uset_remove(set1, 0xff65); /* halfwidth variant */ 3111 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE); 3112 } else { 3113 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3114 } 3115 3116 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */ 3117 set3=uset_openPattern(formatPattern, 6, &errorCode); 3118 set4=uset_openPattern(alphaPattern, 14, &errorCode); 3119 if(U_SUCCESS(errorCode)) { 3120 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE); 3121 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE); 3122 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE); 3123 } else { 3124 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3125 } 3126 3127 uset_close(set1); 3128 uset_close(set2); 3129 uset_close(set3); 3130 uset_close(set4); 3131 3132 /* 3133 * Check that each lowercase character has "small" in its name 3134 * and not "capital". 3135 * There are some such characters, some of which seem odd. 3136 * Use the verbose flag to see these notices. 3137 */ 3138 errorCode=U_ZERO_ERROR; 3139 set1=uset_openPattern(lowerPattern, 13, &errorCode); 3140 if(U_SUCCESS(errorCode)) { 3141 for(i=0;; ++i) { 3142 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode); 3143 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) { 3144 break; /* done */ 3145 } 3146 if(U_FAILURE(errorCode)) { 3147 log_err("error iterating over [:Lowercase:] at item %d: %s\n", 3148 i, u_errorName(errorCode)); 3149 break; 3150 } 3151 if(length!=0) { 3152 break; /* done with code points, got a string or -1 */ 3153 } 3154 3155 while(start<=end) { 3156 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode); 3157 if(U_FAILURE(errorCode)) { 3158 log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode)); 3159 errorCode=U_ZERO_ERROR; 3160 } 3161 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) && 3162 strstr(buffer, "SMALL CAPITAL")==NULL 3163 ) { 3164 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer); 3165 } 3166 ++start; 3167 } 3168 } 3169 } else { 3170 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3171 } 3172 uset_close(set1); 3173 3174 /* verify that all assigned characters in Math blocks are exactly Math characters */ 3175 errorCode=U_ZERO_ERROR; 3176 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode); 3177 set2=uset_openPattern(mathPattern, 8, &errorCode); 3178 set3=uset_openPattern(unassignedPattern, 6, &errorCode); 3179 if(U_SUCCESS(errorCode)) { 3180 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */ 3181 uset_complement(set3); /* assigned characters */ 3182 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */ 3183 compareUSets(set1, set2, 3184 "[assigned Math block chars]", "[math blocks]&[:Math:]", 3185 TRUE); 3186 } else { 3187 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3188 } 3189 uset_close(set1); 3190 uset_close(set2); 3191 uset_close(set3); 3192 3193 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */ 3194 errorCode=U_ZERO_ERROR; 3195 set1=uset_openPattern(unknownPattern, 14, &errorCode); 3196 set2=uset_openPattern(reservedPattern, 20, &errorCode); 3197 if(U_SUCCESS(errorCode)) { 3198 compareUSets(set1, set2, 3199 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]", 3200 TRUE); 3201 } else { 3202 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3203 } 3204 uset_close(set1); 3205 uset_close(set2); 3206} 3207 3208/* 3209 * Starting with ICU4C 3.4, the core Unicode properties files 3210 * (uprops.icu, ucase.icu, ubidi.icu, unorm.icu) 3211 * are hardcoded in the common DLL and therefore not included 3212 * in the data package any more. 3213 * Test requiring these files are disabled so that 3214 * we need not jump through hoops (like adding snapshots of these files 3215 * to testdata). 3216 * See Jitterbug 4497. 3217 */ 3218#define HARDCODED_DATA_4497 1 3219 3220/* API coverage for ucase.c */ 3221static void TestUCase() { 3222#if !HARDCODED_DATA_4497 3223 UDataMemory *pData; 3224 UCaseProps *csp; 3225 const UCaseProps *ccsp; 3226 UErrorCode errorCode; 3227 3228 /* coverage for ucase_openBinary() */ 3229 errorCode=U_ZERO_ERROR; 3230 pData=udata_open(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, &errorCode); 3231 if(U_FAILURE(errorCode)) { 3232 log_data_err("unable to open " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n", 3233 u_errorName(errorCode)); 3234 return; 3235 } 3236 3237 csp=ucase_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode); 3238 if(U_FAILURE(errorCode)) { 3239 log_err("ucase_openBinary() fails for the contents of " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n", 3240 u_errorName(errorCode)); 3241 udata_close(pData); 3242 return; 3243 } 3244 3245 if(UCASE_LOWER!=ucase_getType(csp, 0xdf)) { /* verify islower(sharp s) */ 3246 log_err("ucase_openBinary() does not seem to return working UCaseProps\n"); 3247 } 3248 3249 ucase_close(csp); 3250 udata_close(pData); 3251 3252 /* coverage for ucase_getDummy() */ 3253 errorCode=U_ZERO_ERROR; 3254 ccsp=ucase_getDummy(&errorCode); 3255 if(ucase_tolower(ccsp, 0x41)!=0x41) { 3256 log_err("ucase_tolower(dummy, A)!=A\n"); 3257 } 3258#endif 3259} 3260 3261/* API coverage for ubidi_props.c */ 3262static void TestUBiDiProps() { 3263#if !HARDCODED_DATA_4497 3264 UDataMemory *pData; 3265 UBiDiProps *bdp; 3266 const UBiDiProps *cbdp; 3267 UErrorCode errorCode; 3268 3269 /* coverage for ubidi_openBinary() */ 3270 errorCode=U_ZERO_ERROR; 3271 pData=udata_open(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &errorCode); 3272 if(U_FAILURE(errorCode)) { 3273 log_data_err("unable to open " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n", 3274 u_errorName(errorCode)); 3275 return; 3276 } 3277 3278 bdp=ubidi_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode); 3279 if(U_FAILURE(errorCode)) { 3280 log_err("ubidi_openBinary() fails for the contents of " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n", 3281 u_errorName(errorCode)); 3282 udata_close(pData); 3283 return; 3284 } 3285 3286 if(0x2215!=ubidi_getMirror(bdp, 0x29F5)) { /* verify some data */ 3287 log_err("ubidi_openBinary() does not seem to return working UBiDiProps\n"); 3288 } 3289 3290 ubidi_closeProps(bdp); 3291 udata_close(pData); 3292 3293 /* coverage for ubidi_getDummy() */ 3294 errorCode=U_ZERO_ERROR; 3295 cbdp=ubidi_getDummy(&errorCode); 3296 if(ubidi_getClass(cbdp, 0x20)!=0) { 3297 log_err("ubidi_getClass(dummy, space)!=0\n"); 3298 } 3299#endif 3300} 3301 3302/* test case folding, compare return values with CaseFolding.txt ------------ */ 3303 3304/* bit set for which case foldings for a character have been tested already */ 3305enum { 3306 CF_SIMPLE=1, 3307 CF_FULL=2, 3308 CF_TURKIC=4, 3309 CF_ALL=7 3310}; 3311 3312static void 3313testFold(UChar32 c, int which, 3314 UChar32 simple, UChar32 turkic, 3315 const UChar *full, int32_t fullLength, 3316 const UChar *turkicFull, int32_t turkicFullLength) { 3317 UChar s[2], t[32]; 3318 UChar32 c2; 3319 int32_t length, length2; 3320 3321 UErrorCode errorCode=U_ZERO_ERROR; 3322 3323 length=0; 3324 U16_APPEND_UNSAFE(s, length, c); 3325 3326 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) { 3327 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple); 3328 } 3329 if((which&CF_FULL)!=0) { 3330 length2=u_strFoldCase(t, LENGTHOF(t), s, length, 0, &errorCode); 3331 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) { 3332 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c); 3333 } 3334 } 3335 if((which&CF_TURKIC)!=0) { 3336 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) { 3337 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple); 3338 } 3339 3340 length2=u_strFoldCase(t, LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode); 3341 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) { 3342 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c); 3343 } 3344 } 3345} 3346 3347/* test that c case-folds to itself */ 3348static void 3349testFoldToSelf(UChar32 c, int which) { 3350 UChar s[2]; 3351 int32_t length; 3352 3353 length=0; 3354 U16_APPEND_UNSAFE(s, length, c); 3355 testFold(c, which, c, c, s, length, s, length); 3356} 3357 3358struct CaseFoldingData { 3359 USet *notSeen; 3360 UChar32 prev, prevSimple; 3361 UChar prevFull[32]; 3362 int32_t prevFullLength; 3363 int which; 3364}; 3365typedef struct CaseFoldingData CaseFoldingData; 3366 3367static void U_CALLCONV 3368caseFoldingLineFn(void *context, 3369 char *fields[][2], int32_t fieldCount, 3370 UErrorCode *pErrorCode) { 3371 CaseFoldingData *pData=(CaseFoldingData *)context; 3372 char *end; 3373 UChar full[32]; 3374 UChar32 c, prev, simple; 3375 int32_t count; 3376 int which; 3377 char status; 3378 3379 /* get code point */ 3380 const char *s=u_skipWhitespace(fields[0][0]); 3381 if(0==strncmp(s, "0000..10FFFF", 12)) { 3382 /* 3383 * Ignore the line 3384 * # @missing: 0000..10FFFF; C; <code point> 3385 * because maps-to-self is already our default, and this line breaks this parser. 3386 */ 3387 return; 3388 } 3389 c=(UChar32)strtoul(s, &end, 16); 3390 end=(char *)u_skipWhitespace(end); 3391 if(end<=fields[0][0] || end!=fields[0][1]) { 3392 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); 3393 *pErrorCode=U_PARSE_ERROR; 3394 return; 3395 } 3396 3397 /* get the status of this mapping */ 3398 status=*u_skipWhitespace(fields[1][0]); 3399 if(status!='C' && status!='S' && status!='F' && status!='T') { 3400 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); 3401 *pErrorCode=U_PARSE_ERROR; 3402 return; 3403 } 3404 3405 /* get the mapping */ 3406 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode); 3407 if(U_FAILURE(*pErrorCode)) { 3408 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); 3409 return; 3410 } 3411 3412 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ 3413 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) { 3414 simple=c; 3415 } 3416 3417 if(c!=(prev=pData->prev)) { 3418 /* 3419 * Test remaining mappings for the previous code point. 3420 * If a turkic folding was not mentioned, then it should fold the same 3421 * as the regular simple case folding. 3422 */ 3423 UChar prevString[2]; 3424 int32_t length; 3425 3426 length=0; 3427 U16_APPEND_UNSAFE(prevString, length, prev); 3428 testFold(prev, (~pData->which)&CF_ALL, 3429 prev, pData->prevSimple, 3430 prevString, length, 3431 pData->prevFull, pData->prevFullLength); 3432 pData->prev=pData->prevSimple=c; 3433 length=0; 3434 U16_APPEND_UNSAFE(pData->prevFull, length, c); 3435 pData->prevFullLength=length; 3436 pData->which=0; 3437 } 3438 3439 /* 3440 * Turn the status into a bit set of case foldings to test. 3441 * Remember non-Turkic case foldings as defaults for Turkic mode. 3442 */ 3443 switch(status) { 3444 case 'C': 3445 which=CF_SIMPLE|CF_FULL; 3446 pData->prevSimple=simple; 3447 u_memcpy(pData->prevFull, full, count); 3448 pData->prevFullLength=count; 3449 break; 3450 case 'S': 3451 which=CF_SIMPLE; 3452 pData->prevSimple=simple; 3453 break; 3454 case 'F': 3455 which=CF_FULL; 3456 u_memcpy(pData->prevFull, full, count); 3457 pData->prevFullLength=count; 3458 break; 3459 case 'T': 3460 which=CF_TURKIC; 3461 break; 3462 default: 3463 which=0; 3464 break; /* won't happen because of test above */ 3465 } 3466 3467 testFold(c, which, simple, simple, full, count, full, count); 3468 3469 /* remember which case foldings of c have been tested */ 3470 pData->which|=which; 3471 3472 /* remove c from the set of ones not mentioned in CaseFolding.txt */ 3473 uset_remove(pData->notSeen, c); 3474} 3475 3476static void 3477TestCaseFolding() { 3478 CaseFoldingData data={ NULL }; 3479 char *fields[3][2]; 3480 UErrorCode errorCode; 3481 3482 static char *lastLine= (char *)"10FFFF; C; 10FFFF;"; 3483 3484 errorCode=U_ZERO_ERROR; 3485 /* test BMP & plane 1 - nothing interesting above */ 3486 data.notSeen=uset_open(0, 0x1ffff); 3487 data.prevFullLength=1; /* length of full case folding of U+0000 */ 3488 3489 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode); 3490 if(U_SUCCESS(errorCode)) { 3491 int32_t i, start, end; 3492 3493 /* add a pseudo-last line to finish testing of the actual last one */ 3494 fields[0][0]=lastLine; 3495 fields[0][1]=lastLine+6; 3496 fields[1][0]=lastLine+7; 3497 fields[1][1]=lastLine+9; 3498 fields[2][0]=lastLine+10; 3499 fields[2][1]=lastLine+17; 3500 caseFoldingLineFn(&data, fields, 3, &errorCode); 3501 3502 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */ 3503 for(i=0; 3504 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) && 3505 U_SUCCESS(errorCode); 3506 ++i 3507 ) { 3508 do { 3509 testFoldToSelf(start, CF_ALL); 3510 } while(++start<=end); 3511 } 3512 } 3513 3514 uset_close(data.notSeen); 3515} 3516