1/* 2******************************************************************************* 3* 4* Copyright (C) 2004-2009, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: gencase.c 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2004aug28 14* created by: Markus W. Scherer 15* 16* This program reads several of the Unicode character database text files, 17* parses them, and the case mapping properties for each character. 18* It then writes a binary file containing the properties 19* that is designed to be used directly for random-access to 20* the properties of each Unicode character. 21*/ 22 23#include <stdio.h> 24#include "unicode/utypes.h" 25#include "unicode/uchar.h" 26#include "unicode/uset.h" 27#include "unicode/putil.h" 28#include "unicode/uclean.h" 29#include "cmemory.h" 30#include "cstring.h" 31#include "uarrsort.h" 32#include "unewdata.h" 33#include "uoptions.h" 34#include "uparse.h" 35#include "uprops.h" 36#include "propsvec.h" 37#include "gencase.h" 38 39#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) 40 41/* data --------------------------------------------------------------------- */ 42 43UPropsVectors *pv; 44 45UBool beVerbose=FALSE, haveCopyright=TRUE; 46 47/* 48 * Unicode set collecting the case-sensitive characters; 49 * see uchar.h UCHAR_CASE_SENSITIVE. 50 * Add code points from case mappings/foldings in 51 * the root locale and with default options. 52 */ 53static USet *caseSensitive; 54 55/* prototypes --------------------------------------------------------------- */ 56 57static void 58parseSpecialCasing(const char *filename, UErrorCode *pErrorCode); 59 60static void 61parseCaseFolding(const char *filename, UErrorCode *pErrorCode); 62 63static void 64parseDB(const char *filename, UErrorCode *pErrorCode); 65 66/* parse files with multiple binary properties ------------------------------ */ 67 68/* TODO: more common code, move functions to uparse.h|c */ 69 70/* TODO: similar to genprops/props2.c but not the same */ 71 72struct Binary { 73 const char *propName; 74 int32_t vecWord; 75 uint32_t vecValue, vecMask; 76}; 77typedef struct Binary Binary; 78 79struct Binaries { 80 const char *ucdFile; 81 const Binary *binaries; 82 int32_t binariesCount; 83}; 84typedef struct Binaries Binaries; 85 86static const Binary 87propListNames[]={ 88 { "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK } 89}; 90 91static const Binaries 92propListBinaries={ 93 "PropList", propListNames, LENGTHOF(propListNames) 94}; 95 96static const Binary 97derCorePropsNames[]={ 98 { "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK }, 99 { "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK }, 100 /* Unicode 5.2 adds Case_Ignorable as a public property. See comments in store.c. */ 101 { "Case_Ignorable", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) } 102}; 103 104static const Binaries 105derCorePropsBinaries={ 106 "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames) 107}; 108 109/* 110 * Treat Word_Break=MidLetter and MidNumLet as a single binary property. 111 * We need not distinguish between them because both add to case-ignorable. 112 * We ignore all other Word_Break values. 113 */ 114static const Binary 115wordBreakNames[]={ 116 { "MidLetter", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }, 117 { "MidNumLet", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) } 118}; 119 120static const Binaries 121wordBreakBinaries={ 122 "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames) 123}; 124 125static void U_CALLCONV 126binariesLineFn(void *context, 127 char *fields[][2], int32_t fieldCount, 128 UErrorCode *pErrorCode) { 129 const Binaries *bin; 130 char *s; 131 uint32_t start, end; 132 int32_t i; 133 134 bin=(const Binaries *)context; 135 136 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); 137 if(U_FAILURE(*pErrorCode)) { 138 fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); 139 exit(*pErrorCode); 140 } 141 142 /* parse binary property name */ 143 s=(char *)u_skipWhitespace(fields[1][0]); 144 for(i=0;; ++i) { 145 if(i==bin->binariesCount) { 146 /* ignore unrecognized properties */ 147 return; 148 } 149 if(isToken(bin->binaries[i].propName, s)) { 150 break; 151 } 152 } 153 154 if(bin->binaries[i].vecMask==0) { 155 fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n", 156 (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName); 157 exit(U_INTERNAL_PROGRAM_ERROR); 158 } 159 160 upvec_setValue(pv, start, end, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode); 161 if(U_FAILURE(*pErrorCode)) { 162 fprintf(stderr, "gencase error: unable to set %s, code: %s\n", 163 bin->binaries[i].propName, u_errorName(*pErrorCode)); 164 exit(*pErrorCode); 165 } 166} 167 168static void 169parseBinariesFile(char *filename, char *basename, const char *suffix, 170 const Binaries *bin, 171 UErrorCode *pErrorCode) { 172 char *fields[2][2]; 173 174 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 175 return; 176 } 177 178 writeUCDFilename(basename, bin->ucdFile, suffix); 179 180 u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode); 181 if(U_FAILURE(*pErrorCode)) { 182 fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode)); 183 } 184} 185 186/* -------------------------------------------------------------------------- */ 187 188enum 189{ 190 HELP_H, 191 HELP_QUESTION_MARK, 192 VERBOSE, 193 COPYRIGHT, 194 DESTDIR, 195 SOURCEDIR, 196 UNICODE_VERSION, 197 ICUDATADIR, 198 CSOURCE 199}; 200 201/* Keep these values in sync with the above enums */ 202static UOption options[]={ 203 UOPTION_HELP_H, 204 UOPTION_HELP_QUESTION_MARK, 205 UOPTION_VERBOSE, 206 UOPTION_COPYRIGHT, 207 UOPTION_DESTDIR, 208 UOPTION_SOURCEDIR, 209 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), 210 UOPTION_ICUDATADIR, 211 UOPTION_DEF("csource", 'C', UOPT_NO_ARG) 212}; 213 214extern int 215main(int argc, char* argv[]) { 216 char filename[300]; 217 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; 218 char *basename=NULL; 219 UErrorCode errorCode=U_ZERO_ERROR; 220 221 U_MAIN_INIT_ARGS(argc, argv); 222 223 /* preset then read command line options */ 224 options[DESTDIR].value=u_getDataDirectory(); 225 options[SOURCEDIR].value=""; 226 options[UNICODE_VERSION].value=""; 227 options[ICUDATADIR].value=u_getDataDirectory(); 228 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 229 230 /* error handling, printing usage message */ 231 if(argc<0) { 232 fprintf(stderr, 233 "error in command line argument \"%s\"\n", 234 argv[-argc]); 235 } 236 if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { 237 /* 238 * Broken into chunks because the C89 standard says the minimum 239 * required supported string length is 509 bytes. 240 */ 241 fprintf(stderr, 242 "Usage: %s [-options] [suffix]\n" 243 "\n" 244 "read the UnicodeData.txt file and other Unicode properties files and\n" 245 "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n" 246 "\n", 247 argv[0]); 248 fprintf(stderr, 249 "Options:\n" 250 "\t-h or -? or --help this usage text\n" 251 "\t-v or --verbose verbose output\n" 252 "\t-c or --copyright include a copyright notice\n" 253 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" 254 "\t-C or --csource generate a .c source file rather than the .icu binary\n"); 255 fprintf(stderr, 256 "\t-d or --destdir destination directory, followed by the path\n" 257 "\t-s or --sourcedir source directory, followed by the path\n" 258 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 259 "\t followed by path, defaults to %s\n" 260 "\tsuffix suffix that is to be appended with a '-'\n" 261 "\t to the source file basenames before opening;\n" 262 "\t 'gencase new' will read UnicodeData-new.txt etc.\n", 263 u_getDataDirectory()); 264 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 265 } 266 267 /* get the options values */ 268 beVerbose=options[VERBOSE].doesOccur; 269 haveCopyright=options[COPYRIGHT].doesOccur; 270 srcDir=options[SOURCEDIR].value; 271 destDir=options[DESTDIR].value; 272 273 if(argc>=2) { 274 suffix=argv[1]; 275 } else { 276 suffix=NULL; 277 } 278 279 if(options[UNICODE_VERSION].doesOccur) { 280 setUnicodeVersion(options[UNICODE_VERSION].value); 281 } 282 /* else use the default dataVersion in store.c */ 283 284 if (options[ICUDATADIR].doesOccur) { 285 u_setDataDirectory(options[ICUDATADIR].value); 286 } 287 288 /* prepare the filename beginning with the source dir */ 289 uprv_strcpy(filename, srcDir); 290 basename=filename+uprv_strlen(filename); 291 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { 292 *basename++=U_FILE_SEP_CHAR; 293 } 294 295 /* initialize */ 296 pv=upvec_open(2, &errorCode); 297 caseSensitive=uset_open(1, 0); /* empty set (start>end) */ 298 299 /* process SpecialCasing.txt */ 300 writeUCDFilename(basename, "SpecialCasing", suffix); 301 parseSpecialCasing(filename, &errorCode); 302 303 /* process CaseFolding.txt */ 304 writeUCDFilename(basename, "CaseFolding", suffix); 305 parseCaseFolding(filename, &errorCode); 306 307 /* process additional properties files */ 308 *basename=0; 309 310 parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode); 311 312 parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode); 313 314 if(ucdVersion>=UNI_4_1) { 315 parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode); 316 } 317 318 /* process UnicodeData.txt */ 319 writeUCDFilename(basename, "UnicodeData", suffix); 320 parseDB(filename, &errorCode); 321 322 /* process parsed data */ 323 makeCaseClosure(); 324 325 makeExceptions(); 326 327 if(U_SUCCESS(errorCode)) { 328 /* write the properties data file */ 329 generateData(destDir, options[CSOURCE].doesOccur); 330 } 331 332 u_cleanup(); 333 return errorCode; 334} 335 336U_CFUNC void 337writeUCDFilename(char *basename, const char *filename, const char *suffix) { 338 int32_t length=(int32_t)uprv_strlen(filename); 339 uprv_strcpy(basename, filename); 340 if(suffix!=NULL) { 341 basename[length++]='-'; 342 uprv_strcpy(basename+length, suffix); 343 length+=(int32_t)uprv_strlen(suffix); 344 } 345 uprv_strcpy(basename+length, ".txt"); 346} 347 348/* TODO: move to toolutil */ 349U_CFUNC UBool 350isToken(const char *token, const char *s) { 351 const char *z; 352 int32_t j; 353 354 s=u_skipWhitespace(s); 355 for(j=0;; ++j) { 356 if(token[j]!=0) { 357 if(s[j]!=token[j]) { 358 break; 359 } 360 } else { 361 z=u_skipWhitespace(s+j); 362 if(*z==';' || *z==0) { 363 return TRUE; 364 } else { 365 break; 366 } 367 } 368 } 369 370 return FALSE; 371} 372 373static int32_t 374getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { 375 const char *t, *z; 376 int32_t i, j; 377 378 s=u_skipWhitespace(s); 379 for(i=0; i<countTokens; ++i) { 380 t=tokens[i]; 381 if(t!=NULL) { 382 for(j=0;; ++j) { 383 if(t[j]!=0) { 384 if(s[j]!=t[j]) { 385 break; 386 } 387 } else { 388 z=u_skipWhitespace(s+j); 389 if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') { 390 return i; 391 } else { 392 break; 393 } 394 } 395 } 396 } 397 } 398 return -1; 399} 400 401static void 402_set_addAll(USet *set, const UChar *s, int32_t length) { 403 UChar32 c; 404 int32_t i; 405 406 /* needs length>=0 */ 407 for(i=0; i<length; /* U16_NEXT advances i */) { 408 U16_NEXT(s, i, length, c); 409 uset_add(set, c); 410 } 411} 412 413/* parser for SpecialCasing.txt --------------------------------------------- */ 414 415#define MAX_SPECIAL_CASING_COUNT 500 416 417static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT]; 418static int32_t specialCasingCount=0; 419 420static void U_CALLCONV 421specialCasingLineFn(void *context, 422 char *fields[][2], int32_t fieldCount, 423 UErrorCode *pErrorCode) { 424 char *end; 425 426 /* get code point */ 427 specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); 428 end=(char *)u_skipWhitespace(end); 429 if(end<=fields[0][0] || end!=fields[0][1]) { 430 fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]); 431 *pErrorCode=U_PARSE_ERROR; 432 exit(U_PARSE_ERROR); 433 } 434 435 /* is this a complex mapping? */ 436 if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') { 437 /* there is some condition text in the fifth field */ 438 specialCasings[specialCasingCount].isComplex=TRUE; 439 440 /* do not store any actual mappings for this */ 441 specialCasings[specialCasingCount].lowerCase[0]=0; 442 specialCasings[specialCasingCount].upperCase[0]=0; 443 specialCasings[specialCasingCount].titleCase[0]=0; 444 } else { 445 /* just set the "complex" flag and get the case mappings */ 446 specialCasings[specialCasingCount].isComplex=FALSE; 447 specialCasings[specialCasingCount].lowerCase[0]= 448 (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode); 449 specialCasings[specialCasingCount].upperCase[0]= 450 (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode); 451 specialCasings[specialCasingCount].titleCase[0]= 452 (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode); 453 if(U_FAILURE(*pErrorCode)) { 454 fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]); 455 exit(*pErrorCode); 456 } 457 458 uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code); 459 _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]); 460 _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]); 461 _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]); 462 } 463 464 if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) { 465 fprintf(stderr, "gencase: too many special casing mappings\n"); 466 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 467 exit(U_INDEX_OUTOFBOUNDS_ERROR); 468 } 469} 470 471static int32_t U_CALLCONV 472compareSpecialCasings(const void *context, const void *left, const void *right) { 473 return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code; 474} 475 476static void 477parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) { 478 char *fields[5][2]; 479 int32_t i, j; 480 481 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 482 return; 483 } 484 485 u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode); 486 487 /* sort the special casing entries by code point */ 488 if(specialCasingCount>0) { 489 uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), 490 compareSpecialCasings, NULL, FALSE, pErrorCode); 491 } 492 if(U_FAILURE(*pErrorCode)) { 493 return; 494 } 495 496 /* replace multiple entries for any code point by one "complex" one */ 497 j=0; 498 for(i=1; i<specialCasingCount; ++i) { 499 if(specialCasings[i-1].code==specialCasings[i].code) { 500 /* there is a duplicate code point */ 501 specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following sorting */ 502 specialCasings[i].isComplex=TRUE; /* make the following one complex */ 503 specialCasings[i].lowerCase[0]=0; 504 specialCasings[i].upperCase[0]=0; 505 specialCasings[i].titleCase[0]=0; 506 ++j; 507 } 508 } 509 510 /* if some entries just were removed, then re-sort */ 511 if(j>0) { 512 uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), 513 compareSpecialCasings, NULL, FALSE, pErrorCode); 514 specialCasingCount-=j; 515 } 516 if(U_FAILURE(*pErrorCode)) { 517 return; 518 } 519 520 /* 521 * Add one complex mapping to caseSensitive that was filtered out above: 522 * Greek final Sigma has a conditional mapping but not locale-sensitive, 523 * and it is taken when lowercasing just U+03A3 alone. 524 * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA 525 */ 526 uset_add(caseSensitive, 0x3c2); 527} 528 529/* parser for CaseFolding.txt ----------------------------------------------- */ 530 531#define MAX_CASE_FOLDING_COUNT 2000 532 533static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT]; 534static int32_t caseFoldingCount=0; 535 536static void U_CALLCONV 537caseFoldingLineFn(void *context, 538 char *fields[][2], int32_t fieldCount, 539 UErrorCode *pErrorCode) { 540 char *end; 541 static UChar32 prevCode=0; 542 int32_t count; 543 char status; 544 545 /* get code point */ 546 caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); 547 end=(char *)u_skipWhitespace(end); 548 if(end<=fields[0][0] || end!=fields[0][1]) { 549 fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); 550 *pErrorCode=U_PARSE_ERROR; 551 exit(U_PARSE_ERROR); 552 } 553 554 /* get the status of this mapping */ 555 caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); 556 if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') { 557 fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); 558 *pErrorCode=U_PARSE_ERROR; 559 exit(U_PARSE_ERROR); 560 } 561 562 /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */ 563 if(status=='L') { 564 return; 565 } 566 567 /* get the mapping */ 568 count=caseFoldings[caseFoldingCount].full[0]= 569 (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode); 570 if(U_FAILURE(*pErrorCode)) { 571 fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); 572 exit(*pErrorCode); 573 } 574 575 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ 576 if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) { 577 caseFoldings[caseFoldingCount].simple=0; 578 } 579 580 /* update the case-sensitive set */ 581 if(status!='T') { 582 uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code); 583 _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]); 584 } 585 586 /* check the status */ 587 if(status=='S') { 588 /* check if there was a full mapping for this code point before */ 589 if( caseFoldingCount>0 && 590 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && 591 caseFoldings[caseFoldingCount-1].status=='F' 592 ) { 593 /* merge the two entries */ 594 caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple; 595 return; 596 } 597 } else if(status=='F') { 598 /* check if there was a simple mapping for this code point before */ 599 if( caseFoldingCount>0 && 600 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && 601 caseFoldings[caseFoldingCount-1].status=='S' 602 ) { 603 /* merge the two entries */ 604 uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR); 605 return; 606 } 607 } else if(status=='I' || status=='T') { 608 /* check if there was a default mapping for this code point before (remove it) */ 609 while(caseFoldingCount>0 && 610 caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code 611 ) { 612 prevCode=0; 613 --caseFoldingCount; 614 } 615 /* store only a marker for special handling for cases like dotless i */ 616 caseFoldings[caseFoldingCount].simple=0; 617 caseFoldings[caseFoldingCount].full[0]=0; 618 } 619 620 /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */ 621 if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) { 622 fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n", 623 (unsigned long)caseFoldings[caseFoldingCount].code, 624 (unsigned long)prevCode); 625 *pErrorCode=U_PARSE_ERROR; 626 exit(U_PARSE_ERROR); 627 } 628 prevCode=caseFoldings[caseFoldingCount].code; 629 630 if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) { 631 fprintf(stderr, "gencase: too many case folding mappings\n"); 632 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 633 exit(U_INDEX_OUTOFBOUNDS_ERROR); 634 } 635} 636 637static void 638parseCaseFolding(const char *filename, UErrorCode *pErrorCode) { 639 char *fields[3][2]; 640 641 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 642 return; 643 } 644 645 u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode); 646} 647 648/* parser for UnicodeData.txt ----------------------------------------------- */ 649 650/* general categories */ 651const char *const 652genCategoryNames[U_CHAR_CATEGORY_COUNT]={ 653 "Cn", 654 "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", 655 "Mc", "Nd", "Nl", "No", 656 "Zs", "Zl", "Zp", 657 "Cc", "Cf", "Co", "Cs", 658 "Pd", "Ps", "Pe", "Pc", "Po", 659 "Sm", "Sc", "Sk", "So", 660 "Pi", "Pf" 661}; 662 663static int32_t specialCasingIndex=0, caseFoldingIndex=0; 664 665static void U_CALLCONV 666unicodeDataLineFn(void *context, 667 char *fields[][2], int32_t fieldCount, 668 UErrorCode *pErrorCode) { 669 Props p; 670 char *end; 671 static UChar32 prevCode=0; 672 UChar32 value; 673 int32_t i; 674 675 /* reset the properties */ 676 uprv_memset(&p, 0, sizeof(Props)); 677 678 /* get the character code, field 0 */ 679 p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16); 680 if(end<=fields[0][0] || end!=fields[0][1]) { 681 fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]); 682 *pErrorCode=U_PARSE_ERROR; 683 exit(U_PARSE_ERROR); 684 } 685 686 /* get general category, field 2 */ 687 i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]); 688 if(i>=0) { 689 p.gc=(uint8_t)i; 690 } else { 691 fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n", 692 fields[2][0], (unsigned long)p.code); 693 *pErrorCode=U_PARSE_ERROR; 694 exit(U_PARSE_ERROR); 695 } 696 697 /* get canonical combining class, field 3 */ 698 value=(UChar32)uprv_strtoul(fields[3][0], &end, 10); 699 if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { 700 fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]); 701 *pErrorCode=U_PARSE_ERROR; 702 exit(U_PARSE_ERROR); 703 } 704 p.cc=(uint8_t)value; 705 706 /* get uppercase mapping, field 12 */ 707 value=(UChar32)uprv_strtoul(fields[12][0], &end, 16); 708 if(end!=fields[12][1]) { 709 fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n", 710 (unsigned long)p.code); 711 *pErrorCode=U_PARSE_ERROR; 712 exit(U_PARSE_ERROR); 713 } 714 if(value!=0 && value!=p.code) { 715 p.upperCase=value; 716 uset_add(caseSensitive, p.code); 717 uset_add(caseSensitive, value); 718 } 719 720 /* get lowercase value, field 13 */ 721 value=(UChar32)uprv_strtoul(fields[13][0], &end, 16); 722 if(end!=fields[13][1]) { 723 fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n", 724 (unsigned long)p.code); 725 *pErrorCode=U_PARSE_ERROR; 726 exit(U_PARSE_ERROR); 727 } 728 if(value!=0 && value!=p.code) { 729 p.lowerCase=value; 730 uset_add(caseSensitive, p.code); 731 uset_add(caseSensitive, value); 732 } 733 734 /* get titlecase value, field 14 */ 735 value=(UChar32)uprv_strtoul(fields[14][0], &end, 16); 736 if(end!=fields[14][1]) { 737 fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n", 738 (unsigned long)p.code); 739 *pErrorCode=U_PARSE_ERROR; 740 exit(U_PARSE_ERROR); 741 } 742 if(value!=0 && value!=p.code) { 743 p.titleCase=value; 744 uset_add(caseSensitive, p.code); 745 uset_add(caseSensitive, value); 746 } 747 748 /* set additional properties from previously parsed files */ 749 if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) { 750 p.specialCasing=specialCasings+specialCasingIndex++; 751 } else { 752 p.specialCasing=NULL; 753 } 754 if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) { 755 p.caseFolding=caseFoldings+caseFoldingIndex++; 756 757 /* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */ 758 if( p.caseFolding->status=='C' && 759 p.caseFolding->simple==p.lowerCase 760 ) { 761 p.caseFolding=NULL; 762 } 763 } else { 764 p.caseFolding=NULL; 765 } 766 767 /* check for non-character code points */ 768 if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) { 769 fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n", 770 (unsigned long)p.code); 771 *pErrorCode=U_PARSE_ERROR; 772 exit(U_PARSE_ERROR); 773 } 774 775 /* check that the code points (p.code) are in ascending order */ 776 if(p.code<=prevCode && p.code>0) { 777 fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", 778 (unsigned long)p.code, (unsigned long)prevCode); 779 *pErrorCode=U_PARSE_ERROR; 780 exit(U_PARSE_ERROR); 781 } 782 783 /* properties for a single code point */ 784 setProps(&p); 785 786 prevCode=p.code; 787} 788 789static void 790parseDB(const char *filename, UErrorCode *pErrorCode) { 791 char *fields[15][2]; 792 UChar32 start, end; 793 int32_t i; 794 795 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 796 return; 797 } 798 799 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); 800 801 /* are all sub-properties consumed? */ 802 if(specialCasingIndex<specialCasingCount) { 803 fprintf(stderr, "gencase: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n"); 804 *pErrorCode=U_PARSE_ERROR; 805 exit(U_PARSE_ERROR); 806 } 807 if(caseFoldingIndex<caseFoldingCount) { 808 fprintf(stderr, "gencase: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n"); 809 *pErrorCode=U_PARSE_ERROR; 810 exit(U_PARSE_ERROR); 811 } 812 813 if(U_FAILURE(*pErrorCode)) { 814 return; 815 } 816 817 for(i=0; 818 0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode); 819 ++i 820 ) { 821 addCaseSensitive(start, end); 822 } 823 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) { 824 *pErrorCode=U_ZERO_ERROR; 825 } 826} 827 828/* 829 * Hey, Emacs, please set the following: 830 * 831 * Local Variables: 832 * indent-tabs-mode: nil 833 * End: 834 * 835 */ 836