1/* 2******************************************************************************* 3* 4* Copyright (C) 2001-2005, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: gennorm.c 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2001may25 14* created by: Markus W. Scherer 15* 16* This program reads the Unicode character database text file, 17* parses it, and extracts the data for normalization. 18* It then preprocesses it and writes a binary file for efficient use 19* in various Unicode text normalization processes. 20*/ 21 22#include <stdio.h> 23#include <stdlib.h> 24#include "unicode/utypes.h" 25#include "unicode/uchar.h" 26#include "unicode/ustring.h" 27#include "unicode/putil.h" 28#include "unicode/uclean.h" 29#include "unicode/udata.h" 30#include "unicode/uset.h" 31#include "cmemory.h" 32#include "cstring.h" 33#include "unewdata.h" 34#include "uoptions.h" 35#include "uparse.h" 36#include "unormimp.h" 37 38U_CDECL_BEGIN 39#include "gennorm.h" 40U_CDECL_END 41 42UBool beVerbose=FALSE, haveCopyright=TRUE; 43 44/* prototypes --------------------------------------------------------------- */ 45 46static void 47parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError); 48 49static void 50parseDB(const char *filename, UErrorCode *pErrorCode); 51 52/* -------------------------------------------------------------------------- */ 53 54enum { 55 HELP_H, 56 HELP_QUESTION_MARK, 57 VERBOSE, 58 COPYRIGHT, 59 DESTDIR, 60 SOURCEDIR, 61 UNICODE_VERSION, 62 ICUDATADIR, 63 CSOURCE, 64 STORE_FLAGS 65}; 66 67static UOption options[]={ 68 UOPTION_HELP_H, 69 UOPTION_HELP_QUESTION_MARK, 70 UOPTION_VERBOSE, 71 UOPTION_COPYRIGHT, 72 UOPTION_DESTDIR, 73 UOPTION_SOURCEDIR, 74 UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), 75 UOPTION_ICUDATADIR, 76 UOPTION_DEF("csource", 'C', UOPT_NO_ARG), 77 UOPTION_DEF("prune", 'p', UOPT_REQUIRES_ARG) 78}; 79 80extern int 81main(int argc, char* argv[]) { 82#if !UCONFIG_NO_NORMALIZATION 83 char filename[300]; 84#endif 85 const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; 86 char *basename=NULL; 87 UErrorCode errorCode=U_ZERO_ERROR; 88 89 U_MAIN_INIT_ARGS(argc, argv); 90 91 /* preset then read command line options */ 92 options[4].value=u_getDataDirectory(); 93 options[5].value=""; 94 options[6].value="3.0.0"; 95 options[ICUDATADIR].value=u_getDataDirectory(); 96 argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); 97 98 /* error handling, printing usage message */ 99 if(argc<0) { 100 fprintf(stderr, 101 "error in command line argument \"%s\"\n", 102 argv[-argc]); 103 } 104 if(argc<0 || options[0].doesOccur || options[1].doesOccur) { 105 /* 106 * Broken into chucks because the C89 standard says the minimum 107 * required supported string length is 509 bytes. 108 */ 109 fprintf(stderr, 110 "Usage: %s [-options] [suffix]\n" 111 "\n" 112 "Read the UnicodeData.txt file and other Unicode properties files and\n" 113 "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n" 114 "\n", 115 argv[0]); 116 fprintf(stderr, 117 "Options:\n" 118 "\t-h or -? or --help this usage text\n" 119 "\t-v or --verbose verbose output\n" 120 "\t-c or --copyright include a copyright notice\n" 121 "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" 122 "\t-C or --csource generate a .c source file rather than the .icu binary\n"); 123 fprintf(stderr, 124 "\t-p or --prune flags Prune for data modularization:\n" 125 "\t Determine what data is to be stored.\n" 126 "\t 0 (zero) stores minimal data (only for NFD)\n" 127 "\t lowercase letters turn off data, uppercase turn on (use with 0)\n"); 128 fprintf(stderr, 129 "\t k: compatibility decompositions (NFKC, NFKD)\n" 130 "\t c: composition data (NFC, NFKC)\n" 131 "\t f: FCD data (will be generated at load time)\n" 132 "\t a: auxiliary data (canonical closure etc.)\n" 133 "\t x: exclusion sets (Unicode 3.2-level normalization)\n"); 134 fprintf(stderr, 135 "\t-d or --destdir destination directory, followed by the path\n" 136 "\t-s or --sourcedir source directory, followed by the path\n" 137 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 138 "\t followed by path, defaults to <%s>\n" 139 "\tsuffix suffix that is to be appended with a '-'\n" 140 "\t to the source file basenames before opening;\n" 141 "\t 'gennorm new' will read UnicodeData-new.txt etc.\n", 142 u_getDataDirectory()); 143 return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; 144 } 145 146 /* get the options values */ 147 beVerbose=options[2].doesOccur; 148 haveCopyright=options[3].doesOccur; 149 srcDir=options[5].value; 150 destDir=options[4].value; 151 152 if(argc>=2) { 153 suffix=argv[1]; 154 } else { 155 suffix=NULL; 156 } 157 158#if UCONFIG_NO_NORMALIZATION 159 160 fprintf(stderr, 161 "gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE 162 " because UCONFIG_NO_NORMALIZATION is set, \n" 163 "see icu/source/common/unicode/uconfig.h\n"); 164 generateData(destDir, options[CSOURCE].doesOccur); 165 166#else 167 168 setUnicodeVersion(options[6].value); 169 170 if (options[ICUDATADIR].doesOccur) { 171 u_setDataDirectory(options[ICUDATADIR].value); 172 } 173 174 if(options[STORE_FLAGS].doesOccur) { 175 const char *s=options[STORE_FLAGS].value; 176 char c; 177 178 while((c=*s++)!=0) { 179 switch(c) { 180 case '0': 181 gStoreFlags=0; /* store minimal data (only for NFD) */ 182 break; 183 184 /* lowercase letters: omit data */ 185 case 'k': 186 gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPAT); 187 break; 188 case 'c': 189 gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPOSITION); 190 break; 191 case 'f': 192 gStoreFlags&=~U_MASK(UGENNORM_STORE_FCD); 193 break; 194 case 'a': 195 gStoreFlags&=~U_MASK(UGENNORM_STORE_AUX); 196 break; 197 case 'x': 198 gStoreFlags&=~U_MASK(UGENNORM_STORE_EXCLUSIONS); 199 break; 200 201 /* uppercase letters: include data (use with 0) */ 202 case 'K': 203 gStoreFlags|=U_MASK(UGENNORM_STORE_COMPAT); 204 break; 205 case 'C': 206 gStoreFlags|=U_MASK(UGENNORM_STORE_COMPOSITION); 207 break; 208 case 'F': 209 gStoreFlags|=U_MASK(UGENNORM_STORE_FCD); 210 break; 211 case 'A': 212 gStoreFlags|=U_MASK(UGENNORM_STORE_AUX); 213 break; 214 case 'X': 215 gStoreFlags|=U_MASK(UGENNORM_STORE_EXCLUSIONS); 216 break; 217 218 default: 219 fprintf(stderr, "ignoring undefined prune flag '%c'\n", c); 220 break; 221 } 222 } 223 } 224 225 /* 226 * Verify that we can work with properties 227 * but don't call u_init() because that needs unorm.icu which we are just 228 * going to build here. 229 */ 230 { 231 U_STRING_DECL(ideo, "[:Ideographic:]", 15); 232 USet *set; 233 234 U_STRING_INIT(ideo, "[:Ideographic:]", 15); 235 set=uset_openPattern(ideo, -1, &errorCode); 236 if(U_FAILURE(errorCode) || !uset_contains(set, 0xf900)) { 237 fprintf(stderr, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode)); 238 exit(errorCode); 239 } 240 uset_close(set); 241 } 242 243 /* prepare the filename beginning with the source dir */ 244 uprv_strcpy(filename, srcDir); 245 basename=filename+uprv_strlen(filename); 246 if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { 247 *basename++=U_FILE_SEP_CHAR; 248 } 249 250 /* initialize */ 251 init(); 252 253 /* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */ 254 if(suffix==NULL) { 255 uprv_strcpy(basename, "DerivedNormalizationProps.txt"); 256 } else { 257 uprv_strcpy(basename, "DerivedNormalizationProps"); 258 basename[30]='-'; 259 uprv_strcpy(basename+31, suffix); 260 uprv_strcat(basename+31, ".txt"); 261 } 262 parseDerivedNormalizationProperties(filename, &errorCode, FALSE); 263 if(U_FAILURE(errorCode)) { 264 /* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */ 265 if(suffix==NULL) { 266 uprv_strcpy(basename, "DerivedNormalizationProperties.txt"); 267 } else { 268 uprv_strcpy(basename, "DerivedNormalizationProperties"); 269 basename[30]='-'; 270 uprv_strcpy(basename+31, suffix); 271 uprv_strcat(basename+31, ".txt"); 272 } 273 parseDerivedNormalizationProperties(filename, &errorCode, TRUE); 274 } 275 276 /* process UnicodeData.txt */ 277 if(suffix==NULL) { 278 uprv_strcpy(basename, "UnicodeData.txt"); 279 } else { 280 uprv_strcpy(basename, "UnicodeData"); 281 basename[11]='-'; 282 uprv_strcpy(basename+12, suffix); 283 uprv_strcat(basename+12, ".txt"); 284 } 285 parseDB(filename, &errorCode); 286 287 /* process parsed data */ 288 if(U_SUCCESS(errorCode)) { 289 processData(); 290 291 /* write the properties data file */ 292 generateData(destDir, options[CSOURCE].doesOccur); 293 294 cleanUpData(); 295 } 296 297#endif 298 299 return errorCode; 300} 301 302#if !UCONFIG_NO_NORMALIZATION 303 304/* parser for DerivedNormalizationProperties.txt ---------------------------- */ 305 306static void U_CALLCONV 307derivedNormalizationPropertiesLineFn(void *context, 308 char *fields[][2], int32_t fieldCount, 309 UErrorCode *pErrorCode) { 310 UChar string[32]; 311 char *s; 312 uint32_t start, end; 313 int32_t count; 314 uint8_t qcFlags; 315 316 /* get code point range */ 317 count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); 318 if(U_FAILURE(*pErrorCode)) { 319 fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]); 320 exit(*pErrorCode); 321 } 322 323 /* ignore hangul - handle explicitly */ 324 if(start==0xac00) { 325 return; 326 } 327 328 /* get property - ignore unrecognized ones */ 329 s=(char *)u_skipWhitespace(fields[1][0]); 330 if(*s=='N' && s[1]=='F') { 331 /* quick check flag */ 332 qcFlags=0x11; 333 s+=2; 334 if(*s=='K') { 335 qcFlags<<=1; 336 ++s; 337 } 338 339 if(*s=='C' && s[1]=='_') { 340 s+=2; 341 } else if(*s=='D' && s[1]=='_') { 342 qcFlags<<=2; 343 s+=2; 344 } else { 345 return; 346 } 347 348 if(0==uprv_strncmp(s, "NO", 2)) { 349 qcFlags&=0xf; 350 } else if(0==uprv_strncmp(s, "MAYBE", 5)) { 351 qcFlags&=0x30; 352 } else if(0==uprv_strncmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') { 353 /* 354 * Unicode 4.0.1: 355 * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc. 356 */ 357 /* start of the field */ 358 s=(char *)u_skipWhitespace(s+1); 359 if(*s=='N') { 360 qcFlags&=0xf; 361 } else if(*s=='M') { 362 qcFlags&=0x30; 363 } else { 364 return; /* do nothing for "Yes" because it's the default value */ 365 } 366 } else { 367 return; /* do nothing for "Yes" because it's the default value */ 368 } 369 370 /* set this flag for all code points in this range */ 371 while(start<=end) { 372 setQCFlags(start++, qcFlags); 373 } 374 } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) { 375 /* full composition exclusion */ 376 while(start<=end) { 377 setCompositionExclusion(start++); 378 } 379 } else if( 380 ((0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') || 381 (0==uprv_memcmp(s, "FC_NFKC", 7) && *(s=(char *)u_skipWhitespace(s+7))==';')) 382 383 ) { 384 /* FC_NFKC_Closure, parse field 2 to get the string */ 385 char *t; 386 387 /* start of the field */ 388 s=(char *)u_skipWhitespace(s+1); 389 390 /* find the end of the field */ 391 for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {} 392 *t=0; 393 394 string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode); 395 if(U_FAILURE(*pErrorCode)) { 396 fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]); 397 exit(*pErrorCode); 398 } 399 while(start<=end) { 400 setFNC(start++, string); 401 } 402 } 403} 404 405static void 406parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError) { 407 char *fields[2][2]; 408 409 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 410 return; 411 } 412 413 u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode); 414 if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) { 415 fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); 416 exit(*pErrorCode); 417 } 418} 419 420/* parser for UnicodeData.txt ----------------------------------------------- */ 421 422static void U_CALLCONV 423unicodeDataLineFn(void *context, 424 char *fields[][2], int32_t fieldCount, 425 UErrorCode *pErrorCode) { 426 uint32_t decomp[40]; 427 Norm norm; 428 const char *s; 429 char *end; 430 uint32_t code, value; 431 int32_t length; 432 UBool isCompat, something=FALSE; 433 434 /* ignore First and Last entries for ranges */ 435 if( *fields[1][0]=='<' && 436 (length=(int32_t)(fields[1][1]-fields[1][0]))>=9 && 437 (0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) 438 ) { 439 return; 440 } 441 442 /* reset the properties */ 443 uprv_memset(&norm, 0, sizeof(Norm)); 444 445 /* 446 * The combiningIndex must not be initialized to 0 because 0 is the 447 * combiningIndex of the first forward-combining character. 448 */ 449 norm.combiningIndex=0xffff; 450 451 /* get the character code, field 0 */ 452 code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); 453 if(end<=fields[0][0] || end!=fields[0][1]) { 454 fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]); 455 *pErrorCode=U_PARSE_ERROR; 456 exit(U_PARSE_ERROR); 457 } 458 459 /* get canonical combining class, field 3 */ 460 value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10); 461 if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { 462 fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]); 463 *pErrorCode=U_PARSE_ERROR; 464 exit(U_PARSE_ERROR); 465 } 466 if(value>0) { 467 norm.udataCC=(uint8_t)value; 468 something=TRUE; 469 } 470 471 /* get the decomposition, field 5 */ 472 if(fields[5][0]<fields[5][1]) { 473 if(*(s=fields[5][0])=='<') { 474 ++s; 475 isCompat=TRUE; 476 477 /* skip and ignore the compatibility type name */ 478 do { 479 if(s==fields[5][1]) { 480 /* missing '>' */ 481 fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]); 482 *pErrorCode=U_PARSE_ERROR; 483 exit(U_PARSE_ERROR); 484 } 485 } while(*s++!='>'); 486 } else { 487 isCompat=FALSE; 488 } 489 490 /* parse the decomposition string */ 491 length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode); 492 if(U_FAILURE(*pErrorCode)) { 493 fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n", 494 (long)code, u_errorName(*pErrorCode)); 495 exit(*pErrorCode); 496 } 497 498 /* store the string */ 499 if(length>0) { 500 something=TRUE; 501 if(isCompat) { 502 norm.lenNFKD=(uint8_t)length; 503 norm.nfkd=decomp; 504 } else { 505 if(length>2) { 506 fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n", 507 (long)code, (long)length); 508 *pErrorCode=U_PARSE_ERROR; 509 exit(U_PARSE_ERROR); 510 } 511 norm.lenNFD=(uint8_t)length; 512 norm.nfd=decomp; 513 } 514 } 515 } 516 517 /* check for non-character code points */ 518 if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) { 519 fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n", 520 (long)code); 521 *pErrorCode=U_PARSE_ERROR; 522 exit(U_PARSE_ERROR); 523 } 524 525 if(something) { 526 /* there are normalization values, so store them */ 527#if 0 528 if(beVerbose) { 529 printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n", 530 (long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD); 531 } 532#endif 533 storeNorm(code, &norm); 534 } 535} 536 537static void 538parseDB(const char *filename, UErrorCode *pErrorCode) { 539 char *fields[15][2]; 540 541 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 542 return; 543 } 544 545 u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); 546 if(U_FAILURE(*pErrorCode)) { 547 fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode)); 548 exit(*pErrorCode); 549 } 550} 551 552#endif /* #if !UCONFIG_NO_NORMALIZATION */ 553 554/* 555 * Hey, Emacs, please set the following: 556 * 557 * Local Variables: 558 * indent-tabs-mode: nil 559 * End: 560 * 561 */ 562