1/* 2******************************************************************************* 3* Copyright (C) 2011-2014, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* file name: ppucd.cpp 7* encoding: US-ASCII 8* tab size: 8 (not used) 9* indentation:4 10* 11* created on: 2011dec11 12* created by: Markus W. Scherer 13*/ 14 15#include "unicode/utypes.h" 16#include "unicode/uchar.h" 17#include "charstr.h" 18#include "cstring.h" 19#include "ppucd.h" 20#include "uassert.h" 21#include "uparse.h" 22 23#include <stdio.h> 24#include <string.h> 25 26U_NAMESPACE_BEGIN 27 28PropertyNames::~PropertyNames() {} 29 30int32_t 31PropertyNames::getPropertyEnum(const char *name) const { 32 return u_getPropertyEnum(name); 33} 34 35int32_t 36PropertyNames::getPropertyValueEnum(int32_t property, const char *name) const { 37 return u_getPropertyValueEnum((UProperty)property, name); 38} 39 40UniProps::UniProps() 41 : start(U_SENTINEL), end(U_SENTINEL), 42 bmg(U_SENTINEL), bpb(U_SENTINEL), 43 scf(U_SENTINEL), slc(U_SENTINEL), stc(U_SENTINEL), suc(U_SENTINEL), 44 digitValue(-1), numericValue(NULL), 45 name(NULL), nameAlias(NULL) { 46 memset(binProps, 0, sizeof(binProps)); 47 memset(intProps, 0, sizeof(intProps)); 48 memset(age, 0, 4); 49} 50 51UniProps::~UniProps() {} 52 53const int32_t PreparsedUCD::kNumLineBuffers; 54 55PreparsedUCD::PreparsedUCD(const char *filename, UErrorCode &errorCode) 56 : icuPnames(new PropertyNames()), pnames(icuPnames), 57 file(NULL), 58 defaultLineIndex(-1), blockLineIndex(-1), lineIndex(0), 59 lineNumber(0), 60 lineType(NO_LINE), 61 fieldLimit(NULL), lineLimit(NULL) { 62 if(U_FAILURE(errorCode)) { return; } 63 64 if(filename==NULL || *filename==0 || (*filename=='-' && filename[1]==0)) { 65 filename=NULL; 66 file=stdin; 67 } else { 68 file=fopen(filename, "r"); 69 } 70 if(file==NULL) { 71 perror("error opening preparsed UCD"); 72 fprintf(stderr, "error opening preparsed UCD file %s\n", filename ? filename : "\"no file name given\""); 73 errorCode=U_FILE_ACCESS_ERROR; 74 return; 75 } 76 77 memset(ucdVersion, 0, 4); 78 lines[0][0]=0; 79} 80 81PreparsedUCD::~PreparsedUCD() { 82 if(file!=stdin) { 83 fclose(file); 84 } 85 delete icuPnames; 86} 87 88// Same order as the LineType values. 89static const char *lineTypeStrings[]={ 90 NULL, 91 NULL, 92 "ucd", 93 "property", 94 "binary", 95 "value", 96 "defaults", 97 "block", 98 "cp", 99 "algnamesrange" 100}; 101 102PreparsedUCD::LineType 103PreparsedUCD::readLine(UErrorCode &errorCode) { 104 if(U_FAILURE(errorCode)) { return NO_LINE; } 105 // Select the next available line buffer. 106 while(!isLineBufferAvailable(lineIndex)) { 107 ++lineIndex; 108 if (lineIndex == kNumLineBuffers) { 109 lineIndex = 0; 110 } 111 } 112 char *line=lines[lineIndex]; 113 *line=0; 114 lineLimit=fieldLimit=line; 115 lineType=NO_LINE; 116 char *result=fgets(line, sizeof(lines[0]), file); 117 if(result==NULL) { 118 if(ferror(file)) { 119 perror("error reading preparsed UCD"); 120 fprintf(stderr, "error reading preparsed UCD before line %ld\n", (long)lineNumber); 121 errorCode=U_FILE_ACCESS_ERROR; 122 } 123 return NO_LINE; 124 } 125 ++lineNumber; 126 if(*line=='#') { 127 fieldLimit=strchr(line, 0); 128 return lineType=EMPTY_LINE; 129 } 130 // Remove trailing /r/n. 131 char c; 132 char *limit=strchr(line, 0); 133 while(line<limit && ((c=*(limit-1))=='\n' || c=='\r')) { --limit; } 134 // Remove trailing white space. 135 while(line<limit && ((c=*(limit-1))==' ' || c=='\t')) { --limit; } 136 *limit=0; 137 lineLimit=limit; 138 if(line==limit) { 139 fieldLimit=limit; 140 return lineType=EMPTY_LINE; 141 } 142 // Split by ';'. 143 char *semi=line; 144 while((semi=strchr(semi, ';'))!=NULL) { *semi++=0; } 145 fieldLimit=strchr(line, 0); 146 // Determine the line type. 147 int32_t type; 148 for(type=EMPTY_LINE+1;; ++type) { 149 if(type==LINE_TYPE_COUNT) { 150 fprintf(stderr, 151 "error in preparsed UCD: unknown line type (first field) '%s' on line %ld\n", 152 line, (long)lineNumber); 153 errorCode=U_PARSE_ERROR; 154 return NO_LINE; 155 } 156 if(0==strcmp(line, lineTypeStrings[type])) { 157 break; 158 } 159 } 160 lineType=(LineType)type; 161 if(lineType==UNICODE_VERSION_LINE && fieldLimit<lineLimit) { 162 u_versionFromString(ucdVersion, fieldLimit+1); 163 } 164 return lineType; 165} 166 167const char * 168PreparsedUCD::firstField() { 169 char *field=lines[lineIndex]; 170 fieldLimit=strchr(field, 0); 171 return field; 172} 173 174const char * 175PreparsedUCD::nextField() { 176 if(fieldLimit==lineLimit) { return NULL; } 177 char *field=fieldLimit+1; 178 fieldLimit=strchr(field, 0); 179 return field; 180} 181 182const UniProps * 183PreparsedUCD::getProps(UnicodeSet &newValues, UErrorCode &errorCode) { 184 if(U_FAILURE(errorCode)) { return NULL; } 185 newValues.clear(); 186 if(!lineHasPropertyValues()) { 187 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 188 return NULL; 189 } 190 firstField(); 191 const char *field=nextField(); 192 if(field==NULL) { 193 // No range field after the type. 194 fprintf(stderr, 195 "error in preparsed UCD: missing default/block/cp range field " 196 "(no second field) on line %ld\n", 197 (long)lineNumber); 198 errorCode=U_PARSE_ERROR; 199 return NULL; 200 } 201 UChar32 start, end; 202 if(!parseCodePointRange(field, start, end, errorCode)) { return NULL; } 203 UniProps *props; 204 switch(lineType) { 205 case DEFAULTS_LINE: 206 if(defaultLineIndex>=0) { 207 fprintf(stderr, 208 "error in preparsed UCD: second line with default properties on line %ld\n", 209 (long)lineNumber); 210 errorCode=U_PARSE_ERROR; 211 return NULL; 212 } 213 if(start!=0 || end!=0x10ffff) { 214 fprintf(stderr, 215 "error in preparsed UCD: default range must be 0..10FFFF, not '%s' on line %ld\n", 216 field, (long)lineNumber); 217 errorCode=U_PARSE_ERROR; 218 return NULL; 219 } 220 props=&defaultProps; 221 defaultLineIndex=lineIndex; 222 break; 223 case BLOCK_LINE: 224 blockProps=defaultProps; // Block inherits default properties. 225 props=&blockProps; 226 blockLineIndex=lineIndex; 227 break; 228 case CP_LINE: 229 if(blockProps.start<=start && end<=blockProps.end) { 230 // Code point range fully inside the last block inherits the block properties. 231 cpProps=blockProps; 232 } else if(start>blockProps.end || end<blockProps.start) { 233 // Code point range fully outside the last block inherits the default properties. 234 cpProps=defaultProps; 235 } else { 236 // Code point range partially overlapping with the last block is illegal. 237 fprintf(stderr, 238 "error in preparsed UCD: cp range %s on line %ld only " 239 "partially overlaps with block range %04lX..%04lX\n", 240 field, (long)lineNumber, (long)blockProps.start, (long)blockProps.end); 241 errorCode=U_PARSE_ERROR; 242 return NULL; 243 } 244 props=&cpProps; 245 break; 246 default: 247 // Will not occur because of the range check above. 248 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 249 return NULL; 250 } 251 props->start=start; 252 props->end=end; 253 while((field=nextField())!=NULL) { 254 if(!parseProperty(*props, field, newValues, errorCode)) { return NULL; } 255 } 256 return props; 257} 258 259static const struct { 260 const char *name; 261 int32_t prop; 262} ppucdProperties[]={ 263 { "Name_Alias", PPUCD_NAME_ALIAS }, 264 { "Conditional_Case_Mappings", PPUCD_CONDITIONAL_CASE_MAPPINGS }, 265 { "Turkic_Case_Folding", PPUCD_TURKIC_CASE_FOLDING } 266}; 267 268// Returns TRUE for "ok to continue parsing fields". 269UBool 270PreparsedUCD::parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, 271 UErrorCode &errorCode) { 272 CharString pBuffer; 273 const char *p=field; 274 const char *v=strchr(p, '='); 275 int binaryValue; 276 if(*p=='-') { 277 if(v!=NULL) { 278 fprintf(stderr, 279 "error in preparsed UCD: mix of binary-property-no and " 280 "enum-property syntax '%s' on line %ld\n", 281 field, (long)lineNumber); 282 errorCode=U_PARSE_ERROR; 283 return FALSE; 284 } 285 binaryValue=0; 286 ++p; 287 } else if(v==NULL) { 288 binaryValue=1; 289 } else { 290 binaryValue=-1; 291 // Copy out the property name rather than modifying the field (writing a NUL). 292 pBuffer.append(p, (int32_t)(v-p), errorCode); 293 p=pBuffer.data(); 294 ++v; 295 } 296 int32_t prop=pnames->getPropertyEnum(p); 297 if(prop<0) { 298 for(int32_t i=0;; ++i) { 299 if(i==UPRV_LENGTHOF(ppucdProperties)) { 300 // Ignore unknown property names. 301 return TRUE; 302 } 303 if(0==uprv_stricmp(p, ppucdProperties[i].name)) { 304 prop=ppucdProperties[i].prop; 305 U_ASSERT(prop>=0); 306 break; 307 } 308 } 309 } 310 if(prop<UCHAR_BINARY_LIMIT) { 311 if(binaryValue>=0) { 312 props.binProps[prop]=(UBool)binaryValue; 313 } else { 314 // No binary value for a binary property. 315 fprintf(stderr, 316 "error in preparsed UCD: enum-property syntax '%s' " 317 "for binary property on line %ld\n", 318 field, (long)lineNumber); 319 errorCode=U_PARSE_ERROR; 320 } 321 } else if(binaryValue>=0) { 322 // Binary value for a non-binary property. 323 fprintf(stderr, 324 "error in preparsed UCD: binary-property syntax '%s' " 325 "for non-binary property on line %ld\n", 326 field, (long)lineNumber); 327 errorCode=U_PARSE_ERROR; 328 } else if (prop < UCHAR_INT_START) { 329 fprintf(stderr, 330 "error in preparsed UCD: prop value is invalid: '%d' for line %ld\n", 331 prop, (long)lineNumber); 332 errorCode=U_PARSE_ERROR; 333 } else if(prop<UCHAR_INT_LIMIT) { 334 int32_t value=pnames->getPropertyValueEnum(prop, v); 335 if(value==UCHAR_INVALID_CODE && prop==UCHAR_CANONICAL_COMBINING_CLASS) { 336 // TODO: Make getPropertyValueEnum(UCHAR_CANONICAL_COMBINING_CLASS, v) work. 337 char *end; 338 unsigned long ccc=uprv_strtoul(v, &end, 10); 339 if(v<end && *end==0 && ccc<=254) { 340 value=(int32_t)ccc; 341 } 342 } 343 if(value==UCHAR_INVALID_CODE) { 344 fprintf(stderr, 345 "error in preparsed UCD: '%s' is not a valid value on line %ld\n", 346 field, (long)lineNumber); 347 errorCode=U_PARSE_ERROR; 348 } else { 349 props.intProps[prop-UCHAR_INT_START]=value; 350 } 351 } else if(*v=='<') { 352 // Do not parse default values like <code point>, just set null values. 353 switch(prop) { 354 case UCHAR_BIDI_MIRRORING_GLYPH: 355 props.bmg=U_SENTINEL; 356 break; 357 case UCHAR_BIDI_PAIRED_BRACKET: 358 props.bpb=U_SENTINEL; 359 break; 360 case UCHAR_SIMPLE_CASE_FOLDING: 361 props.scf=U_SENTINEL; 362 break; 363 case UCHAR_SIMPLE_LOWERCASE_MAPPING: 364 props.slc=U_SENTINEL; 365 break; 366 case UCHAR_SIMPLE_TITLECASE_MAPPING: 367 props.stc=U_SENTINEL; 368 break; 369 case UCHAR_SIMPLE_UPPERCASE_MAPPING: 370 props.suc=U_SENTINEL; 371 break; 372 case UCHAR_CASE_FOLDING: 373 props.cf.remove(); 374 break; 375 case UCHAR_LOWERCASE_MAPPING: 376 props.lc.remove(); 377 break; 378 case UCHAR_TITLECASE_MAPPING: 379 props.tc.remove(); 380 break; 381 case UCHAR_UPPERCASE_MAPPING: 382 props.uc.remove(); 383 break; 384 case UCHAR_SCRIPT_EXTENSIONS: 385 props.scx.clear(); 386 break; 387 default: 388 fprintf(stderr, 389 "error in preparsed UCD: '%s' is not a valid default value on line %ld\n", 390 field, (long)lineNumber); 391 errorCode=U_PARSE_ERROR; 392 } 393 } else { 394 char c; 395 switch(prop) { 396 case UCHAR_NUMERIC_VALUE: 397 props.numericValue=v; 398 c=*v; 399 if('0'<=c && c<='9' && v[1]==0) { 400 props.digitValue=c-'0'; 401 } else { 402 props.digitValue=-1; 403 } 404 break; 405 case UCHAR_NAME: 406 props.name=v; 407 break; 408 case UCHAR_AGE: 409 u_versionFromString(props.age, v); // Writes 0.0.0.0 if v is not numeric. 410 break; 411 case UCHAR_BIDI_MIRRORING_GLYPH: 412 props.bmg=parseCodePoint(v, errorCode); 413 break; 414 case UCHAR_BIDI_PAIRED_BRACKET: 415 props.bpb=parseCodePoint(v, errorCode); 416 break; 417 case UCHAR_SIMPLE_CASE_FOLDING: 418 props.scf=parseCodePoint(v, errorCode); 419 break; 420 case UCHAR_SIMPLE_LOWERCASE_MAPPING: 421 props.slc=parseCodePoint(v, errorCode); 422 break; 423 case UCHAR_SIMPLE_TITLECASE_MAPPING: 424 props.stc=parseCodePoint(v, errorCode); 425 break; 426 case UCHAR_SIMPLE_UPPERCASE_MAPPING: 427 props.suc=parseCodePoint(v, errorCode); 428 break; 429 case UCHAR_CASE_FOLDING: 430 parseString(v, props.cf, errorCode); 431 break; 432 case UCHAR_LOWERCASE_MAPPING: 433 parseString(v, props.lc, errorCode); 434 break; 435 case UCHAR_TITLECASE_MAPPING: 436 parseString(v, props.tc, errorCode); 437 break; 438 case UCHAR_UPPERCASE_MAPPING: 439 parseString(v, props.uc, errorCode); 440 break; 441 case PPUCD_NAME_ALIAS: 442 props.nameAlias=v; 443 break; 444 case PPUCD_CONDITIONAL_CASE_MAPPINGS: 445 case PPUCD_TURKIC_CASE_FOLDING: 446 // No need to parse their values: They are hardcoded in the runtime library. 447 break; 448 case UCHAR_SCRIPT_EXTENSIONS: 449 parseScriptExtensions(v, props.scx, errorCode); 450 break; 451 default: 452 // Ignore unhandled properties. 453 return TRUE; 454 } 455 } 456 if(U_SUCCESS(errorCode)) { 457 newValues.add((UChar32)prop); 458 return TRUE; 459 } else { 460 return FALSE; 461 } 462} 463 464UBool 465PreparsedUCD::getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode) { 466 if(U_FAILURE(errorCode)) { return FALSE; } 467 if(lineType!=ALG_NAMES_RANGE_LINE) { 468 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 469 return FALSE; 470 } 471 firstField(); 472 const char *field=nextField(); 473 if(field==NULL) { 474 // No range field after the type. 475 fprintf(stderr, 476 "error in preparsed UCD: missing algnamesrange range field " 477 "(no second field) on line %ld\n", 478 (long)lineNumber); 479 errorCode=U_PARSE_ERROR; 480 return FALSE; 481 } 482 return parseCodePointRange(field, start, end, errorCode); 483} 484 485UChar32 486PreparsedUCD::parseCodePoint(const char *s, UErrorCode &errorCode) { 487 char *end; 488 uint32_t value=(uint32_t)uprv_strtoul(s, &end, 16); 489 if(end<=s || *end!=0 || value>=0x110000) { 490 fprintf(stderr, 491 "error in preparsed UCD: '%s' is not a valid code point on line %ld\n", 492 s, (long)lineNumber); 493 errorCode=U_PARSE_ERROR; 494 return U_SENTINEL; 495 } 496 return (UChar32)value; 497} 498 499UBool 500PreparsedUCD::parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode) { 501 uint32_t st, e; 502 u_parseCodePointRange(s, &st, &e, &errorCode); 503 if(U_FAILURE(errorCode)) { 504 fprintf(stderr, 505 "error in preparsed UCD: '%s' is not a valid code point range on line %ld\n", 506 s, (long)lineNumber); 507 return FALSE; 508 } 509 start=(UChar32)st; 510 end=(UChar32)e; 511 return TRUE; 512} 513 514void 515PreparsedUCD::parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode) { 516 UChar *buffer=uni.getBuffer(-1); 517 int32_t length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode); 518 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 519 errorCode=U_ZERO_ERROR; 520 uni.releaseBuffer(0); 521 buffer=uni.getBuffer(length); 522 length=u_parseString(s, buffer, uni.getCapacity(), NULL, &errorCode); 523 } 524 uni.releaseBuffer(length); 525 if(U_FAILURE(errorCode)) { 526 fprintf(stderr, 527 "error in preparsed UCD: '%s' is not a valid Unicode string on line %ld\n", 528 s, (long)lineNumber); 529 } 530} 531 532void 533PreparsedUCD::parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode) { 534 if(U_FAILURE(errorCode)) { return; } 535 scx.clear(); 536 CharString scString; 537 for(;;) { 538 const char *scs; 539 const char *scLimit=strchr(s, ' '); 540 if(scLimit!=NULL) { 541 scs=scString.clear().append(s, (int32_t)(scLimit-s), errorCode).data(); 542 if(U_FAILURE(errorCode)) { return; } 543 } else { 544 scs=s; 545 } 546 int32_t script=pnames->getPropertyValueEnum(UCHAR_SCRIPT, scs); 547 if(script==UCHAR_INVALID_CODE) { 548 fprintf(stderr, 549 "error in preparsed UCD: '%s' is not a valid script code on line %ld\n", 550 scs, (long)lineNumber); 551 errorCode=U_PARSE_ERROR; 552 return; 553 } else if(scx.contains(script)) { 554 fprintf(stderr, 555 "error in preparsed UCD: scx has duplicate '%s' codes on line %ld\n", 556 scs, (long)lineNumber); 557 errorCode=U_PARSE_ERROR; 558 return; 559 } else { 560 scx.add(script); 561 } 562 if(scLimit!=NULL) { 563 s=scLimit+1; 564 } else { 565 break; 566 } 567 } 568 if(scx.isEmpty()) { 569 fprintf(stderr, "error in preparsed UCD: empty scx= on line %ld\n", (long)lineNumber); 570 errorCode=U_PARSE_ERROR; 571 } 572} 573 574U_NAMESPACE_END 575