1/* 2******************************************************************************* 3* 4* Copyright (C) 1999-2012, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: uniset_props.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2004aug25 14* created by: Markus W. Scherer 15* 16* Character property dependent functions moved here from uniset.cpp 17*/ 18 19#include "unicode/utypes.h" 20#include "unicode/uniset.h" 21#include "unicode/parsepos.h" 22#include "unicode/uchar.h" 23#include "unicode/uscript.h" 24#include "unicode/symtable.h" 25#include "unicode/uset.h" 26#include "unicode/locid.h" 27#include "unicode/brkiter.h" 28#include "uset_imp.h" 29#include "ruleiter.h" 30#include "cmemory.h" 31#include "ucln_cmn.h" 32#include "util.h" 33#include "uvector.h" 34#include "uprops.h" 35#include "propname.h" 36#include "normalizer2impl.h" 37#include "ucase.h" 38#include "ubidi_props.h" 39#include "uinvchar.h" 40#include "uprops.h" 41#include "charstr.h" 42#include "cstring.h" 43#include "mutex.h" 44#include "umutex.h" 45#include "uassert.h" 46#include "hash.h" 47 48U_NAMESPACE_USE 49 50#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 51 52// initial storage. Must be >= 0 53// *** same as in uniset.cpp ! *** 54#define START_EXTRA 16 55 56// Define UChar constants using hex for EBCDIC compatibility 57// Used #define to reduce private static exports and memory access time. 58#define SET_OPEN ((UChar)0x005B) /*[*/ 59#define SET_CLOSE ((UChar)0x005D) /*]*/ 60#define HYPHEN ((UChar)0x002D) /*-*/ 61#define COMPLEMENT ((UChar)0x005E) /*^*/ 62#define COLON ((UChar)0x003A) /*:*/ 63#define BACKSLASH ((UChar)0x005C) /*\*/ 64#define INTERSECTION ((UChar)0x0026) /*&*/ 65#define UPPER_U ((UChar)0x0055) /*U*/ 66#define LOWER_U ((UChar)0x0075) /*u*/ 67#define OPEN_BRACE ((UChar)123) /*{*/ 68#define CLOSE_BRACE ((UChar)125) /*}*/ 69#define UPPER_P ((UChar)0x0050) /*P*/ 70#define LOWER_P ((UChar)0x0070) /*p*/ 71#define UPPER_N ((UChar)78) /*N*/ 72#define EQUALS ((UChar)0x003D) /*=*/ 73 74//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:" 75static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]" 76//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p" 77//static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" 78//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N" 79static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ 80 81// Special property set IDs 82static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] 83static const char ASCII[] = "ASCII"; // [\u0000-\u007F] 84static const char ASSIGNED[] = "Assigned"; // [:^Cn:] 85 86// Unicode name property alias 87#define NAME_PROP "na" 88#define NAME_PROP_LENGTH 2 89 90/** 91 * Delimiter string used in patterns to close a category reference: 92 * ":]". Example: "[:Lu:]". 93 */ 94//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ 95 96// Cached sets ------------------------------------------------------------- *** 97 98U_CDECL_BEGIN 99static UBool U_CALLCONV uset_cleanup(); 100U_CDECL_END 101 102// Not a TriStateSingletonWrapper because we think the UnicodeSet constructor 103// can only fail with an out-of-memory error 104// if we have a correct pattern and the properties data is hardcoded and always available. 105class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> { 106public: 107 UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) : 108 SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {} 109 UnicodeSet *getInstance(UErrorCode &errorCode) { 110 return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, fPattern, errorCode); 111 } 112private: 113 static void *createInstance(const void *context, UErrorCode &errorCode) { 114 UnicodeString pattern((const char *)context, -1, US_INV); 115 UnicodeSet *set=new UnicodeSet(pattern, errorCode); 116 if(set==NULL) { 117 errorCode=U_MEMORY_ALLOCATION_ERROR; 118 return NULL; 119 } 120 set->freeze(); 121 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 122 return set; 123 } 124 125 const char *fPattern; 126}; 127 128U_CDECL_BEGIN 129 130static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions() 131 132STATIC_SIMPLE_SINGLETON(uni32Singleton); 133 134//---------------------------------------------------------------- 135// Inclusions list 136//---------------------------------------------------------------- 137 138// USetAdder implementation 139// Does not use uset.h to reduce code dependencies 140static void U_CALLCONV 141_set_add(USet *set, UChar32 c) { 142 ((UnicodeSet *)set)->add(c); 143} 144 145static void U_CALLCONV 146_set_addRange(USet *set, UChar32 start, UChar32 end) { 147 ((UnicodeSet *)set)->add(start, end); 148} 149 150static void U_CALLCONV 151_set_addString(USet *set, const UChar *str, int32_t length) { 152 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); 153} 154 155/** 156 * Cleanup function for UnicodeSet 157 */ 158static UBool U_CALLCONV uset_cleanup(void) { 159 int32_t i; 160 161 for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { 162 if (INCLUSIONS[i] != NULL) { 163 delete INCLUSIONS[i]; 164 INCLUSIONS[i] = NULL; 165 } 166 } 167 UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance(); 168 return TRUE; 169} 170 171U_CDECL_END 172 173U_NAMESPACE_BEGIN 174 175/* 176Reduce excessive reallocation, and make it easier to detect initialization 177problems. 178Usually you don't see smaller sets than this for Unicode 5.0. 179*/ 180#define DEFAULT_INCLUSION_CAPACITY 3072 181 182const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { 183 UBool needInit; 184 UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit); 185 if (needInit) { 186 UnicodeSet* incl = new UnicodeSet(); 187 USetAdder sa = { 188 (USet *)incl, 189 _set_add, 190 _set_addRange, 191 _set_addString, 192 NULL, // don't need remove() 193 NULL // don't need removeRange() 194 }; 195 if (incl != NULL) { 196 incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); 197 switch(src) { 198 case UPROPS_SRC_CHAR: 199 uchar_addPropertyStarts(&sa, &status); 200 break; 201 case UPROPS_SRC_PROPSVEC: 202 upropsvec_addPropertyStarts(&sa, &status); 203 break; 204 case UPROPS_SRC_CHAR_AND_PROPSVEC: 205 uchar_addPropertyStarts(&sa, &status); 206 upropsvec_addPropertyStarts(&sa, &status); 207 break; 208#if !UCONFIG_NO_NORMALIZATION 209 case UPROPS_SRC_CASE_AND_NORM: { 210 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 211 if(U_SUCCESS(status)) { 212 impl->addPropertyStarts(&sa, status); 213 } 214 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); 215 break; 216 } 217 case UPROPS_SRC_NFC: { 218 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 219 if(U_SUCCESS(status)) { 220 impl->addPropertyStarts(&sa, status); 221 } 222 break; 223 } 224 case UPROPS_SRC_NFKC: { 225 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); 226 if(U_SUCCESS(status)) { 227 impl->addPropertyStarts(&sa, status); 228 } 229 break; 230 } 231 case UPROPS_SRC_NFKC_CF: { 232 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); 233 if(U_SUCCESS(status)) { 234 impl->addPropertyStarts(&sa, status); 235 } 236 break; 237 } 238 case UPROPS_SRC_NFC_CANON_ITER: { 239 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 240 if(U_SUCCESS(status)) { 241 impl->addCanonIterPropertyStarts(&sa, status); 242 } 243 break; 244 } 245#endif 246 case UPROPS_SRC_CASE: 247 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); 248 break; 249 case UPROPS_SRC_BIDI: 250 ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status); 251 break; 252 default: 253 status = U_INTERNAL_PROGRAM_ERROR; 254 break; 255 } 256 if (U_SUCCESS(status)) { 257 // Compact for caching 258 incl->compact(); 259 umtx_lock(NULL); 260 if (INCLUSIONS[src] == NULL) { 261 INCLUSIONS[src] = incl; 262 incl = NULL; 263 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 264 } 265 umtx_unlock(NULL); 266 } 267 delete incl; 268 } else { 269 status = U_MEMORY_ALLOCATION_ERROR; 270 } 271 } 272 return INCLUSIONS[src]; 273} 274 275// Cache some sets for other services -------------------------------------- *** 276 277U_CFUNC UnicodeSet * 278uniset_getUnicode32Instance(UErrorCode &errorCode) { 279 return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorCode); 280} 281 282// helper functions for matching of pattern syntax pieces ------------------ *** 283// these functions are parallel to the PERL_OPEN etc. strings above 284 285// using these functions is not only faster than UnicodeString::compare() and 286// caseCompare(), but they also make UnicodeSet work for simple patterns when 287// no Unicode properties data is available - when caseCompare() fails 288 289static inline UBool 290isPerlOpen(const UnicodeString &pattern, int32_t pos) { 291 UChar c; 292 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P); 293} 294 295/*static inline UBool 296isPerlClose(const UnicodeString &pattern, int32_t pos) { 297 return pattern.charAt(pos)==CLOSE_BRACE; 298}*/ 299 300static inline UBool 301isNameOpen(const UnicodeString &pattern, int32_t pos) { 302 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N; 303} 304 305static inline UBool 306isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { 307 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON; 308} 309 310/*static inline UBool 311isPOSIXClose(const UnicodeString &pattern, int32_t pos) { 312 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE; 313}*/ 314 315// TODO memory debugging provided inside uniset.cpp 316// could be made available here but probably obsolete with use of modern 317// memory leak checker tools 318#define _dbgct(me) 319 320//---------------------------------------------------------------- 321// Constructors &c 322//---------------------------------------------------------------- 323 324/** 325 * Constructs a set from the given pattern, optionally ignoring 326 * white space. See the class description for the syntax of the 327 * pattern language. 328 * @param pattern a string specifying what characters are in the set 329 */ 330UnicodeSet::UnicodeSet(const UnicodeString& pattern, 331 UErrorCode& status) : 332 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 333 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 334 fFlags(0) 335{ 336 if(U_SUCCESS(status)){ 337 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 338 /* test for NULL */ 339 if(list == NULL) { 340 status = U_MEMORY_ALLOCATION_ERROR; 341 }else{ 342 allocateStrings(status); 343 applyPattern(pattern, status); 344 } 345 } 346 _dbgct(this); 347} 348 349//---------------------------------------------------------------- 350// Public API 351//---------------------------------------------------------------- 352 353UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 354 UErrorCode& status) { 355 // Equivalent to 356 // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); 357 // but without dependency on closeOver(). 358 ParsePosition pos(0); 359 applyPatternIgnoreSpace(pattern, pos, NULL, status); 360 if (U_FAILURE(status)) return *this; 361 362 int32_t i = pos.getIndex(); 363 // Skip over trailing whitespace 364 ICU_Utility::skipWhitespace(pattern, i, TRUE); 365 if (i != pattern.length()) { 366 status = U_ILLEGAL_ARGUMENT_ERROR; 367 } 368 return *this; 369} 370 371void 372UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, 373 ParsePosition& pos, 374 const SymbolTable* symbols, 375 UErrorCode& status) { 376 if (U_FAILURE(status)) { 377 return; 378 } 379 if (isFrozen()) { 380 status = U_NO_WRITE_PERMISSION; 381 return; 382 } 383 // Need to build the pattern in a temporary string because 384 // _applyPattern calls add() etc., which set pat to empty. 385 UnicodeString rebuiltPat; 386 RuleCharacterIterator chars(pattern, symbols, pos); 387 applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, status); 388 if (U_FAILURE(status)) return; 389 if (chars.inVariable()) { 390 // syntaxError(chars, "Extra chars in variable value"); 391 status = U_MALFORMED_SET; 392 return; 393 } 394 setPattern(rebuiltPat); 395} 396 397/** 398 * Return true if the given position, in the given pattern, appears 399 * to be the start of a UnicodeSet pattern. 400 */ 401UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { 402 return ((pos+1) < pattern.length() && 403 pattern.charAt(pos) == (UChar)91/*[*/) || 404 resemblesPropertyPattern(pattern, pos); 405} 406 407//---------------------------------------------------------------- 408// Implementation: Pattern parsing 409//---------------------------------------------------------------- 410 411/** 412 * A small all-inline class to manage a UnicodeSet pointer. Add 413 * operator->() etc. as needed. 414 */ 415class UnicodeSetPointer { 416 UnicodeSet* p; 417public: 418 inline UnicodeSetPointer() : p(0) {} 419 inline ~UnicodeSetPointer() { delete p; } 420 inline UnicodeSet* pointer() { return p; } 421 inline UBool allocate() { 422 if (p == 0) { 423 p = new UnicodeSet(); 424 } 425 return p != 0; 426 } 427}; 428 429/** 430 * Parse the pattern from the given RuleCharacterIterator. The 431 * iterator is advanced over the parsed pattern. 432 * @param chars iterator over the pattern characters. Upon return 433 * it will be advanced to the first character after the parsed 434 * pattern, or the end of the iteration if all characters are 435 * parsed. 436 * @param symbols symbol table to use to parse and dereference 437 * variables, or null if none. 438 * @param rebuiltPat the pattern that was parsed, rebuilt or 439 * copied from the input pattern, as appropriate. 440 * @param options a bit mask of zero or more of the following: 441 * IGNORE_SPACE, CASE. 442 */ 443void UnicodeSet::applyPattern(RuleCharacterIterator& chars, 444 const SymbolTable* symbols, 445 UnicodeString& rebuiltPat, 446 uint32_t options, 447 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), 448 UErrorCode& ec) { 449 if (U_FAILURE(ec)) return; 450 451 // Syntax characters: [ ] ^ - & { } 452 453 // Recognized special forms for chars, sets: c-c s-s s&s 454 455 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | 456 RuleCharacterIterator::PARSE_ESCAPES; 457 if ((options & USET_IGNORE_SPACE) != 0) { 458 opts |= RuleCharacterIterator::SKIP_WHITESPACE; 459 } 460 461 UnicodeString patLocal, buf; 462 UBool usePat = FALSE; 463 UnicodeSetPointer scratch; 464 RuleCharacterIterator::Pos backup; 465 466 // mode: 0=before [, 1=between [...], 2=after ] 467 // lastItem: 0=none, 1=char, 2=set 468 int8_t lastItem = 0, mode = 0; 469 UChar32 lastChar = 0; 470 UChar op = 0; 471 472 UBool invert = FALSE; 473 474 clear(); 475 476 while (mode != 2 && !chars.atEnd()) { 477 U_ASSERT((lastItem == 0 && op == 0) || 478 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || 479 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || 480 op == INTERSECTION /*'&'*/))); 481 482 UChar32 c = 0; 483 UBool literal = FALSE; 484 UnicodeSet* nested = 0; // alias - do not delete 485 486 // -------- Check for property pattern 487 488 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 489 int8_t setMode = 0; 490 if (resemblesPropertyPattern(chars, opts)) { 491 setMode = 2; 492 } 493 494 // -------- Parse '[' of opening delimiter OR nested set. 495 // If there is a nested set, use `setMode' to define how 496 // the set should be parsed. If the '[' is part of the 497 // opening delimiter for this pattern, parse special 498 // strings "[", "[^", "[-", and "[^-". Check for stand-in 499 // characters representing a nested set in the symbol 500 // table. 501 502 else { 503 // Prepare to backup if necessary 504 chars.getPos(backup); 505 c = chars.next(opts, literal, ec); 506 if (U_FAILURE(ec)) return; 507 508 if (c == 0x5B /*'['*/ && !literal) { 509 if (mode == 1) { 510 chars.setPos(backup); // backup 511 setMode = 1; 512 } else { 513 // Handle opening '[' delimiter 514 mode = 1; 515 patLocal.append((UChar) 0x5B /*'['*/); 516 chars.getPos(backup); // prepare to backup 517 c = chars.next(opts, literal, ec); 518 if (U_FAILURE(ec)) return; 519 if (c == 0x5E /*'^'*/ && !literal) { 520 invert = TRUE; 521 patLocal.append((UChar) 0x5E /*'^'*/); 522 chars.getPos(backup); // prepare to backup 523 c = chars.next(opts, literal, ec); 524 if (U_FAILURE(ec)) return; 525 } 526 // Fall through to handle special leading '-'; 527 // otherwise restart loop for nested [], \p{}, etc. 528 if (c == HYPHEN /*'-'*/) { 529 literal = TRUE; 530 // Fall through to handle literal '-' below 531 } else { 532 chars.setPos(backup); // backup 533 continue; 534 } 535 } 536 } else if (symbols != 0) { 537 const UnicodeFunctor *m = symbols->lookupMatcher(c); 538 if (m != 0) { 539 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m); 540 if (ms == NULL) { 541 ec = U_MALFORMED_SET; 542 return; 543 } 544 // casting away const, but `nested' won't be modified 545 // (important not to modify stored set) 546 nested = const_cast<UnicodeSet*>(ms); 547 setMode = 3; 548 } 549 } 550 } 551 552 // -------- Handle a nested set. This either is inline in 553 // the pattern or represented by a stand-in that has 554 // previously been parsed and was looked up in the symbol 555 // table. 556 557 if (setMode != 0) { 558 if (lastItem == 1) { 559 if (op != 0) { 560 // syntaxError(chars, "Char expected after operator"); 561 ec = U_MALFORMED_SET; 562 return; 563 } 564 add(lastChar, lastChar); 565 _appendToPat(patLocal, lastChar, FALSE); 566 lastItem = 0; 567 op = 0; 568 } 569 570 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { 571 patLocal.append(op); 572 } 573 574 if (nested == 0) { 575 // lazy allocation 576 if (!scratch.allocate()) { 577 ec = U_MEMORY_ALLOCATION_ERROR; 578 return; 579 } 580 nested = scratch.pointer(); 581 } 582 switch (setMode) { 583 case 1: 584 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, ec); 585 break; 586 case 2: 587 chars.skipIgnored(opts); 588 nested->applyPropertyPattern(chars, patLocal, ec); 589 if (U_FAILURE(ec)) return; 590 break; 591 case 3: // `nested' already parsed 592 nested->_toPattern(patLocal, FALSE); 593 break; 594 } 595 596 usePat = TRUE; 597 598 if (mode == 0) { 599 // Entire pattern is a category; leave parse loop 600 *this = *nested; 601 mode = 2; 602 break; 603 } 604 605 switch (op) { 606 case HYPHEN: /*'-'*/ 607 removeAll(*nested); 608 break; 609 case INTERSECTION: /*'&'*/ 610 retainAll(*nested); 611 break; 612 case 0: 613 addAll(*nested); 614 break; 615 } 616 617 op = 0; 618 lastItem = 2; 619 620 continue; 621 } 622 623 if (mode == 0) { 624 // syntaxError(chars, "Missing '['"); 625 ec = U_MALFORMED_SET; 626 return; 627 } 628 629 // -------- Parse special (syntax) characters. If the 630 // current character is not special, or if it is escaped, 631 // then fall through and handle it below. 632 633 if (!literal) { 634 switch (c) { 635 case 0x5D /*']'*/: 636 if (lastItem == 1) { 637 add(lastChar, lastChar); 638 _appendToPat(patLocal, lastChar, FALSE); 639 } 640 // Treat final trailing '-' as a literal 641 if (op == HYPHEN /*'-'*/) { 642 add(op, op); 643 patLocal.append(op); 644 } else if (op == INTERSECTION /*'&'*/) { 645 // syntaxError(chars, "Trailing '&'"); 646 ec = U_MALFORMED_SET; 647 return; 648 } 649 patLocal.append((UChar) 0x5D /*']'*/); 650 mode = 2; 651 continue; 652 case HYPHEN /*'-'*/: 653 if (op == 0) { 654 if (lastItem != 0) { 655 op = (UChar) c; 656 continue; 657 } else { 658 // Treat final trailing '-' as a literal 659 add(c, c); 660 c = chars.next(opts, literal, ec); 661 if (U_FAILURE(ec)) return; 662 if (c == 0x5D /*']'*/ && !literal) { 663 patLocal.append(HYPHEN_RIGHT_BRACE, 2); 664 mode = 2; 665 continue; 666 } 667 } 668 } 669 // syntaxError(chars, "'-' not after char or set"); 670 ec = U_MALFORMED_SET; 671 return; 672 case INTERSECTION /*'&'*/: 673 if (lastItem == 2 && op == 0) { 674 op = (UChar) c; 675 continue; 676 } 677 // syntaxError(chars, "'&' not after set"); 678 ec = U_MALFORMED_SET; 679 return; 680 case 0x5E /*'^'*/: 681 // syntaxError(chars, "'^' not after '['"); 682 ec = U_MALFORMED_SET; 683 return; 684 case 0x7B /*'{'*/: 685 if (op != 0) { 686 // syntaxError(chars, "Missing operand after operator"); 687 ec = U_MALFORMED_SET; 688 return; 689 } 690 if (lastItem == 1) { 691 add(lastChar, lastChar); 692 _appendToPat(patLocal, lastChar, FALSE); 693 } 694 lastItem = 0; 695 buf.truncate(0); 696 { 697 UBool ok = FALSE; 698 while (!chars.atEnd()) { 699 c = chars.next(opts, literal, ec); 700 if (U_FAILURE(ec)) return; 701 if (c == 0x7D /*'}'*/ && !literal) { 702 ok = TRUE; 703 break; 704 } 705 buf.append(c); 706 } 707 if (buf.length() < 1 || !ok) { 708 // syntaxError(chars, "Invalid multicharacter string"); 709 ec = U_MALFORMED_SET; 710 return; 711 } 712 } 713 // We have new string. Add it to set and continue; 714 // we don't need to drop through to the further 715 // processing 716 add(buf); 717 patLocal.append((UChar) 0x7B /*'{'*/); 718 _appendToPat(patLocal, buf, FALSE); 719 patLocal.append((UChar) 0x7D /*'}'*/); 720 continue; 721 case SymbolTable::SYMBOL_REF: 722 // symbols nosymbols 723 // [a-$] error error (ambiguous) 724 // [a$] anchor anchor 725 // [a-$x] var "x"* literal '$' 726 // [a-$.] error literal '$' 727 // *We won't get here in the case of var "x" 728 { 729 chars.getPos(backup); 730 c = chars.next(opts, literal, ec); 731 if (U_FAILURE(ec)) return; 732 UBool anchor = (c == 0x5D /*']'*/ && !literal); 733 if (symbols == 0 && !anchor) { 734 c = SymbolTable::SYMBOL_REF; 735 chars.setPos(backup); 736 break; // literal '$' 737 } 738 if (anchor && op == 0) { 739 if (lastItem == 1) { 740 add(lastChar, lastChar); 741 _appendToPat(patLocal, lastChar, FALSE); 742 } 743 add(U_ETHER); 744 usePat = TRUE; 745 patLocal.append((UChar) SymbolTable::SYMBOL_REF); 746 patLocal.append((UChar) 0x5D /*']'*/); 747 mode = 2; 748 continue; 749 } 750 // syntaxError(chars, "Unquoted '$'"); 751 ec = U_MALFORMED_SET; 752 return; 753 } 754 default: 755 break; 756 } 757 } 758 759 // -------- Parse literal characters. This includes both 760 // escaped chars ("\u4E01") and non-syntax characters 761 // ("a"). 762 763 switch (lastItem) { 764 case 0: 765 lastItem = 1; 766 lastChar = c; 767 break; 768 case 1: 769 if (op == HYPHEN /*'-'*/) { 770 if (lastChar >= c) { 771 // Don't allow redundant (a-a) or empty (b-a) ranges; 772 // these are most likely typos. 773 // syntaxError(chars, "Invalid range"); 774 ec = U_MALFORMED_SET; 775 return; 776 } 777 add(lastChar, c); 778 _appendToPat(patLocal, lastChar, FALSE); 779 patLocal.append(op); 780 _appendToPat(patLocal, c, FALSE); 781 lastItem = 0; 782 op = 0; 783 } else { 784 add(lastChar, lastChar); 785 _appendToPat(patLocal, lastChar, FALSE); 786 lastChar = c; 787 } 788 break; 789 case 2: 790 if (op != 0) { 791 // syntaxError(chars, "Set expected after operator"); 792 ec = U_MALFORMED_SET; 793 return; 794 } 795 lastChar = c; 796 lastItem = 1; 797 break; 798 } 799 } 800 801 if (mode != 2) { 802 // syntaxError(chars, "Missing ']'"); 803 ec = U_MALFORMED_SET; 804 return; 805 } 806 807 chars.skipIgnored(opts); 808 809 /** 810 * Handle global flags (invert, case insensitivity). If this 811 * pattern should be compiled case-insensitive, then we need 812 * to close over case BEFORE COMPLEMENTING. This makes 813 * patterns like /[^abc]/i work. 814 */ 815 if ((options & USET_CASE_INSENSITIVE) != 0) { 816 (this->*caseClosure)(USET_CASE_INSENSITIVE); 817 } 818 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { 819 (this->*caseClosure)(USET_ADD_CASE_MAPPINGS); 820 } 821 if (invert) { 822 complement(); 823 } 824 825 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the 826 // generated pattern. 827 if (usePat) { 828 rebuiltPat.append(patLocal); 829 } else { 830 _generatePattern(rebuiltPat, FALSE); 831 } 832 if (isBogus() && U_SUCCESS(ec)) { 833 // We likely ran out of memory. AHHH! 834 ec = U_MEMORY_ALLOCATION_ERROR; 835 } 836} 837 838//---------------------------------------------------------------- 839// Property set implementation 840//---------------------------------------------------------------- 841 842static UBool numericValueFilter(UChar32 ch, void* context) { 843 return u_getNumericValue(ch) == *(double*)context; 844} 845 846static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { 847 int32_t value = *(int32_t*)context; 848 return (U_GET_GC_MASK((UChar32) ch) & value) != 0; 849} 850 851static UBool versionFilter(UChar32 ch, void* context) { 852 static const UVersionInfo none = { 0, 0, 0, 0 }; 853 UVersionInfo v; 854 u_charAge(ch, v); 855 UVersionInfo* version = (UVersionInfo*)context; 856 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; 857} 858 859typedef struct { 860 UProperty prop; 861 int32_t value; 862} IntPropertyContext; 863 864static UBool intPropertyFilter(UChar32 ch, void* context) { 865 IntPropertyContext* c = (IntPropertyContext*)context; 866 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; 867} 868 869static UBool scriptExtensionsFilter(UChar32 ch, void* context) { 870 return uscript_hasScript(ch, *(UScriptCode*)context); 871} 872 873/** 874 * Generic filter-based scanning code for UCD property UnicodeSets. 875 */ 876void UnicodeSet::applyFilter(UnicodeSet::Filter filter, 877 void* context, 878 int32_t src, 879 UErrorCode &status) { 880 if (U_FAILURE(status)) return; 881 882 // Logically, walk through all Unicode characters, noting the start 883 // and end of each range for which filter.contain(c) is 884 // true. Add each range to a set. 885 // 886 // To improve performance, use an inclusions set which 887 // encodes information about character ranges that are known 888 // to have identical properties. 889 // getInclusions(src) contains exactly the first characters of 890 // same-value ranges for the given properties "source". 891 const UnicodeSet* inclusions = getInclusions(src, status); 892 if (U_FAILURE(status)) { 893 return; 894 } 895 896 clear(); 897 898 UChar32 startHasProperty = -1; 899 int32_t limitRange = inclusions->getRangeCount(); 900 901 for (int j=0; j<limitRange; ++j) { 902 // get current range 903 UChar32 start = inclusions->getRangeStart(j); 904 UChar32 end = inclusions->getRangeEnd(j); 905 906 // for all the code points in the range, process 907 for (UChar32 ch = start; ch <= end; ++ch) { 908 // only add to this UnicodeSet on inflection points -- 909 // where the hasProperty value changes to false 910 if ((*filter)(ch, context)) { 911 if (startHasProperty < 0) { 912 startHasProperty = ch; 913 } 914 } else if (startHasProperty >= 0) { 915 add(startHasProperty, ch-1); 916 startHasProperty = -1; 917 } 918 } 919 } 920 if (startHasProperty >= 0) { 921 add((UChar32)startHasProperty, (UChar32)0x10FFFF); 922 } 923 if (isBogus() && U_SUCCESS(status)) { 924 // We likely ran out of memory. AHHH! 925 status = U_MEMORY_ALLOCATION_ERROR; 926 } 927} 928 929static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { 930 /* Note: we use ' ' in compiler code page */ 931 int32_t j = 0; 932 char ch; 933 --dstCapacity; /* make room for term. zero */ 934 while ((ch = *src++) != 0) { 935 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { 936 continue; 937 } 938 if (j >= dstCapacity) return FALSE; 939 dst[j++] = ch; 940 } 941 if (j > 0 && dst[j-1] == ' ') --j; 942 dst[j] = 0; 943 return TRUE; 944} 945 946//---------------------------------------------------------------- 947// Property set API 948//---------------------------------------------------------------- 949 950#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} 951 952UnicodeSet& 953UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { 954 if (U_FAILURE(ec) || isFrozen()) return *this; 955 956 if (prop == UCHAR_GENERAL_CATEGORY_MASK) { 957 applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec); 958 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { 959 UScriptCode script = (UScriptCode)value; 960 applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec); 961 } else { 962 IntPropertyContext c = {prop, value}; 963 applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec); 964 } 965 return *this; 966} 967 968UnicodeSet& 969UnicodeSet::applyPropertyAlias(const UnicodeString& prop, 970 const UnicodeString& value, 971 UErrorCode& ec) { 972 if (U_FAILURE(ec) || isFrozen()) return *this; 973 974 // prop and value used to be converted to char * using the default 975 // converter instead of the invariant conversion. 976 // This should not be necessary because all Unicode property and value 977 // names use only invariant characters. 978 // If there are any variant characters, then we won't find them anyway. 979 // Checking first avoids assertion failures in the conversion. 980 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || 981 !uprv_isInvariantUString(value.getBuffer(), value.length()) 982 ) { 983 FAIL(ec); 984 } 985 CharString pname, vname; 986 pname.appendInvariantChars(prop, ec); 987 vname.appendInvariantChars(value, ec); 988 if (U_FAILURE(ec)) return *this; 989 990 UProperty p; 991 int32_t v; 992 UBool mustNotBeEmpty = FALSE, invert = FALSE; 993 994 if (value.length() > 0) { 995 p = u_getPropertyEnum(pname.data()); 996 if (p == UCHAR_INVALID_CODE) FAIL(ec); 997 998 // Treat gc as gcm 999 if (p == UCHAR_GENERAL_CATEGORY) { 1000 p = UCHAR_GENERAL_CATEGORY_MASK; 1001 } 1002 1003 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || 1004 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || 1005 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { 1006 v = u_getPropertyValueEnum(p, vname.data()); 1007 if (v == UCHAR_INVALID_CODE) { 1008 // Handle numeric CCC 1009 if (p == UCHAR_CANONICAL_COMBINING_CLASS || 1010 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || 1011 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { 1012 char* end; 1013 double value = uprv_strtod(vname.data(), &end); 1014 v = (int32_t) value; 1015 if (v != value || v < 0 || *end != 0) { 1016 // non-integral or negative value, or trailing junk 1017 FAIL(ec); 1018 } 1019 // If the resultant set is empty then the numeric value 1020 // was invalid. 1021 mustNotBeEmpty = TRUE; 1022 } else { 1023 FAIL(ec); 1024 } 1025 } 1026 } 1027 1028 else { 1029 1030 switch (p) { 1031 case UCHAR_NUMERIC_VALUE: 1032 { 1033 char* end; 1034 double value = uprv_strtod(vname.data(), &end); 1035 if (*end != 0) { 1036 FAIL(ec); 1037 } 1038 applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec); 1039 return *this; 1040 } 1041 case UCHAR_NAME: 1042 { 1043 // Must munge name, since u_charFromName() does not do 1044 // 'loose' matching. 1045 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength 1046 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 1047 UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec); 1048 if (U_SUCCESS(ec)) { 1049 clear(); 1050 add(ch); 1051 return *this; 1052 } else { 1053 FAIL(ec); 1054 } 1055 } 1056 case UCHAR_UNICODE_1_NAME: 1057 // ICU 49 deprecates the Unicode_1_Name property APIs. 1058 FAIL(ec); 1059 case UCHAR_AGE: 1060 { 1061 // Must munge name, since u_versionFromString() does not do 1062 // 'loose' matching. 1063 char buf[128]; 1064 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 1065 UVersionInfo version; 1066 u_versionFromString(version, buf); 1067 applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec); 1068 return *this; 1069 } 1070 case UCHAR_SCRIPT_EXTENSIONS: 1071 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); 1072 if (v == UCHAR_INVALID_CODE) { 1073 FAIL(ec); 1074 } 1075 // fall through to calling applyIntPropertyValue() 1076 break; 1077 default: 1078 // p is a non-binary, non-enumerated property that we 1079 // don't support (yet). 1080 FAIL(ec); 1081 } 1082 } 1083 } 1084 1085 else { 1086 // value is empty. Interpret as General Category, Script, or 1087 // Binary property. 1088 p = UCHAR_GENERAL_CATEGORY_MASK; 1089 v = u_getPropertyValueEnum(p, pname.data()); 1090 if (v == UCHAR_INVALID_CODE) { 1091 p = UCHAR_SCRIPT; 1092 v = u_getPropertyValueEnum(p, pname.data()); 1093 if (v == UCHAR_INVALID_CODE) { 1094 p = u_getPropertyEnum(pname.data()); 1095 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { 1096 v = 1; 1097 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { 1098 set(MIN_VALUE, MAX_VALUE); 1099 return *this; 1100 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { 1101 set(0, 0x7F); 1102 return *this; 1103 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { 1104 // [:Assigned:]=[:^Cn:] 1105 p = UCHAR_GENERAL_CATEGORY_MASK; 1106 v = U_GC_CN_MASK; 1107 invert = TRUE; 1108 } else { 1109 FAIL(ec); 1110 } 1111 } 1112 } 1113 } 1114 1115 applyIntPropertyValue(p, v, ec); 1116 if(invert) { 1117 complement(); 1118 } 1119 1120 if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) { 1121 // mustNotBeEmpty is set to true if an empty set indicates 1122 // invalid input. 1123 ec = U_ILLEGAL_ARGUMENT_ERROR; 1124 } 1125 1126 if (isBogus() && U_SUCCESS(ec)) { 1127 // We likely ran out of memory. AHHH! 1128 ec = U_MEMORY_ALLOCATION_ERROR; 1129 } 1130 return *this; 1131} 1132 1133//---------------------------------------------------------------- 1134// Property set patterns 1135//---------------------------------------------------------------- 1136 1137/** 1138 * Return true if the given position, in the given pattern, appears 1139 * to be the start of a property set pattern. 1140 */ 1141UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, 1142 int32_t pos) { 1143 // Patterns are at least 5 characters long 1144 if ((pos+5) > pattern.length()) { 1145 return FALSE; 1146 } 1147 1148 // Look for an opening [:, [:^, \p, or \P 1149 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); 1150} 1151 1152/** 1153 * Return true if the given iterator appears to point at a 1154 * property pattern. Regardless of the result, return with the 1155 * iterator unchanged. 1156 * @param chars iterator over the pattern characters. Upon return 1157 * it will be unchanged. 1158 * @param iterOpts RuleCharacterIterator options 1159 */ 1160UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, 1161 int32_t iterOpts) { 1162 // NOTE: literal will always be FALSE, because we don't parse escapes. 1163 UBool result = FALSE, literal; 1164 UErrorCode ec = U_ZERO_ERROR; 1165 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; 1166 RuleCharacterIterator::Pos pos; 1167 chars.getPos(pos); 1168 UChar32 c = chars.next(iterOpts, literal, ec); 1169 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) { 1170 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, 1171 literal, ec); 1172 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) : 1173 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/); 1174 } 1175 chars.setPos(pos); 1176 return result && U_SUCCESS(ec); 1177} 1178 1179/** 1180 * Parse the given property pattern at the given parse position. 1181 */ 1182UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, 1183 ParsePosition& ppos, 1184 UErrorCode &ec) { 1185 int32_t pos = ppos.getIndex(); 1186 1187 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 1188 UBool isName = FALSE; // true for \N{pat}, o/w false 1189 UBool invert = FALSE; 1190 1191 if (U_FAILURE(ec)) return *this; 1192 1193 // Minimum length is 5 characters, e.g. \p{L} 1194 if ((pos+5) > pattern.length()) { 1195 FAIL(ec); 1196 } 1197 1198 // On entry, ppos should point to one of the following locations: 1199 // Look for an opening [:, [:^, \p, or \P 1200 if (isPOSIXOpen(pattern, pos)) { 1201 posix = TRUE; 1202 pos += 2; 1203 pos = ICU_Utility::skipWhitespace(pattern, pos); 1204 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) { 1205 ++pos; 1206 invert = TRUE; 1207 } 1208 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { 1209 UChar c = pattern.charAt(pos+1); 1210 invert = (c == UPPER_P); 1211 isName = (c == UPPER_N); 1212 pos += 2; 1213 pos = ICU_Utility::skipWhitespace(pattern, pos); 1214 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) { 1215 // Syntax error; "\p" or "\P" not followed by "{" 1216 FAIL(ec); 1217 } 1218 } else { 1219 // Open delimiter not seen 1220 FAIL(ec); 1221 } 1222 1223 // Look for the matching close delimiter, either :] or } 1224 int32_t close; 1225 if (posix) { 1226 close = pattern.indexOf(POSIX_CLOSE, 2, pos); 1227 } else { 1228 close = pattern.indexOf(CLOSE_BRACE, pos); 1229 } 1230 if (close < 0) { 1231 // Syntax error; close delimiter missing 1232 FAIL(ec); 1233 } 1234 1235 // Look for an '=' sign. If this is present, we will parse a 1236 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 1237 // pattern. 1238 int32_t equals = pattern.indexOf(EQUALS, pos); 1239 UnicodeString propName, valueName; 1240 if (equals >= 0 && equals < close && !isName) { 1241 // Equals seen; parse medium/long pattern 1242 pattern.extractBetween(pos, equals, propName); 1243 pattern.extractBetween(equals+1, close, valueName); 1244 } 1245 1246 else { 1247 // Handle case where no '=' is seen, and \N{} 1248 pattern.extractBetween(pos, close, propName); 1249 1250 // Handle \N{name} 1251 if (isName) { 1252 // This is a little inefficient since it means we have to 1253 // parse NAME_PROP back to UCHAR_NAME even though we already 1254 // know it's UCHAR_NAME. If we refactor the API to 1255 // support args of (UProperty, char*) then we can remove 1256 // NAME_PROP and make this a little more efficient. 1257 valueName = propName; 1258 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); 1259 } 1260 } 1261 1262 applyPropertyAlias(propName, valueName, ec); 1263 1264 if (U_SUCCESS(ec)) { 1265 if (invert) { 1266 complement(); 1267 } 1268 1269 // Move to the limit position after the close delimiter if the 1270 // parse succeeded. 1271 ppos.setIndex(close + (posix ? 2 : 1)); 1272 } 1273 1274 return *this; 1275} 1276 1277/** 1278 * Parse a property pattern. 1279 * @param chars iterator over the pattern characters. Upon return 1280 * it will be advanced to the first character after the parsed 1281 * pattern, or the end of the iteration if all characters are 1282 * parsed. 1283 * @param rebuiltPat the pattern that was parsed, rebuilt or 1284 * copied from the input pattern, as appropriate. 1285 */ 1286void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, 1287 UnicodeString& rebuiltPat, 1288 UErrorCode& ec) { 1289 if (U_FAILURE(ec)) return; 1290 UnicodeString pattern; 1291 chars.lookahead(pattern); 1292 ParsePosition pos(0); 1293 applyPropertyPattern(pattern, pos, ec); 1294 if (U_FAILURE(ec)) return; 1295 if (pos.getIndex() == 0) { 1296 // syntaxError(chars, "Invalid property pattern"); 1297 ec = U_MALFORMED_SET; 1298 return; 1299 } 1300 chars.jumpahead(pos.getIndex()); 1301 rebuiltPat.append(pattern, 0, pos.getIndex()); 1302} 1303 1304U_NAMESPACE_END 1305