1/* 2******************************************************************************* 3* 4* Copyright (C) 1999-2010, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: uniset_props.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2004aug25 14* created by: Markus W. Scherer 15* 16* Character property dependent functions moved here from uniset.cpp 17*/ 18 19#include "unicode/utypes.h" 20#include "unicode/uniset.h" 21#include "unicode/parsepos.h" 22#include "unicode/uchar.h" 23#include "unicode/uscript.h" 24#include "unicode/symtable.h" 25#include "unicode/uset.h" 26#include "unicode/locid.h" 27#include "unicode/brkiter.h" 28#include "uset_imp.h" 29#include "ruleiter.h" 30#include "cmemory.h" 31#include "ucln_cmn.h" 32#include "util.h" 33#include "uvector.h" 34#include "uprops.h" 35#include "propname.h" 36#include "normalizer2impl.h" 37#include "unormimp.h" 38#include "ucase.h" 39#include "ubidi_props.h" 40#include "uinvchar.h" 41#include "uprops.h" 42#include "charstr.h" 43#include "cstring.h" 44#include "mutex.h" 45#include "umutex.h" 46#include "uassert.h" 47#include "hash.h" 48 49U_NAMESPACE_USE 50 51#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 52 53// initial storage. Must be >= 0 54// *** same as in uniset.cpp ! *** 55#define START_EXTRA 16 56 57// Define UChar constants using hex for EBCDIC compatibility 58// Used #define to reduce private static exports and memory access time. 59#define SET_OPEN ((UChar)0x005B) /*[*/ 60#define SET_CLOSE ((UChar)0x005D) /*]*/ 61#define HYPHEN ((UChar)0x002D) /*-*/ 62#define COMPLEMENT ((UChar)0x005E) /*^*/ 63#define COLON ((UChar)0x003A) /*:*/ 64#define BACKSLASH ((UChar)0x005C) /*\*/ 65#define INTERSECTION ((UChar)0x0026) /*&*/ 66#define UPPER_U ((UChar)0x0055) /*U*/ 67#define LOWER_U ((UChar)0x0075) /*u*/ 68#define OPEN_BRACE ((UChar)123) /*{*/ 69#define CLOSE_BRACE ((UChar)125) /*}*/ 70#define UPPER_P ((UChar)0x0050) /*P*/ 71#define LOWER_P ((UChar)0x0070) /*p*/ 72#define UPPER_N ((UChar)78) /*N*/ 73#define EQUALS ((UChar)0x003D) /*=*/ 74 75//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:" 76static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]" 77//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p" 78static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" 79//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N" 80static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ 81 82// Special property set IDs 83static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] 84static const char ASCII[] = "ASCII"; // [\u0000-\u007F] 85static const char ASSIGNED[] = "Assigned"; // [:^Cn:] 86 87// Unicode name property alias 88#define NAME_PROP "na" 89#define NAME_PROP_LENGTH 2 90 91/** 92 * Delimiter string used in patterns to close a category reference: 93 * ":]". Example: "[:Lu:]". 94 */ 95//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ 96 97// Cached sets ------------------------------------------------------------- *** 98 99U_CDECL_BEGIN 100static UBool U_CALLCONV uset_cleanup(); 101U_CDECL_END 102 103// Not a TriStateSingletonWrapper because we think the UnicodeSet constructor 104// can only fail with an out-of-memory error 105// if we have a correct pattern and the properties data is hardcoded and always available. 106class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> { 107public: 108 UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) : 109 SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {} 110 UnicodeSet *getInstance(UErrorCode &errorCode) { 111 return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, fPattern, errorCode); 112 } 113private: 114 static void *createInstance(const void *context, UErrorCode &errorCode) { 115 UnicodeString pattern((const char *)context, -1, US_INV); 116 UnicodeSet *set=new UnicodeSet(pattern, errorCode); 117 if(set==NULL) { 118 errorCode=U_MEMORY_ALLOCATION_ERROR; 119 } 120 set->freeze(); 121 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 122 return set; 123 } 124 125 const char *fPattern; 126}; 127 128U_CDECL_BEGIN 129 130static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions() 131 132STATIC_SIMPLE_SINGLETON(uni32Singleton); 133 134//---------------------------------------------------------------- 135// Inclusions list 136//---------------------------------------------------------------- 137 138// USetAdder implementation 139// Does not use uset.h to reduce code dependencies 140static void U_CALLCONV 141_set_add(USet *set, UChar32 c) { 142 ((UnicodeSet *)set)->add(c); 143} 144 145static void U_CALLCONV 146_set_addRange(USet *set, UChar32 start, UChar32 end) { 147 ((UnicodeSet *)set)->add(start, end); 148} 149 150static void U_CALLCONV 151_set_addString(USet *set, const UChar *str, int32_t length) { 152 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); 153} 154 155/** 156 * Cleanup function for UnicodeSet 157 */ 158static UBool U_CALLCONV uset_cleanup(void) { 159 int32_t i; 160 161 for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { 162 if (INCLUSIONS[i] != NULL) { 163 delete INCLUSIONS[i]; 164 INCLUSIONS[i] = NULL; 165 } 166 } 167 UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance(); 168 return TRUE; 169} 170 171U_CDECL_END 172 173U_NAMESPACE_BEGIN 174 175/* 176Reduce excessive reallocation, and make it easier to detect initialization 177problems. 178Usually you don't see smaller sets than this for Unicode 5.0. 179*/ 180#define DEFAULT_INCLUSION_CAPACITY 3072 181 182const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { 183 UBool needInit; 184 UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit); 185 if (needInit) { 186 UnicodeSet* incl = new UnicodeSet(); 187 USetAdder sa = { 188 (USet *)incl, 189 _set_add, 190 _set_addRange, 191 _set_addString, 192 NULL, // don't need remove() 193 NULL // don't need removeRange() 194 }; 195 incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); 196 if (incl != NULL) { 197 switch(src) { 198 case UPROPS_SRC_CHAR: 199 uchar_addPropertyStarts(&sa, &status); 200 break; 201 case UPROPS_SRC_PROPSVEC: 202 upropsvec_addPropertyStarts(&sa, &status); 203 break; 204 case UPROPS_SRC_CHAR_AND_PROPSVEC: 205 uchar_addPropertyStarts(&sa, &status); 206 upropsvec_addPropertyStarts(&sa, &status); 207 break; 208#if !UCONFIG_NO_NORMALIZATION 209 case UPROPS_SRC_NORM: 210 unorm_addPropertyStarts(&sa, &status); 211 break; 212 case UPROPS_SRC_CASE_AND_NORM: 213 ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status); 214 unorm_addPropertyStarts(&sa, &status); 215 break; 216 case UPROPS_SRC_NFC: { 217 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 218 if(U_SUCCESS(status)) { 219 impl->addPropertyStarts(&sa, status); 220 } 221 break; 222 } 223 case UPROPS_SRC_NFKC: { 224 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); 225 if(U_SUCCESS(status)) { 226 impl->addPropertyStarts(&sa, status); 227 } 228 break; 229 } 230 case UPROPS_SRC_NFKC_CF: { 231 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); 232 if(U_SUCCESS(status)) { 233 impl->addPropertyStarts(&sa, status); 234 } 235 break; 236 } 237#endif 238 case UPROPS_SRC_CASE: 239 ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status); 240 break; 241 case UPROPS_SRC_BIDI: 242 ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status); 243 break; 244 default: 245 status = U_INTERNAL_PROGRAM_ERROR; 246 break; 247 } 248 if (U_SUCCESS(status)) { 249 // Compact for caching 250 incl->compact(); 251 umtx_lock(NULL); 252 if (INCLUSIONS[src] == NULL) { 253 INCLUSIONS[src] = incl; 254 incl = NULL; 255 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 256 } 257 umtx_unlock(NULL); 258 } 259 delete incl; 260 } else { 261 status = U_MEMORY_ALLOCATION_ERROR; 262 } 263 } 264 return INCLUSIONS[src]; 265} 266 267// Cache some sets for other services -------------------------------------- *** 268 269U_CFUNC UnicodeSet * 270uniset_getUnicode32Instance(UErrorCode &errorCode) { 271 return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorCode); 272} 273 274// helper functions for matching of pattern syntax pieces ------------------ *** 275// these functions are parallel to the PERL_OPEN etc. strings above 276 277// using these functions is not only faster than UnicodeString::compare() and 278// caseCompare(), but they also make UnicodeSet work for simple patterns when 279// no Unicode properties data is available - when caseCompare() fails 280 281static inline UBool 282isPerlOpen(const UnicodeString &pattern, int32_t pos) { 283 UChar c; 284 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P); 285} 286 287/*static inline UBool 288isPerlClose(const UnicodeString &pattern, int32_t pos) { 289 return pattern.charAt(pos)==CLOSE_BRACE; 290}*/ 291 292static inline UBool 293isNameOpen(const UnicodeString &pattern, int32_t pos) { 294 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N; 295} 296 297static inline UBool 298isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { 299 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON; 300} 301 302/*static inline UBool 303isPOSIXClose(const UnicodeString &pattern, int32_t pos) { 304 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE; 305}*/ 306 307// TODO memory debugging provided inside uniset.cpp 308// could be made available here but probably obsolete with use of modern 309// memory leak checker tools 310#define _dbgct(me) 311 312//---------------------------------------------------------------- 313// Constructors &c 314//---------------------------------------------------------------- 315 316/** 317 * Constructs a set from the given pattern, optionally ignoring 318 * white space. See the class description for the syntax of the 319 * pattern language. 320 * @param pattern a string specifying what characters are in the set 321 */ 322UnicodeSet::UnicodeSet(const UnicodeString& pattern, 323 UErrorCode& status) : 324 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 325 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 326 fFlags(0) 327{ 328 if(U_SUCCESS(status)){ 329 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 330 /* test for NULL */ 331 if(list == NULL) { 332 status = U_MEMORY_ALLOCATION_ERROR; 333 }else{ 334 allocateStrings(status); 335 applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); 336 } 337 } 338 _dbgct(this); 339} 340 341/** 342 * Constructs a set from the given pattern, optionally ignoring 343 * white space. See the class description for the syntax of the 344 * pattern language. 345 * @param pattern a string specifying what characters are in the set 346 * @param options bitmask for options to apply to the pattern. 347 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 348 */ 349UnicodeSet::UnicodeSet(const UnicodeString& pattern, 350 uint32_t options, 351 const SymbolTable* symbols, 352 UErrorCode& status) : 353 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 354 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 355 fFlags(0) 356{ 357 if(U_SUCCESS(status)){ 358 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 359 /* test for NULL */ 360 if(list == NULL) { 361 status = U_MEMORY_ALLOCATION_ERROR; 362 }else{ 363 allocateStrings(status); 364 applyPattern(pattern, options, symbols, status); 365 } 366 } 367 _dbgct(this); 368} 369 370UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, 371 uint32_t options, 372 const SymbolTable* symbols, 373 UErrorCode& status) : 374 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 375 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 376 fFlags(0) 377{ 378 if(U_SUCCESS(status)){ 379 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 380 /* test for NULL */ 381 if(list == NULL) { 382 status = U_MEMORY_ALLOCATION_ERROR; 383 }else{ 384 allocateStrings(status); 385 applyPattern(pattern, pos, options, symbols, status); 386 } 387 } 388 _dbgct(this); 389} 390 391//---------------------------------------------------------------- 392// Public API 393//---------------------------------------------------------------- 394 395/** 396 * Modifies this set to represent the set specified by the given 397 * pattern, optionally ignoring white space. See the class 398 * description for the syntax of the pattern language. 399 * @param pattern a string specifying what characters are in the set 400 * @param ignoreSpaces if <code>true</code>, all spaces in the 401 * pattern are ignored. Spaces are those characters for which 402 * <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>. 403 * Characters preceded by '\\' are escaped, losing any special 404 * meaning they otherwise have. Spaces may be included by 405 * escaping them. 406 * @exception <code>IllegalArgumentException</code> if the pattern 407 * contains a syntax error. 408 */ 409UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 410 UErrorCode& status) { 411 return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); 412} 413 414 415/** 416 * Modifies this set to represent the set specified by the given 417 * pattern, optionally ignoring white space. See the class 418 * description for the syntax of the pattern language. 419 * @param pattern a string specifying what characters are in the set 420 * @param options bitmask for options to apply to the pattern. 421 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 422 */ 423UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 424 uint32_t options, 425 const SymbolTable* symbols, 426 UErrorCode& status) { 427 if (U_FAILURE(status) || isFrozen()) { 428 return *this; 429 } 430 431 ParsePosition pos(0); 432 applyPattern(pattern, pos, options, symbols, status); 433 if (U_FAILURE(status)) return *this; 434 435 int32_t i = pos.getIndex(); 436 437 if (options & USET_IGNORE_SPACE) { 438 // Skip over trailing whitespace 439 ICU_Utility::skipWhitespace(pattern, i, TRUE); 440 } 441 442 if (i != pattern.length()) { 443 status = U_ILLEGAL_ARGUMENT_ERROR; 444 } 445 return *this; 446} 447 448UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 449 ParsePosition& pos, 450 uint32_t options, 451 const SymbolTable* symbols, 452 UErrorCode& status) { 453 if (U_FAILURE(status) || isFrozen()) { 454 return *this; 455 } 456 // Need to build the pattern in a temporary string because 457 // _applyPattern calls add() etc., which set pat to empty. 458 UnicodeString rebuiltPat; 459 RuleCharacterIterator chars(pattern, symbols, pos); 460 applyPattern(chars, symbols, rebuiltPat, options, status); 461 if (U_FAILURE(status)) return *this; 462 if (chars.inVariable()) { 463 // syntaxError(chars, "Extra chars in variable value"); 464 status = U_MALFORMED_SET; 465 return *this; 466 } 467 setPattern(rebuiltPat); 468 return *this; 469} 470 471/** 472 * Return true if the given position, in the given pattern, appears 473 * to be the start of a UnicodeSet pattern. 474 */ 475UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { 476 return ((pos+1) < pattern.length() && 477 pattern.charAt(pos) == (UChar)91/*[*/) || 478 resemblesPropertyPattern(pattern, pos); 479} 480 481//---------------------------------------------------------------- 482// Implementation: Pattern parsing 483//---------------------------------------------------------------- 484 485/** 486 * A small all-inline class to manage a UnicodeSet pointer. Add 487 * operator->() etc. as needed. 488 */ 489class UnicodeSetPointer { 490 UnicodeSet* p; 491public: 492 inline UnicodeSetPointer() : p(0) {} 493 inline ~UnicodeSetPointer() { delete p; } 494 inline UnicodeSet* pointer() { return p; } 495 inline UBool allocate() { 496 if (p == 0) { 497 p = new UnicodeSet(); 498 } 499 return p != 0; 500 } 501}; 502 503/** 504 * Parse the pattern from the given RuleCharacterIterator. The 505 * iterator is advanced over the parsed pattern. 506 * @param chars iterator over the pattern characters. Upon return 507 * it will be advanced to the first character after the parsed 508 * pattern, or the end of the iteration if all characters are 509 * parsed. 510 * @param symbols symbol table to use to parse and dereference 511 * variables, or null if none. 512 * @param rebuiltPat the pattern that was parsed, rebuilt or 513 * copied from the input pattern, as appropriate. 514 * @param options a bit mask of zero or more of the following: 515 * IGNORE_SPACE, CASE. 516 */ 517void UnicodeSet::applyPattern(RuleCharacterIterator& chars, 518 const SymbolTable* symbols, 519 UnicodeString& rebuiltPat, 520 uint32_t options, 521 UErrorCode& ec) { 522 if (U_FAILURE(ec)) return; 523 524 // Syntax characters: [ ] ^ - & { } 525 526 // Recognized special forms for chars, sets: c-c s-s s&s 527 528 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | 529 RuleCharacterIterator::PARSE_ESCAPES; 530 if ((options & USET_IGNORE_SPACE) != 0) { 531 opts |= RuleCharacterIterator::SKIP_WHITESPACE; 532 } 533 534 UnicodeString patLocal, buf; 535 UBool usePat = FALSE; 536 UnicodeSetPointer scratch; 537 RuleCharacterIterator::Pos backup; 538 539 // mode: 0=before [, 1=between [...], 2=after ] 540 // lastItem: 0=none, 1=char, 2=set 541 int8_t lastItem = 0, mode = 0; 542 UChar32 lastChar = 0; 543 UChar op = 0; 544 545 UBool invert = FALSE; 546 547 clear(); 548 549 while (mode != 2 && !chars.atEnd()) { 550 U_ASSERT((lastItem == 0 && op == 0) || 551 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || 552 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || 553 op == INTERSECTION /*'&'*/))); 554 555 UChar32 c = 0; 556 UBool literal = FALSE; 557 UnicodeSet* nested = 0; // alias - do not delete 558 559 // -------- Check for property pattern 560 561 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 562 int8_t setMode = 0; 563 if (resemblesPropertyPattern(chars, opts)) { 564 setMode = 2; 565 } 566 567 // -------- Parse '[' of opening delimiter OR nested set. 568 // If there is a nested set, use `setMode' to define how 569 // the set should be parsed. If the '[' is part of the 570 // opening delimiter for this pattern, parse special 571 // strings "[", "[^", "[-", and "[^-". Check for stand-in 572 // characters representing a nested set in the symbol 573 // table. 574 575 else { 576 // Prepare to backup if necessary 577 chars.getPos(backup); 578 c = chars.next(opts, literal, ec); 579 if (U_FAILURE(ec)) return; 580 581 if (c == 0x5B /*'['*/ && !literal) { 582 if (mode == 1) { 583 chars.setPos(backup); // backup 584 setMode = 1; 585 } else { 586 // Handle opening '[' delimiter 587 mode = 1; 588 patLocal.append((UChar) 0x5B /*'['*/); 589 chars.getPos(backup); // prepare to backup 590 c = chars.next(opts, literal, ec); 591 if (U_FAILURE(ec)) return; 592 if (c == 0x5E /*'^'*/ && !literal) { 593 invert = TRUE; 594 patLocal.append((UChar) 0x5E /*'^'*/); 595 chars.getPos(backup); // prepare to backup 596 c = chars.next(opts, literal, ec); 597 if (U_FAILURE(ec)) return; 598 } 599 // Fall through to handle special leading '-'; 600 // otherwise restart loop for nested [], \p{}, etc. 601 if (c == HYPHEN /*'-'*/) { 602 literal = TRUE; 603 // Fall through to handle literal '-' below 604 } else { 605 chars.setPos(backup); // backup 606 continue; 607 } 608 } 609 } else if (symbols != 0) { 610 const UnicodeFunctor *m = symbols->lookupMatcher(c); 611 if (m != 0) { 612 if (m->getDynamicClassID() != UnicodeSet::getStaticClassID()) { 613 ec = U_MALFORMED_SET; 614 return; 615 } 616 // casting away const, but `nested' won't be modified 617 // (important not to modify stored set) 618 nested = (UnicodeSet*) m; 619 setMode = 3; 620 } 621 } 622 } 623 624 // -------- Handle a nested set. This either is inline in 625 // the pattern or represented by a stand-in that has 626 // previously been parsed and was looked up in the symbol 627 // table. 628 629 if (setMode != 0) { 630 if (lastItem == 1) { 631 if (op != 0) { 632 // syntaxError(chars, "Char expected after operator"); 633 ec = U_MALFORMED_SET; 634 return; 635 } 636 add(lastChar, lastChar); 637 _appendToPat(patLocal, lastChar, FALSE); 638 lastItem = 0; 639 op = 0; 640 } 641 642 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { 643 patLocal.append(op); 644 } 645 646 if (nested == 0) { 647 // lazy allocation 648 if (!scratch.allocate()) { 649 ec = U_MEMORY_ALLOCATION_ERROR; 650 return; 651 } 652 nested = scratch.pointer(); 653 } 654 switch (setMode) { 655 case 1: 656 nested->applyPattern(chars, symbols, patLocal, options, ec); 657 break; 658 case 2: 659 chars.skipIgnored(opts); 660 nested->applyPropertyPattern(chars, patLocal, ec); 661 if (U_FAILURE(ec)) return; 662 break; 663 case 3: // `nested' already parsed 664 nested->_toPattern(patLocal, FALSE); 665 break; 666 } 667 668 usePat = TRUE; 669 670 if (mode == 0) { 671 // Entire pattern is a category; leave parse loop 672 *this = *nested; 673 mode = 2; 674 break; 675 } 676 677 switch (op) { 678 case HYPHEN: /*'-'*/ 679 removeAll(*nested); 680 break; 681 case INTERSECTION: /*'&'*/ 682 retainAll(*nested); 683 break; 684 case 0: 685 addAll(*nested); 686 break; 687 } 688 689 op = 0; 690 lastItem = 2; 691 692 continue; 693 } 694 695 if (mode == 0) { 696 // syntaxError(chars, "Missing '['"); 697 ec = U_MALFORMED_SET; 698 return; 699 } 700 701 // -------- Parse special (syntax) characters. If the 702 // current character is not special, or if it is escaped, 703 // then fall through and handle it below. 704 705 if (!literal) { 706 switch (c) { 707 case 0x5D /*']'*/: 708 if (lastItem == 1) { 709 add(lastChar, lastChar); 710 _appendToPat(patLocal, lastChar, FALSE); 711 } 712 // Treat final trailing '-' as a literal 713 if (op == HYPHEN /*'-'*/) { 714 add(op, op); 715 patLocal.append(op); 716 } else if (op == INTERSECTION /*'&'*/) { 717 // syntaxError(chars, "Trailing '&'"); 718 ec = U_MALFORMED_SET; 719 return; 720 } 721 patLocal.append((UChar) 0x5D /*']'*/); 722 mode = 2; 723 continue; 724 case HYPHEN /*'-'*/: 725 if (op == 0) { 726 if (lastItem != 0) { 727 op = (UChar) c; 728 continue; 729 } else { 730 // Treat final trailing '-' as a literal 731 add(c, c); 732 c = chars.next(opts, literal, ec); 733 if (U_FAILURE(ec)) return; 734 if (c == 0x5D /*']'*/ && !literal) { 735 patLocal.append(HYPHEN_RIGHT_BRACE); 736 mode = 2; 737 continue; 738 } 739 } 740 } 741 // syntaxError(chars, "'-' not after char or set"); 742 ec = U_MALFORMED_SET; 743 return; 744 case INTERSECTION /*'&'*/: 745 if (lastItem == 2 && op == 0) { 746 op = (UChar) c; 747 continue; 748 } 749 // syntaxError(chars, "'&' not after set"); 750 ec = U_MALFORMED_SET; 751 return; 752 case 0x5E /*'^'*/: 753 // syntaxError(chars, "'^' not after '['"); 754 ec = U_MALFORMED_SET; 755 return; 756 case 0x7B /*'{'*/: 757 if (op != 0) { 758 // syntaxError(chars, "Missing operand after operator"); 759 ec = U_MALFORMED_SET; 760 return; 761 } 762 if (lastItem == 1) { 763 add(lastChar, lastChar); 764 _appendToPat(patLocal, lastChar, FALSE); 765 } 766 lastItem = 0; 767 buf.truncate(0); 768 { 769 UBool ok = FALSE; 770 while (!chars.atEnd()) { 771 c = chars.next(opts, literal, ec); 772 if (U_FAILURE(ec)) return; 773 if (c == 0x7D /*'}'*/ && !literal) { 774 ok = TRUE; 775 break; 776 } 777 buf.append(c); 778 } 779 if (buf.length() < 1 || !ok) { 780 // syntaxError(chars, "Invalid multicharacter string"); 781 ec = U_MALFORMED_SET; 782 return; 783 } 784 } 785 // We have new string. Add it to set and continue; 786 // we don't need to drop through to the further 787 // processing 788 add(buf); 789 patLocal.append((UChar) 0x7B /*'{'*/); 790 _appendToPat(patLocal, buf, FALSE); 791 patLocal.append((UChar) 0x7D /*'}'*/); 792 continue; 793 case SymbolTable::SYMBOL_REF: 794 // symbols nosymbols 795 // [a-$] error error (ambiguous) 796 // [a$] anchor anchor 797 // [a-$x] var "x"* literal '$' 798 // [a-$.] error literal '$' 799 // *We won't get here in the case of var "x" 800 { 801 chars.getPos(backup); 802 c = chars.next(opts, literal, ec); 803 if (U_FAILURE(ec)) return; 804 UBool anchor = (c == 0x5D /*']'*/ && !literal); 805 if (symbols == 0 && !anchor) { 806 c = SymbolTable::SYMBOL_REF; 807 chars.setPos(backup); 808 break; // literal '$' 809 } 810 if (anchor && op == 0) { 811 if (lastItem == 1) { 812 add(lastChar, lastChar); 813 _appendToPat(patLocal, lastChar, FALSE); 814 } 815 add(U_ETHER); 816 usePat = TRUE; 817 patLocal.append((UChar) SymbolTable::SYMBOL_REF); 818 patLocal.append((UChar) 0x5D /*']'*/); 819 mode = 2; 820 continue; 821 } 822 // syntaxError(chars, "Unquoted '$'"); 823 ec = U_MALFORMED_SET; 824 return; 825 } 826 default: 827 break; 828 } 829 } 830 831 // -------- Parse literal characters. This includes both 832 // escaped chars ("\u4E01") and non-syntax characters 833 // ("a"). 834 835 switch (lastItem) { 836 case 0: 837 lastItem = 1; 838 lastChar = c; 839 break; 840 case 1: 841 if (op == HYPHEN /*'-'*/) { 842 if (lastChar >= c) { 843 // Don't allow redundant (a-a) or empty (b-a) ranges; 844 // these are most likely typos. 845 // syntaxError(chars, "Invalid range"); 846 ec = U_MALFORMED_SET; 847 return; 848 } 849 add(lastChar, c); 850 _appendToPat(patLocal, lastChar, FALSE); 851 patLocal.append(op); 852 _appendToPat(patLocal, c, FALSE); 853 lastItem = 0; 854 op = 0; 855 } else { 856 add(lastChar, lastChar); 857 _appendToPat(patLocal, lastChar, FALSE); 858 lastChar = c; 859 } 860 break; 861 case 2: 862 if (op != 0) { 863 // syntaxError(chars, "Set expected after operator"); 864 ec = U_MALFORMED_SET; 865 return; 866 } 867 lastChar = c; 868 lastItem = 1; 869 break; 870 } 871 } 872 873 if (mode != 2) { 874 // syntaxError(chars, "Missing ']'"); 875 ec = U_MALFORMED_SET; 876 return; 877 } 878 879 chars.skipIgnored(opts); 880 881 /** 882 * Handle global flags (invert, case insensitivity). If this 883 * pattern should be compiled case-insensitive, then we need 884 * to close over case BEFORE COMPLEMENTING. This makes 885 * patterns like /[^abc]/i work. 886 */ 887 if ((options & USET_CASE_INSENSITIVE) != 0) { 888 closeOver(USET_CASE_INSENSITIVE); 889 } 890 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { 891 closeOver(USET_ADD_CASE_MAPPINGS); 892 } 893 if (invert) { 894 complement(); 895 } 896 897 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the 898 // generated pattern. 899 if (usePat) { 900 rebuiltPat.append(patLocal); 901 } else { 902 _generatePattern(rebuiltPat, FALSE); 903 } 904 if (isBogus() && U_SUCCESS(ec)) { 905 // We likely ran out of memory. AHHH! 906 ec = U_MEMORY_ALLOCATION_ERROR; 907 } 908} 909 910//---------------------------------------------------------------- 911// Property set implementation 912//---------------------------------------------------------------- 913 914static UBool numericValueFilter(UChar32 ch, void* context) { 915 return u_getNumericValue(ch) == *(double*)context; 916} 917 918static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { 919 int32_t value = *(int32_t*)context; 920 return (U_GET_GC_MASK((UChar32) ch) & value) != 0; 921} 922 923static UBool versionFilter(UChar32 ch, void* context) { 924 UVersionInfo v, none = { 0, 0, 0, 0}; 925 UVersionInfo* version = (UVersionInfo*)context; 926 u_charAge(ch, v); 927 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; 928} 929 930typedef struct { 931 UProperty prop; 932 int32_t value; 933} IntPropertyContext; 934 935static UBool intPropertyFilter(UChar32 ch, void* context) { 936 IntPropertyContext* c = (IntPropertyContext*)context; 937 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; 938} 939 940 941/** 942 * Generic filter-based scanning code for UCD property UnicodeSets. 943 */ 944void UnicodeSet::applyFilter(UnicodeSet::Filter filter, 945 void* context, 946 int32_t src, 947 UErrorCode &status) { 948 // Walk through all Unicode characters, noting the start 949 // and end of each range for which filter.contain(c) is 950 // true. Add each range to a set. 951 // 952 // To improve performance, use the INCLUSIONS set, which 953 // encodes information about character ranges that are known 954 // to have identical properties. INCLUSIONS contains 955 // only the first characters of such ranges. 956 // 957 // TODO Where possible, instead of scanning over code points, 958 // use internal property data to initialize UnicodeSets for 959 // those properties. Scanning code points is slow. 960 if (U_FAILURE(status)) return; 961 962 const UnicodeSet* inclusions = getInclusions(src, status); 963 if (U_FAILURE(status)) { 964 return; 965 } 966 967 clear(); 968 969 UChar32 startHasProperty = -1; 970 int32_t limitRange = inclusions->getRangeCount(); 971 972 for (int j=0; j<limitRange; ++j) { 973 // get current range 974 UChar32 start = inclusions->getRangeStart(j); 975 UChar32 end = inclusions->getRangeEnd(j); 976 977 // for all the code points in the range, process 978 for (UChar32 ch = start; ch <= end; ++ch) { 979 // only add to this UnicodeSet on inflection points -- 980 // where the hasProperty value changes to false 981 if ((*filter)(ch, context)) { 982 if (startHasProperty < 0) { 983 startHasProperty = ch; 984 } 985 } else if (startHasProperty >= 0) { 986 add(startHasProperty, ch-1); 987 startHasProperty = -1; 988 } 989 } 990 } 991 if (startHasProperty >= 0) { 992 add((UChar32)startHasProperty, (UChar32)0x10FFFF); 993 } 994 if (isBogus() && U_SUCCESS(status)) { 995 // We likely ran out of memory. AHHH! 996 status = U_MEMORY_ALLOCATION_ERROR; 997 } 998} 999 1000static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { 1001 /* Note: we use ' ' in compiler code page */ 1002 int32_t j = 0; 1003 char ch; 1004 --dstCapacity; /* make room for term. zero */ 1005 while ((ch = *src++) != 0) { 1006 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { 1007 continue; 1008 } 1009 if (j >= dstCapacity) return FALSE; 1010 dst[j++] = ch; 1011 } 1012 if (j > 0 && dst[j-1] == ' ') --j; 1013 dst[j] = 0; 1014 return TRUE; 1015} 1016 1017//---------------------------------------------------------------- 1018// Property set API 1019//---------------------------------------------------------------- 1020 1021#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} 1022 1023UnicodeSet& 1024UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { 1025 if (U_FAILURE(ec) || isFrozen()) return *this; 1026 1027 if (prop == UCHAR_GENERAL_CATEGORY_MASK) { 1028 applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec); 1029 } else { 1030 IntPropertyContext c = {prop, value}; 1031 applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec); 1032 } 1033 return *this; 1034} 1035 1036UnicodeSet& 1037UnicodeSet::applyPropertyAlias(const UnicodeString& prop, 1038 const UnicodeString& value, 1039 UErrorCode& ec) { 1040 if (U_FAILURE(ec) || isFrozen()) return *this; 1041 1042 // prop and value used to be converted to char * using the default 1043 // converter instead of the invariant conversion. 1044 // This should not be necessary because all Unicode property and value 1045 // names use only invariant characters. 1046 // If there are any variant characters, then we won't find them anyway. 1047 // Checking first avoids assertion failures in the conversion. 1048 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || 1049 !uprv_isInvariantUString(value.getBuffer(), value.length()) 1050 ) { 1051 FAIL(ec); 1052 } 1053 CharString pname(prop); 1054 CharString vname(value); 1055 1056 UProperty p; 1057 int32_t v; 1058 UBool mustNotBeEmpty = FALSE, invert = FALSE; 1059 1060 if (value.length() > 0) { 1061 p = u_getPropertyEnum(pname); 1062 if (p == UCHAR_INVALID_CODE) FAIL(ec); 1063 1064 // Treat gc as gcm 1065 if (p == UCHAR_GENERAL_CATEGORY) { 1066 p = UCHAR_GENERAL_CATEGORY_MASK; 1067 } 1068 1069 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || 1070 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || 1071 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { 1072 v = u_getPropertyValueEnum(p, vname); 1073 if (v == UCHAR_INVALID_CODE) { 1074 // Handle numeric CCC 1075 if (p == UCHAR_CANONICAL_COMBINING_CLASS || 1076 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || 1077 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { 1078 char* end; 1079 double value = uprv_strtod(vname, &end); 1080 v = (int32_t) value; 1081 if (v != value || v < 0 || *end != 0) { 1082 // non-integral or negative value, or trailing junk 1083 FAIL(ec); 1084 } 1085 // If the resultant set is empty then the numeric value 1086 // was invalid. 1087 mustNotBeEmpty = TRUE; 1088 } else { 1089 FAIL(ec); 1090 } 1091 } 1092 } 1093 1094 else { 1095 1096 switch (p) { 1097 case UCHAR_NUMERIC_VALUE: 1098 { 1099 char* end; 1100 double value = uprv_strtod(vname, &end); 1101 if (*end != 0) { 1102 FAIL(ec); 1103 } 1104 applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec); 1105 return *this; 1106 } 1107 break; 1108 case UCHAR_NAME: 1109 case UCHAR_UNICODE_1_NAME: 1110 { 1111 // Must munge name, since u_charFromName() does not do 1112 // 'loose' matching. 1113 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength 1114 if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec); 1115 UCharNameChoice choice = (p == UCHAR_NAME) ? 1116 U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME; 1117 UChar32 ch = u_charFromName(choice, buf, &ec); 1118 if (U_SUCCESS(ec)) { 1119 clear(); 1120 add(ch); 1121 return *this; 1122 } else { 1123 FAIL(ec); 1124 } 1125 } 1126 break; 1127 case UCHAR_AGE: 1128 { 1129 // Must munge name, since u_versionFromString() does not do 1130 // 'loose' matching. 1131 char buf[128]; 1132 if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec); 1133 UVersionInfo version; 1134 u_versionFromString(version, buf); 1135 applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec); 1136 return *this; 1137 } 1138 break; 1139 default: 1140 // p is a non-binary, non-enumerated property that we 1141 // don't support (yet). 1142 FAIL(ec); 1143 } 1144 } 1145 } 1146 1147 else { 1148 // value is empty. Interpret as General Category, Script, or 1149 // Binary property. 1150 p = UCHAR_GENERAL_CATEGORY_MASK; 1151 v = u_getPropertyValueEnum(p, pname); 1152 if (v == UCHAR_INVALID_CODE) { 1153 p = UCHAR_SCRIPT; 1154 v = u_getPropertyValueEnum(p, pname); 1155 if (v == UCHAR_INVALID_CODE) { 1156 p = u_getPropertyEnum(pname); 1157 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { 1158 v = 1; 1159 } else if (0 == uprv_comparePropertyNames(ANY, pname)) { 1160 set(MIN_VALUE, MAX_VALUE); 1161 return *this; 1162 } else if (0 == uprv_comparePropertyNames(ASCII, pname)) { 1163 set(0, 0x7F); 1164 return *this; 1165 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) { 1166 // [:Assigned:]=[:^Cn:] 1167 p = UCHAR_GENERAL_CATEGORY_MASK; 1168 v = U_GC_CN_MASK; 1169 invert = TRUE; 1170 } else { 1171 FAIL(ec); 1172 } 1173 } 1174 } 1175 } 1176 1177 applyIntPropertyValue(p, v, ec); 1178 if(invert) { 1179 complement(); 1180 } 1181 1182 if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) { 1183 // mustNotBeEmpty is set to true if an empty set indicates 1184 // invalid input. 1185 ec = U_ILLEGAL_ARGUMENT_ERROR; 1186 } 1187 1188 if (isBogus() && U_SUCCESS(ec)) { 1189 // We likely ran out of memory. AHHH! 1190 ec = U_MEMORY_ALLOCATION_ERROR; 1191 } 1192 return *this; 1193} 1194 1195//---------------------------------------------------------------- 1196// Property set patterns 1197//---------------------------------------------------------------- 1198 1199/** 1200 * Return true if the given position, in the given pattern, appears 1201 * to be the start of a property set pattern. 1202 */ 1203UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, 1204 int32_t pos) { 1205 // Patterns are at least 5 characters long 1206 if ((pos+5) > pattern.length()) { 1207 return FALSE; 1208 } 1209 1210 // Look for an opening [:, [:^, \p, or \P 1211 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); 1212} 1213 1214/** 1215 * Return true if the given iterator appears to point at a 1216 * property pattern. Regardless of the result, return with the 1217 * iterator unchanged. 1218 * @param chars iterator over the pattern characters. Upon return 1219 * it will be unchanged. 1220 * @param iterOpts RuleCharacterIterator options 1221 */ 1222UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, 1223 int32_t iterOpts) { 1224 // NOTE: literal will always be FALSE, because we don't parse escapes. 1225 UBool result = FALSE, literal; 1226 UErrorCode ec = U_ZERO_ERROR; 1227 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; 1228 RuleCharacterIterator::Pos pos; 1229 chars.getPos(pos); 1230 UChar32 c = chars.next(iterOpts, literal, ec); 1231 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) { 1232 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, 1233 literal, ec); 1234 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) : 1235 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/); 1236 } 1237 chars.setPos(pos); 1238 return result && U_SUCCESS(ec); 1239} 1240 1241/** 1242 * Parse the given property pattern at the given parse position. 1243 */ 1244UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, 1245 ParsePosition& ppos, 1246 UErrorCode &ec) { 1247 int32_t pos = ppos.getIndex(); 1248 1249 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 1250 UBool isName = FALSE; // true for \N{pat}, o/w false 1251 UBool invert = FALSE; 1252 1253 if (U_FAILURE(ec)) return *this; 1254 1255 // Minimum length is 5 characters, e.g. \p{L} 1256 if ((pos+5) > pattern.length()) { 1257 FAIL(ec); 1258 } 1259 1260 // On entry, ppos should point to one of the following locations: 1261 // Look for an opening [:, [:^, \p, or \P 1262 if (isPOSIXOpen(pattern, pos)) { 1263 posix = TRUE; 1264 pos += 2; 1265 pos = ICU_Utility::skipWhitespace(pattern, pos); 1266 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) { 1267 ++pos; 1268 invert = TRUE; 1269 } 1270 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { 1271 UChar c = pattern.charAt(pos+1); 1272 invert = (c == UPPER_P); 1273 isName = (c == UPPER_N); 1274 pos += 2; 1275 pos = ICU_Utility::skipWhitespace(pattern, pos); 1276 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) { 1277 // Syntax error; "\p" or "\P" not followed by "{" 1278 FAIL(ec); 1279 } 1280 } else { 1281 // Open delimiter not seen 1282 FAIL(ec); 1283 } 1284 1285 // Look for the matching close delimiter, either :] or } 1286 int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos); 1287 if (close < 0) { 1288 // Syntax error; close delimiter missing 1289 FAIL(ec); 1290 } 1291 1292 // Look for an '=' sign. If this is present, we will parse a 1293 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 1294 // pattern. 1295 int32_t equals = pattern.indexOf(EQUALS, pos); 1296 UnicodeString propName, valueName; 1297 if (equals >= 0 && equals < close && !isName) { 1298 // Equals seen; parse medium/long pattern 1299 pattern.extractBetween(pos, equals, propName); 1300 pattern.extractBetween(equals+1, close, valueName); 1301 } 1302 1303 else { 1304 // Handle case where no '=' is seen, and \N{} 1305 pattern.extractBetween(pos, close, propName); 1306 1307 // Handle \N{name} 1308 if (isName) { 1309 // This is a little inefficient since it means we have to 1310 // parse NAME_PROP back to UCHAR_NAME even though we already 1311 // know it's UCHAR_NAME. If we refactor the API to 1312 // support args of (UProperty, char*) then we can remove 1313 // NAME_PROP and make this a little more efficient. 1314 valueName = propName; 1315 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); 1316 } 1317 } 1318 1319 applyPropertyAlias(propName, valueName, ec); 1320 1321 if (U_SUCCESS(ec)) { 1322 if (invert) { 1323 complement(); 1324 } 1325 1326 // Move to the limit position after the close delimiter if the 1327 // parse succeeded. 1328 ppos.setIndex(close + (posix ? 2 : 1)); 1329 } 1330 1331 return *this; 1332} 1333 1334/** 1335 * Parse a property pattern. 1336 * @param chars iterator over the pattern characters. Upon return 1337 * it will be advanced to the first character after the parsed 1338 * pattern, or the end of the iteration if all characters are 1339 * parsed. 1340 * @param rebuiltPat the pattern that was parsed, rebuilt or 1341 * copied from the input pattern, as appropriate. 1342 */ 1343void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, 1344 UnicodeString& rebuiltPat, 1345 UErrorCode& ec) { 1346 if (U_FAILURE(ec)) return; 1347 UnicodeString pattern; 1348 chars.lookahead(pattern); 1349 ParsePosition pos(0); 1350 applyPropertyPattern(pattern, pos, ec); 1351 if (U_FAILURE(ec)) return; 1352 if (pos.getIndex() == 0) { 1353 // syntaxError(chars, "Invalid property pattern"); 1354 ec = U_MALFORMED_SET; 1355 return; 1356 } 1357 chars.jumpahead(pos.getIndex()); 1358 rebuiltPat.append(pattern, 0, pos.getIndex()); 1359} 1360 1361//---------------------------------------------------------------- 1362// Case folding API 1363//---------------------------------------------------------------- 1364 1365// add the result of a full case mapping to the set 1366// use str as a temporary string to avoid constructing one 1367static inline void 1368addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) { 1369 if(result >= 0) { 1370 if(result > UCASE_MAX_STRING_LENGTH) { 1371 // add a single-code point case mapping 1372 set.add(result); 1373 } else { 1374 // add a string case mapping from full with length result 1375 str.setTo((UBool)FALSE, full, result); 1376 set.add(str); 1377 } 1378 } 1379 // result < 0: the code point mapped to itself, no need to add it 1380 // see ucase.h 1381} 1382 1383UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { 1384 if (isFrozen() || isBogus()) { 1385 return *this; 1386 } 1387 if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { 1388 UErrorCode status = U_ZERO_ERROR; 1389 const UCaseProps *csp = ucase_getSingleton(&status); 1390 if (U_SUCCESS(status)) { 1391 UnicodeSet foldSet(*this); 1392 UnicodeString str; 1393 USetAdder sa = { 1394 (USet *)&foldSet, 1395 _set_add, 1396 _set_addRange, 1397 _set_addString, 1398 NULL, // don't need remove() 1399 NULL // don't need removeRange() 1400 }; 1401 1402 // start with input set to guarantee inclusion 1403 // USET_CASE: remove strings because the strings will actually be reduced (folded); 1404 // therefore, start with no strings and add only those needed 1405 if (attribute & USET_CASE_INSENSITIVE) { 1406 foldSet.strings->removeAllElements(); 1407 } 1408 1409 int32_t n = getRangeCount(); 1410 UChar32 result; 1411 const UChar *full; 1412 int32_t locCache = 0; 1413 1414 for (int32_t i=0; i<n; ++i) { 1415 UChar32 start = getRangeStart(i); 1416 UChar32 end = getRangeEnd(i); 1417 1418 if (attribute & USET_CASE_INSENSITIVE) { 1419 // full case closure 1420 for (UChar32 cp=start; cp<=end; ++cp) { 1421 ucase_addCaseClosure(csp, cp, &sa); 1422 } 1423 } else { 1424 // add case mappings 1425 // (does not add long s for regular s, or Kelvin for k, for example) 1426 for (UChar32 cp=start; cp<=end; ++cp) { 1427 result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache); 1428 addCaseMapping(foldSet, result, full, str); 1429 1430 result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache); 1431 addCaseMapping(foldSet, result, full, str); 1432 1433 result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache); 1434 addCaseMapping(foldSet, result, full, str); 1435 1436 result = ucase_toFullFolding(csp, cp, &full, 0); 1437 addCaseMapping(foldSet, result, full, str); 1438 } 1439 } 1440 } 1441 if (strings != NULL && strings->size() > 0) { 1442 if (attribute & USET_CASE_INSENSITIVE) { 1443 for (int32_t j=0; j<strings->size(); ++j) { 1444 str = *(const UnicodeString *) strings->elementAt(j); 1445 str.foldCase(); 1446 if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) { 1447 foldSet.add(str); // does not map to code points: add the folded string itself 1448 } 1449 } 1450 } else { 1451 Locale root(""); 1452#if !UCONFIG_NO_BREAK_ITERATION 1453 BreakIterator *bi = BreakIterator::createWordInstance(root, status); 1454#endif 1455 if (U_SUCCESS(status)) { 1456 const UnicodeString *pStr; 1457 1458 for (int32_t j=0; j<strings->size(); ++j) { 1459 pStr = (const UnicodeString *) strings->elementAt(j); 1460 (str = *pStr).toLower(root); 1461 foldSet.add(str); 1462#if !UCONFIG_NO_BREAK_ITERATION 1463 (str = *pStr).toTitle(bi, root); 1464 foldSet.add(str); 1465#endif 1466 (str = *pStr).toUpper(root); 1467 foldSet.add(str); 1468 (str = *pStr).foldCase(); 1469 foldSet.add(str); 1470 } 1471 } 1472#if !UCONFIG_NO_BREAK_ITERATION 1473 delete bi; 1474#endif 1475 } 1476 } 1477 *this = foldSet; 1478 } 1479 } 1480 return *this; 1481} 1482 1483U_NAMESPACE_END 1484