1/* 2******************************************************************************* 3* 4* Copyright (C) 1999-2011, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: uniset_props.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2004aug25 14* created by: Markus W. Scherer 15* 16* Character property dependent functions moved here from uniset.cpp 17*/ 18 19#include "unicode/utypes.h" 20#include "unicode/uniset.h" 21#include "unicode/parsepos.h" 22#include "unicode/uchar.h" 23#include "unicode/uscript.h" 24#include "unicode/symtable.h" 25#include "unicode/uset.h" 26#include "unicode/locid.h" 27#include "unicode/brkiter.h" 28#include "uset_imp.h" 29#include "ruleiter.h" 30#include "cmemory.h" 31#include "ucln_cmn.h" 32#include "util.h" 33#include "uvector.h" 34#include "uprops.h" 35#include "propname.h" 36#include "normalizer2impl.h" 37#include "ucase.h" 38#include "ubidi_props.h" 39#include "uinvchar.h" 40#include "uprops.h" 41#include "charstr.h" 42#include "cstring.h" 43#include "mutex.h" 44#include "umutex.h" 45#include "uassert.h" 46#include "hash.h" 47 48U_NAMESPACE_USE 49 50#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 51 52// initial storage. Must be >= 0 53// *** same as in uniset.cpp ! *** 54#define START_EXTRA 16 55 56// Define UChar constants using hex for EBCDIC compatibility 57// Used #define to reduce private static exports and memory access time. 58#define SET_OPEN ((UChar)0x005B) /*[*/ 59#define SET_CLOSE ((UChar)0x005D) /*]*/ 60#define HYPHEN ((UChar)0x002D) /*-*/ 61#define COMPLEMENT ((UChar)0x005E) /*^*/ 62#define COLON ((UChar)0x003A) /*:*/ 63#define BACKSLASH ((UChar)0x005C) /*\*/ 64#define INTERSECTION ((UChar)0x0026) /*&*/ 65#define UPPER_U ((UChar)0x0055) /*U*/ 66#define LOWER_U ((UChar)0x0075) /*u*/ 67#define OPEN_BRACE ((UChar)123) /*{*/ 68#define CLOSE_BRACE ((UChar)125) /*}*/ 69#define UPPER_P ((UChar)0x0050) /*P*/ 70#define LOWER_P ((UChar)0x0070) /*p*/ 71#define UPPER_N ((UChar)78) /*N*/ 72#define EQUALS ((UChar)0x003D) /*=*/ 73 74//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:" 75static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]" 76//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p" 77static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" 78//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N" 79static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ 80 81// Special property set IDs 82static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] 83static const char ASCII[] = "ASCII"; // [\u0000-\u007F] 84static const char ASSIGNED[] = "Assigned"; // [:^Cn:] 85 86// Unicode name property alias 87#define NAME_PROP "na" 88#define NAME_PROP_LENGTH 2 89 90/** 91 * Delimiter string used in patterns to close a category reference: 92 * ":]". Example: "[:Lu:]". 93 */ 94//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ 95 96// Cached sets ------------------------------------------------------------- *** 97 98U_CDECL_BEGIN 99static UBool U_CALLCONV uset_cleanup(); 100U_CDECL_END 101 102// Not a TriStateSingletonWrapper because we think the UnicodeSet constructor 103// can only fail with an out-of-memory error 104// if we have a correct pattern and the properties data is hardcoded and always available. 105class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> { 106public: 107 UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) : 108 SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {} 109 UnicodeSet *getInstance(UErrorCode &errorCode) { 110 return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, fPattern, errorCode); 111 } 112private: 113 static void *createInstance(const void *context, UErrorCode &errorCode) { 114 UnicodeString pattern((const char *)context, -1, US_INV); 115 UnicodeSet *set=new UnicodeSet(pattern, errorCode); 116 if(set==NULL) { 117 errorCode=U_MEMORY_ALLOCATION_ERROR; 118 return NULL; 119 } 120 set->freeze(); 121 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 122 return set; 123 } 124 125 const char *fPattern; 126}; 127 128U_CDECL_BEGIN 129 130static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions() 131 132STATIC_SIMPLE_SINGLETON(uni32Singleton); 133 134//---------------------------------------------------------------- 135// Inclusions list 136//---------------------------------------------------------------- 137 138// USetAdder implementation 139// Does not use uset.h to reduce code dependencies 140static void U_CALLCONV 141_set_add(USet *set, UChar32 c) { 142 ((UnicodeSet *)set)->add(c); 143} 144 145static void U_CALLCONV 146_set_addRange(USet *set, UChar32 start, UChar32 end) { 147 ((UnicodeSet *)set)->add(start, end); 148} 149 150static void U_CALLCONV 151_set_addString(USet *set, const UChar *str, int32_t length) { 152 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); 153} 154 155/** 156 * Cleanup function for UnicodeSet 157 */ 158static UBool U_CALLCONV uset_cleanup(void) { 159 int32_t i; 160 161 for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { 162 if (INCLUSIONS[i] != NULL) { 163 delete INCLUSIONS[i]; 164 INCLUSIONS[i] = NULL; 165 } 166 } 167 UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance(); 168 return TRUE; 169} 170 171U_CDECL_END 172 173U_NAMESPACE_BEGIN 174 175/* 176Reduce excessive reallocation, and make it easier to detect initialization 177problems. 178Usually you don't see smaller sets than this for Unicode 5.0. 179*/ 180#define DEFAULT_INCLUSION_CAPACITY 3072 181 182const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { 183 UBool needInit; 184 UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit); 185 if (needInit) { 186 UnicodeSet* incl = new UnicodeSet(); 187 USetAdder sa = { 188 (USet *)incl, 189 _set_add, 190 _set_addRange, 191 _set_addString, 192 NULL, // don't need remove() 193 NULL // don't need removeRange() 194 }; 195 if (incl != NULL) { 196 incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); 197 switch(src) { 198 case UPROPS_SRC_CHAR: 199 uchar_addPropertyStarts(&sa, &status); 200 break; 201 case UPROPS_SRC_PROPSVEC: 202 upropsvec_addPropertyStarts(&sa, &status); 203 break; 204 case UPROPS_SRC_CHAR_AND_PROPSVEC: 205 uchar_addPropertyStarts(&sa, &status); 206 upropsvec_addPropertyStarts(&sa, &status); 207 break; 208#if !UCONFIG_NO_NORMALIZATION 209 case UPROPS_SRC_CASE_AND_NORM: { 210 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 211 if(U_SUCCESS(status)) { 212 impl->addPropertyStarts(&sa, status); 213 } 214 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); 215 break; 216 } 217 case UPROPS_SRC_NFC: { 218 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 219 if(U_SUCCESS(status)) { 220 impl->addPropertyStarts(&sa, status); 221 } 222 break; 223 } 224 case UPROPS_SRC_NFKC: { 225 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status); 226 if(U_SUCCESS(status)) { 227 impl->addPropertyStarts(&sa, status); 228 } 229 break; 230 } 231 case UPROPS_SRC_NFKC_CF: { 232 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status); 233 if(U_SUCCESS(status)) { 234 impl->addPropertyStarts(&sa, status); 235 } 236 break; 237 } 238 case UPROPS_SRC_NFC_CANON_ITER: { 239 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status); 240 if(U_SUCCESS(status)) { 241 impl->addCanonIterPropertyStarts(&sa, status); 242 } 243 break; 244 } 245#endif 246 case UPROPS_SRC_CASE: 247 ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status); 248 break; 249 case UPROPS_SRC_BIDI: 250 ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status); 251 break; 252 default: 253 status = U_INTERNAL_PROGRAM_ERROR; 254 break; 255 } 256 if (U_SUCCESS(status)) { 257 // Compact for caching 258 incl->compact(); 259 umtx_lock(NULL); 260 if (INCLUSIONS[src] == NULL) { 261 INCLUSIONS[src] = incl; 262 incl = NULL; 263 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 264 } 265 umtx_unlock(NULL); 266 } 267 delete incl; 268 } else { 269 status = U_MEMORY_ALLOCATION_ERROR; 270 } 271 } 272 return INCLUSIONS[src]; 273} 274 275// Cache some sets for other services -------------------------------------- *** 276 277U_CFUNC UnicodeSet * 278uniset_getUnicode32Instance(UErrorCode &errorCode) { 279 return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorCode); 280} 281 282// helper functions for matching of pattern syntax pieces ------------------ *** 283// these functions are parallel to the PERL_OPEN etc. strings above 284 285// using these functions is not only faster than UnicodeString::compare() and 286// caseCompare(), but they also make UnicodeSet work for simple patterns when 287// no Unicode properties data is available - when caseCompare() fails 288 289static inline UBool 290isPerlOpen(const UnicodeString &pattern, int32_t pos) { 291 UChar c; 292 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P); 293} 294 295/*static inline UBool 296isPerlClose(const UnicodeString &pattern, int32_t pos) { 297 return pattern.charAt(pos)==CLOSE_BRACE; 298}*/ 299 300static inline UBool 301isNameOpen(const UnicodeString &pattern, int32_t pos) { 302 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N; 303} 304 305static inline UBool 306isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { 307 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON; 308} 309 310/*static inline UBool 311isPOSIXClose(const UnicodeString &pattern, int32_t pos) { 312 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE; 313}*/ 314 315// TODO memory debugging provided inside uniset.cpp 316// could be made available here but probably obsolete with use of modern 317// memory leak checker tools 318#define _dbgct(me) 319 320//---------------------------------------------------------------- 321// Constructors &c 322//---------------------------------------------------------------- 323 324/** 325 * Constructs a set from the given pattern, optionally ignoring 326 * white space. See the class description for the syntax of the 327 * pattern language. 328 * @param pattern a string specifying what characters are in the set 329 */ 330UnicodeSet::UnicodeSet(const UnicodeString& pattern, 331 UErrorCode& status) : 332 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 333 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 334 fFlags(0) 335{ 336 if(U_SUCCESS(status)){ 337 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 338 /* test for NULL */ 339 if(list == NULL) { 340 status = U_MEMORY_ALLOCATION_ERROR; 341 }else{ 342 allocateStrings(status); 343 applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); 344 } 345 } 346 _dbgct(this); 347} 348 349/** 350 * Constructs a set from the given pattern, optionally ignoring 351 * white space. See the class description for the syntax of the 352 * pattern language. 353 * @param pattern a string specifying what characters are in the set 354 * @param options bitmask for options to apply to the pattern. 355 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 356 */ 357UnicodeSet::UnicodeSet(const UnicodeString& pattern, 358 uint32_t options, 359 const SymbolTable* symbols, 360 UErrorCode& status) : 361 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 362 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 363 fFlags(0) 364{ 365 if(U_SUCCESS(status)){ 366 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 367 /* test for NULL */ 368 if(list == NULL) { 369 status = U_MEMORY_ALLOCATION_ERROR; 370 }else{ 371 allocateStrings(status); 372 applyPattern(pattern, options, symbols, status); 373 } 374 } 375 _dbgct(this); 376} 377 378UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, 379 uint32_t options, 380 const SymbolTable* symbols, 381 UErrorCode& status) : 382 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 383 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 384 fFlags(0) 385{ 386 if(U_SUCCESS(status)){ 387 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 388 /* test for NULL */ 389 if(list == NULL) { 390 status = U_MEMORY_ALLOCATION_ERROR; 391 }else{ 392 allocateStrings(status); 393 applyPattern(pattern, pos, options, symbols, status); 394 } 395 } 396 _dbgct(this); 397} 398 399//---------------------------------------------------------------- 400// Public API 401//---------------------------------------------------------------- 402 403UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 404 UErrorCode& status) { 405 return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); 406} 407 408 409/** 410 * Modifies this set to represent the set specified by the given 411 * pattern, optionally ignoring white space. See the class 412 * description for the syntax of the pattern language. 413 * @param pattern a string specifying what characters are in the set 414 * @param options bitmask for options to apply to the pattern. 415 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 416 */ 417UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 418 uint32_t options, 419 const SymbolTable* symbols, 420 UErrorCode& status) { 421 if (U_FAILURE(status) || isFrozen()) { 422 return *this; 423 } 424 425 ParsePosition pos(0); 426 applyPattern(pattern, pos, options, symbols, status); 427 if (U_FAILURE(status)) return *this; 428 429 int32_t i = pos.getIndex(); 430 431 if (options & USET_IGNORE_SPACE) { 432 // Skip over trailing whitespace 433 ICU_Utility::skipWhitespace(pattern, i, TRUE); 434 } 435 436 if (i != pattern.length()) { 437 status = U_ILLEGAL_ARGUMENT_ERROR; 438 } 439 return *this; 440} 441 442UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 443 ParsePosition& pos, 444 uint32_t options, 445 const SymbolTable* symbols, 446 UErrorCode& status) { 447 if (U_FAILURE(status) || isFrozen()) { 448 return *this; 449 } 450 // Need to build the pattern in a temporary string because 451 // _applyPattern calls add() etc., which set pat to empty. 452 UnicodeString rebuiltPat; 453 RuleCharacterIterator chars(pattern, symbols, pos); 454 applyPattern(chars, symbols, rebuiltPat, options, status); 455 if (U_FAILURE(status)) return *this; 456 if (chars.inVariable()) { 457 // syntaxError(chars, "Extra chars in variable value"); 458 status = U_MALFORMED_SET; 459 return *this; 460 } 461 setPattern(rebuiltPat); 462 return *this; 463} 464 465/** 466 * Return true if the given position, in the given pattern, appears 467 * to be the start of a UnicodeSet pattern. 468 */ 469UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { 470 return ((pos+1) < pattern.length() && 471 pattern.charAt(pos) == (UChar)91/*[*/) || 472 resemblesPropertyPattern(pattern, pos); 473} 474 475//---------------------------------------------------------------- 476// Implementation: Pattern parsing 477//---------------------------------------------------------------- 478 479/** 480 * A small all-inline class to manage a UnicodeSet pointer. Add 481 * operator->() etc. as needed. 482 */ 483class UnicodeSetPointer { 484 UnicodeSet* p; 485public: 486 inline UnicodeSetPointer() : p(0) {} 487 inline ~UnicodeSetPointer() { delete p; } 488 inline UnicodeSet* pointer() { return p; } 489 inline UBool allocate() { 490 if (p == 0) { 491 p = new UnicodeSet(); 492 } 493 return p != 0; 494 } 495}; 496 497/** 498 * Parse the pattern from the given RuleCharacterIterator. The 499 * iterator is advanced over the parsed pattern. 500 * @param chars iterator over the pattern characters. Upon return 501 * it will be advanced to the first character after the parsed 502 * pattern, or the end of the iteration if all characters are 503 * parsed. 504 * @param symbols symbol table to use to parse and dereference 505 * variables, or null if none. 506 * @param rebuiltPat the pattern that was parsed, rebuilt or 507 * copied from the input pattern, as appropriate. 508 * @param options a bit mask of zero or more of the following: 509 * IGNORE_SPACE, CASE. 510 */ 511void UnicodeSet::applyPattern(RuleCharacterIterator& chars, 512 const SymbolTable* symbols, 513 UnicodeString& rebuiltPat, 514 uint32_t options, 515 UErrorCode& ec) { 516 if (U_FAILURE(ec)) return; 517 518 // Syntax characters: [ ] ^ - & { } 519 520 // Recognized special forms for chars, sets: c-c s-s s&s 521 522 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | 523 RuleCharacterIterator::PARSE_ESCAPES; 524 if ((options & USET_IGNORE_SPACE) != 0) { 525 opts |= RuleCharacterIterator::SKIP_WHITESPACE; 526 } 527 528 UnicodeString patLocal, buf; 529 UBool usePat = FALSE; 530 UnicodeSetPointer scratch; 531 RuleCharacterIterator::Pos backup; 532 533 // mode: 0=before [, 1=between [...], 2=after ] 534 // lastItem: 0=none, 1=char, 2=set 535 int8_t lastItem = 0, mode = 0; 536 UChar32 lastChar = 0; 537 UChar op = 0; 538 539 UBool invert = FALSE; 540 541 clear(); 542 543 while (mode != 2 && !chars.atEnd()) { 544 U_ASSERT((lastItem == 0 && op == 0) || 545 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || 546 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || 547 op == INTERSECTION /*'&'*/))); 548 549 UChar32 c = 0; 550 UBool literal = FALSE; 551 UnicodeSet* nested = 0; // alias - do not delete 552 553 // -------- Check for property pattern 554 555 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 556 int8_t setMode = 0; 557 if (resemblesPropertyPattern(chars, opts)) { 558 setMode = 2; 559 } 560 561 // -------- Parse '[' of opening delimiter OR nested set. 562 // If there is a nested set, use `setMode' to define how 563 // the set should be parsed. If the '[' is part of the 564 // opening delimiter for this pattern, parse special 565 // strings "[", "[^", "[-", and "[^-". Check for stand-in 566 // characters representing a nested set in the symbol 567 // table. 568 569 else { 570 // Prepare to backup if necessary 571 chars.getPos(backup); 572 c = chars.next(opts, literal, ec); 573 if (U_FAILURE(ec)) return; 574 575 if (c == 0x5B /*'['*/ && !literal) { 576 if (mode == 1) { 577 chars.setPos(backup); // backup 578 setMode = 1; 579 } else { 580 // Handle opening '[' delimiter 581 mode = 1; 582 patLocal.append((UChar) 0x5B /*'['*/); 583 chars.getPos(backup); // prepare to backup 584 c = chars.next(opts, literal, ec); 585 if (U_FAILURE(ec)) return; 586 if (c == 0x5E /*'^'*/ && !literal) { 587 invert = TRUE; 588 patLocal.append((UChar) 0x5E /*'^'*/); 589 chars.getPos(backup); // prepare to backup 590 c = chars.next(opts, literal, ec); 591 if (U_FAILURE(ec)) return; 592 } 593 // Fall through to handle special leading '-'; 594 // otherwise restart loop for nested [], \p{}, etc. 595 if (c == HYPHEN /*'-'*/) { 596 literal = TRUE; 597 // Fall through to handle literal '-' below 598 } else { 599 chars.setPos(backup); // backup 600 continue; 601 } 602 } 603 } else if (symbols != 0) { 604 const UnicodeFunctor *m = symbols->lookupMatcher(c); 605 if (m != 0) { 606 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m); 607 if (ms == NULL) { 608 ec = U_MALFORMED_SET; 609 return; 610 } 611 // casting away const, but `nested' won't be modified 612 // (important not to modify stored set) 613 nested = const_cast<UnicodeSet*>(ms); 614 setMode = 3; 615 } 616 } 617 } 618 619 // -------- Handle a nested set. This either is inline in 620 // the pattern or represented by a stand-in that has 621 // previously been parsed and was looked up in the symbol 622 // table. 623 624 if (setMode != 0) { 625 if (lastItem == 1) { 626 if (op != 0) { 627 // syntaxError(chars, "Char expected after operator"); 628 ec = U_MALFORMED_SET; 629 return; 630 } 631 add(lastChar, lastChar); 632 _appendToPat(patLocal, lastChar, FALSE); 633 lastItem = 0; 634 op = 0; 635 } 636 637 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { 638 patLocal.append(op); 639 } 640 641 if (nested == 0) { 642 // lazy allocation 643 if (!scratch.allocate()) { 644 ec = U_MEMORY_ALLOCATION_ERROR; 645 return; 646 } 647 nested = scratch.pointer(); 648 } 649 switch (setMode) { 650 case 1: 651 nested->applyPattern(chars, symbols, patLocal, options, ec); 652 break; 653 case 2: 654 chars.skipIgnored(opts); 655 nested->applyPropertyPattern(chars, patLocal, ec); 656 if (U_FAILURE(ec)) return; 657 break; 658 case 3: // `nested' already parsed 659 nested->_toPattern(patLocal, FALSE); 660 break; 661 } 662 663 usePat = TRUE; 664 665 if (mode == 0) { 666 // Entire pattern is a category; leave parse loop 667 *this = *nested; 668 mode = 2; 669 break; 670 } 671 672 switch (op) { 673 case HYPHEN: /*'-'*/ 674 removeAll(*nested); 675 break; 676 case INTERSECTION: /*'&'*/ 677 retainAll(*nested); 678 break; 679 case 0: 680 addAll(*nested); 681 break; 682 } 683 684 op = 0; 685 lastItem = 2; 686 687 continue; 688 } 689 690 if (mode == 0) { 691 // syntaxError(chars, "Missing '['"); 692 ec = U_MALFORMED_SET; 693 return; 694 } 695 696 // -------- Parse special (syntax) characters. If the 697 // current character is not special, or if it is escaped, 698 // then fall through and handle it below. 699 700 if (!literal) { 701 switch (c) { 702 case 0x5D /*']'*/: 703 if (lastItem == 1) { 704 add(lastChar, lastChar); 705 _appendToPat(patLocal, lastChar, FALSE); 706 } 707 // Treat final trailing '-' as a literal 708 if (op == HYPHEN /*'-'*/) { 709 add(op, op); 710 patLocal.append(op); 711 } else if (op == INTERSECTION /*'&'*/) { 712 // syntaxError(chars, "Trailing '&'"); 713 ec = U_MALFORMED_SET; 714 return; 715 } 716 patLocal.append((UChar) 0x5D /*']'*/); 717 mode = 2; 718 continue; 719 case HYPHEN /*'-'*/: 720 if (op == 0) { 721 if (lastItem != 0) { 722 op = (UChar) c; 723 continue; 724 } else { 725 // Treat final trailing '-' as a literal 726 add(c, c); 727 c = chars.next(opts, literal, ec); 728 if (U_FAILURE(ec)) return; 729 if (c == 0x5D /*']'*/ && !literal) { 730 patLocal.append(HYPHEN_RIGHT_BRACE); 731 mode = 2; 732 continue; 733 } 734 } 735 } 736 // syntaxError(chars, "'-' not after char or set"); 737 ec = U_MALFORMED_SET; 738 return; 739 case INTERSECTION /*'&'*/: 740 if (lastItem == 2 && op == 0) { 741 op = (UChar) c; 742 continue; 743 } 744 // syntaxError(chars, "'&' not after set"); 745 ec = U_MALFORMED_SET; 746 return; 747 case 0x5E /*'^'*/: 748 // syntaxError(chars, "'^' not after '['"); 749 ec = U_MALFORMED_SET; 750 return; 751 case 0x7B /*'{'*/: 752 if (op != 0) { 753 // syntaxError(chars, "Missing operand after operator"); 754 ec = U_MALFORMED_SET; 755 return; 756 } 757 if (lastItem == 1) { 758 add(lastChar, lastChar); 759 _appendToPat(patLocal, lastChar, FALSE); 760 } 761 lastItem = 0; 762 buf.truncate(0); 763 { 764 UBool ok = FALSE; 765 while (!chars.atEnd()) { 766 c = chars.next(opts, literal, ec); 767 if (U_FAILURE(ec)) return; 768 if (c == 0x7D /*'}'*/ && !literal) { 769 ok = TRUE; 770 break; 771 } 772 buf.append(c); 773 } 774 if (buf.length() < 1 || !ok) { 775 // syntaxError(chars, "Invalid multicharacter string"); 776 ec = U_MALFORMED_SET; 777 return; 778 } 779 } 780 // We have new string. Add it to set and continue; 781 // we don't need to drop through to the further 782 // processing 783 add(buf); 784 patLocal.append((UChar) 0x7B /*'{'*/); 785 _appendToPat(patLocal, buf, FALSE); 786 patLocal.append((UChar) 0x7D /*'}'*/); 787 continue; 788 case SymbolTable::SYMBOL_REF: 789 // symbols nosymbols 790 // [a-$] error error (ambiguous) 791 // [a$] anchor anchor 792 // [a-$x] var "x"* literal '$' 793 // [a-$.] error literal '$' 794 // *We won't get here in the case of var "x" 795 { 796 chars.getPos(backup); 797 c = chars.next(opts, literal, ec); 798 if (U_FAILURE(ec)) return; 799 UBool anchor = (c == 0x5D /*']'*/ && !literal); 800 if (symbols == 0 && !anchor) { 801 c = SymbolTable::SYMBOL_REF; 802 chars.setPos(backup); 803 break; // literal '$' 804 } 805 if (anchor && op == 0) { 806 if (lastItem == 1) { 807 add(lastChar, lastChar); 808 _appendToPat(patLocal, lastChar, FALSE); 809 } 810 add(U_ETHER); 811 usePat = TRUE; 812 patLocal.append((UChar) SymbolTable::SYMBOL_REF); 813 patLocal.append((UChar) 0x5D /*']'*/); 814 mode = 2; 815 continue; 816 } 817 // syntaxError(chars, "Unquoted '$'"); 818 ec = U_MALFORMED_SET; 819 return; 820 } 821 default: 822 break; 823 } 824 } 825 826 // -------- Parse literal characters. This includes both 827 // escaped chars ("\u4E01") and non-syntax characters 828 // ("a"). 829 830 switch (lastItem) { 831 case 0: 832 lastItem = 1; 833 lastChar = c; 834 break; 835 case 1: 836 if (op == HYPHEN /*'-'*/) { 837 if (lastChar >= c) { 838 // Don't allow redundant (a-a) or empty (b-a) ranges; 839 // these are most likely typos. 840 // syntaxError(chars, "Invalid range"); 841 ec = U_MALFORMED_SET; 842 return; 843 } 844 add(lastChar, c); 845 _appendToPat(patLocal, lastChar, FALSE); 846 patLocal.append(op); 847 _appendToPat(patLocal, c, FALSE); 848 lastItem = 0; 849 op = 0; 850 } else { 851 add(lastChar, lastChar); 852 _appendToPat(patLocal, lastChar, FALSE); 853 lastChar = c; 854 } 855 break; 856 case 2: 857 if (op != 0) { 858 // syntaxError(chars, "Set expected after operator"); 859 ec = U_MALFORMED_SET; 860 return; 861 } 862 lastChar = c; 863 lastItem = 1; 864 break; 865 } 866 } 867 868 if (mode != 2) { 869 // syntaxError(chars, "Missing ']'"); 870 ec = U_MALFORMED_SET; 871 return; 872 } 873 874 chars.skipIgnored(opts); 875 876 /** 877 * Handle global flags (invert, case insensitivity). If this 878 * pattern should be compiled case-insensitive, then we need 879 * to close over case BEFORE COMPLEMENTING. This makes 880 * patterns like /[^abc]/i work. 881 */ 882 if ((options & USET_CASE_INSENSITIVE) != 0) { 883 closeOver(USET_CASE_INSENSITIVE); 884 } 885 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { 886 closeOver(USET_ADD_CASE_MAPPINGS); 887 } 888 if (invert) { 889 complement(); 890 } 891 892 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the 893 // generated pattern. 894 if (usePat) { 895 rebuiltPat.append(patLocal); 896 } else { 897 _generatePattern(rebuiltPat, FALSE); 898 } 899 if (isBogus() && U_SUCCESS(ec)) { 900 // We likely ran out of memory. AHHH! 901 ec = U_MEMORY_ALLOCATION_ERROR; 902 } 903} 904 905//---------------------------------------------------------------- 906// Property set implementation 907//---------------------------------------------------------------- 908 909static UBool numericValueFilter(UChar32 ch, void* context) { 910 return u_getNumericValue(ch) == *(double*)context; 911} 912 913static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { 914 int32_t value = *(int32_t*)context; 915 return (U_GET_GC_MASK((UChar32) ch) & value) != 0; 916} 917 918static UBool versionFilter(UChar32 ch, void* context) { 919 static const UVersionInfo none = { 0, 0, 0, 0 }; 920 UVersionInfo v; 921 u_charAge(ch, v); 922 UVersionInfo* version = (UVersionInfo*)context; 923 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; 924} 925 926typedef struct { 927 UProperty prop; 928 int32_t value; 929} IntPropertyContext; 930 931static UBool intPropertyFilter(UChar32 ch, void* context) { 932 IntPropertyContext* c = (IntPropertyContext*)context; 933 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; 934} 935 936static UBool scriptExtensionsFilter(UChar32 ch, void* context) { 937 return uscript_hasScript(ch, *(UScriptCode*)context); 938} 939 940/** 941 * Generic filter-based scanning code for UCD property UnicodeSets. 942 */ 943void UnicodeSet::applyFilter(UnicodeSet::Filter filter, 944 void* context, 945 int32_t src, 946 UErrorCode &status) { 947 if (U_FAILURE(status)) return; 948 949 // Logically, walk through all Unicode characters, noting the start 950 // and end of each range for which filter.contain(c) is 951 // true. Add each range to a set. 952 // 953 // To improve performance, use an inclusions set which 954 // encodes information about character ranges that are known 955 // to have identical properties. 956 // getInclusions(src) contains exactly the first characters of 957 // same-value ranges for the given properties "source". 958 const UnicodeSet* inclusions = getInclusions(src, status); 959 if (U_FAILURE(status)) { 960 return; 961 } 962 963 clear(); 964 965 UChar32 startHasProperty = -1; 966 int32_t limitRange = inclusions->getRangeCount(); 967 968 for (int j=0; j<limitRange; ++j) { 969 // get current range 970 UChar32 start = inclusions->getRangeStart(j); 971 UChar32 end = inclusions->getRangeEnd(j); 972 973 // for all the code points in the range, process 974 for (UChar32 ch = start; ch <= end; ++ch) { 975 // only add to this UnicodeSet on inflection points -- 976 // where the hasProperty value changes to false 977 if ((*filter)(ch, context)) { 978 if (startHasProperty < 0) { 979 startHasProperty = ch; 980 } 981 } else if (startHasProperty >= 0) { 982 add(startHasProperty, ch-1); 983 startHasProperty = -1; 984 } 985 } 986 } 987 if (startHasProperty >= 0) { 988 add((UChar32)startHasProperty, (UChar32)0x10FFFF); 989 } 990 if (isBogus() && U_SUCCESS(status)) { 991 // We likely ran out of memory. AHHH! 992 status = U_MEMORY_ALLOCATION_ERROR; 993 } 994} 995 996static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { 997 /* Note: we use ' ' in compiler code page */ 998 int32_t j = 0; 999 char ch; 1000 --dstCapacity; /* make room for term. zero */ 1001 while ((ch = *src++) != 0) { 1002 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { 1003 continue; 1004 } 1005 if (j >= dstCapacity) return FALSE; 1006 dst[j++] = ch; 1007 } 1008 if (j > 0 && dst[j-1] == ' ') --j; 1009 dst[j] = 0; 1010 return TRUE; 1011} 1012 1013//---------------------------------------------------------------- 1014// Property set API 1015//---------------------------------------------------------------- 1016 1017#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} 1018 1019UnicodeSet& 1020UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { 1021 if (U_FAILURE(ec) || isFrozen()) return *this; 1022 1023 if (prop == UCHAR_GENERAL_CATEGORY_MASK) { 1024 applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec); 1025 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { 1026 UScriptCode script = (UScriptCode)value; 1027 applyFilter(scriptExtensionsFilter, &script, UPROPS_SRC_PROPSVEC, ec); 1028 } else { 1029 IntPropertyContext c = {prop, value}; 1030 applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec); 1031 } 1032 return *this; 1033} 1034 1035UnicodeSet& 1036UnicodeSet::applyPropertyAlias(const UnicodeString& prop, 1037 const UnicodeString& value, 1038 UErrorCode& ec) { 1039 if (U_FAILURE(ec) || isFrozen()) return *this; 1040 1041 // prop and value used to be converted to char * using the default 1042 // converter instead of the invariant conversion. 1043 // This should not be necessary because all Unicode property and value 1044 // names use only invariant characters. 1045 // If there are any variant characters, then we won't find them anyway. 1046 // Checking first avoids assertion failures in the conversion. 1047 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || 1048 !uprv_isInvariantUString(value.getBuffer(), value.length()) 1049 ) { 1050 FAIL(ec); 1051 } 1052 CharString pname, vname; 1053 pname.appendInvariantChars(prop, ec); 1054 vname.appendInvariantChars(value, ec); 1055 if (U_FAILURE(ec)) return *this; 1056 1057 UProperty p; 1058 int32_t v; 1059 UBool mustNotBeEmpty = FALSE, invert = FALSE; 1060 1061 if (value.length() > 0) { 1062 p = u_getPropertyEnum(pname.data()); 1063 if (p == UCHAR_INVALID_CODE) FAIL(ec); 1064 1065 // Treat gc as gcm 1066 if (p == UCHAR_GENERAL_CATEGORY) { 1067 p = UCHAR_GENERAL_CATEGORY_MASK; 1068 } 1069 1070 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || 1071 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || 1072 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { 1073 v = u_getPropertyValueEnum(p, vname.data()); 1074 if (v == UCHAR_INVALID_CODE) { 1075 // Handle numeric CCC 1076 if (p == UCHAR_CANONICAL_COMBINING_CLASS || 1077 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || 1078 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { 1079 char* end; 1080 double value = uprv_strtod(vname.data(), &end); 1081 v = (int32_t) value; 1082 if (v != value || v < 0 || *end != 0) { 1083 // non-integral or negative value, or trailing junk 1084 FAIL(ec); 1085 } 1086 // If the resultant set is empty then the numeric value 1087 // was invalid. 1088 mustNotBeEmpty = TRUE; 1089 } else { 1090 FAIL(ec); 1091 } 1092 } 1093 } 1094 1095 else { 1096 1097 switch (p) { 1098 case UCHAR_NUMERIC_VALUE: 1099 { 1100 char* end; 1101 double value = uprv_strtod(vname.data(), &end); 1102 if (*end != 0) { 1103 FAIL(ec); 1104 } 1105 applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec); 1106 return *this; 1107 } 1108 break; 1109 case UCHAR_NAME: 1110 case UCHAR_UNICODE_1_NAME: 1111 { 1112 // Must munge name, since u_charFromName() does not do 1113 // 'loose' matching. 1114 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength 1115 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 1116 UCharNameChoice choice = (p == UCHAR_NAME) ? 1117 U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME; 1118 UChar32 ch = u_charFromName(choice, buf, &ec); 1119 if (U_SUCCESS(ec)) { 1120 clear(); 1121 add(ch); 1122 return *this; 1123 } else { 1124 FAIL(ec); 1125 } 1126 } 1127 break; 1128 case UCHAR_AGE: 1129 { 1130 // Must munge name, since u_versionFromString() does not do 1131 // 'loose' matching. 1132 char buf[128]; 1133 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); 1134 UVersionInfo version; 1135 u_versionFromString(version, buf); 1136 applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec); 1137 return *this; 1138 } 1139 break; 1140 case UCHAR_SCRIPT_EXTENSIONS: 1141 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); 1142 if (v == UCHAR_INVALID_CODE) { 1143 FAIL(ec); 1144 } 1145 // fall through to calling applyIntPropertyValue() 1146 break; 1147 default: 1148 // p is a non-binary, non-enumerated property that we 1149 // don't support (yet). 1150 FAIL(ec); 1151 } 1152 } 1153 } 1154 1155 else { 1156 // value is empty. Interpret as General Category, Script, or 1157 // Binary property. 1158 p = UCHAR_GENERAL_CATEGORY_MASK; 1159 v = u_getPropertyValueEnum(p, pname.data()); 1160 if (v == UCHAR_INVALID_CODE) { 1161 p = UCHAR_SCRIPT; 1162 v = u_getPropertyValueEnum(p, pname.data()); 1163 if (v == UCHAR_INVALID_CODE) { 1164 p = u_getPropertyEnum(pname.data()); 1165 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { 1166 v = 1; 1167 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { 1168 set(MIN_VALUE, MAX_VALUE); 1169 return *this; 1170 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { 1171 set(0, 0x7F); 1172 return *this; 1173 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { 1174 // [:Assigned:]=[:^Cn:] 1175 p = UCHAR_GENERAL_CATEGORY_MASK; 1176 v = U_GC_CN_MASK; 1177 invert = TRUE; 1178 } else { 1179 FAIL(ec); 1180 } 1181 } 1182 } 1183 } 1184 1185 applyIntPropertyValue(p, v, ec); 1186 if(invert) { 1187 complement(); 1188 } 1189 1190 if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) { 1191 // mustNotBeEmpty is set to true if an empty set indicates 1192 // invalid input. 1193 ec = U_ILLEGAL_ARGUMENT_ERROR; 1194 } 1195 1196 if (isBogus() && U_SUCCESS(ec)) { 1197 // We likely ran out of memory. AHHH! 1198 ec = U_MEMORY_ALLOCATION_ERROR; 1199 } 1200 return *this; 1201} 1202 1203//---------------------------------------------------------------- 1204// Property set patterns 1205//---------------------------------------------------------------- 1206 1207/** 1208 * Return true if the given position, in the given pattern, appears 1209 * to be the start of a property set pattern. 1210 */ 1211UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, 1212 int32_t pos) { 1213 // Patterns are at least 5 characters long 1214 if ((pos+5) > pattern.length()) { 1215 return FALSE; 1216 } 1217 1218 // Look for an opening [:, [:^, \p, or \P 1219 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); 1220} 1221 1222/** 1223 * Return true if the given iterator appears to point at a 1224 * property pattern. Regardless of the result, return with the 1225 * iterator unchanged. 1226 * @param chars iterator over the pattern characters. Upon return 1227 * it will be unchanged. 1228 * @param iterOpts RuleCharacterIterator options 1229 */ 1230UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, 1231 int32_t iterOpts) { 1232 // NOTE: literal will always be FALSE, because we don't parse escapes. 1233 UBool result = FALSE, literal; 1234 UErrorCode ec = U_ZERO_ERROR; 1235 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; 1236 RuleCharacterIterator::Pos pos; 1237 chars.getPos(pos); 1238 UChar32 c = chars.next(iterOpts, literal, ec); 1239 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) { 1240 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, 1241 literal, ec); 1242 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) : 1243 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/); 1244 } 1245 chars.setPos(pos); 1246 return result && U_SUCCESS(ec); 1247} 1248 1249/** 1250 * Parse the given property pattern at the given parse position. 1251 */ 1252UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, 1253 ParsePosition& ppos, 1254 UErrorCode &ec) { 1255 int32_t pos = ppos.getIndex(); 1256 1257 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 1258 UBool isName = FALSE; // true for \N{pat}, o/w false 1259 UBool invert = FALSE; 1260 1261 if (U_FAILURE(ec)) return *this; 1262 1263 // Minimum length is 5 characters, e.g. \p{L} 1264 if ((pos+5) > pattern.length()) { 1265 FAIL(ec); 1266 } 1267 1268 // On entry, ppos should point to one of the following locations: 1269 // Look for an opening [:, [:^, \p, or \P 1270 if (isPOSIXOpen(pattern, pos)) { 1271 posix = TRUE; 1272 pos += 2; 1273 pos = ICU_Utility::skipWhitespace(pattern, pos); 1274 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) { 1275 ++pos; 1276 invert = TRUE; 1277 } 1278 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { 1279 UChar c = pattern.charAt(pos+1); 1280 invert = (c == UPPER_P); 1281 isName = (c == UPPER_N); 1282 pos += 2; 1283 pos = ICU_Utility::skipWhitespace(pattern, pos); 1284 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) { 1285 // Syntax error; "\p" or "\P" not followed by "{" 1286 FAIL(ec); 1287 } 1288 } else { 1289 // Open delimiter not seen 1290 FAIL(ec); 1291 } 1292 1293 // Look for the matching close delimiter, either :] or } 1294 int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos); 1295 if (close < 0) { 1296 // Syntax error; close delimiter missing 1297 FAIL(ec); 1298 } 1299 1300 // Look for an '=' sign. If this is present, we will parse a 1301 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 1302 // pattern. 1303 int32_t equals = pattern.indexOf(EQUALS, pos); 1304 UnicodeString propName, valueName; 1305 if (equals >= 0 && equals < close && !isName) { 1306 // Equals seen; parse medium/long pattern 1307 pattern.extractBetween(pos, equals, propName); 1308 pattern.extractBetween(equals+1, close, valueName); 1309 } 1310 1311 else { 1312 // Handle case where no '=' is seen, and \N{} 1313 pattern.extractBetween(pos, close, propName); 1314 1315 // Handle \N{name} 1316 if (isName) { 1317 // This is a little inefficient since it means we have to 1318 // parse NAME_PROP back to UCHAR_NAME even though we already 1319 // know it's UCHAR_NAME. If we refactor the API to 1320 // support args of (UProperty, char*) then we can remove 1321 // NAME_PROP and make this a little more efficient. 1322 valueName = propName; 1323 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); 1324 } 1325 } 1326 1327 applyPropertyAlias(propName, valueName, ec); 1328 1329 if (U_SUCCESS(ec)) { 1330 if (invert) { 1331 complement(); 1332 } 1333 1334 // Move to the limit position after the close delimiter if the 1335 // parse succeeded. 1336 ppos.setIndex(close + (posix ? 2 : 1)); 1337 } 1338 1339 return *this; 1340} 1341 1342/** 1343 * Parse a property pattern. 1344 * @param chars iterator over the pattern characters. Upon return 1345 * it will be advanced to the first character after the parsed 1346 * pattern, or the end of the iteration if all characters are 1347 * parsed. 1348 * @param rebuiltPat the pattern that was parsed, rebuilt or 1349 * copied from the input pattern, as appropriate. 1350 */ 1351void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, 1352 UnicodeString& rebuiltPat, 1353 UErrorCode& ec) { 1354 if (U_FAILURE(ec)) return; 1355 UnicodeString pattern; 1356 chars.lookahead(pattern); 1357 ParsePosition pos(0); 1358 applyPropertyPattern(pattern, pos, ec); 1359 if (U_FAILURE(ec)) return; 1360 if (pos.getIndex() == 0) { 1361 // syntaxError(chars, "Invalid property pattern"); 1362 ec = U_MALFORMED_SET; 1363 return; 1364 } 1365 chars.jumpahead(pos.getIndex()); 1366 rebuiltPat.append(pattern, 0, pos.getIndex()); 1367} 1368 1369//---------------------------------------------------------------- 1370// Case folding API 1371//---------------------------------------------------------------- 1372 1373// add the result of a full case mapping to the set 1374// use str as a temporary string to avoid constructing one 1375static inline void 1376addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) { 1377 if(result >= 0) { 1378 if(result > UCASE_MAX_STRING_LENGTH) { 1379 // add a single-code point case mapping 1380 set.add(result); 1381 } else { 1382 // add a string case mapping from full with length result 1383 str.setTo((UBool)FALSE, full, result); 1384 set.add(str); 1385 } 1386 } 1387 // result < 0: the code point mapped to itself, no need to add it 1388 // see ucase.h 1389} 1390 1391UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { 1392 if (isFrozen() || isBogus()) { 1393 return *this; 1394 } 1395 if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { 1396 const UCaseProps *csp = ucase_getSingleton(); 1397 { 1398 UnicodeSet foldSet(*this); 1399 UnicodeString str; 1400 USetAdder sa = { 1401 foldSet.toUSet(), 1402 _set_add, 1403 _set_addRange, 1404 _set_addString, 1405 NULL, // don't need remove() 1406 NULL // don't need removeRange() 1407 }; 1408 1409 // start with input set to guarantee inclusion 1410 // USET_CASE: remove strings because the strings will actually be reduced (folded); 1411 // therefore, start with no strings and add only those needed 1412 if (attribute & USET_CASE_INSENSITIVE) { 1413 foldSet.strings->removeAllElements(); 1414 } 1415 1416 int32_t n = getRangeCount(); 1417 UChar32 result; 1418 const UChar *full; 1419 int32_t locCache = 0; 1420 1421 for (int32_t i=0; i<n; ++i) { 1422 UChar32 start = getRangeStart(i); 1423 UChar32 end = getRangeEnd(i); 1424 1425 if (attribute & USET_CASE_INSENSITIVE) { 1426 // full case closure 1427 for (UChar32 cp=start; cp<=end; ++cp) { 1428 ucase_addCaseClosure(csp, cp, &sa); 1429 } 1430 } else { 1431 // add case mappings 1432 // (does not add long s for regular s, or Kelvin for k, for example) 1433 for (UChar32 cp=start; cp<=end; ++cp) { 1434 result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache); 1435 addCaseMapping(foldSet, result, full, str); 1436 1437 result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache); 1438 addCaseMapping(foldSet, result, full, str); 1439 1440 result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache); 1441 addCaseMapping(foldSet, result, full, str); 1442 1443 result = ucase_toFullFolding(csp, cp, &full, 0); 1444 addCaseMapping(foldSet, result, full, str); 1445 } 1446 } 1447 } 1448 if (strings != NULL && strings->size() > 0) { 1449 if (attribute & USET_CASE_INSENSITIVE) { 1450 for (int32_t j=0; j<strings->size(); ++j) { 1451 str = *(const UnicodeString *) strings->elementAt(j); 1452 str.foldCase(); 1453 if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) { 1454 foldSet.add(str); // does not map to code points: add the folded string itself 1455 } 1456 } 1457 } else { 1458 Locale root(""); 1459#if !UCONFIG_NO_BREAK_ITERATION 1460 UErrorCode status = U_ZERO_ERROR; 1461 BreakIterator *bi = BreakIterator::createWordInstance(root, status); 1462 if (U_SUCCESS(status)) { 1463#endif 1464 const UnicodeString *pStr; 1465 1466 for (int32_t j=0; j<strings->size(); ++j) { 1467 pStr = (const UnicodeString *) strings->elementAt(j); 1468 (str = *pStr).toLower(root); 1469 foldSet.add(str); 1470#if !UCONFIG_NO_BREAK_ITERATION 1471 (str = *pStr).toTitle(bi, root); 1472 foldSet.add(str); 1473#endif 1474 (str = *pStr).toUpper(root); 1475 foldSet.add(str); 1476 (str = *pStr).foldCase(); 1477 foldSet.add(str); 1478 } 1479#if !UCONFIG_NO_BREAK_ITERATION 1480 } 1481 delete bi; 1482#endif 1483 } 1484 } 1485 *this = foldSet; 1486 } 1487 } 1488 return *this; 1489} 1490 1491U_NAMESPACE_END 1492