1/* 2******************************************************************************* 3* 4* Copyright (C) 1999-2008, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: uniset_props.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2004aug25 14* created by: Markus W. Scherer 15* 16* Character property dependent functions moved here from uniset.cpp 17*/ 18 19#include "unicode/utypes.h" 20#include "unicode/uniset.h" 21#include "unicode/parsepos.h" 22#include "unicode/uchar.h" 23#include "unicode/uscript.h" 24#include "unicode/symtable.h" 25#include "unicode/uset.h" 26#include "unicode/locid.h" 27#include "unicode/brkiter.h" 28#include "uset_imp.h" 29#include "ruleiter.h" 30#include "cmemory.h" 31#include "ucln_cmn.h" 32#include "util.h" 33#include "uvector.h" 34#include "uprops.h" 35#include "propname.h" 36#include "unormimp.h" 37#include "ucase.h" 38#include "ubidi_props.h" 39#include "uinvchar.h" 40#include "charstr.h" 41#include "cstring.h" 42#include "umutex.h" 43#include "uassert.h" 44#include "hash.h" 45 46U_NAMESPACE_USE 47 48#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 49 50// initial storage. Must be >= 0 51// *** same as in uniset.cpp ! *** 52#define START_EXTRA 16 53 54// Define UChar constants using hex for EBCDIC compatibility 55// Used #define to reduce private static exports and memory access time. 56#define SET_OPEN ((UChar)0x005B) /*[*/ 57#define SET_CLOSE ((UChar)0x005D) /*]*/ 58#define HYPHEN ((UChar)0x002D) /*-*/ 59#define COMPLEMENT ((UChar)0x005E) /*^*/ 60#define COLON ((UChar)0x003A) /*:*/ 61#define BACKSLASH ((UChar)0x005C) /*\*/ 62#define INTERSECTION ((UChar)0x0026) /*&*/ 63#define UPPER_U ((UChar)0x0055) /*U*/ 64#define LOWER_U ((UChar)0x0075) /*u*/ 65#define OPEN_BRACE ((UChar)123) /*{*/ 66#define CLOSE_BRACE ((UChar)125) /*}*/ 67#define UPPER_P ((UChar)0x0050) /*P*/ 68#define LOWER_P ((UChar)0x0070) /*p*/ 69#define UPPER_N ((UChar)78) /*N*/ 70#define EQUALS ((UChar)0x003D) /*=*/ 71 72//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:" 73static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]" 74//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p" 75static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" 76//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N" 77static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ 78 79// Special property set IDs 80static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] 81static const char ASCII[] = "ASCII"; // [\u0000-\u007F] 82static const char ASSIGNED[] = "Assigned"; // [:^Cn:] 83 84// Unicode name property alias 85#define NAME_PROP "na" 86#define NAME_PROP_LENGTH 2 87 88/** 89 * Delimiter string used in patterns to close a category reference: 90 * ":]". Example: "[:Lu:]". 91 */ 92//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ 93 94U_CDECL_BEGIN 95 96static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions() 97 98//---------------------------------------------------------------- 99// Inclusions list 100//---------------------------------------------------------------- 101 102// USetAdder implementation 103// Does not use uset.h to reduce code dependencies 104static void U_CALLCONV 105_set_add(USet *set, UChar32 c) { 106 ((UnicodeSet *)set)->add(c); 107} 108 109static void U_CALLCONV 110_set_addRange(USet *set, UChar32 start, UChar32 end) { 111 ((UnicodeSet *)set)->add(start, end); 112} 113 114static void U_CALLCONV 115_set_addString(USet *set, const UChar *str, int32_t length) { 116 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length)); 117} 118 119/** 120 * Cleanup function for UnicodeSet 121 */ 122static UBool U_CALLCONV uset_cleanup(void) { 123 int32_t i; 124 125 for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) { 126 if (INCLUSIONS[i] != NULL) { 127 delete INCLUSIONS[i]; 128 INCLUSIONS[i] = NULL; 129 } 130 } 131 132 return TRUE; 133} 134 135U_CDECL_END 136 137U_NAMESPACE_BEGIN 138 139/* 140Reduce excessive reallocation, and make it easier to detect initialization 141problems. 142Usually you don't see smaller sets than this for Unicode 5.0. 143*/ 144#define DEFAULT_INCLUSION_CAPACITY 3072 145 146const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) { 147 UBool needInit; 148 UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit); 149 if (needInit) { 150 UnicodeSet* incl = new UnicodeSet(); 151 USetAdder sa = { 152 (USet *)incl, 153 _set_add, 154 _set_addRange, 155 _set_addString, 156 NULL, // don't need remove() 157 NULL // don't need removeRange() 158 }; 159 incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status); 160 if (incl != NULL) { 161 switch(src) { 162 case UPROPS_SRC_CHAR: 163 uchar_addPropertyStarts(&sa, &status); 164 break; 165 case UPROPS_SRC_PROPSVEC: 166 upropsvec_addPropertyStarts(&sa, &status); 167 break; 168 case UPROPS_SRC_CHAR_AND_PROPSVEC: 169 uchar_addPropertyStarts(&sa, &status); 170 upropsvec_addPropertyStarts(&sa, &status); 171 break; 172 case UPROPS_SRC_HST: 173 uhst_addPropertyStarts(&sa, &status); 174 break; 175#if !UCONFIG_NO_NORMALIZATION 176 case UPROPS_SRC_NORM: 177 unorm_addPropertyStarts(&sa, &status); 178 break; 179#endif 180 case UPROPS_SRC_CASE: 181 ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status); 182 break; 183 case UPROPS_SRC_BIDI: 184 ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status); 185 break; 186 default: 187 status = U_INTERNAL_PROGRAM_ERROR; 188 break; 189 } 190 if (U_SUCCESS(status)) { 191 // Compact for caching 192 incl->compact(); 193 umtx_lock(NULL); 194 if (INCLUSIONS[src] == NULL) { 195 INCLUSIONS[src] = incl; 196 incl = NULL; 197 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); 198 } 199 umtx_unlock(NULL); 200 } 201 delete incl; 202 } else { 203 status = U_MEMORY_ALLOCATION_ERROR; 204 } 205 } 206 return INCLUSIONS[src]; 207} 208 209// helper functions for matching of pattern syntax pieces ------------------ *** 210// these functions are parallel to the PERL_OPEN etc. strings above 211 212// using these functions is not only faster than UnicodeString::compare() and 213// caseCompare(), but they also make UnicodeSet work for simple patterns when 214// no Unicode properties data is available - when caseCompare() fails 215 216static inline UBool 217isPerlOpen(const UnicodeString &pattern, int32_t pos) { 218 UChar c; 219 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P); 220} 221 222/*static inline UBool 223isPerlClose(const UnicodeString &pattern, int32_t pos) { 224 return pattern.charAt(pos)==CLOSE_BRACE; 225}*/ 226 227static inline UBool 228isNameOpen(const UnicodeString &pattern, int32_t pos) { 229 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N; 230} 231 232static inline UBool 233isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { 234 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON; 235} 236 237/*static inline UBool 238isPOSIXClose(const UnicodeString &pattern, int32_t pos) { 239 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE; 240}*/ 241 242// TODO memory debugging provided inside uniset.cpp 243// could be made available here but probably obsolete with use of modern 244// memory leak checker tools 245#define _dbgct(me) 246 247//---------------------------------------------------------------- 248// Constructors &c 249//---------------------------------------------------------------- 250 251/** 252 * Constructs a set from the given pattern, optionally ignoring 253 * white space. See the class description for the syntax of the 254 * pattern language. 255 * @param pattern a string specifying what characters are in the set 256 */ 257UnicodeSet::UnicodeSet(const UnicodeString& pattern, 258 UErrorCode& status) : 259 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 260 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 261 fFlags(0) 262{ 263 if(U_SUCCESS(status)){ 264 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 265 /* test for NULL */ 266 if(list == NULL) { 267 status = U_MEMORY_ALLOCATION_ERROR; 268 }else{ 269 allocateStrings(status); 270 applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); 271 } 272 } 273 _dbgct(this); 274} 275 276/** 277 * Constructs a set from the given pattern, optionally ignoring 278 * white space. See the class description for the syntax of the 279 * pattern language. 280 * @param pattern a string specifying what characters are in the set 281 * @param options bitmask for options to apply to the pattern. 282 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 283 */ 284UnicodeSet::UnicodeSet(const UnicodeString& pattern, 285 uint32_t options, 286 const SymbolTable* symbols, 287 UErrorCode& status) : 288 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 289 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 290 fFlags(0) 291{ 292 if(U_SUCCESS(status)){ 293 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 294 /* test for NULL */ 295 if(list == NULL) { 296 status = U_MEMORY_ALLOCATION_ERROR; 297 }else{ 298 allocateStrings(status); 299 applyPattern(pattern, options, symbols, status); 300 } 301 } 302 _dbgct(this); 303} 304 305UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, 306 uint32_t options, 307 const SymbolTable* symbols, 308 UErrorCode& status) : 309 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 310 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 311 fFlags(0) 312{ 313 if(U_SUCCESS(status)){ 314 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 315 /* test for NULL */ 316 if(list == NULL) { 317 status = U_MEMORY_ALLOCATION_ERROR; 318 }else{ 319 allocateStrings(status); 320 applyPattern(pattern, pos, options, symbols, status); 321 } 322 } 323 _dbgct(this); 324} 325 326//---------------------------------------------------------------- 327// Public API 328//---------------------------------------------------------------- 329 330/** 331 * Modifies this set to represent the set specified by the given 332 * pattern, optionally ignoring white space. See the class 333 * description for the syntax of the pattern language. 334 * @param pattern a string specifying what characters are in the set 335 * @param ignoreSpaces if <code>true</code>, all spaces in the 336 * pattern are ignored. Spaces are those characters for which 337 * <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>. 338 * Characters preceded by '\\' are escaped, losing any special 339 * meaning they otherwise have. Spaces may be included by 340 * escaping them. 341 * @exception <code>IllegalArgumentException</code> if the pattern 342 * contains a syntax error. 343 */ 344UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 345 UErrorCode& status) { 346 return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); 347} 348 349 350/** 351 * Modifies this set to represent the set specified by the given 352 * pattern, optionally ignoring white space. See the class 353 * description for the syntax of the pattern language. 354 * @param pattern a string specifying what characters are in the set 355 * @param options bitmask for options to apply to the pattern. 356 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 357 */ 358UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 359 uint32_t options, 360 const SymbolTable* symbols, 361 UErrorCode& status) { 362 if (U_FAILURE(status) || isFrozen()) { 363 return *this; 364 } 365 366 ParsePosition pos(0); 367 applyPattern(pattern, pos, options, symbols, status); 368 if (U_FAILURE(status)) return *this; 369 370 int32_t i = pos.getIndex(); 371 372 if (options & USET_IGNORE_SPACE) { 373 // Skip over trailing whitespace 374 ICU_Utility::skipWhitespace(pattern, i, TRUE); 375 } 376 377 if (i != pattern.length()) { 378 status = U_ILLEGAL_ARGUMENT_ERROR; 379 } 380 return *this; 381} 382 383UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, 384 ParsePosition& pos, 385 uint32_t options, 386 const SymbolTable* symbols, 387 UErrorCode& status) { 388 if (U_FAILURE(status) || isFrozen()) { 389 return *this; 390 } 391 // Need to build the pattern in a temporary string because 392 // _applyPattern calls add() etc., which set pat to empty. 393 UnicodeString rebuiltPat; 394 RuleCharacterIterator chars(pattern, symbols, pos); 395 applyPattern(chars, symbols, rebuiltPat, options, status); 396 if (U_FAILURE(status)) return *this; 397 if (chars.inVariable()) { 398 // syntaxError(chars, "Extra chars in variable value"); 399 status = U_MALFORMED_SET; 400 return *this; 401 } 402 setPattern(rebuiltPat); 403 return *this; 404} 405 406/** 407 * Return true if the given position, in the given pattern, appears 408 * to be the start of a UnicodeSet pattern. 409 */ 410UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { 411 return ((pos+1) < pattern.length() && 412 pattern.charAt(pos) == (UChar)91/*[*/) || 413 resemblesPropertyPattern(pattern, pos); 414} 415 416//---------------------------------------------------------------- 417// Implementation: Pattern parsing 418//---------------------------------------------------------------- 419 420/** 421 * A small all-inline class to manage a UnicodeSet pointer. Add 422 * operator->() etc. as needed. 423 */ 424class UnicodeSetPointer { 425 UnicodeSet* p; 426public: 427 inline UnicodeSetPointer() : p(0) {} 428 inline ~UnicodeSetPointer() { delete p; } 429 inline UnicodeSet* pointer() { return p; } 430 inline UBool allocate() { 431 if (p == 0) { 432 p = new UnicodeSet(); 433 } 434 return p != 0; 435 } 436}; 437 438/** 439 * Parse the pattern from the given RuleCharacterIterator. The 440 * iterator is advanced over the parsed pattern. 441 * @param chars iterator over the pattern characters. Upon return 442 * it will be advanced to the first character after the parsed 443 * pattern, or the end of the iteration if all characters are 444 * parsed. 445 * @param symbols symbol table to use to parse and dereference 446 * variables, or null if none. 447 * @param rebuiltPat the pattern that was parsed, rebuilt or 448 * copied from the input pattern, as appropriate. 449 * @param options a bit mask of zero or more of the following: 450 * IGNORE_SPACE, CASE. 451 */ 452void UnicodeSet::applyPattern(RuleCharacterIterator& chars, 453 const SymbolTable* symbols, 454 UnicodeString& rebuiltPat, 455 uint32_t options, 456 UErrorCode& ec) { 457 if (U_FAILURE(ec)) return; 458 459 // Syntax characters: [ ] ^ - & { } 460 461 // Recognized special forms for chars, sets: c-c s-s s&s 462 463 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | 464 RuleCharacterIterator::PARSE_ESCAPES; 465 if ((options & USET_IGNORE_SPACE) != 0) { 466 opts |= RuleCharacterIterator::SKIP_WHITESPACE; 467 } 468 469 UnicodeString patLocal, buf; 470 UBool usePat = FALSE; 471 UnicodeSetPointer scratch; 472 RuleCharacterIterator::Pos backup; 473 474 // mode: 0=before [, 1=between [...], 2=after ] 475 // lastItem: 0=none, 1=char, 2=set 476 int8_t lastItem = 0, mode = 0; 477 UChar32 lastChar = 0; 478 UChar op = 0; 479 480 UBool invert = FALSE; 481 482 clear(); 483 484 while (mode != 2 && !chars.atEnd()) { 485 U_ASSERT((lastItem == 0 && op == 0) || 486 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || 487 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || 488 op == INTERSECTION /*'&'*/))); 489 490 UChar32 c = 0; 491 UBool literal = FALSE; 492 UnicodeSet* nested = 0; // alias - do not delete 493 494 // -------- Check for property pattern 495 496 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed 497 int8_t setMode = 0; 498 if (resemblesPropertyPattern(chars, opts)) { 499 setMode = 2; 500 } 501 502 // -------- Parse '[' of opening delimiter OR nested set. 503 // If there is a nested set, use `setMode' to define how 504 // the set should be parsed. If the '[' is part of the 505 // opening delimiter for this pattern, parse special 506 // strings "[", "[^", "[-", and "[^-". Check for stand-in 507 // characters representing a nested set in the symbol 508 // table. 509 510 else { 511 // Prepare to backup if necessary 512 chars.getPos(backup); 513 c = chars.next(opts, literal, ec); 514 if (U_FAILURE(ec)) return; 515 516 if (c == 0x5B /*'['*/ && !literal) { 517 if (mode == 1) { 518 chars.setPos(backup); // backup 519 setMode = 1; 520 } else { 521 // Handle opening '[' delimiter 522 mode = 1; 523 patLocal.append((UChar) 0x5B /*'['*/); 524 chars.getPos(backup); // prepare to backup 525 c = chars.next(opts, literal, ec); 526 if (U_FAILURE(ec)) return; 527 if (c == 0x5E /*'^'*/ && !literal) { 528 invert = TRUE; 529 patLocal.append((UChar) 0x5E /*'^'*/); 530 chars.getPos(backup); // prepare to backup 531 c = chars.next(opts, literal, ec); 532 if (U_FAILURE(ec)) return; 533 } 534 // Fall through to handle special leading '-'; 535 // otherwise restart loop for nested [], \p{}, etc. 536 if (c == HYPHEN /*'-'*/) { 537 literal = TRUE; 538 // Fall through to handle literal '-' below 539 } else { 540 chars.setPos(backup); // backup 541 continue; 542 } 543 } 544 } else if (symbols != 0) { 545 const UnicodeFunctor *m = symbols->lookupMatcher(c); 546 if (m != 0) { 547 if (m->getDynamicClassID() != UnicodeSet::getStaticClassID()) { 548 ec = U_MALFORMED_SET; 549 return; 550 } 551 // casting away const, but `nested' won't be modified 552 // (important not to modify stored set) 553 nested = (UnicodeSet*) m; 554 setMode = 3; 555 } 556 } 557 } 558 559 // -------- Handle a nested set. This either is inline in 560 // the pattern or represented by a stand-in that has 561 // previously been parsed and was looked up in the symbol 562 // table. 563 564 if (setMode != 0) { 565 if (lastItem == 1) { 566 if (op != 0) { 567 // syntaxError(chars, "Char expected after operator"); 568 ec = U_MALFORMED_SET; 569 return; 570 } 571 add(lastChar, lastChar); 572 _appendToPat(patLocal, lastChar, FALSE); 573 lastItem = 0; 574 op = 0; 575 } 576 577 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { 578 patLocal.append(op); 579 } 580 581 if (nested == 0) { 582 // lazy allocation 583 if (!scratch.allocate()) { 584 ec = U_MEMORY_ALLOCATION_ERROR; 585 return; 586 } 587 nested = scratch.pointer(); 588 } 589 switch (setMode) { 590 case 1: 591 nested->applyPattern(chars, symbols, patLocal, options, ec); 592 break; 593 case 2: 594 chars.skipIgnored(opts); 595 nested->applyPropertyPattern(chars, patLocal, ec); 596 if (U_FAILURE(ec)) return; 597 break; 598 case 3: // `nested' already parsed 599 nested->_toPattern(patLocal, FALSE); 600 break; 601 } 602 603 usePat = TRUE; 604 605 if (mode == 0) { 606 // Entire pattern is a category; leave parse loop 607 *this = *nested; 608 mode = 2; 609 break; 610 } 611 612 switch (op) { 613 case HYPHEN: /*'-'*/ 614 removeAll(*nested); 615 break; 616 case INTERSECTION: /*'&'*/ 617 retainAll(*nested); 618 break; 619 case 0: 620 addAll(*nested); 621 break; 622 } 623 624 op = 0; 625 lastItem = 2; 626 627 continue; 628 } 629 630 if (mode == 0) { 631 // syntaxError(chars, "Missing '['"); 632 ec = U_MALFORMED_SET; 633 return; 634 } 635 636 // -------- Parse special (syntax) characters. If the 637 // current character is not special, or if it is escaped, 638 // then fall through and handle it below. 639 640 if (!literal) { 641 switch (c) { 642 case 0x5D /*']'*/: 643 if (lastItem == 1) { 644 add(lastChar, lastChar); 645 _appendToPat(patLocal, lastChar, FALSE); 646 } 647 // Treat final trailing '-' as a literal 648 if (op == HYPHEN /*'-'*/) { 649 add(op, op); 650 patLocal.append(op); 651 } else if (op == INTERSECTION /*'&'*/) { 652 // syntaxError(chars, "Trailing '&'"); 653 ec = U_MALFORMED_SET; 654 return; 655 } 656 patLocal.append((UChar) 0x5D /*']'*/); 657 mode = 2; 658 continue; 659 case HYPHEN /*'-'*/: 660 if (op == 0) { 661 if (lastItem != 0) { 662 op = (UChar) c; 663 continue; 664 } else { 665 // Treat final trailing '-' as a literal 666 add(c, c); 667 c = chars.next(opts, literal, ec); 668 if (U_FAILURE(ec)) return; 669 if (c == 0x5D /*']'*/ && !literal) { 670 patLocal.append(HYPHEN_RIGHT_BRACE); 671 mode = 2; 672 continue; 673 } 674 } 675 } 676 // syntaxError(chars, "'-' not after char or set"); 677 ec = U_MALFORMED_SET; 678 return; 679 case INTERSECTION /*'&'*/: 680 if (lastItem == 2 && op == 0) { 681 op = (UChar) c; 682 continue; 683 } 684 // syntaxError(chars, "'&' not after set"); 685 ec = U_MALFORMED_SET; 686 return; 687 case 0x5E /*'^'*/: 688 // syntaxError(chars, "'^' not after '['"); 689 ec = U_MALFORMED_SET; 690 return; 691 case 0x7B /*'{'*/: 692 if (op != 0) { 693 // syntaxError(chars, "Missing operand after operator"); 694 ec = U_MALFORMED_SET; 695 return; 696 } 697 if (lastItem == 1) { 698 add(lastChar, lastChar); 699 _appendToPat(patLocal, lastChar, FALSE); 700 } 701 lastItem = 0; 702 buf.truncate(0); 703 { 704 UBool ok = FALSE; 705 while (!chars.atEnd()) { 706 c = chars.next(opts, literal, ec); 707 if (U_FAILURE(ec)) return; 708 if (c == 0x7D /*'}'*/ && !literal) { 709 ok = TRUE; 710 break; 711 } 712 buf.append(c); 713 } 714 if (buf.length() < 1 || !ok) { 715 // syntaxError(chars, "Invalid multicharacter string"); 716 ec = U_MALFORMED_SET; 717 return; 718 } 719 } 720 // We have new string. Add it to set and continue; 721 // we don't need to drop through to the further 722 // processing 723 add(buf); 724 patLocal.append((UChar) 0x7B /*'{'*/); 725 _appendToPat(patLocal, buf, FALSE); 726 patLocal.append((UChar) 0x7D /*'}'*/); 727 continue; 728 case SymbolTable::SYMBOL_REF: 729 // symbols nosymbols 730 // [a-$] error error (ambiguous) 731 // [a$] anchor anchor 732 // [a-$x] var "x"* literal '$' 733 // [a-$.] error literal '$' 734 // *We won't get here in the case of var "x" 735 { 736 chars.getPos(backup); 737 c = chars.next(opts, literal, ec); 738 if (U_FAILURE(ec)) return; 739 UBool anchor = (c == 0x5D /*']'*/ && !literal); 740 if (symbols == 0 && !anchor) { 741 c = SymbolTable::SYMBOL_REF; 742 chars.setPos(backup); 743 break; // literal '$' 744 } 745 if (anchor && op == 0) { 746 if (lastItem == 1) { 747 add(lastChar, lastChar); 748 _appendToPat(patLocal, lastChar, FALSE); 749 } 750 add(U_ETHER); 751 usePat = TRUE; 752 patLocal.append((UChar) SymbolTable::SYMBOL_REF); 753 patLocal.append((UChar) 0x5D /*']'*/); 754 mode = 2; 755 continue; 756 } 757 // syntaxError(chars, "Unquoted '$'"); 758 ec = U_MALFORMED_SET; 759 return; 760 } 761 default: 762 break; 763 } 764 } 765 766 // -------- Parse literal characters. This includes both 767 // escaped chars ("\u4E01") and non-syntax characters 768 // ("a"). 769 770 switch (lastItem) { 771 case 0: 772 lastItem = 1; 773 lastChar = c; 774 break; 775 case 1: 776 if (op == HYPHEN /*'-'*/) { 777 if (lastChar >= c) { 778 // Don't allow redundant (a-a) or empty (b-a) ranges; 779 // these are most likely typos. 780 // syntaxError(chars, "Invalid range"); 781 ec = U_MALFORMED_SET; 782 return; 783 } 784 add(lastChar, c); 785 _appendToPat(patLocal, lastChar, FALSE); 786 patLocal.append(op); 787 _appendToPat(patLocal, c, FALSE); 788 lastItem = 0; 789 op = 0; 790 } else { 791 add(lastChar, lastChar); 792 _appendToPat(patLocal, lastChar, FALSE); 793 lastChar = c; 794 } 795 break; 796 case 2: 797 if (op != 0) { 798 // syntaxError(chars, "Set expected after operator"); 799 ec = U_MALFORMED_SET; 800 return; 801 } 802 lastChar = c; 803 lastItem = 1; 804 break; 805 } 806 } 807 808 if (mode != 2) { 809 // syntaxError(chars, "Missing ']'"); 810 ec = U_MALFORMED_SET; 811 return; 812 } 813 814 chars.skipIgnored(opts); 815 816 /** 817 * Handle global flags (invert, case insensitivity). If this 818 * pattern should be compiled case-insensitive, then we need 819 * to close over case BEFORE COMPLEMENTING. This makes 820 * patterns like /[^abc]/i work. 821 */ 822 if ((options & USET_CASE_INSENSITIVE) != 0) { 823 closeOver(USET_CASE_INSENSITIVE); 824 } 825 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { 826 closeOver(USET_ADD_CASE_MAPPINGS); 827 } 828 if (invert) { 829 complement(); 830 } 831 832 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the 833 // generated pattern. 834 if (usePat) { 835 rebuiltPat.append(patLocal); 836 } else { 837 _generatePattern(rebuiltPat, FALSE); 838 } 839 if (isBogus() && U_SUCCESS(ec)) { 840 // We likely ran out of memory. AHHH! 841 ec = U_MEMORY_ALLOCATION_ERROR; 842 } 843} 844 845//---------------------------------------------------------------- 846// Property set implementation 847//---------------------------------------------------------------- 848 849static UBool numericValueFilter(UChar32 ch, void* context) { 850 return u_getNumericValue(ch) == *(double*)context; 851} 852 853static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { 854 int32_t value = *(int32_t*)context; 855 return (U_GET_GC_MASK((UChar32) ch) & value) != 0; 856} 857 858static UBool versionFilter(UChar32 ch, void* context) { 859 UVersionInfo v, none = { 0, 0, 0, 0}; 860 UVersionInfo* version = (UVersionInfo*)context; 861 u_charAge(ch, v); 862 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; 863} 864 865typedef struct { 866 UProperty prop; 867 int32_t value; 868} IntPropertyContext; 869 870static UBool intPropertyFilter(UChar32 ch, void* context) { 871 IntPropertyContext* c = (IntPropertyContext*)context; 872 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; 873} 874 875 876/** 877 * Generic filter-based scanning code for UCD property UnicodeSets. 878 */ 879void UnicodeSet::applyFilter(UnicodeSet::Filter filter, 880 void* context, 881 int32_t src, 882 UErrorCode &status) { 883 // Walk through all Unicode characters, noting the start 884 // and end of each range for which filter.contain(c) is 885 // true. Add each range to a set. 886 // 887 // To improve performance, use the INCLUSIONS set, which 888 // encodes information about character ranges that are known 889 // to have identical properties. INCLUSIONS contains 890 // only the first characters of such ranges. 891 // 892 // TODO Where possible, instead of scanning over code points, 893 // use internal property data to initialize UnicodeSets for 894 // those properties. Scanning code points is slow. 895 if (U_FAILURE(status)) return; 896 897 const UnicodeSet* inclusions = getInclusions(src, status); 898 if (U_FAILURE(status)) { 899 return; 900 } 901 902 clear(); 903 904 UChar32 startHasProperty = -1; 905 int32_t limitRange = inclusions->getRangeCount(); 906 907 for (int j=0; j<limitRange; ++j) { 908 // get current range 909 UChar32 start = inclusions->getRangeStart(j); 910 UChar32 end = inclusions->getRangeEnd(j); 911 912 // for all the code points in the range, process 913 for (UChar32 ch = start; ch <= end; ++ch) { 914 // only add to this UnicodeSet on inflection points -- 915 // where the hasProperty value changes to false 916 if ((*filter)(ch, context)) { 917 if (startHasProperty < 0) { 918 startHasProperty = ch; 919 } 920 } else if (startHasProperty >= 0) { 921 add(startHasProperty, ch-1); 922 startHasProperty = -1; 923 } 924 } 925 } 926 if (startHasProperty >= 0) { 927 add((UChar32)startHasProperty, (UChar32)0x10FFFF); 928 } 929 if (isBogus() && U_SUCCESS(status)) { 930 // We likely ran out of memory. AHHH! 931 status = U_MEMORY_ALLOCATION_ERROR; 932 } 933} 934 935static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { 936 /* Note: we use ' ' in compiler code page */ 937 int32_t j = 0; 938 char ch; 939 --dstCapacity; /* make room for term. zero */ 940 while ((ch = *src++) != 0) { 941 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { 942 continue; 943 } 944 if (j >= dstCapacity) return FALSE; 945 dst[j++] = ch; 946 } 947 if (j > 0 && dst[j-1] == ' ') --j; 948 dst[j] = 0; 949 return TRUE; 950} 951 952//---------------------------------------------------------------- 953// Property set API 954//---------------------------------------------------------------- 955 956#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} 957 958UnicodeSet& 959UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { 960 if (U_FAILURE(ec) || isFrozen()) return *this; 961 962 if (prop == UCHAR_GENERAL_CATEGORY_MASK) { 963 applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec); 964 } else { 965 IntPropertyContext c = {prop, value}; 966 applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec); 967 } 968 return *this; 969} 970 971UnicodeSet& 972UnicodeSet::applyPropertyAlias(const UnicodeString& prop, 973 const UnicodeString& value, 974 UErrorCode& ec) { 975 if (U_FAILURE(ec) || isFrozen()) return *this; 976 977 // prop and value used to be converted to char * using the default 978 // converter instead of the invariant conversion. 979 // This should not be necessary because all Unicode property and value 980 // names use only invariant characters. 981 // If there are any variant characters, then we won't find them anyway. 982 // Checking first avoids assertion failures in the conversion. 983 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || 984 !uprv_isInvariantUString(value.getBuffer(), value.length()) 985 ) { 986 FAIL(ec); 987 } 988 CharString pname(prop); 989 CharString vname(value); 990 991 UProperty p; 992 int32_t v; 993 UBool mustNotBeEmpty = FALSE, invert = FALSE; 994 995 if (value.length() > 0) { 996 p = u_getPropertyEnum(pname); 997 if (p == UCHAR_INVALID_CODE) FAIL(ec); 998 999 // Treat gc as gcm 1000 if (p == UCHAR_GENERAL_CATEGORY) { 1001 p = UCHAR_GENERAL_CATEGORY_MASK; 1002 } 1003 1004 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || 1005 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || 1006 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { 1007 v = u_getPropertyValueEnum(p, vname); 1008 if (v == UCHAR_INVALID_CODE) { 1009 // Handle numeric CCC 1010 if (p == UCHAR_CANONICAL_COMBINING_CLASS || 1011 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || 1012 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { 1013 char* end; 1014 double value = uprv_strtod(vname, &end); 1015 v = (int32_t) value; 1016 if (v != value || v < 0 || *end != 0) { 1017 // non-integral or negative value, or trailing junk 1018 FAIL(ec); 1019 } 1020 // If the resultant set is empty then the numeric value 1021 // was invalid. 1022 mustNotBeEmpty = TRUE; 1023 } else { 1024 FAIL(ec); 1025 } 1026 } 1027 } 1028 1029 else { 1030 1031 switch (p) { 1032 case UCHAR_NUMERIC_VALUE: 1033 { 1034 char* end; 1035 double value = uprv_strtod(vname, &end); 1036 if (*end != 0) { 1037 FAIL(ec); 1038 } 1039 applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec); 1040 return *this; 1041 } 1042 break; 1043 case UCHAR_NAME: 1044 case UCHAR_UNICODE_1_NAME: 1045 { 1046 // Must munge name, since u_charFromName() does not do 1047 // 'loose' matching. 1048 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength 1049 if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec); 1050 UCharNameChoice choice = (p == UCHAR_NAME) ? 1051 U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME; 1052 UChar32 ch = u_charFromName(choice, buf, &ec); 1053 if (U_SUCCESS(ec)) { 1054 clear(); 1055 add(ch); 1056 return *this; 1057 } else { 1058 FAIL(ec); 1059 } 1060 } 1061 break; 1062 case UCHAR_AGE: 1063 { 1064 // Must munge name, since u_versionFromString() does not do 1065 // 'loose' matching. 1066 char buf[128]; 1067 if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec); 1068 UVersionInfo version; 1069 u_versionFromString(version, buf); 1070 applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec); 1071 return *this; 1072 } 1073 break; 1074 default: 1075 // p is a non-binary, non-enumerated property that we 1076 // don't support (yet). 1077 FAIL(ec); 1078 } 1079 } 1080 } 1081 1082 else { 1083 // value is empty. Interpret as General Category, Script, or 1084 // Binary property. 1085 p = UCHAR_GENERAL_CATEGORY_MASK; 1086 v = u_getPropertyValueEnum(p, pname); 1087 if (v == UCHAR_INVALID_CODE) { 1088 p = UCHAR_SCRIPT; 1089 v = u_getPropertyValueEnum(p, pname); 1090 if (v == UCHAR_INVALID_CODE) { 1091 p = u_getPropertyEnum(pname); 1092 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { 1093 v = 1; 1094 } else if (0 == uprv_comparePropertyNames(ANY, pname)) { 1095 set(MIN_VALUE, MAX_VALUE); 1096 return *this; 1097 } else if (0 == uprv_comparePropertyNames(ASCII, pname)) { 1098 set(0, 0x7F); 1099 return *this; 1100 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) { 1101 // [:Assigned:]=[:^Cn:] 1102 p = UCHAR_GENERAL_CATEGORY_MASK; 1103 v = U_GC_CN_MASK; 1104 invert = TRUE; 1105 } else { 1106 FAIL(ec); 1107 } 1108 } 1109 } 1110 } 1111 1112 applyIntPropertyValue(p, v, ec); 1113 if(invert) { 1114 complement(); 1115 } 1116 1117 if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) { 1118 // mustNotBeEmpty is set to true if an empty set indicates 1119 // invalid input. 1120 ec = U_ILLEGAL_ARGUMENT_ERROR; 1121 } 1122 1123 if (isBogus() && U_SUCCESS(ec)) { 1124 // We likely ran out of memory. AHHH! 1125 ec = U_MEMORY_ALLOCATION_ERROR; 1126 } 1127 return *this; 1128} 1129 1130//---------------------------------------------------------------- 1131// Property set patterns 1132//---------------------------------------------------------------- 1133 1134/** 1135 * Return true if the given position, in the given pattern, appears 1136 * to be the start of a property set pattern. 1137 */ 1138UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, 1139 int32_t pos) { 1140 // Patterns are at least 5 characters long 1141 if ((pos+5) > pattern.length()) { 1142 return FALSE; 1143 } 1144 1145 // Look for an opening [:, [:^, \p, or \P 1146 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); 1147} 1148 1149/** 1150 * Return true if the given iterator appears to point at a 1151 * property pattern. Regardless of the result, return with the 1152 * iterator unchanged. 1153 * @param chars iterator over the pattern characters. Upon return 1154 * it will be unchanged. 1155 * @param iterOpts RuleCharacterIterator options 1156 */ 1157UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, 1158 int32_t iterOpts) { 1159 // NOTE: literal will always be FALSE, because we don't parse escapes. 1160 UBool result = FALSE, literal; 1161 UErrorCode ec = U_ZERO_ERROR; 1162 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; 1163 RuleCharacterIterator::Pos pos; 1164 chars.getPos(pos); 1165 UChar32 c = chars.next(iterOpts, literal, ec); 1166 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) { 1167 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, 1168 literal, ec); 1169 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) : 1170 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/); 1171 } 1172 chars.setPos(pos); 1173 return result && U_SUCCESS(ec); 1174} 1175 1176/** 1177 * Parse the given property pattern at the given parse position. 1178 */ 1179UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, 1180 ParsePosition& ppos, 1181 UErrorCode &ec) { 1182 int32_t pos = ppos.getIndex(); 1183 1184 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} 1185 UBool isName = FALSE; // true for \N{pat}, o/w false 1186 UBool invert = FALSE; 1187 1188 if (U_FAILURE(ec)) return *this; 1189 1190 // Minimum length is 5 characters, e.g. \p{L} 1191 if ((pos+5) > pattern.length()) { 1192 FAIL(ec); 1193 } 1194 1195 // On entry, ppos should point to one of the following locations: 1196 // Look for an opening [:, [:^, \p, or \P 1197 if (isPOSIXOpen(pattern, pos)) { 1198 posix = TRUE; 1199 pos += 2; 1200 pos = ICU_Utility::skipWhitespace(pattern, pos); 1201 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) { 1202 ++pos; 1203 invert = TRUE; 1204 } 1205 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { 1206 UChar c = pattern.charAt(pos+1); 1207 invert = (c == UPPER_P); 1208 isName = (c == UPPER_N); 1209 pos += 2; 1210 pos = ICU_Utility::skipWhitespace(pattern, pos); 1211 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) { 1212 // Syntax error; "\p" or "\P" not followed by "{" 1213 FAIL(ec); 1214 } 1215 } else { 1216 // Open delimiter not seen 1217 FAIL(ec); 1218 } 1219 1220 // Look for the matching close delimiter, either :] or } 1221 int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos); 1222 if (close < 0) { 1223 // Syntax error; close delimiter missing 1224 FAIL(ec); 1225 } 1226 1227 // Look for an '=' sign. If this is present, we will parse a 1228 // medium \p{gc=Cf} or long \p{GeneralCategory=Format} 1229 // pattern. 1230 int32_t equals = pattern.indexOf(EQUALS, pos); 1231 UnicodeString propName, valueName; 1232 if (equals >= 0 && equals < close && !isName) { 1233 // Equals seen; parse medium/long pattern 1234 pattern.extractBetween(pos, equals, propName); 1235 pattern.extractBetween(equals+1, close, valueName); 1236 } 1237 1238 else { 1239 // Handle case where no '=' is seen, and \N{} 1240 pattern.extractBetween(pos, close, propName); 1241 1242 // Handle \N{name} 1243 if (isName) { 1244 // This is a little inefficient since it means we have to 1245 // parse NAME_PROP back to UCHAR_NAME even though we already 1246 // know it's UCHAR_NAME. If we refactor the API to 1247 // support args of (UProperty, char*) then we can remove 1248 // NAME_PROP and make this a little more efficient. 1249 valueName = propName; 1250 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); 1251 } 1252 } 1253 1254 applyPropertyAlias(propName, valueName, ec); 1255 1256 if (U_SUCCESS(ec)) { 1257 if (invert) { 1258 complement(); 1259 } 1260 1261 // Move to the limit position after the close delimiter if the 1262 // parse succeeded. 1263 ppos.setIndex(close + (posix ? 2 : 1)); 1264 } 1265 1266 return *this; 1267} 1268 1269/** 1270 * Parse a property pattern. 1271 * @param chars iterator over the pattern characters. Upon return 1272 * it will be advanced to the first character after the parsed 1273 * pattern, or the end of the iteration if all characters are 1274 * parsed. 1275 * @param rebuiltPat the pattern that was parsed, rebuilt or 1276 * copied from the input pattern, as appropriate. 1277 */ 1278void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, 1279 UnicodeString& rebuiltPat, 1280 UErrorCode& ec) { 1281 if (U_FAILURE(ec)) return; 1282 UnicodeString pattern; 1283 chars.lookahead(pattern); 1284 ParsePosition pos(0); 1285 applyPropertyPattern(pattern, pos, ec); 1286 if (U_FAILURE(ec)) return; 1287 if (pos.getIndex() == 0) { 1288 // syntaxError(chars, "Invalid property pattern"); 1289 ec = U_MALFORMED_SET; 1290 return; 1291 } 1292 chars.jumpahead(pos.getIndex()); 1293 rebuiltPat.append(pattern, 0, pos.getIndex()); 1294} 1295 1296//---------------------------------------------------------------- 1297// Case folding API 1298//---------------------------------------------------------------- 1299 1300// add the result of a full case mapping to the set 1301// use str as a temporary string to avoid constructing one 1302static inline void 1303addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) { 1304 if(result >= 0) { 1305 if(result > UCASE_MAX_STRING_LENGTH) { 1306 // add a single-code point case mapping 1307 set.add(result); 1308 } else { 1309 // add a string case mapping from full with length result 1310 str.setTo((UBool)FALSE, full, result); 1311 set.add(str); 1312 } 1313 } 1314 // result < 0: the code point mapped to itself, no need to add it 1315 // see ucase.h 1316} 1317 1318UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { 1319 if (isFrozen() || isBogus()) { 1320 return *this; 1321 } 1322 if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) { 1323 UErrorCode status = U_ZERO_ERROR; 1324 const UCaseProps *csp = ucase_getSingleton(&status); 1325 if (U_SUCCESS(status)) { 1326 UnicodeSet foldSet(*this); 1327 UnicodeString str; 1328 USetAdder sa = { 1329 (USet *)&foldSet, 1330 _set_add, 1331 _set_addRange, 1332 _set_addString, 1333 NULL, // don't need remove() 1334 NULL // don't need removeRange() 1335 }; 1336 1337 // start with input set to guarantee inclusion 1338 // USET_CASE: remove strings because the strings will actually be reduced (folded); 1339 // therefore, start with no strings and add only those needed 1340 if (attribute & USET_CASE_INSENSITIVE) { 1341 foldSet.strings->removeAllElements(); 1342 } 1343 1344 int32_t n = getRangeCount(); 1345 UChar32 result; 1346 const UChar *full; 1347 int32_t locCache = 0; 1348 1349 for (int32_t i=0; i<n; ++i) { 1350 UChar32 start = getRangeStart(i); 1351 UChar32 end = getRangeEnd(i); 1352 1353 if (attribute & USET_CASE_INSENSITIVE) { 1354 // full case closure 1355 for (UChar32 cp=start; cp<=end; ++cp) { 1356 ucase_addCaseClosure(csp, cp, &sa); 1357 } 1358 } else { 1359 // add case mappings 1360 // (does not add long s for regular s, or Kelvin for k, for example) 1361 for (UChar32 cp=start; cp<=end; ++cp) { 1362 result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache); 1363 addCaseMapping(foldSet, result, full, str); 1364 1365 result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache); 1366 addCaseMapping(foldSet, result, full, str); 1367 1368 result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache); 1369 addCaseMapping(foldSet, result, full, str); 1370 1371 result = ucase_toFullFolding(csp, cp, &full, 0); 1372 addCaseMapping(foldSet, result, full, str); 1373 } 1374 } 1375 } 1376 if (strings != NULL && strings->size() > 0) { 1377 if (attribute & USET_CASE_INSENSITIVE) { 1378 for (int32_t j=0; j<strings->size(); ++j) { 1379 str = *(const UnicodeString *) strings->elementAt(j); 1380 str.foldCase(); 1381 if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) { 1382 foldSet.add(str); // does not map to code points: add the folded string itself 1383 } 1384 } 1385 } else { 1386 Locale root(""); 1387#if !UCONFIG_NO_BREAK_ITERATION 1388 BreakIterator *bi = BreakIterator::createWordInstance(root, status); 1389#endif 1390 if (U_SUCCESS(status)) { 1391 const UnicodeString *pStr; 1392 1393 for (int32_t j=0; j<strings->size(); ++j) { 1394 pStr = (const UnicodeString *) strings->elementAt(j); 1395 (str = *pStr).toLower(root); 1396 foldSet.add(str); 1397#if !UCONFIG_NO_BREAK_ITERATION 1398 (str = *pStr).toTitle(bi, root); 1399 foldSet.add(str); 1400#endif 1401 (str = *pStr).toUpper(root); 1402 foldSet.add(str); 1403 (str = *pStr).foldCase(); 1404 foldSet.add(str); 1405 } 1406 } 1407#if !UCONFIG_NO_BREAK_ITERATION 1408 delete bi; 1409#endif 1410 } 1411 } 1412 *this = foldSet; 1413 } 1414 } 1415 return *this; 1416} 1417 1418U_NAMESPACE_END 1419