1/* 2********************************************************************** 3* Copyright (C) 1999-2011, International Business Machines 4* Corporation and others. All Rights Reserved. 5********************************************************************** 6* Date Name Description 7* 10/20/99 alan Creation. 8********************************************************************** 9*/ 10 11#include "unicode/utypes.h" 12#include "unicode/uniset.h" 13#include "unicode/parsepos.h" 14#include "unicode/symtable.h" 15#include "ruleiter.h" 16#include "cmemory.h" 17#include "cstring.h" 18#include "patternprops.h" 19#include "uhash.h" 20#include "util.h" 21#include "uvector.h" 22#include "charstr.h" 23#include "ustrfmt.h" 24#include "uassert.h" 25#include "hash.h" 26#include "bmpset.h" 27#include "unisetspan.h" 28 29// Define UChar constants using hex for EBCDIC compatibility 30// Used #define to reduce private static exports and memory access time. 31#define SET_OPEN ((UChar)0x005B) /*[*/ 32#define SET_CLOSE ((UChar)0x005D) /*]*/ 33#define HYPHEN ((UChar)0x002D) /*-*/ 34#define COMPLEMENT ((UChar)0x005E) /*^*/ 35#define COLON ((UChar)0x003A) /*:*/ 36#define BACKSLASH ((UChar)0x005C) /*\*/ 37#define INTERSECTION ((UChar)0x0026) /*&*/ 38#define UPPER_U ((UChar)0x0055) /*U*/ 39#define LOWER_U ((UChar)0x0075) /*u*/ 40#define OPEN_BRACE ((UChar)123) /*{*/ 41#define CLOSE_BRACE ((UChar)125) /*}*/ 42#define UPPER_P ((UChar)0x0050) /*P*/ 43#define LOWER_P ((UChar)0x0070) /*p*/ 44#define UPPER_N ((UChar)78) /*N*/ 45#define EQUALS ((UChar)0x003D) /*=*/ 46 47// HIGH_VALUE > all valid values. 110000 for codepoints 48#define UNICODESET_HIGH 0x0110000 49 50// LOW <= all valid values. ZERO for codepoints 51#define UNICODESET_LOW 0x000000 52 53// initial storage. Must be >= 0 54#define START_EXTRA 16 55 56// extra amount for growth. Must be >= 0 57#define GROW_EXTRA START_EXTRA 58 59U_NAMESPACE_BEGIN 60 61SymbolTable::~SymbolTable() {} 62 63UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet) 64 65/** 66 * Modify the given UChar32 variable so that it is in range, by 67 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and 68 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1. 69 * It modifies its argument in-place and also returns it. 70 */ 71static inline UChar32 pinCodePoint(UChar32& c) { 72 if (c < UNICODESET_LOW) { 73 c = UNICODESET_LOW; 74 } else if (c > (UNICODESET_HIGH-1)) { 75 c = (UNICODESET_HIGH-1); 76 } 77 return c; 78} 79 80//---------------------------------------------------------------- 81// Debugging 82//---------------------------------------------------------------- 83 84// DO NOT DELETE THIS CODE. This code is used to debug memory leaks. 85// To enable the debugging, define the symbol DEBUG_MEM in the line 86// below. This will result in text being sent to stdout that looks 87// like this: 88// DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85- 89// DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85- 90// Each line lists a construction (ct) or destruction (dt) event, the 91// object address, the number of outstanding objects after the event, 92// and the pattern of the object in question. 93 94// #define DEBUG_MEM 95 96#ifdef DEBUG_MEM 97#include <stdio.h> 98static int32_t _dbgCount = 0; 99 100static inline void _dbgct(UnicodeSet* set) { 101 UnicodeString str; 102 set->toPattern(str, TRUE); 103 char buf[40]; 104 str.extract(0, 39, buf, ""); 105 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf); 106} 107 108static inline void _dbgdt(UnicodeSet* set) { 109 UnicodeString str; 110 set->toPattern(str, TRUE); 111 char buf[40]; 112 str.extract(0, 39, buf, ""); 113 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf); 114} 115 116#else 117 118#define _dbgct(set) 119#define _dbgdt(set) 120 121#endif 122 123//---------------------------------------------------------------- 124// UnicodeString in UVector support 125//---------------------------------------------------------------- 126 127static void U_CALLCONV cloneUnicodeString(UHashTok *dst, UHashTok *src) { 128 dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer); 129} 130 131static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) { 132 const UnicodeString &a = *(const UnicodeString*)t1.pointer; 133 const UnicodeString &b = *(const UnicodeString*)t2.pointer; 134 return a.compare(b); 135} 136 137//---------------------------------------------------------------- 138// Constructors &c 139//---------------------------------------------------------------- 140 141/** 142 * Constructs an empty set. 143 */ 144UnicodeSet::UnicodeSet() : 145 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), 146 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 147 fFlags(0) 148{ 149 UErrorCode status = U_ZERO_ERROR; 150 allocateStrings(status); 151 if (U_FAILURE(status)) { 152 return; 153 } 154 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 155 if(list!=NULL){ 156 list[0] = UNICODESET_HIGH; 157 } else { // If memory allocation failed, set to bogus state. 158 setToBogus(); 159 return; 160 } 161 _dbgct(this); 162} 163 164/** 165 * Constructs a set containing the given range. If <code>end > 166 * start</code> then an empty set is created. 167 * 168 * @param start first character, inclusive, of range 169 * @param end last character, inclusive, of range 170 */ 171UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) : 172 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), 173 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 174 fFlags(0) 175{ 176 UErrorCode status = U_ZERO_ERROR; 177 allocateStrings(status); 178 if (U_FAILURE(status)) { 179 return; 180 } 181 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 182 if(list!=NULL){ 183 list[0] = UNICODESET_HIGH; 184 complement(start, end); 185 } else { // If memory allocation failed, set to bogus state. 186 setToBogus(); 187 return; 188 } 189 _dbgct(this); 190} 191 192/** 193 * Constructs a set that is identical to the given UnicodeSet. 194 */ 195UnicodeSet::UnicodeSet(const UnicodeSet& o) : 196 UnicodeFilter(o), 197 len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0), 198 bmpSet(0), 199 buffer(0), bufferCapacity(0), 200 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 201 fFlags(0) 202{ 203 UErrorCode status = U_ZERO_ERROR; 204 allocateStrings(status); 205 if (U_FAILURE(status)) { 206 return; 207 } 208 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 209 if(list!=NULL){ 210 *this = o; 211 } else { // If memory allocation failed, set to bogus state. 212 setToBogus(); 213 return; 214 } 215 _dbgct(this); 216} 217 218// Copy-construct as thawed. 219UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : 220 UnicodeFilter(o), 221 len(0), capacity(o.len + GROW_EXTRA), list(0), 222 bmpSet(0), 223 buffer(0), bufferCapacity(0), 224 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 225 fFlags(0) 226{ 227 UErrorCode status = U_ZERO_ERROR; 228 allocateStrings(status); 229 if (U_FAILURE(status)) { 230 return; 231 } 232 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 233 if(list!=NULL){ 234 // *this = o except for bmpSet and stringSpan 235 len = o.len; 236 uprv_memcpy(list, o.list, len*sizeof(UChar32)); 237 if (strings != NULL && o.strings != NULL) { 238 strings->assign(*o.strings, cloneUnicodeString, status); 239 } else { // Invalid strings. 240 setToBogus(); 241 return; 242 } 243 if (o.pat) { 244 setPattern(UnicodeString(o.pat, o.patLen)); 245 } 246 } else { // If memory allocation failed, set to bogus state. 247 setToBogus(); 248 return; 249 } 250 _dbgct(this); 251} 252 253/** 254 * Destructs the set. 255 */ 256UnicodeSet::~UnicodeSet() { 257 _dbgdt(this); // first! 258 uprv_free(list); 259 delete bmpSet; 260 if (buffer) { 261 uprv_free(buffer); 262 } 263 delete strings; 264 delete stringSpan; 265 releasePattern(); 266} 267 268/** 269 * Assigns this object to be a copy of another. 270 */ 271UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { 272 if (this == &o) { 273 return *this; 274 } 275 if (isFrozen()) { 276 return *this; 277 } 278 if (o.isBogus()) { 279 setToBogus(); 280 return *this; 281 } 282 UErrorCode ec = U_ZERO_ERROR; 283 ensureCapacity(o.len, ec); 284 if (U_FAILURE(ec)) { 285 return *this; // There is no way to report this error :-( 286 } 287 len = o.len; 288 uprv_memcpy(list, o.list, len*sizeof(UChar32)); 289 if (o.bmpSet == NULL) { 290 bmpSet = NULL; 291 } else { 292 bmpSet = new BMPSet(*o.bmpSet, list, len); 293 if (bmpSet == NULL) { // Check for memory allocation error. 294 setToBogus(); 295 return *this; 296 } 297 } 298 if (strings != NULL && o.strings != NULL) { 299 strings->assign(*o.strings, cloneUnicodeString, ec); 300 } else { // Invalid strings. 301 setToBogus(); 302 return *this; 303 } 304 if (o.stringSpan == NULL) { 305 stringSpan = NULL; 306 } else { 307 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); 308 if (stringSpan == NULL) { // Check for memory allocation error. 309 setToBogus(); 310 return *this; 311 } 312 } 313 releasePattern(); 314 if (o.pat) { 315 setPattern(UnicodeString(o.pat, o.patLen)); 316 } 317 return *this; 318} 319 320/** 321 * Returns a copy of this object. All UnicodeMatcher objects have 322 * to support cloning in order to allow classes using 323 * UnicodeMatchers, such as Transliterator, to implement cloning. 324 */ 325UnicodeFunctor* UnicodeSet::clone() const { 326 return new UnicodeSet(*this); 327} 328 329UnicodeFunctor *UnicodeSet::cloneAsThawed() const { 330 return new UnicodeSet(*this, TRUE); 331} 332 333/** 334 * Compares the specified object with this set for equality. Returns 335 * <tt>true</tt> if the two sets 336 * have the same size, and every member of the specified set is 337 * contained in this set (or equivalently, every member of this set is 338 * contained in the specified set). 339 * 340 * @param o set to be compared for equality with this set. 341 * @return <tt>true</tt> if the specified set is equal to this set. 342 */ 343UBool UnicodeSet::operator==(const UnicodeSet& o) const { 344 if (len != o.len) return FALSE; 345 for (int32_t i = 0; i < len; ++i) { 346 if (list[i] != o.list[i]) return FALSE; 347 } 348 if (*strings != *o.strings) return FALSE; 349 return TRUE; 350} 351 352/** 353 * Returns the hash code value for this set. 354 * 355 * @return the hash code value for this set. 356 * @see Object#hashCode() 357 */ 358int32_t UnicodeSet::hashCode(void) const { 359 int32_t result = len; 360 for (int32_t i = 0; i < len; ++i) { 361 result *= 1000003; 362 result += list[i]; 363 } 364 return result; 365} 366 367//---------------------------------------------------------------- 368// Public API 369//---------------------------------------------------------------- 370 371/** 372 * Returns the number of elements in this set (its cardinality), 373 * Note than the elements of a set may include both individual 374 * codepoints and strings. 375 * 376 * @return the number of elements in this set (its cardinality). 377 */ 378int32_t UnicodeSet::size(void) const { 379 int32_t n = 0; 380 int32_t count = getRangeCount(); 381 for (int32_t i = 0; i < count; ++i) { 382 n += getRangeEnd(i) - getRangeStart(i) + 1; 383 } 384 return n + strings->size(); 385} 386 387/** 388 * Returns <tt>true</tt> if this set contains no elements. 389 * 390 * @return <tt>true</tt> if this set contains no elements. 391 */ 392UBool UnicodeSet::isEmpty(void) const { 393 return len == 1 && strings->size() == 0; 394} 395 396/** 397 * Returns true if this set contains the given character. 398 * @param c character to be checked for containment 399 * @return true if the test condition is met 400 */ 401UBool UnicodeSet::contains(UChar32 c) const { 402 // Set i to the index of the start item greater than ch 403 // We know we will terminate without length test! 404 // LATER: for large sets, add binary search 405 //int32_t i = -1; 406 //for (;;) { 407 // if (c < list[++i]) break; 408 //} 409 if (bmpSet != NULL) { 410 return bmpSet->contains(c); 411 } 412 if (stringSpan != NULL) { 413 return stringSpan->contains(c); 414 } 415 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound 416 return FALSE; 417 } 418 int32_t i = findCodePoint(c); 419 return (UBool)(i & 1); // return true if odd 420} 421 422/** 423 * Returns the smallest value i such that c < list[i]. Caller 424 * must ensure that c is a legal value or this method will enter 425 * an infinite loop. This method performs a binary search. 426 * @param c a character in the range MIN_VALUE..MAX_VALUE 427 * inclusive 428 * @return the smallest integer i in the range 0..len-1, 429 * inclusive, such that c < list[i] 430 */ 431int32_t UnicodeSet::findCodePoint(UChar32 c) const { 432 /* Examples: 433 findCodePoint(c) 434 set list[] c=0 1 3 4 7 8 435 === ============== =========== 436 [] [110000] 0 0 0 0 0 0 437 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 438 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 439 [:Any:] [0, 110000] 1 1 1 1 1 1 440 */ 441 442 // Return the smallest i such that c < list[i]. Assume 443 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 444 if (c < list[0]) 445 return 0; 446 // High runner test. c is often after the last range, so an 447 // initial check for this condition pays off. 448 int32_t lo = 0; 449 int32_t hi = len - 1; 450 if (lo >= hi || c >= list[hi-1]) 451 return hi; 452 // invariant: c >= list[lo] 453 // invariant: c < list[hi] 454 for (;;) { 455 int32_t i = (lo + hi) >> 1; 456 if (i == lo) { 457 break; // Found! 458 } else if (c < list[i]) { 459 hi = i; 460 } else { 461 lo = i; 462 } 463 } 464 return hi; 465} 466 467/** 468 * Returns true if this set contains every character 469 * of the given range. 470 * @param start first character, inclusive, of the range 471 * @param end last character, inclusive, of the range 472 * @return true if the test condition is met 473 */ 474UBool UnicodeSet::contains(UChar32 start, UChar32 end) const { 475 //int32_t i = -1; 476 //for (;;) { 477 // if (start < list[++i]) break; 478 //} 479 int32_t i = findCodePoint(start); 480 return ((i & 1) != 0 && end < list[i]); 481} 482 483/** 484 * Returns <tt>true</tt> if this set contains the given 485 * multicharacter string. 486 * @param s string to be checked for containment 487 * @return <tt>true</tt> if this set contains the specified string 488 */ 489UBool UnicodeSet::contains(const UnicodeString& s) const { 490 if (s.length() == 0) return FALSE; 491 int32_t cp = getSingleCP(s); 492 if (cp < 0) { 493 return strings->contains((void*) &s); 494 } else { 495 return contains((UChar32) cp); 496 } 497} 498 499/** 500 * Returns true if this set contains all the characters and strings 501 * of the given set. 502 * @param c set to be checked for containment 503 * @return true if the test condition is met 504 */ 505UBool UnicodeSet::containsAll(const UnicodeSet& c) const { 506 // The specified set is a subset if all of its pairs are contained in 507 // this set. It's possible to code this more efficiently in terms of 508 // direct manipulation of the inversion lists if the need arises. 509 int32_t n = c.getRangeCount(); 510 for (int i=0; i<n; ++i) { 511 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { 512 return FALSE; 513 } 514 } 515 if (!strings->containsAll(*c.strings)) return FALSE; 516 return TRUE; 517} 518 519/** 520 * Returns true if this set contains all the characters 521 * of the given string. 522 * @param s string containing characters to be checked for containment 523 * @return true if the test condition is met 524 */ 525UBool UnicodeSet::containsAll(const UnicodeString& s) const { 526 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) == 527 s.length()); 528} 529 530/** 531 * Returns true if this set contains none of the characters 532 * of the given range. 533 * @param start first character, inclusive, of the range 534 * @param end last character, inclusive, of the range 535 * @return true if the test condition is met 536 */ 537UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const { 538 //int32_t i = -1; 539 //for (;;) { 540 // if (start < list[++i]) break; 541 //} 542 int32_t i = findCodePoint(start); 543 return ((i & 1) == 0 && end < list[i]); 544} 545 546/** 547 * Returns true if this set contains none of the characters and strings 548 * of the given set. 549 * @param c set to be checked for containment 550 * @return true if the test condition is met 551 */ 552UBool UnicodeSet::containsNone(const UnicodeSet& c) const { 553 // The specified set is a subset if all of its pairs are contained in 554 // this set. It's possible to code this more efficiently in terms of 555 // direct manipulation of the inversion lists if the need arises. 556 int32_t n = c.getRangeCount(); 557 for (int32_t i=0; i<n; ++i) { 558 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { 559 return FALSE; 560 } 561 } 562 if (!strings->containsNone(*c.strings)) return FALSE; 563 return TRUE; 564} 565 566/** 567 * Returns true if this set contains none of the characters 568 * of the given string. 569 * @param s string containing characters to be checked for containment 570 * @return true if the test condition is met 571 */ 572UBool UnicodeSet::containsNone(const UnicodeString& s) const { 573 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) == 574 s.length()); 575} 576 577/** 578 * Returns <tt>true</tt> if this set contains any character whose low byte 579 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for 580 * indexing. 581 */ 582UBool UnicodeSet::matchesIndexValue(uint8_t v) const { 583 /* The index value v, in the range [0,255], is contained in this set if 584 * it is contained in any pair of this set. Pairs either have the high 585 * bytes equal, or unequal. If the high bytes are equal, then we have 586 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= 587 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. 588 * Then v is contained if xx <= v || v <= yy. (This is identical to the 589 * time zone month containment logic.) 590 */ 591 int32_t i; 592 int32_t rangeCount=getRangeCount(); 593 for (i=0; i<rangeCount; ++i) { 594 UChar32 low = getRangeStart(i); 595 UChar32 high = getRangeEnd(i); 596 if ((low & ~0xFF) == (high & ~0xFF)) { 597 if ((low & 0xFF) <= v && v <= (high & 0xFF)) { 598 return TRUE; 599 } 600 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { 601 return TRUE; 602 } 603 } 604 if (strings->size() != 0) { 605 for (i=0; i<strings->size(); ++i) { 606 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); 607 //if (s.length() == 0) { 608 // // Empty strings match everything 609 // return TRUE; 610 //} 611 // assert(s.length() != 0); // We enforce this elsewhere 612 UChar32 c = s.char32At(0); 613 if ((c & 0xFF) == v) { 614 return TRUE; 615 } 616 } 617 } 618 return FALSE; 619} 620 621/** 622 * Implementation of UnicodeMatcher::matches(). Always matches the 623 * longest possible multichar string. 624 */ 625UMatchDegree UnicodeSet::matches(const Replaceable& text, 626 int32_t& offset, 627 int32_t limit, 628 UBool incremental) { 629 if (offset == limit) { 630 // Strings, if any, have length != 0, so we don't worry 631 // about them here. If we ever allow zero-length strings 632 // we much check for them here. 633 if (contains(U_ETHER)) { 634 return incremental ? U_PARTIAL_MATCH : U_MATCH; 635 } else { 636 return U_MISMATCH; 637 } 638 } else { 639 if (strings->size() != 0) { // try strings first 640 641 // might separate forward and backward loops later 642 // for now they are combined 643 644 // TODO Improve efficiency of this, at least in the forward 645 // direction, if not in both. In the forward direction we 646 // can assume the strings are sorted. 647 648 int32_t i; 649 UBool forward = offset < limit; 650 651 // firstChar is the leftmost char to match in the 652 // forward direction or the rightmost char to match in 653 // the reverse direction. 654 UChar firstChar = text.charAt(offset); 655 656 // If there are multiple strings that can match we 657 // return the longest match. 658 int32_t highWaterLength = 0; 659 660 for (i=0; i<strings->size(); ++i) { 661 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i); 662 663 //if (trial.length() == 0) { 664 // return U_MATCH; // null-string always matches 665 //} 666 // assert(trial.length() != 0); // We ensure this elsewhere 667 668 UChar c = trial.charAt(forward ? 0 : trial.length() - 1); 669 670 // Strings are sorted, so we can optimize in the 671 // forward direction. 672 if (forward && c > firstChar) break; 673 if (c != firstChar) continue; 674 675 int32_t matchLen = matchRest(text, offset, limit, trial); 676 677 if (incremental) { 678 int32_t maxLen = forward ? limit-offset : offset-limit; 679 if (matchLen == maxLen) { 680 // We have successfully matched but only up to limit. 681 return U_PARTIAL_MATCH; 682 } 683 } 684 685 if (matchLen == trial.length()) { 686 // We have successfully matched the whole string. 687 if (matchLen > highWaterLength) { 688 highWaterLength = matchLen; 689 } 690 // In the forward direction we know strings 691 // are sorted so we can bail early. 692 if (forward && matchLen < highWaterLength) { 693 break; 694 } 695 continue; 696 } 697 } 698 699 // We've checked all strings without a partial match. 700 // If we have full matches, return the longest one. 701 if (highWaterLength != 0) { 702 offset += forward ? highWaterLength : -highWaterLength; 703 return U_MATCH; 704 } 705 } 706 return UnicodeFilter::matches(text, offset, limit, incremental); 707 } 708} 709 710/** 711 * Returns the longest match for s in text at the given position. 712 * If limit > start then match forward from start+1 to limit 713 * matching all characters except s.charAt(0). If limit < start, 714 * go backward starting from start-1 matching all characters 715 * except s.charAt(s.length()-1). This method assumes that the 716 * first character, text.charAt(start), matches s, so it does not 717 * check it. 718 * @param text the text to match 719 * @param start the first character to match. In the forward 720 * direction, text.charAt(start) is matched against s.charAt(0). 721 * In the reverse direction, it is matched against 722 * s.charAt(s.length()-1). 723 * @param limit the limit offset for matching, either last+1 in 724 * the forward direction, or last-1 in the reverse direction, 725 * where last is the index of the last character to match. 726 * @return If part of s matches up to the limit, return |limit - 727 * start|. If all of s matches before reaching the limit, return 728 * s.length(). If there is a mismatch between s and text, return 729 * 0 730 */ 731int32_t UnicodeSet::matchRest(const Replaceable& text, 732 int32_t start, int32_t limit, 733 const UnicodeString& s) { 734 int32_t i; 735 int32_t maxLen; 736 int32_t slen = s.length(); 737 if (start < limit) { 738 maxLen = limit - start; 739 if (maxLen > slen) maxLen = slen; 740 for (i = 1; i < maxLen; ++i) { 741 if (text.charAt(start + i) != s.charAt(i)) return 0; 742 } 743 } else { 744 maxLen = start - limit; 745 if (maxLen > slen) maxLen = slen; 746 --slen; // <=> slen = s.length() - 1; 747 for (i = 1; i < maxLen; ++i) { 748 if (text.charAt(start - i) != s.charAt(slen - i)) return 0; 749 } 750 } 751 return maxLen; 752} 753 754/** 755 * Implement of UnicodeMatcher 756 */ 757void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const { 758 toUnionTo.addAll(*this); 759} 760 761/** 762 * Returns the index of the given character within this set, where 763 * the set is ordered by ascending code point. If the character 764 * is not in this set, return -1. The inverse of this method is 765 * <code>charAt()</code>. 766 * @return an index from 0..size()-1, or -1 767 */ 768int32_t UnicodeSet::indexOf(UChar32 c) const { 769 if (c < MIN_VALUE || c > MAX_VALUE) { 770 return -1; 771 } 772 int32_t i = 0; 773 int32_t n = 0; 774 for (;;) { 775 UChar32 start = list[i++]; 776 if (c < start) { 777 return -1; 778 } 779 UChar32 limit = list[i++]; 780 if (c < limit) { 781 return n + c - start; 782 } 783 n += limit - start; 784 } 785} 786 787/** 788 * Returns the character at the given index within this set, where 789 * the set is ordered by ascending code point. If the index is 790 * out of range, return (UChar32)-1. The inverse of this method is 791 * <code>indexOf()</code>. 792 * @param index an index from 0..size()-1 793 * @return the character at the given index, or (UChar32)-1. 794 */ 795UChar32 UnicodeSet::charAt(int32_t index) const { 796 if (index >= 0) { 797 // len2 is the largest even integer <= len, that is, it is len 798 // for even values and len-1 for odd values. With odd values 799 // the last entry is UNICODESET_HIGH. 800 int32_t len2 = len & ~1; 801 for (int32_t i=0; i < len2;) { 802 UChar32 start = list[i++]; 803 int32_t count = list[i++] - start; 804 if (index < count) { 805 return (UChar32)(start + index); 806 } 807 index -= count; 808 } 809 } 810 return (UChar32)-1; 811} 812 813/** 814 * Make this object represent the range <code>start - end</code>. 815 * If <code>end > start</code> then this object is set to an 816 * an empty range. 817 * 818 * @param start first character in the set, inclusive 819 * @rparam end last character in the set, inclusive 820 */ 821UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { 822 clear(); 823 complement(start, end); 824 return *this; 825} 826 827/** 828 * Adds the specified range to this set if it is not already 829 * present. If this set already contains the specified range, 830 * the call leaves this set unchanged. If <code>end > start</code> 831 * then an empty range is added, leaving the set unchanged. 832 * 833 * @param start first character, inclusive, of range to be added 834 * to this set. 835 * @param end last character, inclusive, of range to be added 836 * to this set. 837 */ 838UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { 839 if (pinCodePoint(start) < pinCodePoint(end)) { 840 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 841 add(range, 2, 0); 842 } else if (start == end) { 843 add(start); 844 } 845 return *this; 846} 847 848// #define DEBUG_US_ADD 849 850#ifdef DEBUG_US_ADD 851#include <stdio.h> 852void dump(UChar32 c) { 853 if (c <= 0xFF) { 854 printf("%c", (char)c); 855 } else { 856 printf("U+%04X", c); 857 } 858} 859void dump(const UChar32* list, int32_t len) { 860 printf("["); 861 for (int32_t i=0; i<len; ++i) { 862 if (i != 0) printf(", "); 863 dump(list[i]); 864 } 865 printf("]"); 866} 867#endif 868 869/** 870 * Adds the specified character to this set if it is not already 871 * present. If this set already contains the specified character, 872 * the call leaves this set unchanged. 873 */ 874UnicodeSet& UnicodeSet::add(UChar32 c) { 875 // find smallest i such that c < list[i] 876 // if odd, then it is IN the set 877 // if even, then it is OUT of the set 878 int32_t i = findCodePoint(pinCodePoint(c)); 879 880 // already in set? 881 if ((i & 1) != 0 || isFrozen() || isBogus()) return *this; 882 883 // HIGH is 0x110000 884 // assert(list[len-1] == HIGH); 885 886 // empty = [HIGH] 887 // [start_0, limit_0, start_1, limit_1, HIGH] 888 889 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 890 // ^ 891 // list[i] 892 893 // i == 0 means c is before the first range 894 895#ifdef DEBUG_US_ADD 896 printf("Add of "); 897 dump(c); 898 printf(" found at %d", i); 899 printf(": "); 900 dump(list, len); 901 printf(" => "); 902#endif 903 904 if (c == list[i]-1) { 905 // c is before start of next range 906 list[i] = c; 907 // if we touched the HIGH mark, then add a new one 908 if (c == (UNICODESET_HIGH - 1)) { 909 UErrorCode status = U_ZERO_ERROR; 910 ensureCapacity(len+1, status); 911 if (U_FAILURE(status)) { 912 return *this; // There is no way to report this error :-( 913 } 914 list[len++] = UNICODESET_HIGH; 915 } 916 if (i > 0 && c == list[i-1]) { 917 // collapse adjacent ranges 918 919 // [..., start_k-1, c, c, limit_k, ..., HIGH] 920 // ^ 921 // list[i] 922 923 //for (int32_t k=i-1; k<len-2; ++k) { 924 // list[k] = list[k+2]; 925 //} 926 UChar32* dst = list + i - 1; 927 UChar32* src = dst + 2; 928 UChar32* srclimit = list + len; 929 while (src < srclimit) *(dst++) = *(src++); 930 931 len -= 2; 932 } 933 } 934 935 else if (i > 0 && c == list[i-1]) { 936 // c is after end of prior range 937 list[i-1]++; 938 // no need to check for collapse here 939 } 940 941 else { 942 // At this point we know the new char is not adjacent to 943 // any existing ranges, and it is not 10FFFF. 944 945 946 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 947 // ^ 948 // list[i] 949 950 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 951 // ^ 952 // list[i] 953 954 UErrorCode status = U_ZERO_ERROR; 955 ensureCapacity(len+2, status); 956 if (U_FAILURE(status)) { 957 return *this; // There is no way to report this error :-( 958 } 959 960 //for (int32_t k=len-1; k>=i; --k) { 961 // list[k+2] = list[k]; 962 //} 963 UChar32* src = list + len; 964 UChar32* dst = src + 2; 965 UChar32* srclimit = list + i; 966 while (src > srclimit) *(--dst) = *(--src); 967 968 list[i] = c; 969 list[i+1] = c+1; 970 len += 2; 971 } 972 973#ifdef DEBUG_US_ADD 974 dump(list, len); 975 printf("\n"); 976 977 for (i=1; i<len; ++i) { 978 if (list[i] <= list[i-1]) { 979 // Corrupt array! 980 printf("ERROR: list has been corrupted\n"); 981 exit(1); 982 } 983 } 984#endif 985 986 releasePattern(); 987 return *this; 988} 989 990/** 991 * Adds the specified multicharacter to this set if it is not already 992 * present. If this set already contains the multicharacter, 993 * the call leaves this set unchanged. 994 * Thus "ch" => {"ch"} 995 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 996 * @param s the source string 997 * @return the modified set, for chaining 998 */ 999UnicodeSet& UnicodeSet::add(const UnicodeString& s) { 1000 if (s.length() == 0 || isFrozen() || isBogus()) return *this; 1001 int32_t cp = getSingleCP(s); 1002 if (cp < 0) { 1003 if (!strings->contains((void*) &s)) { 1004 _add(s); 1005 releasePattern(); 1006 } 1007 } else { 1008 add((UChar32)cp); 1009 } 1010 return *this; 1011} 1012 1013/** 1014 * Adds the given string, in order, to 'strings'. The given string 1015 * must have been checked by the caller to not be empty and to not 1016 * already be in 'strings'. 1017 */ 1018void UnicodeSet::_add(const UnicodeString& s) { 1019 if (isFrozen() || isBogus()) { 1020 return; 1021 } 1022 UnicodeString* t = new UnicodeString(s); 1023 if (t == NULL) { // Check for memory allocation error. 1024 setToBogus(); 1025 return; 1026 } 1027 UErrorCode ec = U_ZERO_ERROR; 1028 strings->sortedInsert(t, compareUnicodeString, ec); 1029 if (U_FAILURE(ec)) { 1030 setToBogus(); 1031 delete t; 1032 } 1033} 1034 1035/** 1036 * @return a code point IF the string consists of a single one. 1037 * otherwise returns -1. 1038 * @param string to test 1039 */ 1040int32_t UnicodeSet::getSingleCP(const UnicodeString& s) { 1041 //if (s.length() < 1) { 1042 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); 1043 //} 1044 if (s.length() > 2) return -1; 1045 if (s.length() == 1) return s.charAt(0); 1046 1047 // at this point, len = 2 1048 UChar32 cp = s.char32At(0); 1049 if (cp > 0xFFFF) { // is surrogate pair 1050 return cp; 1051 } 1052 return -1; 1053} 1054 1055/** 1056 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 1057 * If this set already any particular character, it has no effect on that character. 1058 * @param the source string 1059 * @return the modified set, for chaining 1060 */ 1061UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) { 1062 UChar32 cp; 1063 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) { 1064 cp = s.char32At(i); 1065 add(cp); 1066 } 1067 return *this; 1068} 1069 1070/** 1071 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1072 * If this set already any particular character, it has no effect on that character. 1073 * @param the source string 1074 * @return the modified set, for chaining 1075 */ 1076UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) { 1077 UnicodeSet set; 1078 set.addAll(s); 1079 retainAll(set); 1080 return *this; 1081} 1082 1083/** 1084 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1085 * If this set already any particular character, it has no effect on that character. 1086 * @param the source string 1087 * @return the modified set, for chaining 1088 */ 1089UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) { 1090 UnicodeSet set; 1091 set.addAll(s); 1092 complementAll(set); 1093 return *this; 1094} 1095 1096/** 1097 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1098 * If this set already any particular character, it has no effect on that character. 1099 * @param the source string 1100 * @return the modified set, for chaining 1101 */ 1102UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { 1103 UnicodeSet set; 1104 set.addAll(s); 1105 removeAll(set); 1106 return *this; 1107} 1108 1109UnicodeSet& UnicodeSet::removeAllStrings() { 1110 strings->removeAllElements(); 1111 return *this; 1112} 1113 1114 1115/** 1116 * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1117 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1118 * @param the source string 1119 * @return a newly created set containing the given string 1120 */ 1121UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) { 1122 UnicodeSet *set = new UnicodeSet(); 1123 if (set != NULL) { // Check for memory allocation error. 1124 set->add(s); 1125 } 1126 return set; 1127} 1128 1129 1130/** 1131 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1132 * @param the source string 1133 * @return a newly created set containing the given characters 1134 */ 1135UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) { 1136 UnicodeSet *set = new UnicodeSet(); 1137 if (set != NULL) { // Check for memory allocation error. 1138 set->addAll(s); 1139 } 1140 return set; 1141} 1142 1143/** 1144 * Retain only the elements in this set that are contained in the 1145 * specified range. If <code>end > start</code> then an empty range is 1146 * retained, leaving the set empty. 1147 * 1148 * @param start first character, inclusive, of range to be retained 1149 * to this set. 1150 * @param end last character, inclusive, of range to be retained 1151 * to this set. 1152 */ 1153UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) { 1154 if (pinCodePoint(start) <= pinCodePoint(end)) { 1155 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1156 retain(range, 2, 0); 1157 } else { 1158 clear(); 1159 } 1160 return *this; 1161} 1162 1163UnicodeSet& UnicodeSet::retain(UChar32 c) { 1164 return retain(c, c); 1165} 1166 1167/** 1168 * Removes the specified range from this set if it is present. 1169 * The set will not contain the specified range once the call 1170 * returns. If <code>end > start</code> then an empty range is 1171 * removed, leaving the set unchanged. 1172 * 1173 * @param start first character, inclusive, of range to be removed 1174 * from this set. 1175 * @param end last character, inclusive, of range to be removed 1176 * from this set. 1177 */ 1178UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) { 1179 if (pinCodePoint(start) <= pinCodePoint(end)) { 1180 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1181 retain(range, 2, 2); 1182 } 1183 return *this; 1184} 1185 1186/** 1187 * Removes the specified character from this set if it is present. 1188 * The set will not contain the specified range once the call 1189 * returns. 1190 */ 1191UnicodeSet& UnicodeSet::remove(UChar32 c) { 1192 return remove(c, c); 1193} 1194 1195/** 1196 * Removes the specified string from this set if it is present. 1197 * The set will not contain the specified character once the call 1198 * returns. 1199 * @param the source string 1200 * @return the modified set, for chaining 1201 */ 1202UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { 1203 if (s.length() == 0 || isFrozen() || isBogus()) return *this; 1204 int32_t cp = getSingleCP(s); 1205 if (cp < 0) { 1206 strings->removeElement((void*) &s); 1207 releasePattern(); 1208 } else { 1209 remove((UChar32)cp, (UChar32)cp); 1210 } 1211 return *this; 1212} 1213 1214/** 1215 * Complements the specified range in this set. Any character in 1216 * the range will be removed if it is in this set, or will be 1217 * added if it is not in this set. If <code>end > start</code> 1218 * then an empty range is xor'ed, leaving the set unchanged. 1219 * 1220 * @param start first character, inclusive, of range to be removed 1221 * from this set. 1222 * @param end last character, inclusive, of range to be removed 1223 * from this set. 1224 */ 1225UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) { 1226 if (isFrozen() || isBogus()) { 1227 return *this; 1228 } 1229 if (pinCodePoint(start) <= pinCodePoint(end)) { 1230 UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1231 exclusiveOr(range, 2, 0); 1232 } 1233 releasePattern(); 1234 return *this; 1235} 1236 1237UnicodeSet& UnicodeSet::complement(UChar32 c) { 1238 return complement(c, c); 1239} 1240 1241/** 1242 * This is equivalent to 1243 * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1244 */ 1245UnicodeSet& UnicodeSet::complement(void) { 1246 if (isFrozen() || isBogus()) { 1247 return *this; 1248 } 1249 UErrorCode status = U_ZERO_ERROR; 1250 if (list[0] == UNICODESET_LOW) { 1251 ensureBufferCapacity(len-1, status); 1252 if (U_FAILURE(status)) { 1253 return *this; 1254 } 1255 uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32)); 1256 --len; 1257 } else { 1258 ensureBufferCapacity(len+1, status); 1259 if (U_FAILURE(status)) { 1260 return *this; 1261 } 1262 uprv_memcpy(buffer + 1, list, len*sizeof(UChar32)); 1263 buffer[0] = UNICODESET_LOW; 1264 ++len; 1265 } 1266 swapBuffers(); 1267 releasePattern(); 1268 return *this; 1269} 1270 1271/** 1272 * Complement the specified string in this set. 1273 * The set will not contain the specified string once the call 1274 * returns. 1275 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1276 * @param s the string to complement 1277 * @return this object, for chaining 1278 */ 1279UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { 1280 if (s.length() == 0 || isFrozen() || isBogus()) return *this; 1281 int32_t cp = getSingleCP(s); 1282 if (cp < 0) { 1283 if (strings->contains((void*) &s)) { 1284 strings->removeElement((void*) &s); 1285 } else { 1286 _add(s); 1287 } 1288 releasePattern(); 1289 } else { 1290 complement((UChar32)cp, (UChar32)cp); 1291 } 1292 return *this; 1293} 1294 1295/** 1296 * Adds all of the elements in the specified set to this set if 1297 * they're not already present. This operation effectively 1298 * modifies this set so that its value is the <i>union</i> of the two 1299 * sets. The behavior of this operation is unspecified if the specified 1300 * collection is modified while the operation is in progress. 1301 * 1302 * @param c set whose elements are to be added to this set. 1303 * @see #add(char, char) 1304 */ 1305UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { 1306 if ( c.len>0 && c.list!=NULL ) { 1307 add(c.list, c.len, 0); 1308 } 1309 1310 // Add strings in order 1311 if ( c.strings!=NULL ) { 1312 for (int32_t i=0; i<c.strings->size(); ++i) { 1313 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); 1314 if (!strings->contains((void*) s)) { 1315 _add(*s); 1316 } 1317 } 1318 } 1319 return *this; 1320} 1321 1322/** 1323 * Retains only the elements in this set that are contained in the 1324 * specified set. In other words, removes from this set all of 1325 * its elements that are not contained in the specified set. This 1326 * operation effectively modifies this set so that its value is 1327 * the <i>intersection</i> of the two sets. 1328 * 1329 * @param c set that defines which elements this set will retain. 1330 */ 1331UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { 1332 if (isFrozen() || isBogus()) { 1333 return *this; 1334 } 1335 retain(c.list, c.len, 0); 1336 strings->retainAll(*c.strings); 1337 return *this; 1338} 1339 1340/** 1341 * Removes from this set all of its elements that are contained in the 1342 * specified set. This operation effectively modifies this 1343 * set so that its value is the <i>asymmetric set difference</i> of 1344 * the two sets. 1345 * 1346 * @param c set that defines which elements will be removed from 1347 * this set. 1348 */ 1349UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { 1350 if (isFrozen() || isBogus()) { 1351 return *this; 1352 } 1353 retain(c.list, c.len, 2); 1354 strings->removeAll(*c.strings); 1355 return *this; 1356} 1357 1358/** 1359 * Complements in this set all elements contained in the specified 1360 * set. Any character in the other set will be removed if it is 1361 * in this set, or will be added if it is not in this set. 1362 * 1363 * @param c set that defines which elements will be xor'ed from 1364 * this set. 1365 */ 1366UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { 1367 if (isFrozen() || isBogus()) { 1368 return *this; 1369 } 1370 exclusiveOr(c.list, c.len, 0); 1371 1372 for (int32_t i=0; i<c.strings->size(); ++i) { 1373 void* e = c.strings->elementAt(i); 1374 if (!strings->removeElement(e)) { 1375 _add(*(const UnicodeString*)e); 1376 } 1377 } 1378 return *this; 1379} 1380 1381/** 1382 * Removes all of the elements from this set. This set will be 1383 * empty after this call returns. 1384 */ 1385UnicodeSet& UnicodeSet::clear(void) { 1386 if (isFrozen()) { 1387 return *this; 1388 } 1389 if (list != NULL) { 1390 list[0] = UNICODESET_HIGH; 1391 } 1392 len = 1; 1393 releasePattern(); 1394 if (strings != NULL) { 1395 strings->removeAllElements(); 1396 } 1397 if (list != NULL && strings != NULL) { 1398 // Remove bogus 1399 fFlags = 0; 1400 } 1401 return *this; 1402} 1403 1404/** 1405 * Iteration method that returns the number of ranges contained in 1406 * this set. 1407 * @see #getRangeStart 1408 * @see #getRangeEnd 1409 */ 1410int32_t UnicodeSet::getRangeCount() const { 1411 return len/2; 1412} 1413 1414/** 1415 * Iteration method that returns the first character in the 1416 * specified range of this set. 1417 * @see #getRangeCount 1418 * @see #getRangeEnd 1419 */ 1420UChar32 UnicodeSet::getRangeStart(int32_t index) const { 1421 return list[index*2]; 1422} 1423 1424/** 1425 * Iteration method that returns the last character in the 1426 * specified range of this set. 1427 * @see #getRangeStart 1428 * @see #getRangeEnd 1429 */ 1430UChar32 UnicodeSet::getRangeEnd(int32_t index) const { 1431 return list[index*2 + 1] - 1; 1432} 1433 1434int32_t UnicodeSet::getStringCount() const { 1435 return strings->size(); 1436} 1437 1438const UnicodeString* UnicodeSet::getString(int32_t index) const { 1439 return (const UnicodeString*) strings->elementAt(index); 1440} 1441 1442/** 1443 * Reallocate this objects internal structures to take up the least 1444 * possible space, without changing this object's value. 1445 */ 1446UnicodeSet& UnicodeSet::compact() { 1447 if (isFrozen() || isBogus()) { 1448 return *this; 1449 } 1450 // Delete buffer first to defragment memory less. 1451 if (buffer != NULL) { 1452 uprv_free(buffer); 1453 buffer = NULL; 1454 } 1455 if (len < capacity) { 1456 // Make the capacity equal to len or 1. 1457 // We don't want to realloc of 0 size. 1458 int32_t newCapacity = len + (len == 0); 1459 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity); 1460 if (temp) { 1461 list = temp; 1462 capacity = newCapacity; 1463 } 1464 // else what the heck happened?! We allocated less memory! 1465 // Oh well. We'll keep our original array. 1466 } 1467 return *this; 1468} 1469 1470int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const { 1471 int32_t bmpLength, length, destLength; 1472 1473 if (U_FAILURE(ec)) { 1474 return 0; 1475 } 1476 1477 if (destCapacity<0 || (destCapacity>0 && dest==NULL)) { 1478 ec=U_ILLEGAL_ARGUMENT_ERROR; 1479 return 0; 1480 } 1481 1482 /* count necessary 16-bit units */ 1483 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH 1484 // assert(length>=0); 1485 if (length==0) { 1486 /* empty set */ 1487 if (destCapacity>0) { 1488 *dest=0; 1489 } else { 1490 ec=U_BUFFER_OVERFLOW_ERROR; 1491 } 1492 return 1; 1493 } 1494 /* now length>0 */ 1495 1496 if (this->list[length-1]<=0xffff) { 1497 /* all BMP */ 1498 bmpLength=length; 1499 } else if (this->list[0]>=0x10000) { 1500 /* all supplementary */ 1501 bmpLength=0; 1502 length*=2; 1503 } else { 1504 /* some BMP, some supplementary */ 1505 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {} 1506 length=bmpLength+2*(length-bmpLength); 1507 } 1508 1509 /* length: number of 16-bit array units */ 1510 if (length>0x7fff) { 1511 /* there are only 15 bits for the length in the first serialized word */ 1512 ec=U_INDEX_OUTOFBOUNDS_ERROR; 1513 return 0; 1514 } 1515 1516 /* 1517 * total serialized length: 1518 * number of 16-bit array units (length) + 1519 * 1 length unit (always) + 1520 * 1 bmpLength unit (if there are supplementary values) 1521 */ 1522 destLength=length+((length>bmpLength)?2:1); 1523 if (destLength<=destCapacity) { 1524 const UChar32 *p; 1525 int32_t i; 1526 1527 *dest=(uint16_t)length; 1528 if (length>bmpLength) { 1529 *dest|=0x8000; 1530 *++dest=(uint16_t)bmpLength; 1531 } 1532 ++dest; 1533 1534 /* write the BMP part of the array */ 1535 p=this->list; 1536 for (i=0; i<bmpLength; ++i) { 1537 *dest++=(uint16_t)*p++; 1538 } 1539 1540 /* write the supplementary part of the array */ 1541 for (; i<length; i+=2) { 1542 *dest++=(uint16_t)(*p>>16); 1543 *dest++=(uint16_t)*p++; 1544 } 1545 } else { 1546 ec=U_BUFFER_OVERFLOW_ERROR; 1547 } 1548 return destLength; 1549} 1550 1551//---------------------------------------------------------------- 1552// Implementation: Utility methods 1553//---------------------------------------------------------------- 1554 1555/** 1556 * Allocate our strings vector and return TRUE if successful. 1557 */ 1558UBool UnicodeSet::allocateStrings(UErrorCode &status) { 1559 if (U_FAILURE(status)) { 1560 return FALSE; 1561 } 1562 strings = new UVector(uhash_deleteUnicodeString, 1563 uhash_compareUnicodeString, 1, status); 1564 if (strings == NULL) { // Check for memory allocation error. 1565 status = U_MEMORY_ALLOCATION_ERROR; 1566 return FALSE; 1567 } 1568 if (U_FAILURE(status)) { 1569 delete strings; 1570 strings = NULL; 1571 return FALSE; 1572 } 1573 return TRUE; 1574} 1575 1576void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) { 1577 if (newLen <= capacity) 1578 return; 1579 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA)); 1580 if (temp == NULL) { 1581 ec = U_MEMORY_ALLOCATION_ERROR; 1582 setToBogus(); 1583 return; 1584 } 1585 list = temp; 1586 capacity = newLen + GROW_EXTRA; 1587 // else we keep the original contents on the memory failure. 1588} 1589 1590void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) { 1591 if (buffer != NULL && newLen <= bufferCapacity) 1592 return; 1593 UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA)); 1594 if (temp == NULL) { 1595 ec = U_MEMORY_ALLOCATION_ERROR; 1596 setToBogus(); 1597 return; 1598 } 1599 buffer = temp; 1600 bufferCapacity = newLen + GROW_EXTRA; 1601 // else we keep the original contents on the memory failure. 1602} 1603 1604/** 1605 * Swap list and buffer. 1606 */ 1607void UnicodeSet::swapBuffers(void) { 1608 // swap list and buffer 1609 UChar32* temp = list; 1610 list = buffer; 1611 buffer = temp; 1612 1613 int32_t c = capacity; 1614 capacity = bufferCapacity; 1615 bufferCapacity = c; 1616} 1617 1618void UnicodeSet::setToBogus() { 1619 clear(); // Remove everything in the set. 1620 fFlags = kIsBogus; 1621} 1622 1623//---------------------------------------------------------------- 1624// Implementation: Fundamental operators 1625//---------------------------------------------------------------- 1626 1627static inline UChar32 max(UChar32 a, UChar32 b) { 1628 return (a > b) ? a : b; 1629} 1630 1631// polarity = 0, 3 is normal: x xor y 1632// polarity = 1, 2: x xor ~y == x === y 1633 1634void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) { 1635 if (isFrozen() || isBogus()) { 1636 return; 1637 } 1638 UErrorCode status = U_ZERO_ERROR; 1639 ensureBufferCapacity(len + otherLen, status); 1640 if (U_FAILURE(status)) { 1641 return; 1642 } 1643 1644 int32_t i = 0, j = 0, k = 0; 1645 UChar32 a = list[i++]; 1646 UChar32 b; 1647 if (polarity == 1 || polarity == 2) { 1648 b = UNICODESET_LOW; 1649 if (other[j] == UNICODESET_LOW) { // skip base if already LOW 1650 ++j; 1651 b = other[j]; 1652 } 1653 } else { 1654 b = other[j++]; 1655 } 1656 // simplest of all the routines 1657 // sort the values, discarding identicals! 1658 for (;;) { 1659 if (a < b) { 1660 buffer[k++] = a; 1661 a = list[i++]; 1662 } else if (b < a) { 1663 buffer[k++] = b; 1664 b = other[j++]; 1665 } else if (a != UNICODESET_HIGH) { // at this point, a == b 1666 // discard both values! 1667 a = list[i++]; 1668 b = other[j++]; 1669 } else { // DONE! 1670 buffer[k++] = UNICODESET_HIGH; 1671 len = k; 1672 break; 1673 } 1674 } 1675 swapBuffers(); 1676 releasePattern(); 1677} 1678 1679// polarity = 0 is normal: x union y 1680// polarity = 2: x union ~y 1681// polarity = 1: ~x union y 1682// polarity = 3: ~x union ~y 1683 1684void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { 1685 if (isFrozen() || isBogus() || other==NULL) { 1686 return; 1687 } 1688 UErrorCode status = U_ZERO_ERROR; 1689 ensureBufferCapacity(len + otherLen, status); 1690 if (U_FAILURE(status)) { 1691 return; 1692 } 1693 1694 int32_t i = 0, j = 0, k = 0; 1695 UChar32 a = list[i++]; 1696 UChar32 b = other[j++]; 1697 // change from xor is that we have to check overlapping pairs 1698 // polarity bit 1 means a is second, bit 2 means b is. 1699 for (;;) { 1700 switch (polarity) { 1701 case 0: // both first; take lower if unequal 1702 if (a < b) { // take a 1703 // Back up over overlapping ranges in buffer[] 1704 if (k > 0 && a <= buffer[k-1]) { 1705 // Pick latter end value in buffer[] vs. list[] 1706 a = max(list[i], buffer[--k]); 1707 } else { 1708 // No overlap 1709 buffer[k++] = a; 1710 a = list[i]; 1711 } 1712 i++; // Common if/else code factored out 1713 polarity ^= 1; 1714 } else if (b < a) { // take b 1715 if (k > 0 && b <= buffer[k-1]) { 1716 b = max(other[j], buffer[--k]); 1717 } else { 1718 buffer[k++] = b; 1719 b = other[j]; 1720 } 1721 j++; 1722 polarity ^= 2; 1723 } else { // a == b, take a, drop b 1724 if (a == UNICODESET_HIGH) goto loop_end; 1725 // This is symmetrical; it doesn't matter if 1726 // we backtrack with a or b. - liu 1727 if (k > 0 && a <= buffer[k-1]) { 1728 a = max(list[i], buffer[--k]); 1729 } else { 1730 // No overlap 1731 buffer[k++] = a; 1732 a = list[i]; 1733 } 1734 i++; 1735 polarity ^= 1; 1736 b = other[j++]; 1737 polarity ^= 2; 1738 } 1739 break; 1740 case 3: // both second; take higher if unequal, and drop other 1741 if (b <= a) { // take a 1742 if (a == UNICODESET_HIGH) goto loop_end; 1743 buffer[k++] = a; 1744 } else { // take b 1745 if (b == UNICODESET_HIGH) goto loop_end; 1746 buffer[k++] = b; 1747 } 1748 a = list[i++]; 1749 polarity ^= 1; // factored common code 1750 b = other[j++]; 1751 polarity ^= 2; 1752 break; 1753 case 1: // a second, b first; if b < a, overlap 1754 if (a < b) { // no overlap, take a 1755 buffer[k++] = a; a = list[i++]; polarity ^= 1; 1756 } else if (b < a) { // OVERLAP, drop b 1757 b = other[j++]; 1758 polarity ^= 2; 1759 } else { // a == b, drop both! 1760 if (a == UNICODESET_HIGH) goto loop_end; 1761 a = list[i++]; 1762 polarity ^= 1; 1763 b = other[j++]; 1764 polarity ^= 2; 1765 } 1766 break; 1767 case 2: // a first, b second; if a < b, overlap 1768 if (b < a) { // no overlap, take b 1769 buffer[k++] = b; 1770 b = other[j++]; 1771 polarity ^= 2; 1772 } else if (a < b) { // OVERLAP, drop a 1773 a = list[i++]; 1774 polarity ^= 1; 1775 } else { // a == b, drop both! 1776 if (a == UNICODESET_HIGH) goto loop_end; 1777 a = list[i++]; 1778 polarity ^= 1; 1779 b = other[j++]; 1780 polarity ^= 2; 1781 } 1782 break; 1783 } 1784 } 1785 loop_end: 1786 buffer[k++] = UNICODESET_HIGH; // terminate 1787 len = k; 1788 swapBuffers(); 1789 releasePattern(); 1790} 1791 1792// polarity = 0 is normal: x intersect y 1793// polarity = 2: x intersect ~y == set-minus 1794// polarity = 1: ~x intersect y 1795// polarity = 3: ~x intersect ~y 1796 1797void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) { 1798 if (isFrozen() || isBogus()) { 1799 return; 1800 } 1801 UErrorCode status = U_ZERO_ERROR; 1802 ensureBufferCapacity(len + otherLen, status); 1803 if (U_FAILURE(status)) { 1804 return; 1805 } 1806 1807 int32_t i = 0, j = 0, k = 0; 1808 UChar32 a = list[i++]; 1809 UChar32 b = other[j++]; 1810 // change from xor is that we have to check overlapping pairs 1811 // polarity bit 1 means a is second, bit 2 means b is. 1812 for (;;) { 1813 switch (polarity) { 1814 case 0: // both first; drop the smaller 1815 if (a < b) { // drop a 1816 a = list[i++]; 1817 polarity ^= 1; 1818 } else if (b < a) { // drop b 1819 b = other[j++]; 1820 polarity ^= 2; 1821 } else { // a == b, take one, drop other 1822 if (a == UNICODESET_HIGH) goto loop_end; 1823 buffer[k++] = a; 1824 a = list[i++]; 1825 polarity ^= 1; 1826 b = other[j++]; 1827 polarity ^= 2; 1828 } 1829 break; 1830 case 3: // both second; take lower if unequal 1831 if (a < b) { // take a 1832 buffer[k++] = a; 1833 a = list[i++]; 1834 polarity ^= 1; 1835 } else if (b < a) { // take b 1836 buffer[k++] = b; 1837 b = other[j++]; 1838 polarity ^= 2; 1839 } else { // a == b, take one, drop other 1840 if (a == UNICODESET_HIGH) goto loop_end; 1841 buffer[k++] = a; 1842 a = list[i++]; 1843 polarity ^= 1; 1844 b = other[j++]; 1845 polarity ^= 2; 1846 } 1847 break; 1848 case 1: // a second, b first; 1849 if (a < b) { // NO OVERLAP, drop a 1850 a = list[i++]; 1851 polarity ^= 1; 1852 } else if (b < a) { // OVERLAP, take b 1853 buffer[k++] = b; 1854 b = other[j++]; 1855 polarity ^= 2; 1856 } else { // a == b, drop both! 1857 if (a == UNICODESET_HIGH) goto loop_end; 1858 a = list[i++]; 1859 polarity ^= 1; 1860 b = other[j++]; 1861 polarity ^= 2; 1862 } 1863 break; 1864 case 2: // a first, b second; if a < b, overlap 1865 if (b < a) { // no overlap, drop b 1866 b = other[j++]; 1867 polarity ^= 2; 1868 } else if (a < b) { // OVERLAP, take a 1869 buffer[k++] = a; 1870 a = list[i++]; 1871 polarity ^= 1; 1872 } else { // a == b, drop both! 1873 if (a == UNICODESET_HIGH) goto loop_end; 1874 a = list[i++]; 1875 polarity ^= 1; 1876 b = other[j++]; 1877 polarity ^= 2; 1878 } 1879 break; 1880 } 1881 } 1882 loop_end: 1883 buffer[k++] = UNICODESET_HIGH; // terminate 1884 len = k; 1885 swapBuffers(); 1886 releasePattern(); 1887} 1888 1889/** 1890 * Append the <code>toPattern()</code> representation of a 1891 * string to the given <code>StringBuffer</code>. 1892 */ 1893void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool 1894escapeUnprintable) { 1895 UChar32 cp; 1896 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) { 1897 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable); 1898 } 1899} 1900 1901/** 1902 * Append the <code>toPattern()</code> representation of a 1903 * character to the given <code>StringBuffer</code>. 1904 */ 1905void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool 1906escapeUnprintable) { 1907 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { 1908 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything 1909 // unprintable 1910 if (ICU_Utility::escapeUnprintable(buf, c)) { 1911 return; 1912 } 1913 } 1914 // Okay to let ':' pass through 1915 switch (c) { 1916 case SET_OPEN: 1917 case SET_CLOSE: 1918 case HYPHEN: 1919 case COMPLEMENT: 1920 case INTERSECTION: 1921 case BACKSLASH: 1922 case OPEN_BRACE: 1923 case CLOSE_BRACE: 1924 case COLON: 1925 case SymbolTable::SYMBOL_REF: 1926 buf.append(BACKSLASH); 1927 break; 1928 default: 1929 // Escape whitespace 1930 if (PatternProps::isWhiteSpace(c)) { 1931 buf.append(BACKSLASH); 1932 } 1933 break; 1934 } 1935 buf.append(c); 1936} 1937 1938/** 1939 * Append a string representation of this set to result. This will be 1940 * a cleaned version of the string passed to applyPattern(), if there 1941 * is one. Otherwise it will be generated. 1942 */ 1943UnicodeString& UnicodeSet::_toPattern(UnicodeString& result, 1944 UBool escapeUnprintable) const 1945{ 1946 if (pat != NULL) { 1947 int32_t i; 1948 int32_t backslashCount = 0; 1949 for (i=0; i<patLen; ) { 1950 UChar32 c; 1951 U16_NEXT(pat, i, patLen, c); 1952 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { 1953 // If the unprintable character is preceded by an odd 1954 // number of backslashes, then it has been escaped. 1955 // Before unescaping it, we delete the final 1956 // backslash. 1957 if ((backslashCount % 2) == 1) { 1958 result.truncate(result.length() - 1); 1959 } 1960 ICU_Utility::escapeUnprintable(result, c); 1961 backslashCount = 0; 1962 } else { 1963 result.append(c); 1964 if (c == BACKSLASH) { 1965 ++backslashCount; 1966 } else { 1967 backslashCount = 0; 1968 } 1969 } 1970 } 1971 return result; 1972 } 1973 1974 return _generatePattern(result, escapeUnprintable); 1975} 1976 1977/** 1978 * Returns a string representation of this set. If the result of 1979 * calling this function is passed to a UnicodeSet constructor, it 1980 * will produce another set that is equal to this one. 1981 */ 1982UnicodeString& UnicodeSet::toPattern(UnicodeString& result, 1983 UBool escapeUnprintable) const 1984{ 1985 result.truncate(0); 1986 return _toPattern(result, escapeUnprintable); 1987} 1988 1989/** 1990 * Generate and append a string representation of this set to result. 1991 * This does not use this.pat, the cleaned up copy of the string 1992 * passed to applyPattern(). 1993 */ 1994UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, 1995 UBool escapeUnprintable) const 1996{ 1997 result.append(SET_OPEN); 1998 1999// // Check against the predefined categories. We implicitly build 2000// // up ALL category sets the first time toPattern() is called. 2001// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) { 2002// if (*this == getCategorySet(cat)) { 2003// result.append(COLON); 2004// result.append(CATEGORY_NAMES, cat*2, 2); 2005// return result.append(CATEGORY_CLOSE); 2006// } 2007// } 2008 2009 int32_t count = getRangeCount(); 2010 2011 // If the set contains at least 2 intervals and includes both 2012 // MIN_VALUE and MAX_VALUE, then the inverse representation will 2013 // be more economical. 2014 if (count > 1 && 2015 getRangeStart(0) == MIN_VALUE && 2016 getRangeEnd(count-1) == MAX_VALUE) { 2017 2018 // Emit the inverse 2019 result.append(COMPLEMENT); 2020 2021 for (int32_t i = 1; i < count; ++i) { 2022 UChar32 start = getRangeEnd(i-1)+1; 2023 UChar32 end = getRangeStart(i)-1; 2024 _appendToPat(result, start, escapeUnprintable); 2025 if (start != end) { 2026 if ((start+1) != end) { 2027 result.append(HYPHEN); 2028 } 2029 _appendToPat(result, end, escapeUnprintable); 2030 } 2031 } 2032 } 2033 2034 // Default; emit the ranges as pairs 2035 else { 2036 for (int32_t i = 0; i < count; ++i) { 2037 UChar32 start = getRangeStart(i); 2038 UChar32 end = getRangeEnd(i); 2039 _appendToPat(result, start, escapeUnprintable); 2040 if (start != end) { 2041 if ((start+1) != end) { 2042 result.append(HYPHEN); 2043 } 2044 _appendToPat(result, end, escapeUnprintable); 2045 } 2046 } 2047 } 2048 2049 for (int32_t i = 0; i<strings->size(); ++i) { 2050 result.append(OPEN_BRACE); 2051 _appendToPat(result, 2052 *(const UnicodeString*) strings->elementAt(i), 2053 escapeUnprintable); 2054 result.append(CLOSE_BRACE); 2055 } 2056 return result.append(SET_CLOSE); 2057} 2058 2059/** 2060* Release existing cached pattern 2061*/ 2062void UnicodeSet::releasePattern() { 2063 if (pat) { 2064 uprv_free(pat); 2065 pat = NULL; 2066 patLen = 0; 2067 } 2068} 2069 2070/** 2071* Set the new pattern to cache. 2072*/ 2073void UnicodeSet::setPattern(const UnicodeString& newPat) { 2074 releasePattern(); 2075 int32_t newPatLen = newPat.length(); 2076 pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar)); 2077 if (pat) { 2078 patLen = newPatLen; 2079 newPat.extractBetween(0, patLen, pat); 2080 pat[patLen] = 0; 2081 } 2082 // else we don't care if malloc failed. This was just a nice cache. 2083 // We can regenerate an equivalent pattern later when requested. 2084} 2085 2086UnicodeFunctor *UnicodeSet::freeze() { 2087 if(!isFrozen() && !isBogus()) { 2088 // Do most of what compact() does before freezing because 2089 // compact() will not work when the set is frozen. 2090 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). 2091 2092 // Delete buffer first to defragment memory less. 2093 if (buffer != NULL) { 2094 uprv_free(buffer); 2095 buffer = NULL; 2096 } 2097 if (capacity > (len + GROW_EXTRA)) { 2098 // Make the capacity equal to len or 1. 2099 // We don't want to realloc of 0 size. 2100 capacity = len + (len == 0); 2101 list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity); 2102 if (list == NULL) { // Check for memory allocation error. 2103 setToBogus(); 2104 return this; 2105 } 2106 } 2107 2108 // Optimize contains() and span() and similar functions. 2109 if (!strings->isEmpty()) { 2110 stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); 2111 if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) { 2112 // All strings are irrelevant for span() etc. because 2113 // all of each string's code points are contained in this set. 2114 // Do not check needsStringSpanUTF8() because UTF-8 has at most as 2115 // many relevant strings as UTF-16. 2116 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().) 2117 delete stringSpan; 2118 stringSpan = NULL; 2119 } 2120 } 2121 if (stringSpan == NULL) { 2122 // No span-relevant strings: Optimize for code point spans. 2123 bmpSet=new BMPSet(list, len); 2124 if (bmpSet == NULL) { // Check for memory allocation error. 2125 setToBogus(); 2126 } 2127 } 2128 } 2129 return this; 2130} 2131 2132int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 2133 if(length>0 && bmpSet!=NULL) { 2134 return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s); 2135 } 2136 if(length<0) { 2137 length=u_strlen(s); 2138 } 2139 if(length==0) { 2140 return 0; 2141 } 2142 if(stringSpan!=NULL) { 2143 return stringSpan->span(s, length, spanCondition); 2144 } else if(!strings->isEmpty()) { 2145 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2146 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : 2147 UnicodeSetStringSpan::FWD_UTF16_CONTAINED; 2148 UnicodeSetStringSpan strSpan(*this, *strings, which); 2149 if(strSpan.needsStringSpanUTF16()) { 2150 return strSpan.span(s, length, spanCondition); 2151 } 2152 } 2153 2154 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2155 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2156 } 2157 2158 UChar32 c; 2159 int32_t start=0, prev=0; 2160 do { 2161 U16_NEXT(s, start, length, c); 2162 if(spanCondition!=contains(c)) { 2163 break; 2164 } 2165 } while((prev=start)<length); 2166 return prev; 2167} 2168 2169int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 2170 if(length>0 && bmpSet!=NULL) { 2171 return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s); 2172 } 2173 if(length<0) { 2174 length=u_strlen(s); 2175 } 2176 if(length==0) { 2177 return 0; 2178 } 2179 if(stringSpan!=NULL) { 2180 return stringSpan->spanBack(s, length, spanCondition); 2181 } else if(!strings->isEmpty()) { 2182 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2183 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : 2184 UnicodeSetStringSpan::BACK_UTF16_CONTAINED; 2185 UnicodeSetStringSpan strSpan(*this, *strings, which); 2186 if(strSpan.needsStringSpanUTF16()) { 2187 return strSpan.spanBack(s, length, spanCondition); 2188 } 2189 } 2190 2191 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2192 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2193 } 2194 2195 UChar32 c; 2196 int32_t prev=length; 2197 do { 2198 U16_PREV(s, 0, length, c); 2199 if(spanCondition!=contains(c)) { 2200 break; 2201 } 2202 } while((prev=length)>0); 2203 return prev; 2204} 2205 2206int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { 2207 if(length>0 && bmpSet!=NULL) { 2208 const uint8_t *s0=(const uint8_t *)s; 2209 return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0); 2210 } 2211 if(length<0) { 2212 length=(int32_t)uprv_strlen(s); 2213 } 2214 if(length==0) { 2215 return 0; 2216 } 2217 if(stringSpan!=NULL) { 2218 return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition); 2219 } else if(!strings->isEmpty()) { 2220 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2221 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : 2222 UnicodeSetStringSpan::FWD_UTF8_CONTAINED; 2223 UnicodeSetStringSpan strSpan(*this, *strings, which); 2224 if(strSpan.needsStringSpanUTF8()) { 2225 return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition); 2226 } 2227 } 2228 2229 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2230 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2231 } 2232 2233 UChar32 c; 2234 int32_t start=0, prev=0; 2235 do { 2236 U8_NEXT(s, start, length, c); 2237 if(c<0) { 2238 c=0xfffd; 2239 } 2240 if(spanCondition!=contains(c)) { 2241 break; 2242 } 2243 } while((prev=start)<length); 2244 return prev; 2245} 2246 2247int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { 2248 if(length>0 && bmpSet!=NULL) { 2249 const uint8_t *s0=(const uint8_t *)s; 2250 return bmpSet->spanBackUTF8(s0, length, spanCondition); 2251 } 2252 if(length<0) { 2253 length=(int32_t)uprv_strlen(s); 2254 } 2255 if(length==0) { 2256 return 0; 2257 } 2258 if(stringSpan!=NULL) { 2259 return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition); 2260 } else if(!strings->isEmpty()) { 2261 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 2262 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : 2263 UnicodeSetStringSpan::BACK_UTF8_CONTAINED; 2264 UnicodeSetStringSpan strSpan(*this, *strings, which); 2265 if(strSpan.needsStringSpanUTF8()) { 2266 return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition); 2267 } 2268 } 2269 2270 if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 2271 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 2272 } 2273 2274 UChar32 c; 2275 int32_t prev=length; 2276 do { 2277 U8_PREV(s, 0, length, c); 2278 if(c<0) { 2279 c=0xfffd; 2280 } 2281 if(spanCondition!=contains(c)) { 2282 break; 2283 } 2284 } while((prev=length)>0); 2285 return prev; 2286} 2287 2288U_NAMESPACE_END 2289