1/*
2**********************************************************************
3*   Copyright (C) 1999-2012, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   10/20/99    alan        Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12#include "unicode/parsepos.h"
13#include "unicode/symtable.h"
14#include "unicode/uniset.h"
15#include "unicode/utf8.h"
16#include "unicode/utf16.h"
17#include "ruleiter.h"
18#include "cmemory.h"
19#include "cstring.h"
20#include "patternprops.h"
21#include "uelement.h"
22#include "util.h"
23#include "uvector.h"
24#include "charstr.h"
25#include "ustrfmt.h"
26#include "uassert.h"
27#include "bmpset.h"
28#include "unisetspan.h"
29
30// Define UChar constants using hex for EBCDIC compatibility
31// Used #define to reduce private static exports and memory access time.
32#define SET_OPEN        ((UChar)0x005B) /*[*/
33#define SET_CLOSE       ((UChar)0x005D) /*]*/
34#define HYPHEN          ((UChar)0x002D) /*-*/
35#define COMPLEMENT      ((UChar)0x005E) /*^*/
36#define COLON           ((UChar)0x003A) /*:*/
37#define BACKSLASH       ((UChar)0x005C) /*\*/
38#define INTERSECTION    ((UChar)0x0026) /*&*/
39#define UPPER_U         ((UChar)0x0055) /*U*/
40#define LOWER_U         ((UChar)0x0075) /*u*/
41#define OPEN_BRACE      ((UChar)123)    /*{*/
42#define CLOSE_BRACE     ((UChar)125)    /*}*/
43#define UPPER_P         ((UChar)0x0050) /*P*/
44#define LOWER_P         ((UChar)0x0070) /*p*/
45#define UPPER_N         ((UChar)78)     /*N*/
46#define EQUALS          ((UChar)0x003D) /*=*/
47
48// HIGH_VALUE > all valid values. 110000 for codepoints
49#define UNICODESET_HIGH 0x0110000
50
51// LOW <= all valid values. ZERO for codepoints
52#define UNICODESET_LOW 0x000000
53
54// initial storage. Must be >= 0
55#define START_EXTRA 16
56
57// extra amount for growth. Must be >= 0
58#define GROW_EXTRA START_EXTRA
59
60U_NAMESPACE_BEGIN
61
62SymbolTable::~SymbolTable() {}
63
64UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
65
66/**
67 * Modify the given UChar32 variable so that it is in range, by
68 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
69 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
70 * It modifies its argument in-place and also returns it.
71 */
72static inline UChar32 pinCodePoint(UChar32& c) {
73    if (c < UNICODESET_LOW) {
74        c = UNICODESET_LOW;
75    } else if (c > (UNICODESET_HIGH-1)) {
76        c = (UNICODESET_HIGH-1);
77    }
78    return c;
79}
80
81//----------------------------------------------------------------
82// Debugging
83//----------------------------------------------------------------
84
85// DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
86// To enable the debugging, define the symbol DEBUG_MEM in the line
87// below.  This will result in text being sent to stdout that looks
88// like this:
89//   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
90//   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
91// Each line lists a construction (ct) or destruction (dt) event, the
92// object address, the number of outstanding objects after the event,
93// and the pattern of the object in question.
94
95// #define DEBUG_MEM
96
97#ifdef DEBUG_MEM
98#include <stdio.h>
99static int32_t _dbgCount = 0;
100
101static inline void _dbgct(UnicodeSet* set) {
102    UnicodeString str;
103    set->toPattern(str, TRUE);
104    char buf[40];
105    str.extract(0, 39, buf, "");
106    printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
107}
108
109static inline void _dbgdt(UnicodeSet* set) {
110    UnicodeString str;
111    set->toPattern(str, TRUE);
112    char buf[40];
113    str.extract(0, 39, buf, "");
114    printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
115}
116
117#else
118
119#define _dbgct(set)
120#define _dbgdt(set)
121
122#endif
123
124//----------------------------------------------------------------
125// UnicodeString in UVector support
126//----------------------------------------------------------------
127
128static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
129    dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
130}
131
132static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
133    const UnicodeString &a = *(const UnicodeString*)t1.pointer;
134    const UnicodeString &b = *(const UnicodeString*)t2.pointer;
135    return a.compare(b);
136}
137
138//----------------------------------------------------------------
139// Constructors &c
140//----------------------------------------------------------------
141
142/**
143 * Constructs an empty set.
144 */
145UnicodeSet::UnicodeSet() :
146    len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
147    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
148    fFlags(0)
149{
150    UErrorCode status = U_ZERO_ERROR;
151    allocateStrings(status);
152    if (U_FAILURE(status)) {
153        return;
154    }
155    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
156    if(list!=NULL){
157        list[0] = UNICODESET_HIGH;
158    } else { // If memory allocation failed, set to bogus state.
159        setToBogus();
160        return;
161    }
162    _dbgct(this);
163}
164
165/**
166 * Constructs a set containing the given range. If <code>end >
167 * start</code> then an empty set is created.
168 *
169 * @param start first character, inclusive, of range
170 * @param end last character, inclusive, of range
171 */
172UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
173    len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
174    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
175    fFlags(0)
176{
177    UErrorCode status = U_ZERO_ERROR;
178    allocateStrings(status);
179    if (U_FAILURE(status)) {
180        return;
181    }
182    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
183    if(list!=NULL){
184        list[0] = UNICODESET_HIGH;
185        complement(start, end);
186    } else { // If memory allocation failed, set to bogus state.
187        setToBogus();
188        return;
189    }
190    _dbgct(this);
191}
192
193/**
194 * Constructs a set that is identical to the given UnicodeSet.
195 */
196UnicodeSet::UnicodeSet(const UnicodeSet& o) :
197    UnicodeFilter(o),
198    len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
199    bmpSet(0),
200    buffer(0), bufferCapacity(0),
201    patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
202    fFlags(0)
203{
204    UErrorCode status = U_ZERO_ERROR;
205    allocateStrings(status);
206    if (U_FAILURE(status)) {
207        return;
208    }
209    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
210    if(list!=NULL){
211        *this = o;
212    } else { // If memory allocation failed, set to bogus state.
213        setToBogus();
214        return;
215    }
216    _dbgct(this);
217}
218
219// Copy-construct as thawed.
220UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
221    UnicodeFilter(o),
222    len(0), capacity(o.len + GROW_EXTRA), list(0),
223    bmpSet(0),
224    buffer(0), bufferCapacity(0),
225    patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
226    fFlags(0)
227{
228    UErrorCode status = U_ZERO_ERROR;
229    allocateStrings(status);
230    if (U_FAILURE(status)) {
231        return;
232    }
233    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
234    if(list!=NULL){
235        // *this = o except for bmpSet and stringSpan
236        len = o.len;
237        uprv_memcpy(list, o.list, len*sizeof(UChar32));
238        if (strings != NULL && o.strings != NULL) {
239            strings->assign(*o.strings, cloneUnicodeString, status);
240        } else { // Invalid strings.
241            setToBogus();
242            return;
243        }
244        if (o.pat) {
245            setPattern(UnicodeString(o.pat, o.patLen));
246        }
247    } else { // If memory allocation failed, set to bogus state.
248        setToBogus();
249        return;
250    }
251    _dbgct(this);
252}
253
254/**
255 * Destructs the set.
256 */
257UnicodeSet::~UnicodeSet() {
258    _dbgdt(this); // first!
259    uprv_free(list);
260    delete bmpSet;
261    if (buffer) {
262        uprv_free(buffer);
263    }
264    delete strings;
265    delete stringSpan;
266    releasePattern();
267}
268
269/**
270 * Assigns this object to be a copy of another.
271 */
272UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
273    if (this == &o) {
274        return *this;
275    }
276    if (isFrozen()) {
277        return *this;
278    }
279    if (o.isBogus()) {
280        setToBogus();
281        return *this;
282    }
283    UErrorCode ec = U_ZERO_ERROR;
284    ensureCapacity(o.len, ec);
285    if (U_FAILURE(ec)) {
286        return *this; // There is no way to report this error :-(
287    }
288    len = o.len;
289    uprv_memcpy(list, o.list, len*sizeof(UChar32));
290    if (o.bmpSet == NULL) {
291        bmpSet = NULL;
292    } else {
293        bmpSet = new BMPSet(*o.bmpSet, list, len);
294        if (bmpSet == NULL) { // Check for memory allocation error.
295            setToBogus();
296            return *this;
297        }
298    }
299    if (strings != NULL && o.strings != NULL) {
300        strings->assign(*o.strings, cloneUnicodeString, ec);
301    } else { // Invalid strings.
302        setToBogus();
303        return *this;
304    }
305    if (o.stringSpan == NULL) {
306        stringSpan = NULL;
307    } else {
308        stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
309        if (stringSpan == NULL) { // Check for memory allocation error.
310            setToBogus();
311            return *this;
312        }
313    }
314    releasePattern();
315    if (o.pat) {
316        setPattern(UnicodeString(o.pat, o.patLen));
317    }
318    return *this;
319}
320
321/**
322 * Returns a copy of this object.  All UnicodeMatcher objects have
323 * to support cloning in order to allow classes using
324 * UnicodeMatchers, such as Transliterator, to implement cloning.
325 */
326UnicodeFunctor* UnicodeSet::clone() const {
327    return new UnicodeSet(*this);
328}
329
330UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
331    return new UnicodeSet(*this, TRUE);
332}
333
334/**
335 * Compares the specified object with this set for equality.  Returns
336 * <tt>true</tt> if the two sets
337 * have the same size, and every member of the specified set is
338 * contained in this set (or equivalently, every member of this set is
339 * contained in the specified set).
340 *
341 * @param o set to be compared for equality with this set.
342 * @return <tt>true</tt> if the specified set is equal to this set.
343 */
344UBool UnicodeSet::operator==(const UnicodeSet& o) const {
345    if (len != o.len) return FALSE;
346    for (int32_t i = 0; i < len; ++i) {
347        if (list[i] != o.list[i]) return FALSE;
348    }
349    if (*strings != *o.strings) return FALSE;
350    return TRUE;
351}
352
353/**
354 * Returns the hash code value for this set.
355 *
356 * @return the hash code value for this set.
357 * @see Object#hashCode()
358 */
359int32_t UnicodeSet::hashCode(void) const {
360    int32_t result = len;
361    for (int32_t i = 0; i < len; ++i) {
362        result *= 1000003;
363        result += list[i];
364    }
365    return result;
366}
367
368//----------------------------------------------------------------
369// Public API
370//----------------------------------------------------------------
371
372/**
373 * Returns the number of elements in this set (its cardinality),
374 * Note than the elements of a set may include both individual
375 * codepoints and strings.
376 *
377 * @return the number of elements in this set (its cardinality).
378 */
379int32_t UnicodeSet::size(void) const {
380    int32_t n = 0;
381    int32_t count = getRangeCount();
382    for (int32_t i = 0; i < count; ++i) {
383        n += getRangeEnd(i) - getRangeStart(i) + 1;
384    }
385    return n + strings->size();
386}
387
388/**
389 * Returns <tt>true</tt> if this set contains no elements.
390 *
391 * @return <tt>true</tt> if this set contains no elements.
392 */
393UBool UnicodeSet::isEmpty(void) const {
394    return len == 1 && strings->size() == 0;
395}
396
397/**
398 * Returns true if this set contains the given character.
399 * @param c character to be checked for containment
400 * @return true if the test condition is met
401 */
402UBool UnicodeSet::contains(UChar32 c) const {
403    // Set i to the index of the start item greater than ch
404    // We know we will terminate without length test!
405    // LATER: for large sets, add binary search
406    //int32_t i = -1;
407    //for (;;) {
408    //    if (c < list[++i]) break;
409    //}
410    if (bmpSet != NULL) {
411        return bmpSet->contains(c);
412    }
413    if (stringSpan != NULL) {
414        return stringSpan->contains(c);
415    }
416    if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
417        return FALSE;
418    }
419    int32_t i = findCodePoint(c);
420    return (UBool)(i & 1); // return true if odd
421}
422
423/**
424 * Returns the smallest value i such that c < list[i].  Caller
425 * must ensure that c is a legal value or this method will enter
426 * an infinite loop.  This method performs a binary search.
427 * @param c a character in the range MIN_VALUE..MAX_VALUE
428 * inclusive
429 * @return the smallest integer i in the range 0..len-1,
430 * inclusive, such that c < list[i]
431 */
432int32_t UnicodeSet::findCodePoint(UChar32 c) const {
433    /* Examples:
434                                       findCodePoint(c)
435       set              list[]         c=0 1 3 4 7 8
436       ===              ==============   ===========
437       []               [110000]         0 0 0 0 0 0
438       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
439       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
440       [:Any:]          [0, 110000]      1 1 1 1 1 1
441     */
442
443    // Return the smallest i such that c < list[i].  Assume
444    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
445    if (c < list[0])
446        return 0;
447    // High runner test.  c is often after the last range, so an
448    // initial check for this condition pays off.
449    int32_t lo = 0;
450    int32_t hi = len - 1;
451    if (lo >= hi || c >= list[hi-1])
452        return hi;
453    // invariant: c >= list[lo]
454    // invariant: c < list[hi]
455    for (;;) {
456        int32_t i = (lo + hi) >> 1;
457        if (i == lo) {
458            break; // Found!
459        } else if (c < list[i]) {
460            hi = i;
461        } else {
462            lo = i;
463        }
464    }
465    return hi;
466}
467
468/**
469 * Returns true if this set contains every character
470 * of the given range.
471 * @param start first character, inclusive, of the range
472 * @param end last character, inclusive, of the range
473 * @return true if the test condition is met
474 */
475UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
476    //int32_t i = -1;
477    //for (;;) {
478    //    if (start < list[++i]) break;
479    //}
480    int32_t i = findCodePoint(start);
481    return ((i & 1) != 0 && end < list[i]);
482}
483
484/**
485 * Returns <tt>true</tt> if this set contains the given
486 * multicharacter string.
487 * @param s string to be checked for containment
488 * @return <tt>true</tt> if this set contains the specified string
489 */
490UBool UnicodeSet::contains(const UnicodeString& s) const {
491    if (s.length() == 0) return FALSE;
492    int32_t cp = getSingleCP(s);
493    if (cp < 0) {
494        return strings->contains((void*) &s);
495    } else {
496        return contains((UChar32) cp);
497    }
498}
499
500/**
501 * Returns true if this set contains all the characters and strings
502 * of the given set.
503 * @param c set to be checked for containment
504 * @return true if the test condition is met
505 */
506UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
507    // The specified set is a subset if all of its pairs are contained in
508    // this set.  It's possible to code this more efficiently in terms of
509    // direct manipulation of the inversion lists if the need arises.
510    int32_t n = c.getRangeCount();
511    for (int i=0; i<n; ++i) {
512        if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
513            return FALSE;
514        }
515    }
516    if (!strings->containsAll(*c.strings)) return FALSE;
517    return TRUE;
518}
519
520/**
521 * Returns true if this set contains all the characters
522 * of the given string.
523 * @param s string containing characters to be checked for containment
524 * @return true if the test condition is met
525 */
526UBool UnicodeSet::containsAll(const UnicodeString& s) const {
527    return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
528                   s.length());
529}
530
531/**
532 * Returns true if this set contains none of the characters
533 * of the given range.
534 * @param start first character, inclusive, of the range
535 * @param end last character, inclusive, of the range
536 * @return true if the test condition is met
537 */
538UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
539    //int32_t i = -1;
540    //for (;;) {
541    //    if (start < list[++i]) break;
542    //}
543    int32_t i = findCodePoint(start);
544    return ((i & 1) == 0 && end < list[i]);
545}
546
547/**
548 * Returns true if this set contains none of the characters and strings
549 * of the given set.
550 * @param c set to be checked for containment
551 * @return true if the test condition is met
552 */
553UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
554    // The specified set is a subset if all of its pairs are contained in
555    // this set.  It's possible to code this more efficiently in terms of
556    // direct manipulation of the inversion lists if the need arises.
557    int32_t n = c.getRangeCount();
558    for (int32_t i=0; i<n; ++i) {
559        if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
560            return FALSE;
561        }
562    }
563    if (!strings->containsNone(*c.strings)) return FALSE;
564    return TRUE;
565}
566
567/**
568 * Returns true if this set contains none of the characters
569 * of the given string.
570 * @param s string containing characters to be checked for containment
571 * @return true if the test condition is met
572 */
573UBool UnicodeSet::containsNone(const UnicodeString& s) const {
574    return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
575                   s.length());
576}
577
578/**
579 * Returns <tt>true</tt> if this set contains any character whose low byte
580 * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
581 * indexing.
582 */
583UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
584    /* The index value v, in the range [0,255], is contained in this set if
585     * it is contained in any pair of this set.  Pairs either have the high
586     * bytes equal, or unequal.  If the high bytes are equal, then we have
587     * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
588     * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
589     * Then v is contained if xx <= v || v <= yy.  (This is identical to the
590     * time zone month containment logic.)
591     */
592    int32_t i;
593    int32_t rangeCount=getRangeCount();
594    for (i=0; i<rangeCount; ++i) {
595        UChar32 low = getRangeStart(i);
596        UChar32 high = getRangeEnd(i);
597        if ((low & ~0xFF) == (high & ~0xFF)) {
598            if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
599                return TRUE;
600            }
601        } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
602            return TRUE;
603        }
604    }
605    if (strings->size() != 0) {
606        for (i=0; i<strings->size(); ++i) {
607            const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
608            //if (s.length() == 0) {
609            //    // Empty strings match everything
610            //    return TRUE;
611            //}
612            // assert(s.length() != 0); // We enforce this elsewhere
613            UChar32 c = s.char32At(0);
614            if ((c & 0xFF) == v) {
615                return TRUE;
616            }
617        }
618    }
619    return FALSE;
620}
621
622/**
623 * Implementation of UnicodeMatcher::matches().  Always matches the
624 * longest possible multichar string.
625 */
626UMatchDegree UnicodeSet::matches(const Replaceable& text,
627                                 int32_t& offset,
628                                 int32_t limit,
629                                 UBool incremental) {
630    if (offset == limit) {
631        // Strings, if any, have length != 0, so we don't worry
632        // about them here.  If we ever allow zero-length strings
633        // we much check for them here.
634        if (contains(U_ETHER)) {
635            return incremental ? U_PARTIAL_MATCH : U_MATCH;
636        } else {
637            return U_MISMATCH;
638        }
639    } else {
640        if (strings->size() != 0) { // try strings first
641
642            // might separate forward and backward loops later
643            // for now they are combined
644
645            // TODO Improve efficiency of this, at least in the forward
646            // direction, if not in both.  In the forward direction we
647            // can assume the strings are sorted.
648
649            int32_t i;
650            UBool forward = offset < limit;
651
652            // firstChar is the leftmost char to match in the
653            // forward direction or the rightmost char to match in
654            // the reverse direction.
655            UChar firstChar = text.charAt(offset);
656
657            // If there are multiple strings that can match we
658            // return the longest match.
659            int32_t highWaterLength = 0;
660
661            for (i=0; i<strings->size(); ++i) {
662                const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
663
664                //if (trial.length() == 0) {
665                //    return U_MATCH; // null-string always matches
666                //}
667                // assert(trial.length() != 0); // We ensure this elsewhere
668
669                UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
670
671                // Strings are sorted, so we can optimize in the
672                // forward direction.
673                if (forward && c > firstChar) break;
674                if (c != firstChar) continue;
675
676                int32_t matchLen = matchRest(text, offset, limit, trial);
677
678                if (incremental) {
679                    int32_t maxLen = forward ? limit-offset : offset-limit;
680                    if (matchLen == maxLen) {
681                        // We have successfully matched but only up to limit.
682                        return U_PARTIAL_MATCH;
683                    }
684                }
685
686                if (matchLen == trial.length()) {
687                    // We have successfully matched the whole string.
688                    if (matchLen > highWaterLength) {
689                        highWaterLength = matchLen;
690                    }
691                    // In the forward direction we know strings
692                    // are sorted so we can bail early.
693                    if (forward && matchLen < highWaterLength) {
694                        break;
695                    }
696                    continue;
697                }
698            }
699
700            // We've checked all strings without a partial match.
701            // If we have full matches, return the longest one.
702            if (highWaterLength != 0) {
703                offset += forward ? highWaterLength : -highWaterLength;
704                return U_MATCH;
705            }
706        }
707        return UnicodeFilter::matches(text, offset, limit, incremental);
708    }
709}
710
711/**
712 * Returns the longest match for s in text at the given position.
713 * If limit > start then match forward from start+1 to limit
714 * matching all characters except s.charAt(0).  If limit < start,
715 * go backward starting from start-1 matching all characters
716 * except s.charAt(s.length()-1).  This method assumes that the
717 * first character, text.charAt(start), matches s, so it does not
718 * check it.
719 * @param text the text to match
720 * @param start the first character to match.  In the forward
721 * direction, text.charAt(start) is matched against s.charAt(0).
722 * In the reverse direction, it is matched against
723 * s.charAt(s.length()-1).
724 * @param limit the limit offset for matching, either last+1 in
725 * the forward direction, or last-1 in the reverse direction,
726 * where last is the index of the last character to match.
727 * @return If part of s matches up to the limit, return |limit -
728 * start|.  If all of s matches before reaching the limit, return
729 * s.length().  If there is a mismatch between s and text, return
730 * 0
731 */
732int32_t UnicodeSet::matchRest(const Replaceable& text,
733                              int32_t start, int32_t limit,
734                              const UnicodeString& s) {
735    int32_t i;
736    int32_t maxLen;
737    int32_t slen = s.length();
738    if (start < limit) {
739        maxLen = limit - start;
740        if (maxLen > slen) maxLen = slen;
741        for (i = 1; i < maxLen; ++i) {
742            if (text.charAt(start + i) != s.charAt(i)) return 0;
743        }
744    } else {
745        maxLen = start - limit;
746        if (maxLen > slen) maxLen = slen;
747        --slen; // <=> slen = s.length() - 1;
748        for (i = 1; i < maxLen; ++i) {
749            if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
750        }
751    }
752    return maxLen;
753}
754
755/**
756 * Implement of UnicodeMatcher
757 */
758void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
759    toUnionTo.addAll(*this);
760}
761
762/**
763 * Returns the index of the given character within this set, where
764 * the set is ordered by ascending code point.  If the character
765 * is not in this set, return -1.  The inverse of this method is
766 * <code>charAt()</code>.
767 * @return an index from 0..size()-1, or -1
768 */
769int32_t UnicodeSet::indexOf(UChar32 c) const {
770    if (c < MIN_VALUE || c > MAX_VALUE) {
771        return -1;
772    }
773    int32_t i = 0;
774    int32_t n = 0;
775    for (;;) {
776        UChar32 start = list[i++];
777        if (c < start) {
778            return -1;
779        }
780        UChar32 limit = list[i++];
781        if (c < limit) {
782            return n + c - start;
783        }
784        n += limit - start;
785    }
786}
787
788/**
789 * Returns the character at the given index within this set, where
790 * the set is ordered by ascending code point.  If the index is
791 * out of range, return (UChar32)-1.  The inverse of this method is
792 * <code>indexOf()</code>.
793 * @param index an index from 0..size()-1
794 * @return the character at the given index, or (UChar32)-1.
795 */
796UChar32 UnicodeSet::charAt(int32_t index) const {
797    if (index >= 0) {
798        // len2 is the largest even integer <= len, that is, it is len
799        // for even values and len-1 for odd values.  With odd values
800        // the last entry is UNICODESET_HIGH.
801        int32_t len2 = len & ~1;
802        for (int32_t i=0; i < len2;) {
803            UChar32 start = list[i++];
804            int32_t count = list[i++] - start;
805            if (index < count) {
806                return (UChar32)(start + index);
807            }
808            index -= count;
809        }
810    }
811    return (UChar32)-1;
812}
813
814/**
815 * Make this object represent the range <code>start - end</code>.
816 * If <code>end > start</code> then this object is set to an
817 * an empty range.
818 *
819 * @param start first character in the set, inclusive
820 * @rparam end last character in the set, inclusive
821 */
822UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
823    clear();
824    complement(start, end);
825    return *this;
826}
827
828/**
829 * Adds the specified range to this set if it is not already
830 * present.  If this set already contains the specified range,
831 * the call leaves this set unchanged.  If <code>end > start</code>
832 * then an empty range is added, leaving the set unchanged.
833 *
834 * @param start first character, inclusive, of range to be added
835 * to this set.
836 * @param end last character, inclusive, of range to be added
837 * to this set.
838 */
839UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
840    if (pinCodePoint(start) < pinCodePoint(end)) {
841        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
842        add(range, 2, 0);
843    } else if (start == end) {
844        add(start);
845    }
846    return *this;
847}
848
849// #define DEBUG_US_ADD
850
851#ifdef DEBUG_US_ADD
852#include <stdio.h>
853void dump(UChar32 c) {
854    if (c <= 0xFF) {
855        printf("%c", (char)c);
856    } else {
857        printf("U+%04X", c);
858    }
859}
860void dump(const UChar32* list, int32_t len) {
861    printf("[");
862    for (int32_t i=0; i<len; ++i) {
863        if (i != 0) printf(", ");
864        dump(list[i]);
865    }
866    printf("]");
867}
868#endif
869
870/**
871 * Adds the specified character to this set if it is not already
872 * present.  If this set already contains the specified character,
873 * the call leaves this set unchanged.
874 */
875UnicodeSet& UnicodeSet::add(UChar32 c) {
876    // find smallest i such that c < list[i]
877    // if odd, then it is IN the set
878    // if even, then it is OUT of the set
879    int32_t i = findCodePoint(pinCodePoint(c));
880
881    // already in set?
882    if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
883
884    // HIGH is 0x110000
885    // assert(list[len-1] == HIGH);
886
887    // empty = [HIGH]
888    // [start_0, limit_0, start_1, limit_1, HIGH]
889
890    // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
891    //                             ^
892    //                             list[i]
893
894    // i == 0 means c is before the first range
895
896#ifdef DEBUG_US_ADD
897    printf("Add of ");
898    dump(c);
899    printf(" found at %d", i);
900    printf(": ");
901    dump(list, len);
902    printf(" => ");
903#endif
904
905    if (c == list[i]-1) {
906        // c is before start of next range
907        list[i] = c;
908        // if we touched the HIGH mark, then add a new one
909        if (c == (UNICODESET_HIGH - 1)) {
910            UErrorCode status = U_ZERO_ERROR;
911            ensureCapacity(len+1, status);
912            if (U_FAILURE(status)) {
913                return *this; // There is no way to report this error :-(
914            }
915            list[len++] = UNICODESET_HIGH;
916        }
917        if (i > 0 && c == list[i-1]) {
918            // collapse adjacent ranges
919
920            // [..., start_k-1, c, c, limit_k, ..., HIGH]
921            //                     ^
922            //                     list[i]
923
924            //for (int32_t k=i-1; k<len-2; ++k) {
925            //    list[k] = list[k+2];
926            //}
927            UChar32* dst = list + i - 1;
928            UChar32* src = dst + 2;
929            UChar32* srclimit = list + len;
930            while (src < srclimit) *(dst++) = *(src++);
931
932            len -= 2;
933        }
934    }
935
936    else if (i > 0 && c == list[i-1]) {
937        // c is after end of prior range
938        list[i-1]++;
939        // no need to check for collapse here
940    }
941
942    else {
943        // At this point we know the new char is not adjacent to
944        // any existing ranges, and it is not 10FFFF.
945
946
947        // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
948        //                             ^
949        //                             list[i]
950
951        // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
952        //                             ^
953        //                             list[i]
954
955        UErrorCode status = U_ZERO_ERROR;
956        ensureCapacity(len+2, status);
957        if (U_FAILURE(status)) {
958            return *this; // There is no way to report this error :-(
959        }
960
961        //for (int32_t k=len-1; k>=i; --k) {
962        //    list[k+2] = list[k];
963        //}
964        UChar32* src = list + len;
965        UChar32* dst = src + 2;
966        UChar32* srclimit = list + i;
967        while (src > srclimit) *(--dst) = *(--src);
968
969        list[i] = c;
970        list[i+1] = c+1;
971        len += 2;
972    }
973
974#ifdef DEBUG_US_ADD
975    dump(list, len);
976    printf("\n");
977
978    for (i=1; i<len; ++i) {
979        if (list[i] <= list[i-1]) {
980            // Corrupt array!
981            printf("ERROR: list has been corrupted\n");
982            exit(1);
983        }
984    }
985#endif
986
987    releasePattern();
988    return *this;
989}
990
991/**
992 * Adds the specified multicharacter to this set if it is not already
993 * present.  If this set already contains the multicharacter,
994 * the call leaves this set unchanged.
995 * Thus "ch" => {"ch"}
996 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
997 * @param s the source string
998 * @return the modified set, for chaining
999 */
1000UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
1001    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1002    int32_t cp = getSingleCP(s);
1003    if (cp < 0) {
1004        if (!strings->contains((void*) &s)) {
1005            _add(s);
1006            releasePattern();
1007        }
1008    } else {
1009        add((UChar32)cp);
1010    }
1011    return *this;
1012}
1013
1014/**
1015 * Adds the given string, in order, to 'strings'.  The given string
1016 * must have been checked by the caller to not be empty and to not
1017 * already be in 'strings'.
1018 */
1019void UnicodeSet::_add(const UnicodeString& s) {
1020    if (isFrozen() || isBogus()) {
1021        return;
1022    }
1023    UnicodeString* t = new UnicodeString(s);
1024    if (t == NULL) { // Check for memory allocation error.
1025        setToBogus();
1026        return;
1027    }
1028    UErrorCode ec = U_ZERO_ERROR;
1029    strings->sortedInsert(t, compareUnicodeString, ec);
1030    if (U_FAILURE(ec)) {
1031        setToBogus();
1032        delete t;
1033    }
1034}
1035
1036/**
1037 * @return a code point IF the string consists of a single one.
1038 * otherwise returns -1.
1039 * @param string to test
1040 */
1041int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1042    //if (s.length() < 1) {
1043    //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1044    //}
1045    if (s.length() > 2) return -1;
1046    if (s.length() == 1) return s.charAt(0);
1047
1048    // at this point, len = 2
1049    UChar32 cp = s.char32At(0);
1050    if (cp > 0xFFFF) { // is surrogate pair
1051        return cp;
1052    }
1053    return -1;
1054}
1055
1056/**
1057 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1058 * If this set already any particular character, it has no effect on that character.
1059 * @param the source string
1060 * @return the modified set, for chaining
1061 */
1062UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1063    UChar32 cp;
1064    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1065        cp = s.char32At(i);
1066        add(cp);
1067    }
1068    return *this;
1069}
1070
1071/**
1072 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1073 * If this set already any particular character, it has no effect on that character.
1074 * @param the source string
1075 * @return the modified set, for chaining
1076 */
1077UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1078    UnicodeSet set;
1079    set.addAll(s);
1080    retainAll(set);
1081    return *this;
1082}
1083
1084/**
1085 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1086 * If this set already any particular character, it has no effect on that character.
1087 * @param the source string
1088 * @return the modified set, for chaining
1089 */
1090UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1091    UnicodeSet set;
1092    set.addAll(s);
1093    complementAll(set);
1094    return *this;
1095}
1096
1097/**
1098 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1099 * If this set already any particular character, it has no effect on that character.
1100 * @param the source string
1101 * @return the modified set, for chaining
1102 */
1103UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1104    UnicodeSet set;
1105    set.addAll(s);
1106    removeAll(set);
1107    return *this;
1108}
1109
1110UnicodeSet& UnicodeSet::removeAllStrings() {
1111    strings->removeAllElements();
1112    return *this;
1113}
1114
1115
1116/**
1117 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1118 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1119 * @param the source string
1120 * @return a newly created set containing the given string
1121 */
1122UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1123    UnicodeSet *set = new UnicodeSet();
1124    if (set != NULL) { // Check for memory allocation error.
1125        set->add(s);
1126    }
1127    return set;
1128}
1129
1130
1131/**
1132 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1133 * @param the source string
1134 * @return a newly created set containing the given characters
1135 */
1136UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1137    UnicodeSet *set = new UnicodeSet();
1138    if (set != NULL) { // Check for memory allocation error.
1139        set->addAll(s);
1140    }
1141    return set;
1142}
1143
1144/**
1145 * Retain only the elements in this set that are contained in the
1146 * specified range.  If <code>end > start</code> then an empty range is
1147 * retained, leaving the set empty.
1148 *
1149 * @param start first character, inclusive, of range to be retained
1150 * to this set.
1151 * @param end last character, inclusive, of range to be retained
1152 * to this set.
1153 */
1154UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1155    if (pinCodePoint(start) <= pinCodePoint(end)) {
1156        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1157        retain(range, 2, 0);
1158    } else {
1159        clear();
1160    }
1161    return *this;
1162}
1163
1164UnicodeSet& UnicodeSet::retain(UChar32 c) {
1165    return retain(c, c);
1166}
1167
1168/**
1169 * Removes the specified range from this set if it is present.
1170 * The set will not contain the specified range once the call
1171 * returns.  If <code>end > start</code> then an empty range is
1172 * removed, leaving the set unchanged.
1173 *
1174 * @param start first character, inclusive, of range to be removed
1175 * from this set.
1176 * @param end last character, inclusive, of range to be removed
1177 * from this set.
1178 */
1179UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1180    if (pinCodePoint(start) <= pinCodePoint(end)) {
1181        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1182        retain(range, 2, 2);
1183    }
1184    return *this;
1185}
1186
1187/**
1188 * Removes the specified character from this set if it is present.
1189 * The set will not contain the specified range once the call
1190 * returns.
1191 */
1192UnicodeSet& UnicodeSet::remove(UChar32 c) {
1193    return remove(c, c);
1194}
1195
1196/**
1197 * Removes the specified string from this set if it is present.
1198 * The set will not contain the specified character once the call
1199 * returns.
1200 * @param the source string
1201 * @return the modified set, for chaining
1202 */
1203UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1204    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1205    int32_t cp = getSingleCP(s);
1206    if (cp < 0) {
1207        strings->removeElement((void*) &s);
1208        releasePattern();
1209    } else {
1210        remove((UChar32)cp, (UChar32)cp);
1211    }
1212    return *this;
1213}
1214
1215/**
1216 * Complements the specified range in this set.  Any character in
1217 * the range will be removed if it is in this set, or will be
1218 * added if it is not in this set.  If <code>end > start</code>
1219 * then an empty range is xor'ed, leaving the set unchanged.
1220 *
1221 * @param start first character, inclusive, of range to be removed
1222 * from this set.
1223 * @param end last character, inclusive, of range to be removed
1224 * from this set.
1225 */
1226UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1227    if (isFrozen() || isBogus()) {
1228        return *this;
1229    }
1230    if (pinCodePoint(start) <= pinCodePoint(end)) {
1231        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1232        exclusiveOr(range, 2, 0);
1233    }
1234    releasePattern();
1235    return *this;
1236}
1237
1238UnicodeSet& UnicodeSet::complement(UChar32 c) {
1239    return complement(c, c);
1240}
1241
1242/**
1243 * This is equivalent to
1244 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1245 */
1246UnicodeSet& UnicodeSet::complement(void) {
1247    if (isFrozen() || isBogus()) {
1248        return *this;
1249    }
1250    UErrorCode status = U_ZERO_ERROR;
1251    if (list[0] == UNICODESET_LOW) {
1252        ensureBufferCapacity(len-1, status);
1253        if (U_FAILURE(status)) {
1254            return *this;
1255        }
1256        uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
1257        --len;
1258    } else {
1259        ensureBufferCapacity(len+1, status);
1260        if (U_FAILURE(status)) {
1261            return *this;
1262        }
1263        uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
1264        buffer[0] = UNICODESET_LOW;
1265        ++len;
1266    }
1267    swapBuffers();
1268    releasePattern();
1269    return *this;
1270}
1271
1272/**
1273 * Complement the specified string in this set.
1274 * The set will not contain the specified string once the call
1275 * returns.
1276 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1277 * @param s the string to complement
1278 * @return this object, for chaining
1279 */
1280UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1281    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1282    int32_t cp = getSingleCP(s);
1283    if (cp < 0) {
1284        if (strings->contains((void*) &s)) {
1285            strings->removeElement((void*) &s);
1286        } else {
1287            _add(s);
1288        }
1289        releasePattern();
1290    } else {
1291        complement((UChar32)cp, (UChar32)cp);
1292    }
1293    return *this;
1294}
1295
1296/**
1297 * Adds all of the elements in the specified set to this set if
1298 * they're not already present.  This operation effectively
1299 * modifies this set so that its value is the <i>union</i> of the two
1300 * sets.  The behavior of this operation is unspecified if the specified
1301 * collection is modified while the operation is in progress.
1302 *
1303 * @param c set whose elements are to be added to this set.
1304 * @see #add(char, char)
1305 */
1306UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1307    if ( c.len>0 && c.list!=NULL ) {
1308        add(c.list, c.len, 0);
1309    }
1310
1311    // Add strings in order
1312    if ( c.strings!=NULL ) {
1313        for (int32_t i=0; i<c.strings->size(); ++i) {
1314            const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1315            if (!strings->contains((void*) s)) {
1316                _add(*s);
1317            }
1318        }
1319    }
1320    return *this;
1321}
1322
1323/**
1324 * Retains only the elements in this set that are contained in the
1325 * specified set.  In other words, removes from this set all of
1326 * its elements that are not contained in the specified set.  This
1327 * operation effectively modifies this set so that its value is
1328 * the <i>intersection</i> of the two sets.
1329 *
1330 * @param c set that defines which elements this set will retain.
1331 */
1332UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1333    if (isFrozen() || isBogus()) {
1334        return *this;
1335    }
1336    retain(c.list, c.len, 0);
1337    strings->retainAll(*c.strings);
1338    return *this;
1339}
1340
1341/**
1342 * Removes from this set all of its elements that are contained in the
1343 * specified set.  This operation effectively modifies this
1344 * set so that its value is the <i>asymmetric set difference</i> of
1345 * the two sets.
1346 *
1347 * @param c set that defines which elements will be removed from
1348 *          this set.
1349 */
1350UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1351    if (isFrozen() || isBogus()) {
1352        return *this;
1353    }
1354    retain(c.list, c.len, 2);
1355    strings->removeAll(*c.strings);
1356    return *this;
1357}
1358
1359/**
1360 * Complements in this set all elements contained in the specified
1361 * set.  Any character in the other set will be removed if it is
1362 * in this set, or will be added if it is not in this set.
1363 *
1364 * @param c set that defines which elements will be xor'ed from
1365 *          this set.
1366 */
1367UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1368    if (isFrozen() || isBogus()) {
1369        return *this;
1370    }
1371    exclusiveOr(c.list, c.len, 0);
1372
1373    for (int32_t i=0; i<c.strings->size(); ++i) {
1374        void* e = c.strings->elementAt(i);
1375        if (!strings->removeElement(e)) {
1376            _add(*(const UnicodeString*)e);
1377        }
1378    }
1379    return *this;
1380}
1381
1382/**
1383 * Removes all of the elements from this set.  This set will be
1384 * empty after this call returns.
1385 */
1386UnicodeSet& UnicodeSet::clear(void) {
1387    if (isFrozen()) {
1388        return *this;
1389    }
1390    if (list != NULL) {
1391        list[0] = UNICODESET_HIGH;
1392    }
1393    len = 1;
1394    releasePattern();
1395    if (strings != NULL) {
1396        strings->removeAllElements();
1397    }
1398    if (list != NULL && strings != NULL) {
1399        // Remove bogus
1400        fFlags = 0;
1401    }
1402    return *this;
1403}
1404
1405/**
1406 * Iteration method that returns the number of ranges contained in
1407 * this set.
1408 * @see #getRangeStart
1409 * @see #getRangeEnd
1410 */
1411int32_t UnicodeSet::getRangeCount() const {
1412    return len/2;
1413}
1414
1415/**
1416 * Iteration method that returns the first character in the
1417 * specified range of this set.
1418 * @see #getRangeCount
1419 * @see #getRangeEnd
1420 */
1421UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1422    return list[index*2];
1423}
1424
1425/**
1426 * Iteration method that returns the last character in the
1427 * specified range of this set.
1428 * @see #getRangeStart
1429 * @see #getRangeEnd
1430 */
1431UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1432    return list[index*2 + 1] - 1;
1433}
1434
1435int32_t UnicodeSet::getStringCount() const {
1436    return strings->size();
1437}
1438
1439const UnicodeString* UnicodeSet::getString(int32_t index) const {
1440    return (const UnicodeString*) strings->elementAt(index);
1441}
1442
1443/**
1444 * Reallocate this objects internal structures to take up the least
1445 * possible space, without changing this object's value.
1446 */
1447UnicodeSet& UnicodeSet::compact() {
1448    if (isFrozen() || isBogus()) {
1449        return *this;
1450    }
1451    // Delete buffer first to defragment memory less.
1452    if (buffer != NULL) {
1453        uprv_free(buffer);
1454        buffer = NULL;
1455    }
1456    if (len < capacity) {
1457        // Make the capacity equal to len or 1.
1458        // We don't want to realloc of 0 size.
1459        int32_t newCapacity = len + (len == 0);
1460        UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
1461        if (temp) {
1462            list = temp;
1463            capacity = newCapacity;
1464        }
1465        // else what the heck happened?! We allocated less memory!
1466        // Oh well. We'll keep our original array.
1467    }
1468    return *this;
1469}
1470
1471int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1472    int32_t bmpLength, length, destLength;
1473
1474    if (U_FAILURE(ec)) {
1475        return 0;
1476    }
1477
1478    if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1479        ec=U_ILLEGAL_ARGUMENT_ERROR;
1480        return 0;
1481    }
1482
1483    /* count necessary 16-bit units */
1484    length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1485    // assert(length>=0);
1486    if (length==0) {
1487        /* empty set */
1488        if (destCapacity>0) {
1489            *dest=0;
1490        } else {
1491            ec=U_BUFFER_OVERFLOW_ERROR;
1492        }
1493        return 1;
1494    }
1495    /* now length>0 */
1496
1497    if (this->list[length-1]<=0xffff) {
1498        /* all BMP */
1499        bmpLength=length;
1500    } else if (this->list[0]>=0x10000) {
1501        /* all supplementary */
1502        bmpLength=0;
1503        length*=2;
1504    } else {
1505        /* some BMP, some supplementary */
1506        for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1507        length=bmpLength+2*(length-bmpLength);
1508    }
1509
1510    /* length: number of 16-bit array units */
1511    if (length>0x7fff) {
1512        /* there are only 15 bits for the length in the first serialized word */
1513        ec=U_INDEX_OUTOFBOUNDS_ERROR;
1514        return 0;
1515    }
1516
1517    /*
1518     * total serialized length:
1519     * number of 16-bit array units (length) +
1520     * 1 length unit (always) +
1521     * 1 bmpLength unit (if there are supplementary values)
1522     */
1523    destLength=length+((length>bmpLength)?2:1);
1524    if (destLength<=destCapacity) {
1525        const UChar32 *p;
1526        int32_t i;
1527
1528        *dest=(uint16_t)length;
1529        if (length>bmpLength) {
1530            *dest|=0x8000;
1531            *++dest=(uint16_t)bmpLength;
1532        }
1533        ++dest;
1534
1535        /* write the BMP part of the array */
1536        p=this->list;
1537        for (i=0; i<bmpLength; ++i) {
1538            *dest++=(uint16_t)*p++;
1539        }
1540
1541        /* write the supplementary part of the array */
1542        for (; i<length; i+=2) {
1543            *dest++=(uint16_t)(*p>>16);
1544            *dest++=(uint16_t)*p++;
1545        }
1546    } else {
1547        ec=U_BUFFER_OVERFLOW_ERROR;
1548    }
1549    return destLength;
1550}
1551
1552//----------------------------------------------------------------
1553// Implementation: Utility methods
1554//----------------------------------------------------------------
1555
1556/**
1557 * Allocate our strings vector and return TRUE if successful.
1558 */
1559UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1560    if (U_FAILURE(status)) {
1561        return FALSE;
1562    }
1563    strings = new UVector(uprv_deleteUObject,
1564                          uhash_compareUnicodeString, 1, status);
1565    if (strings == NULL) { // Check for memory allocation error.
1566        status = U_MEMORY_ALLOCATION_ERROR;
1567        return FALSE;
1568    }
1569    if (U_FAILURE(status)) {
1570        delete strings;
1571        strings = NULL;
1572        return FALSE;
1573    }
1574    return TRUE;
1575}
1576
1577void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
1578    if (newLen <= capacity)
1579        return;
1580    UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
1581    if (temp == NULL) {
1582        ec = U_MEMORY_ALLOCATION_ERROR;
1583        setToBogus();
1584        return;
1585    }
1586    list = temp;
1587    capacity = newLen + GROW_EXTRA;
1588    // else we keep the original contents on the memory failure.
1589}
1590
1591void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
1592    if (buffer != NULL && newLen <= bufferCapacity)
1593        return;
1594    UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
1595    if (temp == NULL) {
1596        ec = U_MEMORY_ALLOCATION_ERROR;
1597        setToBogus();
1598        return;
1599    }
1600    buffer = temp;
1601    bufferCapacity = newLen + GROW_EXTRA;
1602    // else we keep the original contents on the memory failure.
1603}
1604
1605/**
1606 * Swap list and buffer.
1607 */
1608void UnicodeSet::swapBuffers(void) {
1609    // swap list and buffer
1610    UChar32* temp = list;
1611    list = buffer;
1612    buffer = temp;
1613
1614    int32_t c = capacity;
1615    capacity = bufferCapacity;
1616    bufferCapacity = c;
1617}
1618
1619void UnicodeSet::setToBogus() {
1620    clear(); // Remove everything in the set.
1621    fFlags = kIsBogus;
1622}
1623
1624//----------------------------------------------------------------
1625// Implementation: Fundamental operators
1626//----------------------------------------------------------------
1627
1628static inline UChar32 max(UChar32 a, UChar32 b) {
1629    return (a > b) ? a : b;
1630}
1631
1632// polarity = 0, 3 is normal: x xor y
1633// polarity = 1, 2: x xor ~y == x === y
1634
1635void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1636    if (isFrozen() || isBogus()) {
1637        return;
1638    }
1639    UErrorCode status = U_ZERO_ERROR;
1640    ensureBufferCapacity(len + otherLen, status);
1641    if (U_FAILURE(status)) {
1642        return;
1643    }
1644
1645    int32_t i = 0, j = 0, k = 0;
1646    UChar32 a = list[i++];
1647    UChar32 b;
1648    if (polarity == 1 || polarity == 2) {
1649        b = UNICODESET_LOW;
1650        if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1651            ++j;
1652            b = other[j];
1653        }
1654    } else {
1655        b = other[j++];
1656    }
1657    // simplest of all the routines
1658    // sort the values, discarding identicals!
1659    for (;;) {
1660        if (a < b) {
1661            buffer[k++] = a;
1662            a = list[i++];
1663        } else if (b < a) {
1664            buffer[k++] = b;
1665            b = other[j++];
1666        } else if (a != UNICODESET_HIGH) { // at this point, a == b
1667            // discard both values!
1668            a = list[i++];
1669            b = other[j++];
1670        } else { // DONE!
1671            buffer[k++] = UNICODESET_HIGH;
1672            len = k;
1673            break;
1674        }
1675    }
1676    swapBuffers();
1677    releasePattern();
1678}
1679
1680// polarity = 0 is normal: x union y
1681// polarity = 2: x union ~y
1682// polarity = 1: ~x union y
1683// polarity = 3: ~x union ~y
1684
1685void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1686    if (isFrozen() || isBogus() || other==NULL) {
1687        return;
1688    }
1689    UErrorCode status = U_ZERO_ERROR;
1690    ensureBufferCapacity(len + otherLen, status);
1691    if (U_FAILURE(status)) {
1692        return;
1693    }
1694
1695    int32_t i = 0, j = 0, k = 0;
1696    UChar32 a = list[i++];
1697    UChar32 b = other[j++];
1698    // change from xor is that we have to check overlapping pairs
1699    // polarity bit 1 means a is second, bit 2 means b is.
1700    for (;;) {
1701        switch (polarity) {
1702          case 0: // both first; take lower if unequal
1703            if (a < b) { // take a
1704                // Back up over overlapping ranges in buffer[]
1705                if (k > 0 && a <= buffer[k-1]) {
1706                    // Pick latter end value in buffer[] vs. list[]
1707                    a = max(list[i], buffer[--k]);
1708                } else {
1709                    // No overlap
1710                    buffer[k++] = a;
1711                    a = list[i];
1712                }
1713                i++; // Common if/else code factored out
1714                polarity ^= 1;
1715            } else if (b < a) { // take b
1716                if (k > 0 && b <= buffer[k-1]) {
1717                    b = max(other[j], buffer[--k]);
1718                } else {
1719                    buffer[k++] = b;
1720                    b = other[j];
1721                }
1722                j++;
1723                polarity ^= 2;
1724            } else { // a == b, take a, drop b
1725                if (a == UNICODESET_HIGH) goto loop_end;
1726                // This is symmetrical; it doesn't matter if
1727                // we backtrack with a or b. - liu
1728                if (k > 0 && a <= buffer[k-1]) {
1729                    a = max(list[i], buffer[--k]);
1730                } else {
1731                    // No overlap
1732                    buffer[k++] = a;
1733                    a = list[i];
1734                }
1735                i++;
1736                polarity ^= 1;
1737                b = other[j++];
1738                polarity ^= 2;
1739            }
1740            break;
1741          case 3: // both second; take higher if unequal, and drop other
1742            if (b <= a) { // take a
1743                if (a == UNICODESET_HIGH) goto loop_end;
1744                buffer[k++] = a;
1745            } else { // take b
1746                if (b == UNICODESET_HIGH) goto loop_end;
1747                buffer[k++] = b;
1748            }
1749            a = list[i++];
1750            polarity ^= 1;   // factored common code
1751            b = other[j++];
1752            polarity ^= 2;
1753            break;
1754          case 1: // a second, b first; if b < a, overlap
1755            if (a < b) { // no overlap, take a
1756                buffer[k++] = a; a = list[i++]; polarity ^= 1;
1757            } else if (b < a) { // OVERLAP, drop b
1758                b = other[j++];
1759                polarity ^= 2;
1760            } else { // a == b, drop both!
1761                if (a == UNICODESET_HIGH) goto loop_end;
1762                a = list[i++];
1763                polarity ^= 1;
1764                b = other[j++];
1765                polarity ^= 2;
1766            }
1767            break;
1768          case 2: // a first, b second; if a < b, overlap
1769            if (b < a) { // no overlap, take b
1770                buffer[k++] = b;
1771                b = other[j++];
1772                polarity ^= 2;
1773            } else  if (a < b) { // OVERLAP, drop a
1774                a = list[i++];
1775                polarity ^= 1;
1776            } else { // a == b, drop both!
1777                if (a == UNICODESET_HIGH) goto loop_end;
1778                a = list[i++];
1779                polarity ^= 1;
1780                b = other[j++];
1781                polarity ^= 2;
1782            }
1783            break;
1784        }
1785    }
1786 loop_end:
1787    buffer[k++] = UNICODESET_HIGH;    // terminate
1788    len = k;
1789    swapBuffers();
1790    releasePattern();
1791}
1792
1793// polarity = 0 is normal: x intersect y
1794// polarity = 2: x intersect ~y == set-minus
1795// polarity = 1: ~x intersect y
1796// polarity = 3: ~x intersect ~y
1797
1798void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1799    if (isFrozen() || isBogus()) {
1800        return;
1801    }
1802    UErrorCode status = U_ZERO_ERROR;
1803    ensureBufferCapacity(len + otherLen, status);
1804    if (U_FAILURE(status)) {
1805        return;
1806    }
1807
1808    int32_t i = 0, j = 0, k = 0;
1809    UChar32 a = list[i++];
1810    UChar32 b = other[j++];
1811    // change from xor is that we have to check overlapping pairs
1812    // polarity bit 1 means a is second, bit 2 means b is.
1813    for (;;) {
1814        switch (polarity) {
1815          case 0: // both first; drop the smaller
1816            if (a < b) { // drop a
1817                a = list[i++];
1818                polarity ^= 1;
1819            } else if (b < a) { // drop b
1820                b = other[j++];
1821                polarity ^= 2;
1822            } else { // a == b, take one, drop other
1823                if (a == UNICODESET_HIGH) goto loop_end;
1824                buffer[k++] = a;
1825                a = list[i++];
1826                polarity ^= 1;
1827                b = other[j++];
1828                polarity ^= 2;
1829            }
1830            break;
1831          case 3: // both second; take lower if unequal
1832            if (a < b) { // take a
1833                buffer[k++] = a;
1834                a = list[i++];
1835                polarity ^= 1;
1836            } else if (b < a) { // take b
1837                buffer[k++] = b;
1838                b = other[j++];
1839                polarity ^= 2;
1840            } else { // a == b, take one, drop other
1841                if (a == UNICODESET_HIGH) goto loop_end;
1842                buffer[k++] = a;
1843                a = list[i++];
1844                polarity ^= 1;
1845                b = other[j++];
1846                polarity ^= 2;
1847            }
1848            break;
1849          case 1: // a second, b first;
1850            if (a < b) { // NO OVERLAP, drop a
1851                a = list[i++];
1852                polarity ^= 1;
1853            } else if (b < a) { // OVERLAP, take b
1854                buffer[k++] = b;
1855                b = other[j++];
1856                polarity ^= 2;
1857            } else { // a == b, drop both!
1858                if (a == UNICODESET_HIGH) goto loop_end;
1859                a = list[i++];
1860                polarity ^= 1;
1861                b = other[j++];
1862                polarity ^= 2;
1863            }
1864            break;
1865          case 2: // a first, b second; if a < b, overlap
1866            if (b < a) { // no overlap, drop b
1867                b = other[j++];
1868                polarity ^= 2;
1869            } else  if (a < b) { // OVERLAP, take a
1870                buffer[k++] = a;
1871                a = list[i++];
1872                polarity ^= 1;
1873            } else { // a == b, drop both!
1874                if (a == UNICODESET_HIGH) goto loop_end;
1875                a = list[i++];
1876                polarity ^= 1;
1877                b = other[j++];
1878                polarity ^= 2;
1879            }
1880            break;
1881        }
1882    }
1883 loop_end:
1884    buffer[k++] = UNICODESET_HIGH;    // terminate
1885    len = k;
1886    swapBuffers();
1887    releasePattern();
1888}
1889
1890/**
1891 * Append the <code>toPattern()</code> representation of a
1892 * string to the given <code>StringBuffer</code>.
1893 */
1894void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1895escapeUnprintable) {
1896    UChar32 cp;
1897    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1898        _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1899    }
1900}
1901
1902/**
1903 * Append the <code>toPattern()</code> representation of a
1904 * character to the given <code>StringBuffer</code>.
1905 */
1906void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1907escapeUnprintable) {
1908    if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1909        // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1910        // unprintable
1911        if (ICU_Utility::escapeUnprintable(buf, c)) {
1912            return;
1913        }
1914    }
1915    // Okay to let ':' pass through
1916    switch (c) {
1917    case SET_OPEN:
1918    case SET_CLOSE:
1919    case HYPHEN:
1920    case COMPLEMENT:
1921    case INTERSECTION:
1922    case BACKSLASH:
1923    case OPEN_BRACE:
1924    case CLOSE_BRACE:
1925    case COLON:
1926    case SymbolTable::SYMBOL_REF:
1927        buf.append(BACKSLASH);
1928        break;
1929    default:
1930        // Escape whitespace
1931        if (PatternProps::isWhiteSpace(c)) {
1932            buf.append(BACKSLASH);
1933        }
1934        break;
1935    }
1936    buf.append(c);
1937}
1938
1939/**
1940 * Append a string representation of this set to result.  This will be
1941 * a cleaned version of the string passed to applyPattern(), if there
1942 * is one.  Otherwise it will be generated.
1943 */
1944UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
1945                                      UBool escapeUnprintable) const
1946{
1947    if (pat != NULL) {
1948        int32_t i;
1949        int32_t backslashCount = 0;
1950        for (i=0; i<patLen; ) {
1951            UChar32 c;
1952            U16_NEXT(pat, i, patLen, c);
1953            if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1954                // If the unprintable character is preceded by an odd
1955                // number of backslashes, then it has been escaped.
1956                // Before unescaping it, we delete the final
1957                // backslash.
1958                if ((backslashCount % 2) == 1) {
1959                    result.truncate(result.length() - 1);
1960                }
1961                ICU_Utility::escapeUnprintable(result, c);
1962                backslashCount = 0;
1963            } else {
1964                result.append(c);
1965                if (c == BACKSLASH) {
1966                    ++backslashCount;
1967                } else {
1968                    backslashCount = 0;
1969                }
1970            }
1971        }
1972        return result;
1973    }
1974
1975    return _generatePattern(result, escapeUnprintable);
1976}
1977
1978/**
1979 * Returns a string representation of this set.  If the result of
1980 * calling this function is passed to a UnicodeSet constructor, it
1981 * will produce another set that is equal to this one.
1982 */
1983UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
1984                                     UBool escapeUnprintable) const
1985{
1986    result.truncate(0);
1987    return _toPattern(result, escapeUnprintable);
1988}
1989
1990/**
1991 * Generate and append a string representation of this set to result.
1992 * This does not use this.pat, the cleaned up copy of the string
1993 * passed to applyPattern().
1994 */
1995UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
1996                                            UBool escapeUnprintable) const
1997{
1998    result.append(SET_OPEN);
1999
2000//  // Check against the predefined categories.  We implicitly build
2001//  // up ALL category sets the first time toPattern() is called.
2002//  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2003//      if (*this == getCategorySet(cat)) {
2004//          result.append(COLON);
2005//          result.append(CATEGORY_NAMES, cat*2, 2);
2006//          return result.append(CATEGORY_CLOSE);
2007//      }
2008//  }
2009
2010    int32_t count = getRangeCount();
2011
2012    // If the set contains at least 2 intervals and includes both
2013    // MIN_VALUE and MAX_VALUE, then the inverse representation will
2014    // be more economical.
2015    if (count > 1 &&
2016        getRangeStart(0) == MIN_VALUE &&
2017        getRangeEnd(count-1) == MAX_VALUE) {
2018
2019        // Emit the inverse
2020        result.append(COMPLEMENT);
2021
2022        for (int32_t i = 1; i < count; ++i) {
2023            UChar32 start = getRangeEnd(i-1)+1;
2024            UChar32 end = getRangeStart(i)-1;
2025            _appendToPat(result, start, escapeUnprintable);
2026            if (start != end) {
2027                if ((start+1) != end) {
2028                    result.append(HYPHEN);
2029                }
2030                _appendToPat(result, end, escapeUnprintable);
2031            }
2032        }
2033    }
2034
2035    // Default; emit the ranges as pairs
2036    else {
2037        for (int32_t i = 0; i < count; ++i) {
2038            UChar32 start = getRangeStart(i);
2039            UChar32 end = getRangeEnd(i);
2040            _appendToPat(result, start, escapeUnprintable);
2041            if (start != end) {
2042                if ((start+1) != end) {
2043                    result.append(HYPHEN);
2044                }
2045                _appendToPat(result, end, escapeUnprintable);
2046            }
2047        }
2048    }
2049
2050    for (int32_t i = 0; i<strings->size(); ++i) {
2051        result.append(OPEN_BRACE);
2052        _appendToPat(result,
2053                     *(const UnicodeString*) strings->elementAt(i),
2054                     escapeUnprintable);
2055        result.append(CLOSE_BRACE);
2056    }
2057    return result.append(SET_CLOSE);
2058}
2059
2060/**
2061* Release existing cached pattern
2062*/
2063void UnicodeSet::releasePattern() {
2064    if (pat) {
2065        uprv_free(pat);
2066        pat = NULL;
2067        patLen = 0;
2068    }
2069}
2070
2071/**
2072* Set the new pattern to cache.
2073*/
2074void UnicodeSet::setPattern(const UnicodeString& newPat) {
2075    releasePattern();
2076    int32_t newPatLen = newPat.length();
2077    pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2078    if (pat) {
2079        patLen = newPatLen;
2080        newPat.extractBetween(0, patLen, pat);
2081        pat[patLen] = 0;
2082    }
2083    // else we don't care if malloc failed. This was just a nice cache.
2084    // We can regenerate an equivalent pattern later when requested.
2085}
2086
2087UnicodeFunctor *UnicodeSet::freeze() {
2088    if(!isFrozen() && !isBogus()) {
2089        // Do most of what compact() does before freezing because
2090        // compact() will not work when the set is frozen.
2091        // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2092
2093        // Delete buffer first to defragment memory less.
2094        if (buffer != NULL) {
2095            uprv_free(buffer);
2096            buffer = NULL;
2097        }
2098        if (capacity > (len + GROW_EXTRA)) {
2099            // Make the capacity equal to len or 1.
2100            // We don't want to realloc of 0 size.
2101            capacity = len + (len == 0);
2102            list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
2103            if (list == NULL) { // Check for memory allocation error.
2104                setToBogus();
2105                return this;
2106            }
2107        }
2108
2109        // Optimize contains() and span() and similar functions.
2110        if (!strings->isEmpty()) {
2111            stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2112            if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
2113                // All strings are irrelevant for span() etc. because
2114                // all of each string's code points are contained in this set.
2115                // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2116                // many relevant strings as UTF-16.
2117                // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2118                delete stringSpan;
2119                stringSpan = NULL;
2120            }
2121        }
2122        if (stringSpan == NULL) {
2123            // No span-relevant strings: Optimize for code point spans.
2124            bmpSet=new BMPSet(list, len);
2125            if (bmpSet == NULL) { // Check for memory allocation error.
2126                setToBogus();
2127            }
2128        }
2129    }
2130    return this;
2131}
2132
2133int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2134    if(length>0 && bmpSet!=NULL) {
2135        return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2136    }
2137    if(length<0) {
2138        length=u_strlen(s);
2139    }
2140    if(length==0) {
2141        return 0;
2142    }
2143    if(stringSpan!=NULL) {
2144        return stringSpan->span(s, length, spanCondition);
2145    } else if(!strings->isEmpty()) {
2146        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2147                            UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2148                            UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2149        UnicodeSetStringSpan strSpan(*this, *strings, which);
2150        if(strSpan.needsStringSpanUTF16()) {
2151            return strSpan.span(s, length, spanCondition);
2152        }
2153    }
2154
2155    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2156        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2157    }
2158
2159    UChar32 c;
2160    int32_t start=0, prev=0;
2161    do {
2162        U16_NEXT(s, start, length, c);
2163        if(spanCondition!=contains(c)) {
2164            break;
2165        }
2166    } while((prev=start)<length);
2167    return prev;
2168}
2169
2170int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2171    if(length>0 && bmpSet!=NULL) {
2172        return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2173    }
2174    if(length<0) {
2175        length=u_strlen(s);
2176    }
2177    if(length==0) {
2178        return 0;
2179    }
2180    if(stringSpan!=NULL) {
2181        return stringSpan->spanBack(s, length, spanCondition);
2182    } else if(!strings->isEmpty()) {
2183        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2184                            UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2185                            UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2186        UnicodeSetStringSpan strSpan(*this, *strings, which);
2187        if(strSpan.needsStringSpanUTF16()) {
2188            return strSpan.spanBack(s, length, spanCondition);
2189        }
2190    }
2191
2192    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2193        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2194    }
2195
2196    UChar32 c;
2197    int32_t prev=length;
2198    do {
2199        U16_PREV(s, 0, length, c);
2200        if(spanCondition!=contains(c)) {
2201            break;
2202        }
2203    } while((prev=length)>0);
2204    return prev;
2205}
2206
2207int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2208    if(length>0 && bmpSet!=NULL) {
2209        const uint8_t *s0=(const uint8_t *)s;
2210        return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2211    }
2212    if(length<0) {
2213        length=(int32_t)uprv_strlen(s);
2214    }
2215    if(length==0) {
2216        return 0;
2217    }
2218    if(stringSpan!=NULL) {
2219        return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2220    } else if(!strings->isEmpty()) {
2221        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2222                            UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2223                            UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2224        UnicodeSetStringSpan strSpan(*this, *strings, which);
2225        if(strSpan.needsStringSpanUTF8()) {
2226            return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2227        }
2228    }
2229
2230    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2231        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2232    }
2233
2234    UChar32 c;
2235    int32_t start=0, prev=0;
2236    do {
2237        U8_NEXT_OR_FFFD(s, start, length, c);
2238        if(spanCondition!=contains(c)) {
2239            break;
2240        }
2241    } while((prev=start)<length);
2242    return prev;
2243}
2244
2245int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2246    if(length>0 && bmpSet!=NULL) {
2247        const uint8_t *s0=(const uint8_t *)s;
2248        return bmpSet->spanBackUTF8(s0, length, spanCondition);
2249    }
2250    if(length<0) {
2251        length=(int32_t)uprv_strlen(s);
2252    }
2253    if(length==0) {
2254        return 0;
2255    }
2256    if(stringSpan!=NULL) {
2257        return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2258    } else if(!strings->isEmpty()) {
2259        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2260                            UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2261                            UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2262        UnicodeSetStringSpan strSpan(*this, *strings, which);
2263        if(strSpan.needsStringSpanUTF8()) {
2264            return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2265        }
2266    }
2267
2268    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2269        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2270    }
2271
2272    UChar32 c;
2273    int32_t prev=length;
2274    do {
2275        U8_PREV_OR_FFFD(s, 0, length, c);
2276        if(spanCondition!=contains(c)) {
2277            break;
2278        }
2279    } while((prev=length)>0);
2280    return prev;
2281}
2282
2283U_NAMESPACE_END
2284