1/*
2**********************************************************************
3*   Copyright (C) 1999-2015, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   10/20/99    alan        Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12#include "unicode/parsepos.h"
13#include "unicode/symtable.h"
14#include "unicode/uniset.h"
15#include "unicode/utf8.h"
16#include "unicode/utf16.h"
17#include "ruleiter.h"
18#include "cmemory.h"
19#include "cstring.h"
20#include "patternprops.h"
21#include "uelement.h"
22#include "util.h"
23#include "uvector.h"
24#include "charstr.h"
25#include "ustrfmt.h"
26#include "uassert.h"
27#include "bmpset.h"
28#include "unisetspan.h"
29
30// Define UChar constants using hex for EBCDIC compatibility
31// Used #define to reduce private static exports and memory access time.
32#define SET_OPEN        ((UChar)0x005B) /*[*/
33#define SET_CLOSE       ((UChar)0x005D) /*]*/
34#define HYPHEN          ((UChar)0x002D) /*-*/
35#define COMPLEMENT      ((UChar)0x005E) /*^*/
36#define COLON           ((UChar)0x003A) /*:*/
37#define BACKSLASH       ((UChar)0x005C) /*\*/
38#define INTERSECTION    ((UChar)0x0026) /*&*/
39#define UPPER_U         ((UChar)0x0055) /*U*/
40#define LOWER_U         ((UChar)0x0075) /*u*/
41#define OPEN_BRACE      ((UChar)123)    /*{*/
42#define CLOSE_BRACE     ((UChar)125)    /*}*/
43#define UPPER_P         ((UChar)0x0050) /*P*/
44#define LOWER_P         ((UChar)0x0070) /*p*/
45#define UPPER_N         ((UChar)78)     /*N*/
46#define EQUALS          ((UChar)0x003D) /*=*/
47
48// HIGH_VALUE > all valid values. 110000 for codepoints
49#define UNICODESET_HIGH 0x0110000
50
51// LOW <= all valid values. ZERO for codepoints
52#define UNICODESET_LOW 0x000000
53
54// initial storage. Must be >= 0
55#define START_EXTRA 16
56
57// extra amount for growth. Must be >= 0
58#define GROW_EXTRA START_EXTRA
59
60U_NAMESPACE_BEGIN
61
62SymbolTable::~SymbolTable() {}
63
64UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
65
66/**
67 * Modify the given UChar32 variable so that it is in range, by
68 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
69 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
70 * It modifies its argument in-place and also returns it.
71 */
72static inline UChar32 pinCodePoint(UChar32& c) {
73    if (c < UNICODESET_LOW) {
74        c = UNICODESET_LOW;
75    } else if (c > (UNICODESET_HIGH-1)) {
76        c = (UNICODESET_HIGH-1);
77    }
78    return c;
79}
80
81//----------------------------------------------------------------
82// Debugging
83//----------------------------------------------------------------
84
85// DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
86// To enable the debugging, define the symbol DEBUG_MEM in the line
87// below.  This will result in text being sent to stdout that looks
88// like this:
89//   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
90//   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
91// Each line lists a construction (ct) or destruction (dt) event, the
92// object address, the number of outstanding objects after the event,
93// and the pattern of the object in question.
94
95// #define DEBUG_MEM
96
97#ifdef DEBUG_MEM
98#include <stdio.h>
99static int32_t _dbgCount = 0;
100
101static inline void _dbgct(UnicodeSet* set) {
102    UnicodeString str;
103    set->toPattern(str, TRUE);
104    char buf[40];
105    str.extract(0, 39, buf, "");
106    printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
107}
108
109static inline void _dbgdt(UnicodeSet* set) {
110    UnicodeString str;
111    set->toPattern(str, TRUE);
112    char buf[40];
113    str.extract(0, 39, buf, "");
114    printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
115}
116
117#else
118
119#define _dbgct(set)
120#define _dbgdt(set)
121
122#endif
123
124//----------------------------------------------------------------
125// UnicodeString in UVector support
126//----------------------------------------------------------------
127
128static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
129    dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
130}
131
132static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
133    const UnicodeString &a = *(const UnicodeString*)t1.pointer;
134    const UnicodeString &b = *(const UnicodeString*)t2.pointer;
135    return a.compare(b);
136}
137
138//----------------------------------------------------------------
139// Constructors &c
140//----------------------------------------------------------------
141
142/**
143 * Constructs an empty set.
144 */
145UnicodeSet::UnicodeSet() :
146    len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
147    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
148    fFlags(0)
149{
150    UErrorCode status = U_ZERO_ERROR;
151    allocateStrings(status);
152    if (U_FAILURE(status)) {
153        return;
154    }
155    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
156    if(list!=NULL){
157        list[0] = UNICODESET_HIGH;
158    } else { // If memory allocation failed, set to bogus state.
159        setToBogus();
160        return;
161    }
162    _dbgct(this);
163}
164
165/**
166 * Constructs a set containing the given range. If <code>end >
167 * start</code> then an empty set is created.
168 *
169 * @param start first character, inclusive, of range
170 * @param end last character, inclusive, of range
171 */
172UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
173    len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
174    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
175    fFlags(0)
176{
177    UErrorCode status = U_ZERO_ERROR;
178    allocateStrings(status);
179    if (U_FAILURE(status)) {
180        return;
181    }
182    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
183    if(list!=NULL){
184        list[0] = UNICODESET_HIGH;
185        complement(start, end);
186    } else { // If memory allocation failed, set to bogus state.
187        setToBogus();
188        return;
189    }
190    _dbgct(this);
191}
192
193/**
194 * Constructs a set that is identical to the given UnicodeSet.
195 */
196UnicodeSet::UnicodeSet(const UnicodeSet& o) :
197    UnicodeFilter(o),
198    len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
199    bmpSet(0),
200    buffer(0), bufferCapacity(0),
201    patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
202    fFlags(0)
203{
204    UErrorCode status = U_ZERO_ERROR;
205    allocateStrings(status);
206    if (U_FAILURE(status)) {
207        return;
208    }
209    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
210    if(list!=NULL){
211        *this = o;
212    } else { // If memory allocation failed, set to bogus state.
213        setToBogus();
214        return;
215    }
216    _dbgct(this);
217}
218
219// Copy-construct as thawed.
220UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
221    UnicodeFilter(o),
222    len(0), capacity(o.len + GROW_EXTRA), list(0),
223    bmpSet(0),
224    buffer(0), bufferCapacity(0),
225    patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
226    fFlags(0)
227{
228    UErrorCode status = U_ZERO_ERROR;
229    allocateStrings(status);
230    if (U_FAILURE(status)) {
231        return;
232    }
233    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
234    if(list!=NULL){
235        // *this = o except for bmpSet and stringSpan
236        len = o.len;
237        uprv_memcpy(list, o.list, len*sizeof(UChar32));
238        if (strings != NULL && o.strings != NULL) {
239            strings->assign(*o.strings, cloneUnicodeString, status);
240        } else { // Invalid strings.
241            setToBogus();
242            return;
243        }
244        if (o.pat) {
245            setPattern(UnicodeString(o.pat, o.patLen));
246        }
247    } else { // If memory allocation failed, set to bogus state.
248        setToBogus();
249        return;
250    }
251    _dbgct(this);
252}
253
254/**
255 * Destructs the set.
256 */
257UnicodeSet::~UnicodeSet() {
258    _dbgdt(this); // first!
259    uprv_free(list);
260    delete bmpSet;
261    if (buffer) {
262        uprv_free(buffer);
263    }
264    delete strings;
265    delete stringSpan;
266    releasePattern();
267}
268
269/**
270 * Assigns this object to be a copy of another.
271 */
272UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
273    if (this == &o) {
274        return *this;
275    }
276    if (isFrozen()) {
277        return *this;
278    }
279    if (o.isBogus()) {
280        setToBogus();
281        return *this;
282    }
283    UErrorCode ec = U_ZERO_ERROR;
284    ensureCapacity(o.len, ec);
285    if (U_FAILURE(ec)) {
286        return *this; // There is no way to report this error :-(
287    }
288    len = o.len;
289    uprv_memcpy(list, o.list, len*sizeof(UChar32));
290    if (o.bmpSet == NULL) {
291        bmpSet = NULL;
292    } else {
293        bmpSet = new BMPSet(*o.bmpSet, list, len);
294        if (bmpSet == NULL) { // Check for memory allocation error.
295            setToBogus();
296            return *this;
297        }
298    }
299    if (strings != NULL && o.strings != NULL) {
300        strings->assign(*o.strings, cloneUnicodeString, ec);
301    } else { // Invalid strings.
302        setToBogus();
303        return *this;
304    }
305    if (o.stringSpan == NULL) {
306        stringSpan = NULL;
307    } else {
308        stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
309        if (stringSpan == NULL) { // Check for memory allocation error.
310            setToBogus();
311            return *this;
312        }
313    }
314    releasePattern();
315    if (o.pat) {
316        setPattern(UnicodeString(o.pat, o.patLen));
317    }
318    return *this;
319}
320
321/**
322 * Returns a copy of this object.  All UnicodeMatcher objects have
323 * to support cloning in order to allow classes using
324 * UnicodeMatchers, such as Transliterator, to implement cloning.
325 */
326UnicodeFunctor* UnicodeSet::clone() const {
327    return new UnicodeSet(*this);
328}
329
330UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
331    return new UnicodeSet(*this, TRUE);
332}
333
334/**
335 * Compares the specified object with this set for equality.  Returns
336 * <tt>true</tt> if the two sets
337 * have the same size, and every member of the specified set is
338 * contained in this set (or equivalently, every member of this set is
339 * contained in the specified set).
340 *
341 * @param o set to be compared for equality with this set.
342 * @return <tt>true</tt> if the specified set is equal to this set.
343 */
344UBool UnicodeSet::operator==(const UnicodeSet& o) const {
345    if (len != o.len) return FALSE;
346    for (int32_t i = 0; i < len; ++i) {
347        if (list[i] != o.list[i]) return FALSE;
348    }
349    if (*strings != *o.strings) return FALSE;
350    return TRUE;
351}
352
353/**
354 * Returns the hash code value for this set.
355 *
356 * @return the hash code value for this set.
357 * @see Object#hashCode()
358 */
359int32_t UnicodeSet::hashCode(void) const {
360    int32_t result = len;
361    for (int32_t i = 0; i < len; ++i) {
362        result *= 1000003;
363        result += list[i];
364    }
365    return result;
366}
367
368//----------------------------------------------------------------
369// Public API
370//----------------------------------------------------------------
371
372/**
373 * Returns the number of elements in this set (its cardinality),
374 * Note than the elements of a set may include both individual
375 * codepoints and strings.
376 *
377 * @return the number of elements in this set (its cardinality).
378 */
379int32_t UnicodeSet::size(void) const {
380    int32_t n = 0;
381    int32_t count = getRangeCount();
382    for (int32_t i = 0; i < count; ++i) {
383        n += getRangeEnd(i) - getRangeStart(i) + 1;
384    }
385    return n + strings->size();
386}
387
388/**
389 * Returns <tt>true</tt> if this set contains no elements.
390 *
391 * @return <tt>true</tt> if this set contains no elements.
392 */
393UBool UnicodeSet::isEmpty(void) const {
394    return len == 1 && strings->size() == 0;
395}
396
397/**
398 * Returns true if this set contains the given character.
399 * @param c character to be checked for containment
400 * @return true if the test condition is met
401 */
402UBool UnicodeSet::contains(UChar32 c) const {
403    // Set i to the index of the start item greater than ch
404    // We know we will terminate without length test!
405    // LATER: for large sets, add binary search
406    //int32_t i = -1;
407    //for (;;) {
408    //    if (c < list[++i]) break;
409    //}
410    if (bmpSet != NULL) {
411        return bmpSet->contains(c);
412    }
413    if (stringSpan != NULL) {
414        return stringSpan->contains(c);
415    }
416    if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
417        return FALSE;
418    }
419    int32_t i = findCodePoint(c);
420    return (UBool)(i & 1); // return true if odd
421}
422
423/**
424 * Returns the smallest value i such that c < list[i].  Caller
425 * must ensure that c is a legal value or this method will enter
426 * an infinite loop.  This method performs a binary search.
427 * @param c a character in the range MIN_VALUE..MAX_VALUE
428 * inclusive
429 * @return the smallest integer i in the range 0..len-1,
430 * inclusive, such that c < list[i]
431 */
432int32_t UnicodeSet::findCodePoint(UChar32 c) const {
433    /* Examples:
434                                       findCodePoint(c)
435       set              list[]         c=0 1 3 4 7 8
436       ===              ==============   ===========
437       []               [110000]         0 0 0 0 0 0
438       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
439       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
440       [:Any:]          [0, 110000]      1 1 1 1 1 1
441     */
442
443    // Return the smallest i such that c < list[i].  Assume
444    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
445    if (c < list[0])
446        return 0;
447    // High runner test.  c is often after the last range, so an
448    // initial check for this condition pays off.
449    int32_t lo = 0;
450    int32_t hi = len - 1;
451    if (lo >= hi || c >= list[hi-1])
452        return hi;
453    // invariant: c >= list[lo]
454    // invariant: c < list[hi]
455    for (;;) {
456        int32_t i = (lo + hi) >> 1;
457        if (i == lo) {
458            break; // Found!
459        } else if (c < list[i]) {
460            hi = i;
461        } else {
462            lo = i;
463        }
464    }
465    return hi;
466}
467
468/**
469 * Returns true if this set contains every character
470 * of the given range.
471 * @param start first character, inclusive, of the range
472 * @param end last character, inclusive, of the range
473 * @return true if the test condition is met
474 */
475UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
476    //int32_t i = -1;
477    //for (;;) {
478    //    if (start < list[++i]) break;
479    //}
480    int32_t i = findCodePoint(start);
481    return ((i & 1) != 0 && end < list[i]);
482}
483
484/**
485 * Returns <tt>true</tt> if this set contains the given
486 * multicharacter string.
487 * @param s string to be checked for containment
488 * @return <tt>true</tt> if this set contains the specified string
489 */
490UBool UnicodeSet::contains(const UnicodeString& s) const {
491    if (s.length() == 0) return FALSE;
492    int32_t cp = getSingleCP(s);
493    if (cp < 0) {
494        return strings->contains((void*) &s);
495    } else {
496        return contains((UChar32) cp);
497    }
498}
499
500/**
501 * Returns true if this set contains all the characters and strings
502 * of the given set.
503 * @param c set to be checked for containment
504 * @return true if the test condition is met
505 */
506UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
507    // The specified set is a subset if all of its pairs are contained in
508    // this set.  It's possible to code this more efficiently in terms of
509    // direct manipulation of the inversion lists if the need arises.
510    int32_t n = c.getRangeCount();
511    for (int i=0; i<n; ++i) {
512        if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
513            return FALSE;
514        }
515    }
516    if (!strings->containsAll(*c.strings)) return FALSE;
517    return TRUE;
518}
519
520/**
521 * Returns true if this set contains all the characters
522 * of the given string.
523 * @param s string containing characters to be checked for containment
524 * @return true if the test condition is met
525 */
526UBool UnicodeSet::containsAll(const UnicodeString& s) const {
527    return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
528                   s.length());
529}
530
531/**
532 * Returns true if this set contains none of the characters
533 * of the given range.
534 * @param start first character, inclusive, of the range
535 * @param end last character, inclusive, of the range
536 * @return true if the test condition is met
537 */
538UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
539    //int32_t i = -1;
540    //for (;;) {
541    //    if (start < list[++i]) break;
542    //}
543    int32_t i = findCodePoint(start);
544    return ((i & 1) == 0 && end < list[i]);
545}
546
547/**
548 * Returns true if this set contains none of the characters and strings
549 * of the given set.
550 * @param c set to be checked for containment
551 * @return true if the test condition is met
552 */
553UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
554    // The specified set is a subset if all of its pairs are contained in
555    // this set.  It's possible to code this more efficiently in terms of
556    // direct manipulation of the inversion lists if the need arises.
557    int32_t n = c.getRangeCount();
558    for (int32_t i=0; i<n; ++i) {
559        if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
560            return FALSE;
561        }
562    }
563    if (!strings->containsNone(*c.strings)) return FALSE;
564    return TRUE;
565}
566
567/**
568 * Returns true if this set contains none of the characters
569 * of the given string.
570 * @param s string containing characters to be checked for containment
571 * @return true if the test condition is met
572 */
573UBool UnicodeSet::containsNone(const UnicodeString& s) const {
574    return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
575                   s.length());
576}
577
578/**
579 * Returns <tt>true</tt> if this set contains any character whose low byte
580 * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
581 * indexing.
582 */
583UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
584    /* The index value v, in the range [0,255], is contained in this set if
585     * it is contained in any pair of this set.  Pairs either have the high
586     * bytes equal, or unequal.  If the high bytes are equal, then we have
587     * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
588     * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
589     * Then v is contained if xx <= v || v <= yy.  (This is identical to the
590     * time zone month containment logic.)
591     */
592    int32_t i;
593    int32_t rangeCount=getRangeCount();
594    for (i=0; i<rangeCount; ++i) {
595        UChar32 low = getRangeStart(i);
596        UChar32 high = getRangeEnd(i);
597        if ((low & ~0xFF) == (high & ~0xFF)) {
598            if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
599                return TRUE;
600            }
601        } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
602            return TRUE;
603        }
604    }
605    if (strings->size() != 0) {
606        for (i=0; i<strings->size(); ++i) {
607            const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
608            //if (s.length() == 0) {
609            //    // Empty strings match everything
610            //    return TRUE;
611            //}
612            // assert(s.length() != 0); // We enforce this elsewhere
613            UChar32 c = s.char32At(0);
614            if ((c & 0xFF) == v) {
615                return TRUE;
616            }
617        }
618    }
619    return FALSE;
620}
621
622/**
623 * Implementation of UnicodeMatcher::matches().  Always matches the
624 * longest possible multichar string.
625 */
626UMatchDegree UnicodeSet::matches(const Replaceable& text,
627                                 int32_t& offset,
628                                 int32_t limit,
629                                 UBool incremental) {
630    if (offset == limit) {
631        // Strings, if any, have length != 0, so we don't worry
632        // about them here.  If we ever allow zero-length strings
633        // we much check for them here.
634        if (contains(U_ETHER)) {
635            return incremental ? U_PARTIAL_MATCH : U_MATCH;
636        } else {
637            return U_MISMATCH;
638        }
639    } else {
640        if (strings->size() != 0) { // try strings first
641
642            // might separate forward and backward loops later
643            // for now they are combined
644
645            // TODO Improve efficiency of this, at least in the forward
646            // direction, if not in both.  In the forward direction we
647            // can assume the strings are sorted.
648
649            int32_t i;
650            UBool forward = offset < limit;
651
652            // firstChar is the leftmost char to match in the
653            // forward direction or the rightmost char to match in
654            // the reverse direction.
655            UChar firstChar = text.charAt(offset);
656
657            // If there are multiple strings that can match we
658            // return the longest match.
659            int32_t highWaterLength = 0;
660
661            for (i=0; i<strings->size(); ++i) {
662                const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
663
664                //if (trial.length() == 0) {
665                //    return U_MATCH; // null-string always matches
666                //}
667                // assert(trial.length() != 0); // We ensure this elsewhere
668
669                UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
670
671                // Strings are sorted, so we can optimize in the
672                // forward direction.
673                if (forward && c > firstChar) break;
674                if (c != firstChar) continue;
675
676                int32_t matchLen = matchRest(text, offset, limit, trial);
677
678                if (incremental) {
679                    int32_t maxLen = forward ? limit-offset : offset-limit;
680                    if (matchLen == maxLen) {
681                        // We have successfully matched but only up to limit.
682                        return U_PARTIAL_MATCH;
683                    }
684                }
685
686                if (matchLen == trial.length()) {
687                    // We have successfully matched the whole string.
688                    if (matchLen > highWaterLength) {
689                        highWaterLength = matchLen;
690                    }
691                    // In the forward direction we know strings
692                    // are sorted so we can bail early.
693                    if (forward && matchLen < highWaterLength) {
694                        break;
695                    }
696                    continue;
697                }
698            }
699
700            // We've checked all strings without a partial match.
701            // If we have full matches, return the longest one.
702            if (highWaterLength != 0) {
703                offset += forward ? highWaterLength : -highWaterLength;
704                return U_MATCH;
705            }
706        }
707        return UnicodeFilter::matches(text, offset, limit, incremental);
708    }
709}
710
711/**
712 * Returns the longest match for s in text at the given position.
713 * If limit > start then match forward from start+1 to limit
714 * matching all characters except s.charAt(0).  If limit < start,
715 * go backward starting from start-1 matching all characters
716 * except s.charAt(s.length()-1).  This method assumes that the
717 * first character, text.charAt(start), matches s, so it does not
718 * check it.
719 * @param text the text to match
720 * @param start the first character to match.  In the forward
721 * direction, text.charAt(start) is matched against s.charAt(0).
722 * In the reverse direction, it is matched against
723 * s.charAt(s.length()-1).
724 * @param limit the limit offset for matching, either last+1 in
725 * the forward direction, or last-1 in the reverse direction,
726 * where last is the index of the last character to match.
727 * @return If part of s matches up to the limit, return |limit -
728 * start|.  If all of s matches before reaching the limit, return
729 * s.length().  If there is a mismatch between s and text, return
730 * 0
731 */
732int32_t UnicodeSet::matchRest(const Replaceable& text,
733                              int32_t start, int32_t limit,
734                              const UnicodeString& s) {
735    int32_t i;
736    int32_t maxLen;
737    int32_t slen = s.length();
738    if (start < limit) {
739        maxLen = limit - start;
740        if (maxLen > slen) maxLen = slen;
741        for (i = 1; i < maxLen; ++i) {
742            if (text.charAt(start + i) != s.charAt(i)) return 0;
743        }
744    } else {
745        maxLen = start - limit;
746        if (maxLen > slen) maxLen = slen;
747        --slen; // <=> slen = s.length() - 1;
748        for (i = 1; i < maxLen; ++i) {
749            if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
750        }
751    }
752    return maxLen;
753}
754
755/**
756 * Implement of UnicodeMatcher
757 */
758void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
759    toUnionTo.addAll(*this);
760}
761
762/**
763 * Returns the index of the given character within this set, where
764 * the set is ordered by ascending code point.  If the character
765 * is not in this set, return -1.  The inverse of this method is
766 * <code>charAt()</code>.
767 * @return an index from 0..size()-1, or -1
768 */
769int32_t UnicodeSet::indexOf(UChar32 c) const {
770    if (c < MIN_VALUE || c > MAX_VALUE) {
771        return -1;
772    }
773    int32_t i = 0;
774    int32_t n = 0;
775    for (;;) {
776        UChar32 start = list[i++];
777        if (c < start) {
778            return -1;
779        }
780        UChar32 limit = list[i++];
781        if (c < limit) {
782            return n + c - start;
783        }
784        n += limit - start;
785    }
786}
787
788/**
789 * Returns the character at the given index within this set, where
790 * the set is ordered by ascending code point.  If the index is
791 * out of range, return (UChar32)-1.  The inverse of this method is
792 * <code>indexOf()</code>.
793 * @param index an index from 0..size()-1
794 * @return the character at the given index, or (UChar32)-1.
795 */
796UChar32 UnicodeSet::charAt(int32_t index) const {
797    if (index >= 0) {
798        // len2 is the largest even integer <= len, that is, it is len
799        // for even values and len-1 for odd values.  With odd values
800        // the last entry is UNICODESET_HIGH.
801        int32_t len2 = len & ~1;
802        for (int32_t i=0; i < len2;) {
803            UChar32 start = list[i++];
804            int32_t count = list[i++] - start;
805            if (index < count) {
806                return (UChar32)(start + index);
807            }
808            index -= count;
809        }
810    }
811    return (UChar32)-1;
812}
813
814/**
815 * Make this object represent the range <code>start - end</code>.
816 * If <code>end > start</code> then this object is set to an
817 * an empty range.
818 *
819 * @param start first character in the set, inclusive
820 * @rparam end last character in the set, inclusive
821 */
822UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
823    clear();
824    complement(start, end);
825    return *this;
826}
827
828/**
829 * Adds the specified range to this set if it is not already
830 * present.  If this set already contains the specified range,
831 * the call leaves this set unchanged.  If <code>end > start</code>
832 * then an empty range is added, leaving the set unchanged.
833 *
834 * @param start first character, inclusive, of range to be added
835 * to this set.
836 * @param end last character, inclusive, of range to be added
837 * to this set.
838 */
839UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
840    if (pinCodePoint(start) < pinCodePoint(end)) {
841        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
842        add(range, 2, 0);
843    } else if (start == end) {
844        add(start);
845    }
846    return *this;
847}
848
849// #define DEBUG_US_ADD
850
851#ifdef DEBUG_US_ADD
852#include <stdio.h>
853void dump(UChar32 c) {
854    if (c <= 0xFF) {
855        printf("%c", (char)c);
856    } else {
857        printf("U+%04X", c);
858    }
859}
860void dump(const UChar32* list, int32_t len) {
861    printf("[");
862    for (int32_t i=0; i<len; ++i) {
863        if (i != 0) printf(", ");
864        dump(list[i]);
865    }
866    printf("]");
867}
868#endif
869
870/**
871 * Adds the specified character to this set if it is not already
872 * present.  If this set already contains the specified character,
873 * the call leaves this set unchanged.
874 */
875UnicodeSet& UnicodeSet::add(UChar32 c) {
876    // find smallest i such that c < list[i]
877    // if odd, then it is IN the set
878    // if even, then it is OUT of the set
879    int32_t i = findCodePoint(pinCodePoint(c));
880
881    // already in set?
882    if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
883
884    // HIGH is 0x110000
885    // assert(list[len-1] == HIGH);
886
887    // empty = [HIGH]
888    // [start_0, limit_0, start_1, limit_1, HIGH]
889
890    // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
891    //                             ^
892    //                             list[i]
893
894    // i == 0 means c is before the first range
895
896#ifdef DEBUG_US_ADD
897    printf("Add of ");
898    dump(c);
899    printf(" found at %d", i);
900    printf(": ");
901    dump(list, len);
902    printf(" => ");
903#endif
904
905    if (c == list[i]-1) {
906        // c is before start of next range
907        list[i] = c;
908        // if we touched the HIGH mark, then add a new one
909        if (c == (UNICODESET_HIGH - 1)) {
910            UErrorCode status = U_ZERO_ERROR;
911            ensureCapacity(len+1, status);
912            if (U_FAILURE(status)) {
913                return *this; // There is no way to report this error :-(
914            }
915            list[len++] = UNICODESET_HIGH;
916        }
917        if (i > 0 && c == list[i-1]) {
918            // collapse adjacent ranges
919
920            // [..., start_k-1, c, c, limit_k, ..., HIGH]
921            //                     ^
922            //                     list[i]
923
924            //for (int32_t k=i-1; k<len-2; ++k) {
925            //    list[k] = list[k+2];
926            //}
927            UChar32* dst = list + i - 1;
928            UChar32* src = dst + 2;
929            UChar32* srclimit = list + len;
930            while (src < srclimit) *(dst++) = *(src++);
931
932            len -= 2;
933        }
934    }
935
936    else if (i > 0 && c == list[i-1]) {
937        // c is after end of prior range
938        list[i-1]++;
939        // no need to check for collapse here
940    }
941
942    else {
943        // At this point we know the new char is not adjacent to
944        // any existing ranges, and it is not 10FFFF.
945
946
947        // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
948        //                             ^
949        //                             list[i]
950
951        // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
952        //                             ^
953        //                             list[i]
954
955        UErrorCode status = U_ZERO_ERROR;
956        ensureCapacity(len+2, status);
957        if (U_FAILURE(status)) {
958            return *this; // There is no way to report this error :-(
959        }
960
961        //for (int32_t k=len-1; k>=i; --k) {
962        //    list[k+2] = list[k];
963        //}
964        UChar32* src = list + len;
965        UChar32* dst = src + 2;
966        UChar32* srclimit = list + i;
967        while (src > srclimit) *(--dst) = *(--src);
968
969        list[i] = c;
970        list[i+1] = c+1;
971        len += 2;
972    }
973
974#ifdef DEBUG_US_ADD
975    dump(list, len);
976    printf("\n");
977
978    for (i=1; i<len; ++i) {
979        if (list[i] <= list[i-1]) {
980            // Corrupt array!
981            printf("ERROR: list has been corrupted\n");
982            exit(1);
983        }
984    }
985#endif
986
987    releasePattern();
988    return *this;
989}
990
991/**
992 * Adds the specified multicharacter to this set if it is not already
993 * present.  If this set already contains the multicharacter,
994 * the call leaves this set unchanged.
995 * Thus "ch" => {"ch"}
996 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
997 * @param s the source string
998 * @return the modified set, for chaining
999 */
1000UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
1001    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1002    int32_t cp = getSingleCP(s);
1003    if (cp < 0) {
1004        if (!strings->contains((void*) &s)) {
1005            _add(s);
1006            releasePattern();
1007        }
1008    } else {
1009        add((UChar32)cp);
1010    }
1011    return *this;
1012}
1013
1014/**
1015 * Adds the given string, in order, to 'strings'.  The given string
1016 * must have been checked by the caller to not be empty and to not
1017 * already be in 'strings'.
1018 */
1019void UnicodeSet::_add(const UnicodeString& s) {
1020    if (isFrozen() || isBogus()) {
1021        return;
1022    }
1023    UnicodeString* t = new UnicodeString(s);
1024    if (t == NULL) { // Check for memory allocation error.
1025        setToBogus();
1026        return;
1027    }
1028    UErrorCode ec = U_ZERO_ERROR;
1029    strings->sortedInsert(t, compareUnicodeString, ec);
1030    if (U_FAILURE(ec)) {
1031        setToBogus();
1032        delete t;
1033    }
1034}
1035
1036/**
1037 * @return a code point IF the string consists of a single one.
1038 * otherwise returns -1.
1039 * @param string to test
1040 */
1041int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1042    //if (s.length() < 1) {
1043    //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1044    //}
1045    if (s.length() > 2) return -1;
1046    if (s.length() == 1) return s.charAt(0);
1047
1048    // at this point, len = 2
1049    UChar32 cp = s.char32At(0);
1050    if (cp > 0xFFFF) { // is surrogate pair
1051        return cp;
1052    }
1053    return -1;
1054}
1055
1056/**
1057 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1058 * If this set already any particular character, it has no effect on that character.
1059 * @param the source string
1060 * @return the modified set, for chaining
1061 */
1062UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1063    UChar32 cp;
1064    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1065        cp = s.char32At(i);
1066        add(cp);
1067    }
1068    return *this;
1069}
1070
1071/**
1072 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1073 * If this set already any particular character, it has no effect on that character.
1074 * @param the source string
1075 * @return the modified set, for chaining
1076 */
1077UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1078    UnicodeSet set;
1079    set.addAll(s);
1080    retainAll(set);
1081    return *this;
1082}
1083
1084/**
1085 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1086 * If this set already any particular character, it has no effect on that character.
1087 * @param the source string
1088 * @return the modified set, for chaining
1089 */
1090UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1091    UnicodeSet set;
1092    set.addAll(s);
1093    complementAll(set);
1094    return *this;
1095}
1096
1097/**
1098 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1099 * If this set already any particular character, it has no effect on that character.
1100 * @param the source string
1101 * @return the modified set, for chaining
1102 */
1103UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1104    UnicodeSet set;
1105    set.addAll(s);
1106    removeAll(set);
1107    return *this;
1108}
1109
1110UnicodeSet& UnicodeSet::removeAllStrings() {
1111    strings->removeAllElements();
1112    return *this;
1113}
1114
1115
1116/**
1117 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1118 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1119 * @param the source string
1120 * @return a newly created set containing the given string
1121 */
1122UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1123    UnicodeSet *set = new UnicodeSet();
1124    if (set != NULL) { // Check for memory allocation error.
1125        set->add(s);
1126    }
1127    return set;
1128}
1129
1130
1131/**
1132 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1133 * @param the source string
1134 * @return a newly created set containing the given characters
1135 */
1136UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1137    UnicodeSet *set = new UnicodeSet();
1138    if (set != NULL) { // Check for memory allocation error.
1139        set->addAll(s);
1140    }
1141    return set;
1142}
1143
1144/**
1145 * Retain only the elements in this set that are contained in the
1146 * specified range.  If <code>end > start</code> then an empty range is
1147 * retained, leaving the set empty.
1148 *
1149 * @param start first character, inclusive, of range to be retained
1150 * to this set.
1151 * @param end last character, inclusive, of range to be retained
1152 * to this set.
1153 */
1154UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1155    if (pinCodePoint(start) <= pinCodePoint(end)) {
1156        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1157        retain(range, 2, 0);
1158    } else {
1159        clear();
1160    }
1161    return *this;
1162}
1163
1164UnicodeSet& UnicodeSet::retain(UChar32 c) {
1165    return retain(c, c);
1166}
1167
1168/**
1169 * Removes the specified range from this set if it is present.
1170 * The set will not contain the specified range once the call
1171 * returns.  If <code>end > start</code> then an empty range is
1172 * removed, leaving the set unchanged.
1173 *
1174 * @param start first character, inclusive, of range to be removed
1175 * from this set.
1176 * @param end last character, inclusive, of range to be removed
1177 * from this set.
1178 */
1179UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1180    if (pinCodePoint(start) <= pinCodePoint(end)) {
1181        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1182        retain(range, 2, 2);
1183    }
1184    return *this;
1185}
1186
1187/**
1188 * Removes the specified character from this set if it is present.
1189 * The set will not contain the specified range once the call
1190 * returns.
1191 */
1192UnicodeSet& UnicodeSet::remove(UChar32 c) {
1193    return remove(c, c);
1194}
1195
1196/**
1197 * Removes the specified string from this set if it is present.
1198 * The set will not contain the specified character once the call
1199 * returns.
1200 * @param the source string
1201 * @return the modified set, for chaining
1202 */
1203UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1204    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1205    int32_t cp = getSingleCP(s);
1206    if (cp < 0) {
1207        strings->removeElement((void*) &s);
1208        releasePattern();
1209    } else {
1210        remove((UChar32)cp, (UChar32)cp);
1211    }
1212    return *this;
1213}
1214
1215/**
1216 * Complements the specified range in this set.  Any character in
1217 * the range will be removed if it is in this set, or will be
1218 * added if it is not in this set.  If <code>end > start</code>
1219 * then an empty range is xor'ed, leaving the set unchanged.
1220 *
1221 * @param start first character, inclusive, of range to be removed
1222 * from this set.
1223 * @param end last character, inclusive, of range to be removed
1224 * from this set.
1225 */
1226UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1227    if (isFrozen() || isBogus()) {
1228        return *this;
1229    }
1230    if (pinCodePoint(start) <= pinCodePoint(end)) {
1231        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1232        exclusiveOr(range, 2, 0);
1233    }
1234    releasePattern();
1235    return *this;
1236}
1237
1238UnicodeSet& UnicodeSet::complement(UChar32 c) {
1239    return complement(c, c);
1240}
1241
1242/**
1243 * This is equivalent to
1244 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1245 */
1246UnicodeSet& UnicodeSet::complement(void) {
1247    if (isFrozen() || isBogus()) {
1248        return *this;
1249    }
1250    UErrorCode status = U_ZERO_ERROR;
1251    if (list[0] == UNICODESET_LOW) {
1252        ensureBufferCapacity(len-1, status);
1253        if (U_FAILURE(status)) {
1254            return *this;
1255        }
1256        uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
1257        --len;
1258    } else {
1259        ensureBufferCapacity(len+1, status);
1260        if (U_FAILURE(status)) {
1261            return *this;
1262        }
1263        uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
1264        buffer[0] = UNICODESET_LOW;
1265        ++len;
1266    }
1267    swapBuffers();
1268    releasePattern();
1269    return *this;
1270}
1271
1272/**
1273 * Complement the specified string in this set.
1274 * The set will not contain the specified string once the call
1275 * returns.
1276 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1277 * @param s the string to complement
1278 * @return this object, for chaining
1279 */
1280UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1281    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1282    int32_t cp = getSingleCP(s);
1283    if (cp < 0) {
1284        if (strings->contains((void*) &s)) {
1285            strings->removeElement((void*) &s);
1286        } else {
1287            _add(s);
1288        }
1289        releasePattern();
1290    } else {
1291        complement((UChar32)cp, (UChar32)cp);
1292    }
1293    return *this;
1294}
1295
1296/**
1297 * Adds all of the elements in the specified set to this set if
1298 * they're not already present.  This operation effectively
1299 * modifies this set so that its value is the <i>union</i> of the two
1300 * sets.  The behavior of this operation is unspecified if the specified
1301 * collection is modified while the operation is in progress.
1302 *
1303 * @param c set whose elements are to be added to this set.
1304 * @see #add(char, char)
1305 */
1306UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1307    if ( c.len>0 && c.list!=NULL ) {
1308        add(c.list, c.len, 0);
1309    }
1310
1311    // Add strings in order
1312    if ( c.strings!=NULL ) {
1313        for (int32_t i=0; i<c.strings->size(); ++i) {
1314            const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1315            if (!strings->contains((void*) s)) {
1316                _add(*s);
1317            }
1318        }
1319    }
1320    return *this;
1321}
1322
1323/**
1324 * Retains only the elements in this set that are contained in the
1325 * specified set.  In other words, removes from this set all of
1326 * its elements that are not contained in the specified set.  This
1327 * operation effectively modifies this set so that its value is
1328 * the <i>intersection</i> of the two sets.
1329 *
1330 * @param c set that defines which elements this set will retain.
1331 */
1332UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1333    if (isFrozen() || isBogus()) {
1334        return *this;
1335    }
1336    retain(c.list, c.len, 0);
1337    strings->retainAll(*c.strings);
1338    return *this;
1339}
1340
1341/**
1342 * Removes from this set all of its elements that are contained in the
1343 * specified set.  This operation effectively modifies this
1344 * set so that its value is the <i>asymmetric set difference</i> of
1345 * the two sets.
1346 *
1347 * @param c set that defines which elements will be removed from
1348 *          this set.
1349 */
1350UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1351    if (isFrozen() || isBogus()) {
1352        return *this;
1353    }
1354    retain(c.list, c.len, 2);
1355    strings->removeAll(*c.strings);
1356    return *this;
1357}
1358
1359/**
1360 * Complements in this set all elements contained in the specified
1361 * set.  Any character in the other set will be removed if it is
1362 * in this set, or will be added if it is not in this set.
1363 *
1364 * @param c set that defines which elements will be xor'ed from
1365 *          this set.
1366 */
1367UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1368    if (isFrozen() || isBogus()) {
1369        return *this;
1370    }
1371    exclusiveOr(c.list, c.len, 0);
1372
1373    for (int32_t i=0; i<c.strings->size(); ++i) {
1374        void* e = c.strings->elementAt(i);
1375        if (!strings->removeElement(e)) {
1376            _add(*(const UnicodeString*)e);
1377        }
1378    }
1379    return *this;
1380}
1381
1382/**
1383 * Removes all of the elements from this set.  This set will be
1384 * empty after this call returns.
1385 */
1386UnicodeSet& UnicodeSet::clear(void) {
1387    if (isFrozen()) {
1388        return *this;
1389    }
1390    if (list != NULL) {
1391        list[0] = UNICODESET_HIGH;
1392    }
1393    len = 1;
1394    releasePattern();
1395    if (strings != NULL) {
1396        strings->removeAllElements();
1397    }
1398    if (list != NULL && strings != NULL) {
1399        // Remove bogus
1400        fFlags = 0;
1401    }
1402    return *this;
1403}
1404
1405/**
1406 * Iteration method that returns the number of ranges contained in
1407 * this set.
1408 * @see #getRangeStart
1409 * @see #getRangeEnd
1410 */
1411int32_t UnicodeSet::getRangeCount() const {
1412    return len/2;
1413}
1414
1415/**
1416 * Iteration method that returns the first character in the
1417 * specified range of this set.
1418 * @see #getRangeCount
1419 * @see #getRangeEnd
1420 */
1421UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1422    return list[index*2];
1423}
1424
1425/**
1426 * Iteration method that returns the last character in the
1427 * specified range of this set.
1428 * @see #getRangeStart
1429 * @see #getRangeEnd
1430 */
1431UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1432    return list[index*2 + 1] - 1;
1433}
1434
1435int32_t UnicodeSet::getStringCount() const {
1436    return strings->size();
1437}
1438
1439const UnicodeString* UnicodeSet::getString(int32_t index) const {
1440    return (const UnicodeString*) strings->elementAt(index);
1441}
1442
1443/**
1444 * Reallocate this objects internal structures to take up the least
1445 * possible space, without changing this object's value.
1446 */
1447UnicodeSet& UnicodeSet::compact() {
1448    if (isFrozen() || isBogus()) {
1449        return *this;
1450    }
1451    // Delete buffer first to defragment memory less.
1452    if (buffer != NULL) {
1453        uprv_free(buffer);
1454        buffer = NULL;
1455    }
1456    if (len < capacity) {
1457        // Make the capacity equal to len or 1.
1458        // We don't want to realloc of 0 size.
1459        int32_t newCapacity = len + (len == 0);
1460        UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
1461        if (temp) {
1462            list = temp;
1463            capacity = newCapacity;
1464        }
1465        // else what the heck happened?! We allocated less memory!
1466        // Oh well. We'll keep our original array.
1467    }
1468    return *this;
1469}
1470
1471#ifdef DEBUG_SERIALIZE
1472#include <stdio.h>
1473#endif
1474
1475/**
1476 * Deserialize constructor.
1477 */
1478UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec)
1479  : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0),
1480    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
1481    fFlags(0) {
1482
1483  if(U_FAILURE(ec)) {
1484    setToBogus();
1485    return;
1486  }
1487
1488  if( (serialization != kSerialized)
1489      || (data==NULL)
1490      || (dataLen < 1)) {
1491    ec = U_ILLEGAL_ARGUMENT_ERROR;
1492    setToBogus();
1493    return;
1494  }
1495
1496  allocateStrings(ec);
1497  if (U_FAILURE(ec)) {
1498    setToBogus();
1499    return;
1500  }
1501
1502  // bmp?
1503  int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1504  int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1505
1506  len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1507#ifdef DEBUG_SERIALIZE
1508  printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]);
1509#endif
1510  capacity = len+1;
1511  list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1512  if(!list || U_FAILURE(ec)) {
1513    setToBogus();
1514    return;
1515  }
1516  // copy bmp
1517  int32_t i;
1518  for(i = 0; i< bmpLength;i++) {
1519    list[i] = data[i+headerSize];
1520#ifdef DEBUG_SERIALIZE
1521    printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1522#endif
1523  }
1524  // copy smp
1525  for(i=bmpLength;i<len;i++) {
1526    list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1527              ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1528#ifdef DEBUG_SERIALIZE
1529    printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1530#endif
1531  }
1532  // terminator
1533  list[len++]=UNICODESET_HIGH;
1534}
1535
1536
1537int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1538    int32_t bmpLength, length, destLength;
1539
1540    if (U_FAILURE(ec)) {
1541        return 0;
1542    }
1543
1544    if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1545        ec=U_ILLEGAL_ARGUMENT_ERROR;
1546        return 0;
1547    }
1548
1549    /* count necessary 16-bit units */
1550    length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1551    // assert(length>=0);
1552    if (length==0) {
1553        /* empty set */
1554        if (destCapacity>0) {
1555            *dest=0;
1556        } else {
1557            ec=U_BUFFER_OVERFLOW_ERROR;
1558        }
1559        return 1;
1560    }
1561    /* now length>0 */
1562
1563    if (this->list[length-1]<=0xffff) {
1564        /* all BMP */
1565        bmpLength=length;
1566    } else if (this->list[0]>=0x10000) {
1567        /* all supplementary */
1568        bmpLength=0;
1569        length*=2;
1570    } else {
1571        /* some BMP, some supplementary */
1572        for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1573        length=bmpLength+2*(length-bmpLength);
1574    }
1575#ifdef DEBUG_SERIALIZE
1576    printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1577#endif
1578    /* length: number of 16-bit array units */
1579    if (length>0x7fff) {
1580        /* there are only 15 bits for the length in the first serialized word */
1581        ec=U_INDEX_OUTOFBOUNDS_ERROR;
1582        return 0;
1583    }
1584
1585    /*
1586     * total serialized length:
1587     * number of 16-bit array units (length) +
1588     * 1 length unit (always) +
1589     * 1 bmpLength unit (if there are supplementary values)
1590     */
1591    destLength=length+((length>bmpLength)?2:1);
1592    if (destLength<=destCapacity) {
1593        const UChar32 *p;
1594        int32_t i;
1595
1596#ifdef DEBUG_SERIALIZE
1597        printf("writeHdr\n");
1598#endif
1599        *dest=(uint16_t)length;
1600        if (length>bmpLength) {
1601            *dest|=0x8000;
1602            *++dest=(uint16_t)bmpLength;
1603        }
1604        ++dest;
1605
1606        /* write the BMP part of the array */
1607        p=this->list;
1608        for (i=0; i<bmpLength; ++i) {
1609#ifdef DEBUG_SERIALIZE
1610          printf("writebmp: %x\n", (int)*p);
1611#endif
1612            *dest++=(uint16_t)*p++;
1613        }
1614
1615        /* write the supplementary part of the array */
1616        for (; i<length; i+=2) {
1617#ifdef DEBUG_SERIALIZE
1618          printf("write32: %x\n", (int)*p);
1619#endif
1620            *dest++=(uint16_t)(*p>>16);
1621            *dest++=(uint16_t)*p++;
1622        }
1623    } else {
1624        ec=U_BUFFER_OVERFLOW_ERROR;
1625    }
1626    return destLength;
1627}
1628
1629//----------------------------------------------------------------
1630// Implementation: Utility methods
1631//----------------------------------------------------------------
1632
1633/**
1634 * Allocate our strings vector and return TRUE if successful.
1635 */
1636UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1637    if (U_FAILURE(status)) {
1638        return FALSE;
1639    }
1640    strings = new UVector(uprv_deleteUObject,
1641                          uhash_compareUnicodeString, 1, status);
1642    if (strings == NULL) { // Check for memory allocation error.
1643        status = U_MEMORY_ALLOCATION_ERROR;
1644        return FALSE;
1645    }
1646    if (U_FAILURE(status)) {
1647        delete strings;
1648        strings = NULL;
1649        return FALSE;
1650    }
1651    return TRUE;
1652}
1653
1654void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
1655    if (newLen <= capacity)
1656        return;
1657    UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
1658    if (temp == NULL) {
1659        ec = U_MEMORY_ALLOCATION_ERROR;
1660        setToBogus();
1661        return;
1662    }
1663    list = temp;
1664    capacity = newLen + GROW_EXTRA;
1665    // else we keep the original contents on the memory failure.
1666}
1667
1668void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
1669    if (buffer != NULL && newLen <= bufferCapacity)
1670        return;
1671    UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
1672    if (temp == NULL) {
1673        ec = U_MEMORY_ALLOCATION_ERROR;
1674        setToBogus();
1675        return;
1676    }
1677    buffer = temp;
1678    bufferCapacity = newLen + GROW_EXTRA;
1679    // else we keep the original contents on the memory failure.
1680}
1681
1682/**
1683 * Swap list and buffer.
1684 */
1685void UnicodeSet::swapBuffers(void) {
1686    // swap list and buffer
1687    UChar32* temp = list;
1688    list = buffer;
1689    buffer = temp;
1690
1691    int32_t c = capacity;
1692    capacity = bufferCapacity;
1693    bufferCapacity = c;
1694}
1695
1696void UnicodeSet::setToBogus() {
1697    clear(); // Remove everything in the set.
1698    fFlags = kIsBogus;
1699}
1700
1701//----------------------------------------------------------------
1702// Implementation: Fundamental operators
1703//----------------------------------------------------------------
1704
1705static inline UChar32 max(UChar32 a, UChar32 b) {
1706    return (a > b) ? a : b;
1707}
1708
1709// polarity = 0, 3 is normal: x xor y
1710// polarity = 1, 2: x xor ~y == x === y
1711
1712void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1713    if (isFrozen() || isBogus()) {
1714        return;
1715    }
1716    UErrorCode status = U_ZERO_ERROR;
1717    ensureBufferCapacity(len + otherLen, status);
1718    if (U_FAILURE(status)) {
1719        return;
1720    }
1721
1722    int32_t i = 0, j = 0, k = 0;
1723    UChar32 a = list[i++];
1724    UChar32 b;
1725    if (polarity == 1 || polarity == 2) {
1726        b = UNICODESET_LOW;
1727        if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1728            ++j;
1729            b = other[j];
1730        }
1731    } else {
1732        b = other[j++];
1733    }
1734    // simplest of all the routines
1735    // sort the values, discarding identicals!
1736    for (;;) {
1737        if (a < b) {
1738            buffer[k++] = a;
1739            a = list[i++];
1740        } else if (b < a) {
1741            buffer[k++] = b;
1742            b = other[j++];
1743        } else if (a != UNICODESET_HIGH) { // at this point, a == b
1744            // discard both values!
1745            a = list[i++];
1746            b = other[j++];
1747        } else { // DONE!
1748            buffer[k++] = UNICODESET_HIGH;
1749            len = k;
1750            break;
1751        }
1752    }
1753    swapBuffers();
1754    releasePattern();
1755}
1756
1757// polarity = 0 is normal: x union y
1758// polarity = 2: x union ~y
1759// polarity = 1: ~x union y
1760// polarity = 3: ~x union ~y
1761
1762void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1763    if (isFrozen() || isBogus() || other==NULL) {
1764        return;
1765    }
1766    UErrorCode status = U_ZERO_ERROR;
1767    ensureBufferCapacity(len + otherLen, status);
1768    if (U_FAILURE(status)) {
1769        return;
1770    }
1771
1772    int32_t i = 0, j = 0, k = 0;
1773    UChar32 a = list[i++];
1774    UChar32 b = other[j++];
1775    // change from xor is that we have to check overlapping pairs
1776    // polarity bit 1 means a is second, bit 2 means b is.
1777    for (;;) {
1778        switch (polarity) {
1779          case 0: // both first; take lower if unequal
1780            if (a < b) { // take a
1781                // Back up over overlapping ranges in buffer[]
1782                if (k > 0 && a <= buffer[k-1]) {
1783                    // Pick latter end value in buffer[] vs. list[]
1784                    a = max(list[i], buffer[--k]);
1785                } else {
1786                    // No overlap
1787                    buffer[k++] = a;
1788                    a = list[i];
1789                }
1790                i++; // Common if/else code factored out
1791                polarity ^= 1;
1792            } else if (b < a) { // take b
1793                if (k > 0 && b <= buffer[k-1]) {
1794                    b = max(other[j], buffer[--k]);
1795                } else {
1796                    buffer[k++] = b;
1797                    b = other[j];
1798                }
1799                j++;
1800                polarity ^= 2;
1801            } else { // a == b, take a, drop b
1802                if (a == UNICODESET_HIGH) goto loop_end;
1803                // This is symmetrical; it doesn't matter if
1804                // we backtrack with a or b. - liu
1805                if (k > 0 && a <= buffer[k-1]) {
1806                    a = max(list[i], buffer[--k]);
1807                } else {
1808                    // No overlap
1809                    buffer[k++] = a;
1810                    a = list[i];
1811                }
1812                i++;
1813                polarity ^= 1;
1814                b = other[j++];
1815                polarity ^= 2;
1816            }
1817            break;
1818          case 3: // both second; take higher if unequal, and drop other
1819            if (b <= a) { // take a
1820                if (a == UNICODESET_HIGH) goto loop_end;
1821                buffer[k++] = a;
1822            } else { // take b
1823                if (b == UNICODESET_HIGH) goto loop_end;
1824                buffer[k++] = b;
1825            }
1826            a = list[i++];
1827            polarity ^= 1;   // factored common code
1828            b = other[j++];
1829            polarity ^= 2;
1830            break;
1831          case 1: // a second, b first; if b < a, overlap
1832            if (a < b) { // no overlap, take a
1833                buffer[k++] = a; a = list[i++]; polarity ^= 1;
1834            } else if (b < a) { // OVERLAP, drop b
1835                b = other[j++];
1836                polarity ^= 2;
1837            } else { // a == b, drop both!
1838                if (a == UNICODESET_HIGH) goto loop_end;
1839                a = list[i++];
1840                polarity ^= 1;
1841                b = other[j++];
1842                polarity ^= 2;
1843            }
1844            break;
1845          case 2: // a first, b second; if a < b, overlap
1846            if (b < a) { // no overlap, take b
1847                buffer[k++] = b;
1848                b = other[j++];
1849                polarity ^= 2;
1850            } else  if (a < b) { // OVERLAP, drop a
1851                a = list[i++];
1852                polarity ^= 1;
1853            } else { // a == b, drop both!
1854                if (a == UNICODESET_HIGH) goto loop_end;
1855                a = list[i++];
1856                polarity ^= 1;
1857                b = other[j++];
1858                polarity ^= 2;
1859            }
1860            break;
1861        }
1862    }
1863 loop_end:
1864    buffer[k++] = UNICODESET_HIGH;    // terminate
1865    len = k;
1866    swapBuffers();
1867    releasePattern();
1868}
1869
1870// polarity = 0 is normal: x intersect y
1871// polarity = 2: x intersect ~y == set-minus
1872// polarity = 1: ~x intersect y
1873// polarity = 3: ~x intersect ~y
1874
1875void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1876    if (isFrozen() || isBogus()) {
1877        return;
1878    }
1879    UErrorCode status = U_ZERO_ERROR;
1880    ensureBufferCapacity(len + otherLen, status);
1881    if (U_FAILURE(status)) {
1882        return;
1883    }
1884
1885    int32_t i = 0, j = 0, k = 0;
1886    UChar32 a = list[i++];
1887    UChar32 b = other[j++];
1888    // change from xor is that we have to check overlapping pairs
1889    // polarity bit 1 means a is second, bit 2 means b is.
1890    for (;;) {
1891        switch (polarity) {
1892          case 0: // both first; drop the smaller
1893            if (a < b) { // drop a
1894                a = list[i++];
1895                polarity ^= 1;
1896            } else if (b < a) { // drop b
1897                b = other[j++];
1898                polarity ^= 2;
1899            } else { // a == b, take one, drop other
1900                if (a == UNICODESET_HIGH) goto loop_end;
1901                buffer[k++] = a;
1902                a = list[i++];
1903                polarity ^= 1;
1904                b = other[j++];
1905                polarity ^= 2;
1906            }
1907            break;
1908          case 3: // both second; take lower if unequal
1909            if (a < b) { // take a
1910                buffer[k++] = a;
1911                a = list[i++];
1912                polarity ^= 1;
1913            } else if (b < a) { // take b
1914                buffer[k++] = b;
1915                b = other[j++];
1916                polarity ^= 2;
1917            } else { // a == b, take one, drop other
1918                if (a == UNICODESET_HIGH) goto loop_end;
1919                buffer[k++] = a;
1920                a = list[i++];
1921                polarity ^= 1;
1922                b = other[j++];
1923                polarity ^= 2;
1924            }
1925            break;
1926          case 1: // a second, b first;
1927            if (a < b) { // NO OVERLAP, drop a
1928                a = list[i++];
1929                polarity ^= 1;
1930            } else if (b < a) { // OVERLAP, take b
1931                buffer[k++] = b;
1932                b = other[j++];
1933                polarity ^= 2;
1934            } else { // a == b, drop both!
1935                if (a == UNICODESET_HIGH) goto loop_end;
1936                a = list[i++];
1937                polarity ^= 1;
1938                b = other[j++];
1939                polarity ^= 2;
1940            }
1941            break;
1942          case 2: // a first, b second; if a < b, overlap
1943            if (b < a) { // no overlap, drop b
1944                b = other[j++];
1945                polarity ^= 2;
1946            } else  if (a < b) { // OVERLAP, take a
1947                buffer[k++] = a;
1948                a = list[i++];
1949                polarity ^= 1;
1950            } else { // a == b, drop both!
1951                if (a == UNICODESET_HIGH) goto loop_end;
1952                a = list[i++];
1953                polarity ^= 1;
1954                b = other[j++];
1955                polarity ^= 2;
1956            }
1957            break;
1958        }
1959    }
1960 loop_end:
1961    buffer[k++] = UNICODESET_HIGH;    // terminate
1962    len = k;
1963    swapBuffers();
1964    releasePattern();
1965}
1966
1967/**
1968 * Append the <code>toPattern()</code> representation of a
1969 * string to the given <code>StringBuffer</code>.
1970 */
1971void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1972escapeUnprintable) {
1973    UChar32 cp;
1974    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1975        _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1976    }
1977}
1978
1979/**
1980 * Append the <code>toPattern()</code> representation of a
1981 * character to the given <code>StringBuffer</code>.
1982 */
1983void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1984escapeUnprintable) {
1985    if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1986        // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1987        // unprintable
1988        if (ICU_Utility::escapeUnprintable(buf, c)) {
1989            return;
1990        }
1991    }
1992    // Okay to let ':' pass through
1993    switch (c) {
1994    case SET_OPEN:
1995    case SET_CLOSE:
1996    case HYPHEN:
1997    case COMPLEMENT:
1998    case INTERSECTION:
1999    case BACKSLASH:
2000    case OPEN_BRACE:
2001    case CLOSE_BRACE:
2002    case COLON:
2003    case SymbolTable::SYMBOL_REF:
2004        buf.append(BACKSLASH);
2005        break;
2006    default:
2007        // Escape whitespace
2008        if (PatternProps::isWhiteSpace(c)) {
2009            buf.append(BACKSLASH);
2010        }
2011        break;
2012    }
2013    buf.append(c);
2014}
2015
2016/**
2017 * Append a string representation of this set to result.  This will be
2018 * a cleaned version of the string passed to applyPattern(), if there
2019 * is one.  Otherwise it will be generated.
2020 */
2021UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2022                                      UBool escapeUnprintable) const
2023{
2024    if (pat != NULL) {
2025        int32_t i;
2026        int32_t backslashCount = 0;
2027        for (i=0; i<patLen; ) {
2028            UChar32 c;
2029            U16_NEXT(pat, i, patLen, c);
2030            if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
2031                // If the unprintable character is preceded by an odd
2032                // number of backslashes, then it has been escaped.
2033                // Before unescaping it, we delete the final
2034                // backslash.
2035                if ((backslashCount % 2) == 1) {
2036                    result.truncate(result.length() - 1);
2037                }
2038                ICU_Utility::escapeUnprintable(result, c);
2039                backslashCount = 0;
2040            } else {
2041                result.append(c);
2042                if (c == BACKSLASH) {
2043                    ++backslashCount;
2044                } else {
2045                    backslashCount = 0;
2046                }
2047            }
2048        }
2049        return result;
2050    }
2051
2052    return _generatePattern(result, escapeUnprintable);
2053}
2054
2055/**
2056 * Returns a string representation of this set.  If the result of
2057 * calling this function is passed to a UnicodeSet constructor, it
2058 * will produce another set that is equal to this one.
2059 */
2060UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2061                                     UBool escapeUnprintable) const
2062{
2063    result.truncate(0);
2064    return _toPattern(result, escapeUnprintable);
2065}
2066
2067/**
2068 * Generate and append a string representation of this set to result.
2069 * This does not use this.pat, the cleaned up copy of the string
2070 * passed to applyPattern().
2071 */
2072UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2073                                            UBool escapeUnprintable) const
2074{
2075    result.append(SET_OPEN);
2076
2077//  // Check against the predefined categories.  We implicitly build
2078//  // up ALL category sets the first time toPattern() is called.
2079//  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2080//      if (*this == getCategorySet(cat)) {
2081//          result.append(COLON);
2082//          result.append(CATEGORY_NAMES, cat*2, 2);
2083//          return result.append(CATEGORY_CLOSE);
2084//      }
2085//  }
2086
2087    int32_t count = getRangeCount();
2088
2089    // If the set contains at least 2 intervals and includes both
2090    // MIN_VALUE and MAX_VALUE, then the inverse representation will
2091    // be more economical.
2092    if (count > 1 &&
2093        getRangeStart(0) == MIN_VALUE &&
2094        getRangeEnd(count-1) == MAX_VALUE) {
2095
2096        // Emit the inverse
2097        result.append(COMPLEMENT);
2098
2099        for (int32_t i = 1; i < count; ++i) {
2100            UChar32 start = getRangeEnd(i-1)+1;
2101            UChar32 end = getRangeStart(i)-1;
2102            _appendToPat(result, start, escapeUnprintable);
2103            if (start != end) {
2104                if ((start+1) != end) {
2105                    result.append(HYPHEN);
2106                }
2107                _appendToPat(result, end, escapeUnprintable);
2108            }
2109        }
2110    }
2111
2112    // Default; emit the ranges as pairs
2113    else {
2114        for (int32_t i = 0; i < count; ++i) {
2115            UChar32 start = getRangeStart(i);
2116            UChar32 end = getRangeEnd(i);
2117            _appendToPat(result, start, escapeUnprintable);
2118            if (start != end) {
2119                if ((start+1) != end) {
2120                    result.append(HYPHEN);
2121                }
2122                _appendToPat(result, end, escapeUnprintable);
2123            }
2124        }
2125    }
2126
2127    for (int32_t i = 0; i<strings->size(); ++i) {
2128        result.append(OPEN_BRACE);
2129        _appendToPat(result,
2130                     *(const UnicodeString*) strings->elementAt(i),
2131                     escapeUnprintable);
2132        result.append(CLOSE_BRACE);
2133    }
2134    return result.append(SET_CLOSE);
2135}
2136
2137/**
2138* Release existing cached pattern
2139*/
2140void UnicodeSet::releasePattern() {
2141    if (pat) {
2142        uprv_free(pat);
2143        pat = NULL;
2144        patLen = 0;
2145    }
2146}
2147
2148/**
2149* Set the new pattern to cache.
2150*/
2151void UnicodeSet::setPattern(const UnicodeString& newPat) {
2152    releasePattern();
2153    int32_t newPatLen = newPat.length();
2154    pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2155    if (pat) {
2156        patLen = newPatLen;
2157        newPat.extractBetween(0, patLen, pat);
2158        pat[patLen] = 0;
2159    }
2160    // else we don't care if malloc failed. This was just a nice cache.
2161    // We can regenerate an equivalent pattern later when requested.
2162}
2163
2164UnicodeFunctor *UnicodeSet::freeze() {
2165    if(!isFrozen() && !isBogus()) {
2166        // Do most of what compact() does before freezing because
2167        // compact() will not work when the set is frozen.
2168        // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2169
2170        // Delete buffer first to defragment memory less.
2171        if (buffer != NULL) {
2172            uprv_free(buffer);
2173            buffer = NULL;
2174        }
2175        if (capacity > (len + GROW_EXTRA)) {
2176            // Make the capacity equal to len or 1.
2177            // We don't want to realloc of 0 size.
2178            capacity = len + (len == 0);
2179            list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
2180            if (list == NULL) { // Check for memory allocation error.
2181                setToBogus();
2182                return this;
2183            }
2184        }
2185
2186        // Optimize contains() and span() and similar functions.
2187        if (!strings->isEmpty()) {
2188            stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2189            if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
2190                // All strings are irrelevant for span() etc. because
2191                // all of each string's code points are contained in this set.
2192                // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2193                // many relevant strings as UTF-16.
2194                // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2195                delete stringSpan;
2196                stringSpan = NULL;
2197            }
2198        }
2199        if (stringSpan == NULL) {
2200            // No span-relevant strings: Optimize for code point spans.
2201            bmpSet=new BMPSet(list, len);
2202            if (bmpSet == NULL) { // Check for memory allocation error.
2203                setToBogus();
2204            }
2205        }
2206    }
2207    return this;
2208}
2209
2210int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2211    if(length>0 && bmpSet!=NULL) {
2212        return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2213    }
2214    if(length<0) {
2215        length=u_strlen(s);
2216    }
2217    if(length==0) {
2218        return 0;
2219    }
2220    if(stringSpan!=NULL) {
2221        return stringSpan->span(s, length, spanCondition);
2222    } else if(!strings->isEmpty()) {
2223        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2224                            UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2225                            UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2226        UnicodeSetStringSpan strSpan(*this, *strings, which);
2227        if(strSpan.needsStringSpanUTF16()) {
2228            return strSpan.span(s, length, spanCondition);
2229        }
2230    }
2231
2232    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2233        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2234    }
2235
2236    UChar32 c;
2237    int32_t start=0, prev=0;
2238    do {
2239        U16_NEXT(s, start, length, c);
2240        if(spanCondition!=contains(c)) {
2241            break;
2242        }
2243    } while((prev=start)<length);
2244    return prev;
2245}
2246
2247int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2248    if(length>0 && bmpSet!=NULL) {
2249        return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2250    }
2251    if(length<0) {
2252        length=u_strlen(s);
2253    }
2254    if(length==0) {
2255        return 0;
2256    }
2257    if(stringSpan!=NULL) {
2258        return stringSpan->spanBack(s, length, spanCondition);
2259    } else if(!strings->isEmpty()) {
2260        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2261                            UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2262                            UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2263        UnicodeSetStringSpan strSpan(*this, *strings, which);
2264        if(strSpan.needsStringSpanUTF16()) {
2265            return strSpan.spanBack(s, length, spanCondition);
2266        }
2267    }
2268
2269    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2270        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2271    }
2272
2273    UChar32 c;
2274    int32_t prev=length;
2275    do {
2276        U16_PREV(s, 0, length, c);
2277        if(spanCondition!=contains(c)) {
2278            break;
2279        }
2280    } while((prev=length)>0);
2281    return prev;
2282}
2283
2284int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2285    if(length>0 && bmpSet!=NULL) {
2286        const uint8_t *s0=(const uint8_t *)s;
2287        return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2288    }
2289    if(length<0) {
2290        length=(int32_t)uprv_strlen(s);
2291    }
2292    if(length==0) {
2293        return 0;
2294    }
2295    if(stringSpan!=NULL) {
2296        return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2297    } else if(!strings->isEmpty()) {
2298        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2299                            UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2300                            UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2301        UnicodeSetStringSpan strSpan(*this, *strings, which);
2302        if(strSpan.needsStringSpanUTF8()) {
2303            return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2304        }
2305    }
2306
2307    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2308        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2309    }
2310
2311    UChar32 c;
2312    int32_t start=0, prev=0;
2313    do {
2314        U8_NEXT_OR_FFFD(s, start, length, c);
2315        if(spanCondition!=contains(c)) {
2316            break;
2317        }
2318    } while((prev=start)<length);
2319    return prev;
2320}
2321
2322int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2323    if(length>0 && bmpSet!=NULL) {
2324        const uint8_t *s0=(const uint8_t *)s;
2325        return bmpSet->spanBackUTF8(s0, length, spanCondition);
2326    }
2327    if(length<0) {
2328        length=(int32_t)uprv_strlen(s);
2329    }
2330    if(length==0) {
2331        return 0;
2332    }
2333    if(stringSpan!=NULL) {
2334        return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2335    } else if(!strings->isEmpty()) {
2336        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2337                            UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2338                            UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2339        UnicodeSetStringSpan strSpan(*this, *strings, which);
2340        if(strSpan.needsStringSpanUTF8()) {
2341            return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2342        }
2343    }
2344
2345    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2346        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2347    }
2348
2349    UChar32 c;
2350    int32_t prev=length;
2351    do {
2352        U8_PREV_OR_FFFD(s, 0, length, c);
2353        if(spanCondition!=contains(c)) {
2354            break;
2355        }
2356    } while((prev=length)>0);
2357    return prev;
2358}
2359
2360U_NAMESPACE_END
2361