1/*
2********************************************************************************
3*   Copyright (C) 1999-2010 International Business Machines Corporation and
4*   others. All Rights Reserved.
5********************************************************************************
6*   Date        Name        Description
7*   10/20/99    alan        Creation.
8*   03/22/2000  Madhu       Added additional tests
9********************************************************************************
10*/
11
12#include <stdio.h>
13
14#include <string.h>
15#include "unicode/utypes.h"
16#include "usettest.h"
17#include "unicode/ucnv.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/usetiter.h"
21#include "unicode/ustring.h"
22#include "unicode/parsepos.h"
23#include "unicode/symtable.h"
24#include "unicode/uversion.h"
25#include "hash.h"
26
27#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
28
29#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
30    dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
31    u_errorName(status));}}
32
33#define TEST_ASSERT(expr) {if (!(expr)) { \
34    dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
35
36UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
37    UnicodeString pat;
38    set.toPattern(pat);
39    return left + UnicodeSetTest::escape(pat);
40}
41
42#define CASE(id,test) case id:                          \
43                          name = #test;                 \
44                          if (exec) {                   \
45                              logln(#test "---");       \
46                              logln();                  \
47                              test();                   \
48                          }                             \
49                          break
50
51UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
52}
53
54UConverter *UnicodeSetTest::openUTF8Converter() {
55    if(utf8Cnv==NULL) {
56        UErrorCode errorCode=U_ZERO_ERROR;
57        utf8Cnv=ucnv_open("UTF-8", &errorCode);
58    }
59    return utf8Cnv;
60}
61
62UnicodeSetTest::~UnicodeSetTest() {
63    ucnv_close(utf8Cnv);
64}
65
66void
67UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
68                               const char* &name, char* /*par*/) {
69    // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
70    switch (index) {
71        CASE(0,TestPatterns);
72        CASE(1,TestAddRemove);
73        CASE(2,TestCategories);
74        CASE(3,TestCloneEqualHash);
75        CASE(4,TestMinimalRep);
76        CASE(5,TestAPI);
77        CASE(6,TestScriptSet);
78        CASE(7,TestPropertySet);
79        CASE(8,TestClone);
80        CASE(9,TestExhaustive);
81        CASE(10,TestToPattern);
82        CASE(11,TestIndexOf);
83        CASE(12,TestStrings);
84        CASE(13,Testj2268);
85        CASE(14,TestCloseOver);
86        CASE(15,TestEscapePattern);
87        CASE(16,TestInvalidCodePoint);
88        CASE(17,TestSymbolTable);
89        CASE(18,TestSurrogate);
90        CASE(19,TestPosixClasses);
91        CASE(20,TestIteration);
92        CASE(21,TestFreezable);
93        CASE(22,TestSpan);
94        CASE(23,TestStringSpan);
95        default: name = ""; break;
96    }
97}
98
99static const char NOT[] = "%%%%";
100
101/**
102 * UVector was improperly copying contents
103 * This code will crash this is still true
104 */
105void UnicodeSetTest::Testj2268() {
106  UnicodeSet t;
107  t.add(UnicodeString("abc"));
108  UnicodeSet test(t);
109  UnicodeString ustrPat;
110  test.toPattern(ustrPat, TRUE);
111}
112
113/**
114 * Test toPattern().
115 */
116void UnicodeSetTest::TestToPattern() {
117    UErrorCode ec = U_ZERO_ERROR;
118
119    // Test that toPattern() round trips with syntax characters and
120    // whitespace.
121    {
122        static const char* OTHER_TOPATTERN_TESTS[] = {
123            "[[:latin:]&[:greek:]]",
124            "[[:latin:]-[:greek:]]",
125            "[:nonspacing mark:]",
126            NULL
127        };
128
129        for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
130            ec = U_ZERO_ERROR;
131            UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
132            if (U_FAILURE(ec)) {
133                dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
134                continue;
135            }
136            checkPat(OTHER_TOPATTERN_TESTS[j], s);
137        }
138
139        for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
140            if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
141
142                // check various combinations to make sure they all work.
143                if (i != 0 && !toPatternAux(i, i)){
144                    continue;
145                }
146                if (!toPatternAux(0, i)){
147                    continue;
148                }
149                if (!toPatternAux(i, 0xFFFF)){
150                    continue;
151                }
152            }
153        }
154    }
155
156    // Test pattern behavior of multicharacter strings.
157    {
158        ec = U_ZERO_ERROR;
159        UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
160
161        // This loop isn't a loop.  It's here to make the compiler happy.
162        // If you're curious, try removing it and changing the 'break'
163        // statements (except for the last) to goto's.
164        for (;;) {
165            if (U_FAILURE(ec)) break;
166            const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
167            expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
168
169            s->add("ac");
170            const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
171            expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
172
173            s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
174            if (U_FAILURE(ec)) break;
175            const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
176            expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
177
178            s->add("[]");
179            const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
180            expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
181
182            s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
183            if (U_FAILURE(ec)) break;
184            const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
185            expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
186
187            // j2189
188            s->clear();
189            s->add(UnicodeString("abc", ""));
190            s->add(UnicodeString("abc", ""));
191            const char* exp6[] = {"abc", NOT, "ab", NULL};
192            expectToPattern(*s, "[{abc}]", exp6);
193
194            break;
195        }
196
197        if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
198        delete s;
199    }
200
201    // JB#3400: For 2 character ranges prefer [ab] to [a-b]
202    UnicodeSet s;
203    s.add((UChar)97, (UChar)98); // 'a', 'b'
204    expectToPattern(s, "[ab]", NULL);
205}
206
207UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
208
209    // use Integer.toString because Utility.hex doesn't handle ints
210    UnicodeString pat = "";
211    // TODO do these in hex
212    //String source = "0x" + Integer.toString(start,16).toUpperCase();
213    //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
214    UnicodeString source;
215    source = source + (uint32_t)start;
216    if (start != end)
217        source = source + ".." + (uint32_t)end;
218    UnicodeSet testSet;
219    testSet.add(start, end);
220    return checkPat(source, testSet);
221}
222
223UBool UnicodeSetTest::checkPat(const UnicodeString& source,
224                               const UnicodeSet& testSet) {
225    // What we want to make sure of is that a pattern generated
226    // by toPattern(), with or without escaped unprintables, can
227    // be passed back into the UnicodeSet constructor.
228    UnicodeString pat0;
229
230    testSet.toPattern(pat0, TRUE);
231
232    if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
233
234    //String pat1 = unescapeLeniently(pat0);
235    //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
236
237    UnicodeString pat2;
238    testSet.toPattern(pat2, FALSE);
239    if (!checkPat(source, testSet, pat2)) return FALSE;
240
241    //String pat3 = unescapeLeniently(pat2);
242    // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
243
244    //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
245    logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
246    return TRUE;
247}
248
249UBool UnicodeSetTest::checkPat(const UnicodeString& source,
250                               const UnicodeSet& testSet,
251                               const UnicodeString& pat) {
252    UErrorCode ec = U_ZERO_ERROR;
253    UnicodeSet testSet2(pat, ec);
254    if (testSet2 != testSet) {
255        errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
256        return FALSE;
257    }
258    return TRUE;
259}
260
261void
262UnicodeSetTest::TestPatterns(void) {
263    UnicodeSet set;
264    expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
265    expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
266    expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
267    expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
268    expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
269    expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
270
271    // Throw in a test of complement
272    set.complement();
273    UnicodeString exp;
274    exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
275    expectPairs(set, exp);
276}
277
278void
279UnicodeSetTest::TestCategories(void) {
280    UErrorCode status = U_ZERO_ERROR;
281    const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
282    UnicodeSet set(pat, status);
283    if (U_FAILURE(status)) {
284        dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
285        return;
286    } else {
287        expectContainment(set, pat, "ABC", "abc");
288    }
289
290    UChar32 i;
291    int32_t failures = 0;
292    // Make sure generation of L doesn't pollute cached Lu set
293    // First generate L, then Lu
294    set.applyPattern("[:L:]", status);
295    if (U_FAILURE(status)) { errln("FAIL"); return; }
296    for (i=0; i<0x200; ++i) {
297        UBool l = u_isalpha((UChar)i);
298        if (l != set.contains(i)) {
299            errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
300                  set.contains(i));
301            if (++failures == 10) break;
302        }
303    }
304
305    set.applyPattern("[:Lu:]", status);
306    if (U_FAILURE(status)) { errln("FAIL"); return; }
307    for (i=0; i<0x200; ++i) {
308        UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
309        if (lu != set.contains(i)) {
310            errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
311                  set.contains(i));
312            if (++failures == 20) break;
313        }
314    }
315}
316void
317UnicodeSetTest::TestCloneEqualHash(void) {
318    UErrorCode status = U_ZERO_ERROR;
319    // set1 and set2 used to be built with the obsolete constructor taking
320    // UCharCategory values; replaced with pattern constructors
321    // markus 20030502
322    UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
323    UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
324    if (U_FAILURE(status)){
325        dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
326        return;
327    }
328    UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
329    UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
330    if (U_FAILURE(status)){
331        errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
332        return;
333    }
334
335    if (*set1 != *set1a) {
336        errln("FAIL: category constructor for Ll broken");
337    }
338    if (*set2 != *set2a) {
339        errln("FAIL: category constructor for Nd broken");
340    }
341    delete set1a;
342    delete set2a;
343
344    logln("Testing copy construction");
345    UnicodeSet *set1copy=new UnicodeSet(*set1);
346    if(*set1 != *set1copy || *set1 == *set2 ||
347        getPairs(*set1) != getPairs(*set1copy) ||
348        set1->hashCode() != set1copy->hashCode()){
349        errln("FAIL : Error in copy construction");
350        return;
351    }
352
353    logln("Testing =operator");
354    UnicodeSet set1equal=*set1;
355    UnicodeSet set2equal=*set2;
356    if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
357        set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
358        errln("FAIL: Error in =operator");
359    }
360
361    logln("Testing clone()");
362    UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
363    UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
364    if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
365        *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
366        *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
367        errln("FAIL: Error in clone");
368    }
369
370    logln("Testing hashcode");
371    if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
372        set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
373        set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
374        set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
375        set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
376        errln("FAIL: Error in hashCode()");
377    }
378
379    delete set1;
380    delete set1copy;
381    delete set2;
382    delete set1clone;
383    delete set2clone;
384
385
386}
387void
388UnicodeSetTest::TestAddRemove(void) {
389    UnicodeSet set; // Construct empty set
390    doAssert(set.isEmpty() == TRUE, "set should be empty");
391    doAssert(set.size() == 0, "size should be 0");
392    set.complement();
393    doAssert(set.size() == 0x110000, "size should be 0x110000");
394    set.clear();
395    set.add(0x0061, 0x007a);
396    expectPairs(set, "az");
397    doAssert(set.isEmpty() == FALSE, "set should not be empty");
398    doAssert(set.size() != 0, "size should not be equal to 0");
399    doAssert(set.size() == 26, "size should be equal to 26");
400    set.remove(0x006d, 0x0070);
401    expectPairs(set, "alqz");
402    doAssert(set.size() == 22, "size should be equal to 22");
403    set.remove(0x0065, 0x0067);
404    expectPairs(set, "adhlqz");
405    doAssert(set.size() == 19, "size should be equal to 19");
406    set.remove(0x0064, 0x0069);
407    expectPairs(set, "acjlqz");
408    doAssert(set.size() == 16, "size should be equal to 16");
409    set.remove(0x0063, 0x0072);
410    expectPairs(set, "absz");
411    doAssert(set.size() == 10, "size should be equal to 10");
412    set.add(0x0066, 0x0071);
413    expectPairs(set, "abfqsz");
414    doAssert(set.size() == 22, "size should be equal to 22");
415    set.remove(0x0061, 0x0067);
416    expectPairs(set, "hqsz");
417    set.remove(0x0061, 0x007a);
418    expectPairs(set, "");
419    doAssert(set.isEmpty() == TRUE, "set should be empty");
420    doAssert(set.size() == 0, "size should be 0");
421    set.add(0x0061);
422    doAssert(set.isEmpty() == FALSE, "set should not be empty");
423    doAssert(set.size() == 1, "size should not be equal to 1");
424    set.add(0x0062);
425    set.add(0x0063);
426    expectPairs(set, "ac");
427    doAssert(set.size() == 3, "size should not be equal to 3");
428    set.add(0x0070);
429    set.add(0x0071);
430    expectPairs(set, "acpq");
431    doAssert(set.size() == 5, "size should not be equal to 5");
432    set.clear();
433    expectPairs(set, "");
434    doAssert(set.isEmpty() == TRUE, "set should be empty");
435    doAssert(set.size() == 0, "size should be 0");
436
437    // Try removing an entire set from another set
438    expectPattern(set, "[c-x]", "cx");
439    UnicodeSet set2;
440    expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
441    set.removeAll(set2);
442    expectPairs(set, "deluxx");
443
444    // Try adding an entire set to another set
445    expectPattern(set, "[jackiemclean]", "aacceein");
446    expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
447    set.addAll(set2);
448    expectPairs(set, "aacehort");
449    doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
450
451    // Try retaining an set of elements contained in another set (intersection)
452    UnicodeSet set3;
453    expectPattern(set3, "[a-c]", "ac");
454    doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
455    set3.remove(0x0062);
456    expectPairs(set3, "aacc");
457    doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
458    set.retainAll(set3);
459    expectPairs(set, "aacc");
460    doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
461    doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
462    set.clear();
463    doAssert(set.size() != set3.size(), "set.size() != set3.size()");
464
465    // Test commutativity
466    expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
467    expectPattern(set2, "[jackiemclean]", "aacceein");
468    set.addAll(set2);
469    expectPairs(set, "aacehort");
470    doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
471
472
473
474
475}
476
477/**
478 * Make sure minimal representation is maintained.
479 */
480void UnicodeSetTest::TestMinimalRep() {
481    UErrorCode status = U_ZERO_ERROR;
482    // This is pretty thoroughly tested by checkCanonicalRep()
483    // run against the exhaustive operation results.  Use the code
484    // here for debugging specific spot problems.
485
486    // 1 overlap against 2
487    UnicodeSet set("[h-km-q]", status);
488    if (U_FAILURE(status)) { errln("FAIL"); return; }
489    UnicodeSet set2("[i-o]", status);
490    if (U_FAILURE(status)) { errln("FAIL"); return; }
491    set.addAll(set2);
492    expectPairs(set, "hq");
493    // right
494    set.applyPattern("[a-m]", status);
495    if (U_FAILURE(status)) { errln("FAIL"); return; }
496    set2.applyPattern("[e-o]", status);
497    if (U_FAILURE(status)) { errln("FAIL"); return; }
498    set.addAll(set2);
499    expectPairs(set, "ao");
500    // left
501    set.applyPattern("[e-o]", status);
502    if (U_FAILURE(status)) { errln("FAIL"); return; }
503    set2.applyPattern("[a-m]", status);
504    if (U_FAILURE(status)) { errln("FAIL"); return; }
505    set.addAll(set2);
506    expectPairs(set, "ao");
507    // 1 overlap against 3
508    set.applyPattern("[a-eg-mo-w]", status);
509    if (U_FAILURE(status)) { errln("FAIL"); return; }
510    set2.applyPattern("[d-q]", status);
511    if (U_FAILURE(status)) { errln("FAIL"); return; }
512    set.addAll(set2);
513    expectPairs(set, "aw");
514}
515
516void UnicodeSetTest::TestAPI() {
517    UErrorCode status = U_ZERO_ERROR;
518    // default ct
519    UnicodeSet set;
520    if (!set.isEmpty() || set.getRangeCount() != 0) {
521        errln((UnicodeString)"FAIL, set should be empty but isn't: " +
522              set);
523    }
524
525    // clear(), isEmpty()
526    set.add(0x0061);
527    if (set.isEmpty()) {
528        errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
529              set);
530    }
531    set.clear();
532    if (!set.isEmpty()) {
533        errln((UnicodeString)"FAIL, set should be empty but isn't: " +
534              set);
535    }
536
537    // size()
538    set.clear();
539    if (set.size() != 0) {
540        errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
541              ": " + set);
542    }
543    set.add(0x0061);
544    if (set.size() != 1) {
545        errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
546              ": " + set);
547    }
548    set.add(0x0031, 0x0039);
549    if (set.size() != 10) {
550        errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
551              ": " + set);
552    }
553
554    // contains(first, last)
555    set.clear();
556    set.applyPattern("[A-Y 1-8 b-d l-y]", status);
557    if (U_FAILURE(status)) { errln("FAIL"); return; }
558    for (int32_t i = 0; i<set.getRangeCount(); ++i) {
559        UChar32 a = set.getRangeStart(i);
560        UChar32 b = set.getRangeEnd(i);
561        if (!set.contains(a, b)) {
562            errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
563                  " but doesn't: " + set);
564        }
565        if (set.contains((UChar32)(a-1), b)) {
566            errln((UnicodeString)"FAIL, shouldn't contain " +
567                  (unsigned short)(a-1) + '-' + (unsigned short)b +
568                  " but does: " + set);
569        }
570        if (set.contains(a, (UChar32)(b+1))) {
571            errln((UnicodeString)"FAIL, shouldn't contain " +
572                  (unsigned short)a + '-' + (unsigned short)(b+1) +
573                  " but does: " + set);
574        }
575    }
576
577    // Ported InversionList test.
578    UnicodeSet a((UChar32)3,(UChar32)10);
579    UnicodeSet b((UChar32)7,(UChar32)15);
580    UnicodeSet c;
581
582    logln((UnicodeString)"a [3-10]: " + a);
583    logln((UnicodeString)"b [7-15]: " + b);
584    c = a;
585    c.addAll(b);
586    UnicodeSet exp((UChar32)3,(UChar32)15);
587    if (c == exp) {
588        logln((UnicodeString)"c.set(a).add(b): " + c);
589    } else {
590        errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
591    }
592    c.complement();
593    exp.set((UChar32)0, (UChar32)2);
594    exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
595    if (c == exp) {
596        logln((UnicodeString)"c.complement(): " + c);
597    } else {
598        errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
599    }
600    c.complement();
601    exp.set((UChar32)3, (UChar32)15);
602    if (c == exp) {
603        logln((UnicodeString)"c.complement(): " + c);
604    } else {
605        errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
606    }
607    c = a;
608    c.complementAll(b);
609    exp.set((UChar32)3,(UChar32)6);
610    exp.add((UChar32)11,(UChar32) 15);
611    if (c == exp) {
612        logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
613    } else {
614        errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
615    }
616
617    exp = c;
618    bitsToSet(setToBits(c), c);
619    if (c == exp) {
620        logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
621    } else {
622        errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
623    }
624
625    // Additional tests for coverage JB#2118
626    //UnicodeSet::complement(class UnicodeString const &)
627    //UnicodeSet::complementAll(class UnicodeString const &)
628    //UnicodeSet::containsNone(class UnicodeSet const &)
629    //UnicodeSet::containsNone(long,long)
630    //UnicodeSet::containsSome(class UnicodeSet const &)
631    //UnicodeSet::containsSome(long,long)
632    //UnicodeSet::removeAll(class UnicodeString const &)
633    //UnicodeSet::retain(long)
634    //UnicodeSet::retainAll(class UnicodeString const &)
635    //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
636    //UnicodeSetIterator::getString(void)
637    set.clear();
638    set.complement("ab");
639    exp.applyPattern("[{ab}]", status);
640    if (U_FAILURE(status)) { errln("FAIL"); return; }
641    if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
642
643    UnicodeSetIterator iset(set);
644    if (!iset.next() || !iset.isString()) {
645        errln("FAIL: UnicodeSetIterator::next/isString");
646    } else if (iset.getString() != "ab") {
647        errln("FAIL: UnicodeSetIterator::getString");
648    }
649
650    set.add((UChar32)0x61, (UChar32)0x7A);
651    set.complementAll("alan");
652    exp.applyPattern("[{ab}b-kmo-z]", status);
653    if (U_FAILURE(status)) { errln("FAIL"); return; }
654    if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
655
656    exp.applyPattern("[a-z]", status);
657    if (U_FAILURE(status)) { errln("FAIL"); return; }
658    if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
659    if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
660    exp.applyPattern("[aln]", status);
661    if (U_FAILURE(status)) { errln("FAIL"); return; }
662    if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
663    if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
664
665    if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
666        errln("FAIL: containsNone(UChar32, UChar32)");
667    }
668    if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
669        errln("FAIL: containsSome(UChar32, UChar32)");
670    }
671    if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
672        errln("FAIL: containsNone(UChar32, UChar32)");
673    }
674    if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
675        errln("FAIL: containsSome(UChar32, UChar32)");
676    }
677
678    set.removeAll("liu");
679    exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
680    if (U_FAILURE(status)) { errln("FAIL"); return; }
681    if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
682
683    set.retainAll("star");
684    exp.applyPattern("[rst]", status);
685    if (U_FAILURE(status)) { errln("FAIL"); return; }
686    if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
687
688    set.retain((UChar32)0x73);
689    exp.applyPattern("[s]", status);
690    if (U_FAILURE(status)) { errln("FAIL"); return; }
691    if (set != exp) { errln("FAIL: retain('s')"); return; }
692
693    uint16_t buf[32];
694    int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
695    if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
696    if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
697        errln("FAIL: serialize");
698        return;
699    }
700
701    // Conversions to and from USet
702    UnicodeSet *uniset = &set;
703    USet *uset = uniset->toUSet();
704    TEST_ASSERT((void *)uset == (void *)uniset);
705    UnicodeSet *setx = UnicodeSet::fromUSet(uset);
706    TEST_ASSERT((void *)setx == (void *)uset);
707    const UnicodeSet *constSet = uniset;
708    const USet *constUSet = constSet->toUSet();
709    TEST_ASSERT((void *)constUSet == (void *)constSet);
710    const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
711    TEST_ASSERT((void *)constSetx == (void *)constUSet);
712
713    // span(UnicodeString) and spanBack(UnicodeString) convenience methods
714    UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
715    UnicodeSet ac(0x61, 0x63);
716    ac.remove(0x62).freeze();
717    if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
718        ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
719        ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
720        ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
721        ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
722        ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
723        ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
724        ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
725        ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
726        ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
727    ) {
728        errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
729    }
730    if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
731        ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
732        ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
733        ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
734        ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
735        ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
736        ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
737        ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
738        ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
739        ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
740    ) {
741        errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
742    }
743}
744
745void UnicodeSetTest::TestIteration() {
746    UErrorCode ec = U_ZERO_ERROR;
747    int i = 0;
748    int outerLoop;
749
750    // 6 code points, 3 ranges, 2 strings, 8 total elements
751    //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
752    UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
753    TEST_ASSERT_SUCCESS(ec);
754    UnicodeSetIterator it(set);
755
756    for (outerLoop=0; outerLoop<3; outerLoop++) {
757        // Run the test multiple times, to check that iterator.reset() is working.
758        for (i=0; i<10; i++) {
759            UBool         nextv        = it.next();
760            UBool         isString     = it.isString();
761            int32_t       codePoint    = it.getCodepoint();
762            //int32_t       codePointEnd = it.getCodepointEnd();
763            UnicodeString s   = it.getString();
764            switch (i) {
765            case 0:
766                TEST_ASSERT(nextv == TRUE);
767                TEST_ASSERT(isString == FALSE);
768                TEST_ASSERT(codePoint==0x61);
769                TEST_ASSERT(s == "a");
770                break;
771            case 1:
772                TEST_ASSERT(nextv == TRUE);
773                TEST_ASSERT(isString == FALSE);
774                TEST_ASSERT(codePoint==0x62);
775                TEST_ASSERT(s == "b");
776                break;
777            case 2:
778                TEST_ASSERT(nextv == TRUE);
779                TEST_ASSERT(isString == FALSE);
780                TEST_ASSERT(codePoint==0x63);
781                TEST_ASSERT(s == "c");
782                break;
783            case 3:
784                TEST_ASSERT(nextv == TRUE);
785                TEST_ASSERT(isString == FALSE);
786                TEST_ASSERT(codePoint==0x79);
787                TEST_ASSERT(s == "y");
788                break;
789            case 4:
790                TEST_ASSERT(nextv == TRUE);
791                TEST_ASSERT(isString == FALSE);
792                TEST_ASSERT(codePoint==0x7a);
793                TEST_ASSERT(s == "z");
794                break;
795            case 5:
796                TEST_ASSERT(nextv == TRUE);
797                TEST_ASSERT(isString == FALSE);
798                TEST_ASSERT(codePoint==0x1abcd);
799                TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
800                break;
801            case 6:
802                TEST_ASSERT(nextv == TRUE);
803                TEST_ASSERT(isString == TRUE);
804                TEST_ASSERT(s == "str1");
805                break;
806            case 7:
807                TEST_ASSERT(nextv == TRUE);
808                TEST_ASSERT(isString == TRUE);
809                TEST_ASSERT(s == "str2");
810                break;
811            case 8:
812                TEST_ASSERT(nextv == FALSE);
813                break;
814            case 9:
815                TEST_ASSERT(nextv == FALSE);
816                break;
817            }
818        }
819        it.reset();  // prepare to run the iteration again.
820    }
821}
822
823
824
825
826void UnicodeSetTest::TestStrings() {
827    UErrorCode ec = U_ZERO_ERROR;
828
829    UnicodeSet* testList[] = {
830        UnicodeSet::createFromAll("abc"),
831        new UnicodeSet("[a-c]", ec),
832
833        &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
834        new UnicodeSet("[{ll}{ch}a-z]", ec),
835
836        UnicodeSet::createFrom("ab}c"),
837        new UnicodeSet("[{ab\\}c}]", ec),
838
839        &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
840        new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
841
842        NULL
843    };
844
845    if (U_FAILURE(ec)) {
846        errln("FAIL: couldn't construct test sets");
847    }
848
849    for (int32_t i = 0; testList[i] != NULL; i+=2) {
850        if (U_SUCCESS(ec)) {
851            UnicodeString pat0, pat1;
852            testList[i]->toPattern(pat0, TRUE);
853            testList[i+1]->toPattern(pat1, TRUE);
854            if (*testList[i] == *testList[i+1]) {
855                logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
856            } else {
857                logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
858            }
859        }
860        delete testList[i];
861        delete testList[i+1];
862    }
863}
864
865/**
866 * Test the [:Latin:] syntax.
867 */
868void UnicodeSetTest::TestScriptSet() {
869    expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
870
871    expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
872
873    /* Jitterbug 1423 */
874    expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
875
876}
877
878/**
879 * Test the [:Latin:] syntax.
880 */
881void UnicodeSetTest::TestPropertySet() {
882    static const char* const DATA[] = {
883        // Pattern, Chars IN, Chars NOT in
884
885        "[:Latin:]",
886        "aA",
887        "\\u0391\\u03B1",
888
889        "[\\p{Greek}]",
890        "\\u0391\\u03B1",
891        "aA",
892
893        "\\P{ GENERAL Category = upper case letter }",
894        "abc",
895        "ABC",
896
897#if !UCONFIG_NO_NORMALIZATION
898        // Combining class: @since ICU 2.2
899        // Check both symbolic and numeric
900        "\\p{ccc=Nukta}",
901        "\\u0ABC",
902        "abc",
903
904        "\\p{Canonical Combining Class = 11}",
905        "\\u05B1",
906        "\\u05B2",
907
908        "[:c c c = iota subscript :]",
909        "\\u0345",
910        "xyz",
911#endif
912
913        // Bidi class: @since ICU 2.2
914        "\\p{bidiclass=lefttoright}",
915        "abc",
916        "\\u0671\\u0672",
917
918        // Binary properties: @since ICU 2.2
919        "\\p{ideographic}",
920        "\\u4E0A",
921        "x",
922
923        "[:math=false:]",
924        "q)*(",
925        // weiv: )(and * were removed from math in Unicode 4.0.1
926        //"(*+)",
927        "+<>^",
928
929        // JB#1767 \N{}, \p{ASCII}
930        "[:Ascii:]",
931        "abc\\u0000\\u007F",
932        "\\u0080\\u4E00",
933
934        "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
935        "az",
936        "qrs",
937
938        // JB#2015
939        "[:any:]",
940        "a\\U0010FFFF",
941        "",
942
943        "[:nv=0.5:]",
944        "\\u00BD\\u0F2A",
945        "\\u00BC",
946
947        // JB#2653: Age
948        "[:Age=1.1:]",
949        "\\u03D6", // 1.1
950        "\\u03D8\\u03D9", // 3.2
951
952        "[:Age=3.1:]",
953        "\\u1800\\u3400\\U0002f800",
954        "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
955
956        // JB#2350: Case_Sensitive
957        "[:Case Sensitive:]",
958        "A\\u1FFC\\U00010410",
959        ";\\u00B4\\U00010500",
960
961        // JB#2832: C99-compatibility props
962        "[:blank:]",
963        " \\u0009",
964        "1-9A-Z",
965
966        "[:graph:]",
967        "19AZ",
968        " \\u0003\\u0007\\u0009\\u000A\\u000D",
969
970        "[:punct:]",
971        "!@#%&*()[]{}-_\\/;:,.?'\"",
972        "09azAZ",
973
974        "[:xdigit:]",
975        "09afAF",
976        "gG!",
977
978        // Regex compatibility test
979        "[-b]", // leading '-' is literal
980        "-b",
981        "ac",
982
983        "[^-b]", // leading '-' is literal
984        "ac",
985        "-b",
986
987        "[b-]", // trailing '-' is literal
988        "-b",
989        "ac",
990
991        "[^b-]", // trailing '-' is literal
992        "ac",
993        "-b",
994
995        "[a-b-]", // trailing '-' is literal
996        "ab-",
997        "c=",
998
999        "[[a-q]&[p-z]-]", // trailing '-' is literal
1000        "pq-",
1001        "or=",
1002
1003        "[\\s|\\)|:|$|\\>]", // from regex tests
1004        "s|):$>",
1005        "abc",
1006
1007        "[\\uDC00cd]", // JB#2906: isolated trail at start
1008        "cd\\uDC00",
1009        "ab\\uD800\\U00010000",
1010
1011        "[ab\\uD800]", // JB#2906: isolated trail at start
1012        "ab\\uD800",
1013        "cd\\uDC00\\U00010000",
1014
1015        "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1016        "abcd\\uD800",
1017        "ef\\uDC00\\U00010000",
1018
1019        "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1020        "abcd\\uDC00",
1021        "ef\\uD800\\U00010000",
1022
1023#if !UCONFIG_NO_NORMALIZATION
1024        "[:^lccc=0:]", // Lead canonical class
1025        "\\u0300\\u0301",
1026        "abcd\\u00c0\\u00c5",
1027
1028        "[:^tccc=0:]", // Trail canonical class
1029        "\\u0300\\u0301\\u00c0\\u00c5",
1030        "abcd",
1031
1032        "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1033        "\\u0300\\u0301\\u00c0\\u00c5",
1034        "abcd",
1035
1036        "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1037        "",
1038        "abcd\\u0300\\u0301\\u00c0\\u00c5",
1039
1040        "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1041        "\\u0F73\\u0F75\\u0F81",
1042        "abcd\\u0300\\u0301\\u00c0\\u00c5",
1043#endif /* !UCONFIG_NO_NORMALIZATION */
1044
1045        "[:Assigned:]",
1046        "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1047        "\\u0888\\uFDD3\\uFFFE\\U00050005",
1048
1049        // Script_Extensions, new in Unicode 6.0
1050        "[:scx=Arab:]",
1051        "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1052        "\\u061D\\u065F\\uFDEF\\uFDFE",
1053
1054        // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1055        // so scx-sc is missing U+FDF2.
1056        "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1057        "\\u0640\\u064B\\u0650\\u0655\\uFDFD",
1058        "\\uFDF2"
1059    };
1060
1061    static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1062
1063    for (int32_t i=0; i<DATA_LEN; i+=3) {
1064        expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1065                          CharsToUnicodeString(DATA[i+2]));
1066    }
1067}
1068
1069/**
1070  * Test that Posix style character classes [:digit:], etc.
1071  *   have the Unicode definitions from TR 18.
1072  */
1073void UnicodeSetTest::TestPosixClasses() {
1074    {
1075        UErrorCode status = U_ZERO_ERROR;
1076        UnicodeSet s1("[:alpha:]", status);
1077        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1078        TEST_ASSERT_SUCCESS(status);
1079        TEST_ASSERT(s1==s2);
1080    }
1081    {
1082        UErrorCode status = U_ZERO_ERROR;
1083        UnicodeSet s1("[:lower:]", status);
1084        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1085        TEST_ASSERT_SUCCESS(status);
1086        TEST_ASSERT(s1==s2);
1087    }
1088    {
1089        UErrorCode status = U_ZERO_ERROR;
1090        UnicodeSet s1("[:upper:]", status);
1091        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1092        TEST_ASSERT_SUCCESS(status);
1093        TEST_ASSERT(s1==s2);
1094    }
1095    {
1096        UErrorCode status = U_ZERO_ERROR;
1097        UnicodeSet s1("[:punct:]", status);
1098        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1099        TEST_ASSERT_SUCCESS(status);
1100        TEST_ASSERT(s1==s2);
1101    }
1102    {
1103        UErrorCode status = U_ZERO_ERROR;
1104        UnicodeSet s1("[:digit:]", status);
1105        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1106        TEST_ASSERT_SUCCESS(status);
1107        TEST_ASSERT(s1==s2);
1108    }
1109    {
1110        UErrorCode status = U_ZERO_ERROR;
1111        UnicodeSet s1("[:xdigit:]", status);
1112        UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1113        TEST_ASSERT_SUCCESS(status);
1114        TEST_ASSERT(s1==s2);
1115    }
1116    {
1117        UErrorCode status = U_ZERO_ERROR;
1118        UnicodeSet s1("[:alnum:]", status);
1119        UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1120        TEST_ASSERT_SUCCESS(status);
1121        TEST_ASSERT(s1==s2);
1122    }
1123    {
1124        UErrorCode status = U_ZERO_ERROR;
1125        UnicodeSet s1("[:space:]", status);
1126        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1127        TEST_ASSERT_SUCCESS(status);
1128        TEST_ASSERT(s1==s2);
1129    }
1130    {
1131        UErrorCode status = U_ZERO_ERROR;
1132        UnicodeSet s1("[:blank:]", status);
1133        TEST_ASSERT_SUCCESS(status);
1134        UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1135            status);
1136        TEST_ASSERT_SUCCESS(status);
1137        TEST_ASSERT(s1==s2);
1138    }
1139    {
1140        UErrorCode status = U_ZERO_ERROR;
1141        UnicodeSet s1("[:cntrl:]", status);
1142        TEST_ASSERT_SUCCESS(status);
1143        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1144        TEST_ASSERT_SUCCESS(status);
1145        TEST_ASSERT(s1==s2);
1146    }
1147    {
1148        UErrorCode status = U_ZERO_ERROR;
1149        UnicodeSet s1("[:graph:]", status);
1150        TEST_ASSERT_SUCCESS(status);
1151        UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1152        TEST_ASSERT_SUCCESS(status);
1153        TEST_ASSERT(s1==s2);
1154    }
1155    {
1156        UErrorCode status = U_ZERO_ERROR;
1157        UnicodeSet s1("[:print:]", status);
1158        TEST_ASSERT_SUCCESS(status);
1159        UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1160        TEST_ASSERT_SUCCESS(status);
1161        TEST_ASSERT(s1==s2);
1162    }
1163}
1164/**
1165 * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1166 */
1167void UnicodeSetTest::TestClone() {
1168    UErrorCode ec = U_ZERO_ERROR;
1169    UnicodeSet s("[abcxyz]", ec);
1170    UnicodeSet t(s);
1171    expectContainment(t, "abc", "def");
1172}
1173
1174/**
1175 * Test the indexOf() and charAt() methods.
1176 */
1177void UnicodeSetTest::TestIndexOf() {
1178    UErrorCode ec = U_ZERO_ERROR;
1179    UnicodeSet set("[a-cx-y3578]", ec);
1180    if (U_FAILURE(ec)) {
1181        errln("FAIL: UnicodeSet constructor");
1182        return;
1183    }
1184    for (int32_t i=0; i<set.size(); ++i) {
1185        UChar32 c = set.charAt(i);
1186        if (set.indexOf(c) != i) {
1187            errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1188                i, c, set.indexOf(c));
1189        }
1190    }
1191    UChar32 c = set.charAt(set.size());
1192    if (c != -1) {
1193        errln("FAIL: charAt(<out of range>) = %X", c);
1194    }
1195    int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1196    if (j != -1) {
1197        errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1198    }
1199}
1200
1201/**
1202 * Test closure API.
1203 */
1204void UnicodeSetTest::TestCloseOver() {
1205    UErrorCode ec = U_ZERO_ERROR;
1206
1207    char CASE[] = {(char)USET_CASE_INSENSITIVE};
1208    char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1209    const char* DATA[] = {
1210        // selector, input, output
1211        CASE,
1212        "[aq\\u00DF{Bc}{bC}{Fi}]",
1213        "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1214
1215        CASE,
1216        "[\\u01F1]", // 'DZ'
1217        "[\\u01F1\\u01F2\\u01F3]",
1218
1219        CASE,
1220        "[\\u1FB4]",
1221        "[\\u1FB4{\\u03AC\\u03B9}]",
1222
1223        CASE,
1224        "[{F\\uFB01}]",
1225        "[\\uFB03{ffi}]",
1226
1227        CASE, // make sure binary search finds limits
1228        "[a\\uFF3A]",
1229        "[aA\\uFF3A\\uFF5A]",
1230
1231        CASE,
1232        "[a-z]","[A-Za-z\\u017F\\u212A]",
1233        CASE,
1234        "[abc]","[A-Ca-c]",
1235        CASE,
1236        "[ABC]","[A-Ca-c]",
1237
1238        CASE, "[i]", "[iI]",
1239
1240        CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1241        CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1242
1243        CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1244
1245        CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1246
1247        CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1248
1249        CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1250
1251        CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1252
1253        CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1254
1255        CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1256        CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1257
1258        CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1259
1260        CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1261
1262        CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1263
1264#if !UCONFIG_NO_FILE_IO
1265        CASE_MAPPINGS,
1266        "[aq\\u00DF{Bc}{bC}{Fi}]",
1267        "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1268#endif
1269
1270        CASE_MAPPINGS,
1271        "[\\u01F1]", // 'DZ'
1272        "[\\u01F1\\u01F2\\u01F3]",
1273
1274        CASE_MAPPINGS,
1275        "[a-z]",
1276        "[A-Za-z]",
1277
1278        NULL
1279    };
1280
1281    UnicodeSet s;
1282    UnicodeSet t;
1283    UnicodeString buf;
1284    for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1285        int32_t selector = DATA[i][0];
1286        UnicodeString pat(DATA[i+1], -1, US_INV);
1287        UnicodeString exp(DATA[i+2], -1, US_INV);
1288        s.applyPattern(pat, ec);
1289        s.closeOver(selector);
1290        t.applyPattern(exp, ec);
1291        if (U_FAILURE(ec)) {
1292            errln("FAIL: applyPattern failed");
1293            continue;
1294        }
1295        if (s == t) {
1296            logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1297        } else {
1298            dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1299                  s.toPattern(buf, TRUE) + ", expected " + exp);
1300        }
1301    }
1302
1303#if 0
1304    /*
1305     * Unused test code.
1306     * This was used to compare the old implementation (using USET_CASE)
1307     * with the new one (using 0x100 temporarily)
1308     * while transitioning from hardcoded case closure tables in uniset.cpp
1309     * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1310     * and using ucase.c functions for closure.
1311     * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1312     *
1313     * Note: The old and new implementation never fully matched because
1314     * the old implementation turned out to not map U+0130 and U+0131 correctly
1315     * (dotted I and dotless i) and because the old implementation's data tables
1316     * were outdated compared to Unicode 4.0.1 at the time of the change to the
1317     * new implementation. (So sigmas and some other characters were not handled
1318     * according to the newer Unicode version.)
1319     */
1320    UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1321    UnicodeSetIterator si(sens);
1322    UnicodeString str, buf2;
1323    const UnicodeString *pStr;
1324    UChar32 c;
1325    while(si.next()) {
1326        if(!si.isString()) {
1327            c=si.getCodepoint();
1328            s.clear();
1329            s.add(c);
1330
1331            str.setTo(c);
1332            str.foldCase();
1333            sens2.add(str);
1334
1335            t=s;
1336            s.closeOver(USET_CASE);
1337            t.closeOver(0x100);
1338            if(s!=t) {
1339                errln("FAIL: closeOver(U+%04x) differs: ", c);
1340                errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1341            }
1342        }
1343    }
1344    // remove all code points
1345    // should contain all full case folding mapping strings
1346    sens2.remove(0, 0x10ffff);
1347    si.reset(sens2);
1348    while(si.next()) {
1349        if(si.isString()) {
1350            pStr=&si.getString();
1351            s.clear();
1352            s.add(*pStr);
1353            t=s2=s;
1354            s.closeOver(USET_CASE);
1355            t.closeOver(0x100);
1356            if(s!=t) {
1357                errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1358                errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1359            }
1360        }
1361    }
1362#endif
1363
1364    // Test the pattern API
1365    s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1366    if (U_FAILURE(ec)) {
1367        errln("FAIL: applyPattern failed");
1368    } else {
1369        expectContainment(s, "abcABC", "defDEF");
1370    }
1371    UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1372    if (U_FAILURE(ec)) {
1373        errln("FAIL: constructor failed");
1374    } else {
1375        expectContainment(v, "defDEF", "abcABC");
1376    }
1377    UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1378    if (U_FAILURE(ec)) {
1379        errln("FAIL: construct w/case mappings failed");
1380    } else {
1381        expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1382    }
1383}
1384
1385void UnicodeSetTest::TestEscapePattern() {
1386    const char pattern[] =
1387        "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1388    const char exp[] =
1389        "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1390    // We test this with two passes; in the second pass we
1391    // pre-unescape the pattern.  Since U+200E is rule whitespace,
1392    // this fails -- which is what we expect.
1393    for (int32_t pass=1; pass<=2; ++pass) {
1394        UErrorCode ec = U_ZERO_ERROR;
1395        UnicodeString pat(pattern, -1, US_INV);
1396        if (pass==2) {
1397            pat = pat.unescape();
1398        }
1399        // Pattern is only good for pass 1
1400        UBool isPatternValid = (pass==1);
1401
1402        UnicodeSet set(pat, ec);
1403        if (U_SUCCESS(ec) != isPatternValid){
1404            errln((UnicodeString)"FAIL: applyPattern(" +
1405                  escape(pat) + ") => " +
1406                  u_errorName(ec));
1407            continue;
1408        }
1409        if (U_FAILURE(ec)) {
1410            continue;
1411        }
1412        if (set.contains((UChar)0x0644)){
1413            errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1414        }
1415
1416        UnicodeString newpat;
1417        set.toPattern(newpat, TRUE);
1418        if (newpat == UnicodeString(exp, -1, US_INV)) {
1419            logln(escape(pat) + " => " + newpat);
1420        } else {
1421            errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1422        }
1423
1424        for (int32_t i=0; i<set.getRangeCount(); ++i) {
1425            UnicodeString str("Range ");
1426            str.append((UChar)(0x30 + i))
1427                .append(": ")
1428                .append((UChar32)set.getRangeStart(i))
1429                .append(" - ")
1430                .append((UChar32)set.getRangeEnd(i));
1431            str = str + " (" + set.getRangeStart(i) + " - " +
1432                set.getRangeEnd(i) + ")";
1433            if (set.getRangeStart(i) < 0) {
1434                errln((UnicodeString)"FAIL: " + escape(str));
1435            } else {
1436                logln(escape(str));
1437            }
1438        }
1439    }
1440}
1441
1442void UnicodeSetTest::expectRange(const UnicodeString& label,
1443                                 const UnicodeSet& set,
1444                                 UChar32 start, UChar32 end) {
1445    UnicodeSet exp(start, end);
1446    UnicodeString pat;
1447    if (set == exp) {
1448        logln(label + " => " + set.toPattern(pat, TRUE));
1449    } else {
1450        UnicodeString xpat;
1451        errln((UnicodeString)"FAIL: " + label + " => " +
1452              set.toPattern(pat, TRUE) +
1453              ", expected " + exp.toPattern(xpat, TRUE));
1454    }
1455}
1456
1457void UnicodeSetTest::TestInvalidCodePoint() {
1458
1459    const UChar32 DATA[] = {
1460        // Test range             Expected range
1461        0, 0x10FFFF,              0, 0x10FFFF,
1462        (UChar32)-1, 8,           0, 8,
1463        8, 0x110000,              8, 0x10FFFF
1464    };
1465    const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1466
1467    UnicodeString pat;
1468    int32_t i;
1469
1470    for (i=0; i<DATA_LENGTH; i+=4) {
1471        UChar32 start  = DATA[i];
1472        UChar32 end    = DATA[i+1];
1473        UChar32 xstart = DATA[i+2];
1474        UChar32 xend   = DATA[i+3];
1475
1476        // Try various API using the test code points
1477
1478        UnicodeSet set(start, end);
1479        expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1480                    set, xstart, xend);
1481
1482        set.clear();
1483        set.set(start, end);
1484        expectRange((UnicodeString)"set(" + start + "," + end + ")",
1485                    set, xstart, xend);
1486
1487        UBool b = set.contains(start);
1488        b = set.contains(start, end);
1489        b = set.containsNone(start, end);
1490        b = set.containsSome(start, end);
1491
1492        /*int32_t index = set.indexOf(start);*/
1493
1494        set.clear();
1495        set.add(start);
1496        set.add(start, end);
1497        expectRange((UnicodeString)"add(" + start + "," + end + ")",
1498                    set, xstart, xend);
1499
1500        set.set(0, 0x10FFFF);
1501        set.retain(start, end);
1502        expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1503                    set, xstart, xend);
1504        set.retain(start);
1505
1506        set.set(0, 0x10FFFF);
1507        set.remove(start);
1508        set.remove(start, end);
1509        set.complement();
1510        expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1511                    set, xstart, xend);
1512
1513        set.set(0, 0x10FFFF);
1514        set.complement(start, end);
1515        set.complement();
1516        expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1517                    set, xstart, xend);
1518        set.complement(start);
1519    }
1520
1521    const UChar32 DATA2[] = {
1522        0,
1523        0x10FFFF,
1524        (UChar32)-1,
1525        0x110000
1526    };
1527    const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1528
1529    for (i=0; i<DATA2_LENGTH; ++i) {
1530        UChar32 c = DATA2[i], end = 0x10FFFF;
1531        UBool valid = (c >= 0 && c <= 0x10FFFF);
1532
1533        UnicodeSet set(0, 0x10FFFF);
1534
1535        // For single-codepoint contains, invalid codepoints are NOT contained
1536        UBool b = set.contains(c);
1537        if (b == valid) {
1538            logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1539                  ") = " + b);
1540        } else {
1541            errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1542                  ") = " + b);
1543        }
1544
1545        // For codepoint range contains, containsNone, and containsSome,
1546        // invalid or empty (start > end) ranges have UNDEFINED behavior.
1547        b = set.contains(c, end);
1548        logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1549              "," + end + ") = " + b);
1550
1551        b = set.containsNone(c, end);
1552        logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1553              "," + end + ") = " + b);
1554
1555        b = set.containsSome(c, end);
1556        logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1557              "," + end + ") = " + b);
1558
1559        int32_t index = set.indexOf(c);
1560        if ((index >= 0) == valid) {
1561            logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1562                  ") = " + index);
1563        } else {
1564            errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1565                  ") = " + index);
1566        }
1567    }
1568}
1569
1570// Used by TestSymbolTable
1571class TokenSymbolTable : public SymbolTable {
1572public:
1573    Hashtable contents;
1574
1575    TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1576        contents.setValueDeleter(uhash_deleteUnicodeString);
1577    }
1578
1579    ~TokenSymbolTable() {}
1580
1581    /**
1582     * (Non-SymbolTable API) Add the given variable and value to
1583     * the table.  Variable should NOT contain leading '$'.
1584     */
1585    void add(const UnicodeString& var, const UnicodeString& value,
1586             UErrorCode& ec) {
1587        if (U_SUCCESS(ec)) {
1588            contents.put(var, new UnicodeString(value), ec);
1589        }
1590    }
1591
1592    /**
1593     * SymbolTable API
1594     */
1595    virtual const UnicodeString* lookup(const UnicodeString& s) const {
1596        return (const UnicodeString*) contents.get(s);
1597    }
1598
1599    /**
1600     * SymbolTable API
1601     */
1602    virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1603        return NULL;
1604    }
1605
1606    /**
1607     * SymbolTable API
1608     */
1609    virtual UnicodeString parseReference(const UnicodeString& text,
1610                                         ParsePosition& pos, int32_t limit) const {
1611        int32_t start = pos.getIndex();
1612        int32_t i = start;
1613        UnicodeString result;
1614        while (i < limit) {
1615            UChar c = text.charAt(i);
1616            if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1617                break;
1618            }
1619            ++i;
1620        }
1621        if (i == start) { // No valid name chars
1622            return result; // Indicate failure with empty string
1623        }
1624        pos.setIndex(i);
1625        text.extractBetween(start, i, result);
1626        return result;
1627    }
1628};
1629
1630void UnicodeSetTest::TestSymbolTable() {
1631    // Multiple test cases can be set up here.  Each test case
1632    // is terminated by null:
1633    // var, value, var, value,..., input pat., exp. output pat., null
1634    const char* DATA[] = {
1635        "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1636        "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1637        "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1638        NULL
1639    };
1640
1641    for (int32_t i=0; DATA[i]!=NULL; ++i) {
1642        UErrorCode ec = U_ZERO_ERROR;
1643        TokenSymbolTable sym(ec);
1644        if (U_FAILURE(ec)) {
1645            errln("FAIL: couldn't construct TokenSymbolTable");
1646            continue;
1647        }
1648
1649        // Set up variables
1650        while (DATA[i+2] != NULL) {
1651            sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1652            if (U_FAILURE(ec)) {
1653                errln("FAIL: couldn't add to TokenSymbolTable");
1654                continue;
1655            }
1656            i += 2;
1657        }
1658
1659        // Input pattern and expected output pattern
1660        UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1661        i += 2;
1662
1663        ParsePosition pos(0);
1664        UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1665        if (U_FAILURE(ec)) {
1666            errln("FAIL: couldn't construct UnicodeSet");
1667            continue;
1668        }
1669
1670        // results
1671        if (pos.getIndex() != inpat.length()) {
1672            errln((UnicodeString)"Failed to read to end of string \""
1673                  + inpat + "\": read to "
1674                  + pos.getIndex() + ", length is "
1675                  + inpat.length());
1676        }
1677
1678        UnicodeSet us2(exppat, ec);
1679        if (U_FAILURE(ec)) {
1680            errln("FAIL: couldn't construct expected UnicodeSet");
1681            continue;
1682        }
1683
1684        UnicodeString a, b;
1685        if (us != us2) {
1686            errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1687                  ", expected " + us2.toPattern(b, TRUE));
1688        } else {
1689            logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1690        }
1691    }
1692}
1693
1694void UnicodeSetTest::TestSurrogate() {
1695    const char* DATA[] = {
1696        // These should all behave identically
1697        "[abc\\uD800\\uDC00]",
1698        // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1699        "[abc\\U00010000]",
1700        0
1701    };
1702    for (int i=0; DATA[i] != 0; ++i) {
1703        UErrorCode ec = U_ZERO_ERROR;
1704        logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1705        UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1706        UnicodeSet set(str, ec);
1707        if (U_FAILURE(ec)) {
1708            errln("FAIL: UnicodeSet constructor");
1709            continue;
1710        }
1711        expectContainment(set,
1712                          CharsToUnicodeString("abc\\U00010000"),
1713                          CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1714        if (set.size() != 4) {
1715            errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1716                  set.size() + ", expected 4");
1717        }
1718    }
1719}
1720
1721void UnicodeSetTest::TestExhaustive() {
1722    // exhaustive tests. Simulate UnicodeSets with integers.
1723    // That gives us very solid tests (except for large memory tests).
1724
1725    int32_t limit = 128;
1726
1727    UnicodeSet x, y, z, aa;
1728
1729    for (int32_t i = 0; i < limit; ++i) {
1730        bitsToSet(i, x);
1731        logln((UnicodeString)"Testing " + i + ", " + x);
1732        _testComplement(i, x, y);
1733
1734        // AS LONG AS WE ARE HERE, check roundtrip
1735        checkRoundTrip(bitsToSet(i, aa));
1736
1737        for (int32_t j = 0; j < limit; ++j) {
1738            _testAdd(i,j,  x,y,z);
1739            _testXor(i,j,  x,y,z);
1740            _testRetain(i,j,  x,y,z);
1741            _testRemove(i,j,  x,y,z);
1742        }
1743    }
1744}
1745
1746void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1747    bitsToSet(a, x);
1748    z = x;
1749    z.complement();
1750    int32_t c = setToBits(z);
1751    if (c != (~a)) {
1752        errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1753        errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1754    }
1755    checkCanonicalRep(z, (UnicodeString)"complement " + a);
1756}
1757
1758void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1759    bitsToSet(a, x);
1760    bitsToSet(b, y);
1761    z = x;
1762    z.addAll(y);
1763    int32_t c = setToBits(z);
1764    if (c != (a | b)) {
1765        errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1766        errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1767    }
1768    checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1769}
1770
1771void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1772    bitsToSet(a, x);
1773    bitsToSet(b, y);
1774    z = x;
1775    z.retainAll(y);
1776    int32_t c = setToBits(z);
1777    if (c != (a & b)) {
1778        errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1779        errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1780    }
1781    checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1782}
1783
1784void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1785    bitsToSet(a, x);
1786    bitsToSet(b, y);
1787    z = x;
1788    z.removeAll(y);
1789    int32_t c = setToBits(z);
1790    if (c != (a &~ b)) {
1791        errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1792        errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1793    }
1794    checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1795}
1796
1797void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1798    bitsToSet(a, x);
1799    bitsToSet(b, y);
1800    z = x;
1801    z.complementAll(y);
1802    int32_t c = setToBits(z);
1803    if (c != (a ^ b)) {
1804        errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1805        errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1806    }
1807    checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1808}
1809
1810/**
1811 * Check that ranges are monotonically increasing and non-
1812 * overlapping.
1813 */
1814void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1815    int32_t n = set.getRangeCount();
1816    if (n < 0) {
1817        errln((UnicodeString)"FAIL result of " + msg +
1818              ": range count should be >= 0 but is " +
1819              n /*+ " for " + set.toPattern())*/);
1820        return;
1821    }
1822    UChar32 last = 0;
1823    for (int32_t i=0; i<n; ++i) {
1824        UChar32 start = set.getRangeStart(i);
1825        UChar32 end = set.getRangeEnd(i);
1826        if (start > end) {
1827            errln((UnicodeString)"FAIL result of " + msg +
1828                  ": range " + (i+1) +
1829                  " start > end: " + (int)start + ", " + (int)end +
1830                  " for " + set);
1831        }
1832        if (i > 0 && start <= last) {
1833            errln((UnicodeString)"FAIL result of " + msg +
1834                  ": range " + (i+1) +
1835                  " overlaps previous range: " + (int)start + ", " + (int)end +
1836                  " for " + set);
1837        }
1838        last = end;
1839    }
1840}
1841
1842/**
1843 * Convert a bitmask to a UnicodeSet.
1844 */
1845UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1846    result.clear();
1847    for (UChar32 i = 0; i < 32; ++i) {
1848        if ((a & (1<<i)) != 0) {
1849            result.add(i);
1850        }
1851    }
1852    return result;
1853}
1854
1855/**
1856 * Convert a UnicodeSet to a bitmask.  Only the characters
1857 * U+0000 to U+0020 are represented in the bitmask.
1858 */
1859int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1860    int32_t result = 0;
1861    for (int32_t i = 0; i < 32; ++i) {
1862        if (x.contains((UChar32)i)) {
1863            result |= (1<<i);
1864        }
1865    }
1866    return result;
1867}
1868
1869/**
1870 * Return the representation of an inversion list based UnicodeSet
1871 * as a pairs list.  Ranges are listed in ascending Unicode order.
1872 * For example, the set [a-zA-M3] is represented as "33AMaz".
1873 */
1874UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1875    UnicodeString pairs;
1876    for (int32_t i=0; i<set.getRangeCount(); ++i) {
1877        UChar32 start = set.getRangeStart(i);
1878        UChar32 end = set.getRangeEnd(i);
1879        if (end > 0xFFFF) {
1880            end = 0xFFFF;
1881            i = set.getRangeCount(); // Should be unnecessary
1882        }
1883        pairs.append((UChar)start).append((UChar)end);
1884    }
1885    return pairs;
1886}
1887
1888/**
1889 * Basic consistency check for a few items.
1890 * That the iterator works, and that we can create a pattern and
1891 * get the same thing back
1892 */
1893void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1894    UErrorCode ec = U_ZERO_ERROR;
1895
1896    UnicodeSet t(s);
1897    checkEqual(s, t, "copy ct");
1898
1899    t = s;
1900    checkEqual(s, t, "operator=");
1901
1902    copyWithIterator(t, s, FALSE);
1903    checkEqual(s, t, "iterator roundtrip");
1904
1905    copyWithIterator(t, s, TRUE); // try range
1906    checkEqual(s, t, "iterator roundtrip");
1907
1908    UnicodeString pat; s.toPattern(pat, FALSE);
1909    t.applyPattern(pat, ec);
1910    if (U_FAILURE(ec)) {
1911        errln("FAIL: applyPattern");
1912        return;
1913    } else {
1914        checkEqual(s, t, "toPattern(false)");
1915    }
1916
1917    s.toPattern(pat, TRUE);
1918    t.applyPattern(pat, ec);
1919    if (U_FAILURE(ec)) {
1920        errln("FAIL: applyPattern");
1921        return;
1922    } else {
1923        checkEqual(s, t, "toPattern(true)");
1924    }
1925}
1926
1927void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1928    t.clear();
1929    UnicodeSetIterator it(s);
1930    if (withRange) {
1931        while (it.nextRange()) {
1932            if (it.isString()) {
1933                t.add(it.getString());
1934            } else {
1935                t.add(it.getCodepoint(), it.getCodepointEnd());
1936            }
1937        }
1938    } else {
1939        while (it.next()) {
1940            if (it.isString()) {
1941                t.add(it.getString());
1942            } else {
1943                t.add(it.getCodepoint());
1944            }
1945        }
1946    }
1947}
1948
1949UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1950    UnicodeString source; s.toPattern(source, TRUE);
1951    UnicodeString result; t.toPattern(result, TRUE);
1952    if (s != t) {
1953        errln((UnicodeString)"FAIL: " + message
1954              + "; source = " + source
1955              + "; result = " + result
1956              );
1957        return FALSE;
1958    } else {
1959        logln((UnicodeString)"Ok: " + message
1960              + "; source = " + source
1961              + "; result = " + result
1962              );
1963    }
1964    return TRUE;
1965}
1966
1967void
1968UnicodeSetTest::expectContainment(const UnicodeString& pat,
1969                                  const UnicodeString& charsIn,
1970                                  const UnicodeString& charsOut) {
1971    UErrorCode ec = U_ZERO_ERROR;
1972    UnicodeSet set(pat, ec);
1973    if (U_FAILURE(ec)) {
1974        dataerrln((UnicodeString)"FAIL: pattern \"" +
1975              pat + "\" => " + u_errorName(ec));
1976        return;
1977    }
1978    expectContainment(set, pat, charsIn, charsOut);
1979}
1980
1981void
1982UnicodeSetTest::expectContainment(const UnicodeSet& set,
1983                                  const UnicodeString& charsIn,
1984                                  const UnicodeString& charsOut) {
1985    UnicodeString pat;
1986    set.toPattern(pat);
1987    expectContainment(set, pat, charsIn, charsOut);
1988}
1989
1990void
1991UnicodeSetTest::expectContainment(const UnicodeSet& set,
1992                                  const UnicodeString& setName,
1993                                  const UnicodeString& charsIn,
1994                                  const UnicodeString& charsOut) {
1995    UnicodeString bad;
1996    UChar32 c;
1997    int32_t i;
1998
1999    for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2000        c = charsIn.char32At(i);
2001        if (!set.contains(c)) {
2002            bad.append(c);
2003        }
2004    }
2005    if (bad.length() > 0) {
2006        errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2007              ", expected containment of " + prettify(charsIn));
2008    } else {
2009        logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2010    }
2011
2012    bad.truncate(0);
2013    for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2014        c = charsOut.char32At(i);
2015        if (set.contains(c)) {
2016            bad.append(c);
2017        }
2018    }
2019    if (bad.length() > 0) {
2020        errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2021              ", expected non-containment of " + prettify(charsOut));
2022    } else {
2023        logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2024    }
2025}
2026
2027void
2028UnicodeSetTest::expectPattern(UnicodeSet& set,
2029                              const UnicodeString& pattern,
2030                              const UnicodeString& expectedPairs){
2031    UErrorCode status = U_ZERO_ERROR;
2032    set.applyPattern(pattern, status);
2033    if (U_FAILURE(status)) {
2034        errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2035              "\") failed");
2036        return;
2037    } else {
2038        if (getPairs(set) != expectedPairs ) {
2039            errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2040                  "\") => pairs \"" +
2041                  escape(getPairs(set)) + "\", expected \"" +
2042                  escape(expectedPairs) + "\"");
2043        } else {
2044            logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2045                  "\") => pairs \"" +
2046                  escape(getPairs(set)) + "\"");
2047        }
2048    }
2049    // the result of calling set.toPattern(), which is the string representation of
2050    // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2051    // will produce another set that is equal to this one.
2052    UnicodeString temppattern;
2053    set.toPattern(temppattern);
2054    UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2055    if (U_FAILURE(status)) {
2056        errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2057        return;
2058    }
2059    if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2060        errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2061            escape(getPairs(set)) + "\""));
2062    } else{
2063        logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2064    }
2065
2066    delete tempset;
2067
2068}
2069
2070void
2071UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2072    if (getPairs(set) != expectedPairs) {
2073        errln(UnicodeString("FAIL: Expected pair list \"") +
2074              escape(expectedPairs) + "\", got \"" +
2075              escape(getPairs(set)) + "\"");
2076    }
2077}
2078
2079void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2080                                     const UnicodeString& expPat,
2081                                     const char** expStrings) {
2082    UnicodeString pat;
2083    set.toPattern(pat, TRUE);
2084    if (pat == expPat) {
2085        logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2086    } else {
2087        errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2088        return;
2089    }
2090    if (expStrings == NULL) {
2091        return;
2092    }
2093    UBool in = TRUE;
2094    for (int32_t i=0; expStrings[i] != NULL; ++i) {
2095        if (expStrings[i] == NOT) { // sic; pointer comparison
2096            in = FALSE;
2097            continue;
2098        }
2099        UnicodeString s = CharsToUnicodeString(expStrings[i]);
2100        UBool contained = set.contains(s);
2101        if (contained == in) {
2102            logln((UnicodeString)"Ok: " + expPat +
2103                  (contained ? " contains {" : " does not contain {") +
2104                  escape(expStrings[i]) + "}");
2105        } else {
2106            errln((UnicodeString)"FAIL: " + expPat +
2107                  (contained ? " contains {" : " does not contain {") +
2108                  escape(expStrings[i]) + "}");
2109        }
2110    }
2111}
2112
2113static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2114
2115void
2116UnicodeSetTest::doAssert(UBool condition, const char *message)
2117{
2118    if (!condition) {
2119        errln(UnicodeString("ERROR : ") + message);
2120    }
2121}
2122
2123UnicodeString
2124UnicodeSetTest::escape(const UnicodeString& s) {
2125    UnicodeString buf;
2126    for (int32_t i=0; i<s.length(); )
2127    {
2128        UChar32 c = s.char32At(i);
2129        if (0x0020 <= c && c <= 0x007F) {
2130            buf += c;
2131        } else {
2132            if (c <= 0xFFFF) {
2133                buf += (UChar)0x5c; buf += (UChar)0x75;
2134            } else {
2135                buf += (UChar)0x5c; buf += (UChar)0x55;
2136                buf += toHexString((c & 0xF0000000) >> 28);
2137                buf += toHexString((c & 0x0F000000) >> 24);
2138                buf += toHexString((c & 0x00F00000) >> 20);
2139                buf += toHexString((c & 0x000F0000) >> 16);
2140            }
2141            buf += toHexString((c & 0xF000) >> 12);
2142            buf += toHexString((c & 0x0F00) >> 8);
2143            buf += toHexString((c & 0x00F0) >> 4);
2144            buf += toHexString(c & 0x000F);
2145        }
2146        i += U16_LENGTH(c);
2147    }
2148    return buf;
2149}
2150
2151void UnicodeSetTest::TestFreezable() {
2152    UErrorCode errorCode=U_ZERO_ERROR;
2153    UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2154    UnicodeSet idSet(idPattern, errorCode);
2155    if(U_FAILURE(errorCode)) {
2156        dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2157        return;
2158    }
2159
2160    UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2161    UnicodeSet wsSet(wsPattern, errorCode);
2162    if(U_FAILURE(errorCode)) {
2163        dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2164        return;
2165    }
2166
2167    idSet.add(idPattern);
2168    UnicodeSet frozen(idSet);
2169    frozen.freeze();
2170
2171    if(idSet.isFrozen() || !frozen.isFrozen()) {
2172        errln("FAIL: isFrozen() is wrong");
2173    }
2174    if(frozen!=idSet || !(frozen==idSet)) {
2175        errln("FAIL: a copy-constructed frozen set differs from its original");
2176    }
2177
2178    frozen=wsSet;
2179    if(frozen!=idSet || !(frozen==idSet)) {
2180        errln("FAIL: a frozen set was modified by operator=");
2181    }
2182
2183    UnicodeSet frozen2(frozen);
2184    if(frozen2!=frozen || frozen2!=idSet) {
2185        errln("FAIL: a copied frozen set differs from its frozen original");
2186    }
2187    if(!frozen2.isFrozen()) {
2188        errln("FAIL: copy-constructing a frozen set results in a thawed one");
2189    }
2190    UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2191    if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2192        errln("FAIL: UnicodeSet(5, 55) failed");
2193    }
2194    frozen3=frozen;
2195    if(!frozen3.isFrozen()) {
2196        errln("FAIL: copying a frozen set results in a thawed one");
2197    }
2198
2199    UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2200    if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2201        errln("FAIL: clone() failed");
2202    }
2203    cloned->add(0xd802, 0xd805);
2204    if(cloned->containsSome(0xd802, 0xd805)) {
2205        errln("FAIL: unable to modify clone");
2206    }
2207    delete cloned;
2208
2209    UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2210    if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2211        errln("FAIL: cloneAsThawed() failed");
2212    }
2213    thawed->add(0xd802, 0xd805);
2214    if(!thawed->contains(0xd802, 0xd805)) {
2215        errln("FAIL: unable to modify thawed clone");
2216    }
2217    delete thawed;
2218
2219    frozen.set(5, 55);
2220    if(frozen!=idSet || !(frozen==idSet)) {
2221        errln("FAIL: UnicodeSet::set() modified a frozen set");
2222    }
2223
2224    frozen.clear();
2225    if(frozen!=idSet || !(frozen==idSet)) {
2226        errln("FAIL: UnicodeSet::clear() modified a frozen set");
2227    }
2228
2229    frozen.closeOver(USET_CASE_INSENSITIVE);
2230    if(frozen!=idSet || !(frozen==idSet)) {
2231        errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2232    }
2233
2234    frozen.compact();
2235    if(frozen!=idSet || !(frozen==idSet)) {
2236        errln("FAIL: UnicodeSet::compact() modified a frozen set");
2237    }
2238
2239    ParsePosition pos;
2240    frozen.
2241        applyPattern(wsPattern, errorCode).
2242        applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2243        applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2244        applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2245        applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2246    if(frozen!=idSet || !(frozen==idSet)) {
2247        errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2248    }
2249
2250    frozen.
2251        add(0xd800).
2252        add(0xd802, 0xd805).
2253        add(wsPattern).
2254        addAll(idPattern).
2255        addAll(wsSet);
2256    if(frozen!=idSet || !(frozen==idSet)) {
2257        errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2258    }
2259
2260    frozen.
2261        retain(0x62).
2262        retain(0x64, 0x69).
2263        retainAll(wsPattern).
2264        retainAll(wsSet);
2265    if(frozen!=idSet || !(frozen==idSet)) {
2266        errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2267    }
2268
2269    frozen.
2270        remove(0x62).
2271        remove(0x64, 0x69).
2272        remove(idPattern).
2273        removeAll(idPattern).
2274        removeAll(idSet);
2275    if(frozen!=idSet || !(frozen==idSet)) {
2276        errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2277    }
2278
2279    frozen.
2280        complement().
2281        complement(0x62).
2282        complement(0x64, 0x69).
2283        complement(idPattern).
2284        complementAll(idPattern).
2285        complementAll(idSet);
2286    if(frozen!=idSet || !(frozen==idSet)) {
2287        errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2288    }
2289}
2290
2291// Test span() etc. -------------------------------------------------------- ***
2292
2293// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2294static int32_t
2295appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2296    UErrorCode errorCode=U_ZERO_ERROR;
2297    int32_t length8=0;
2298    u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2299    if(U_SUCCESS(errorCode)) {
2300        return length8;
2301    } else {
2302        // The string contains an unpaired surrogate.
2303        // Ignore this string.
2304        return 0;
2305    }
2306}
2307
2308class UnicodeSetWithStringsIterator;
2309
2310// Make the strings in a UnicodeSet easily accessible.
2311class UnicodeSetWithStrings {
2312public:
2313    UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2314            set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2315        int32_t size=set.size();
2316        if(size>0 && set.charAt(size-1)<0) {
2317            // If a set's last element is not a code point, then it must contain strings.
2318            // Iterate over the set, skip all code point ranges, and cache the strings.
2319            // Convert them to UTF-8 for spanUTF8().
2320            UnicodeSetIterator iter(set);
2321            const UnicodeString *s;
2322            char *s8=utf8;
2323            int32_t length8, utf8Count=0;
2324            while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
2325                if(iter.isString()) {
2326                    // Store the pointer to the set's string element
2327                    // which we happen to know is a stable pointer.
2328                    strings[stringsLength]=s=&iter.getString();
2329                    utf8Count+=
2330                        utf8Lengths[stringsLength]=length8=
2331                        appendUTF8(s->getBuffer(), s->length(),
2332                                   s8, (int32_t)(sizeof(utf8)-utf8Count));
2333                    if(length8==0) {
2334                        hasSurrogates=TRUE;  // Contains unpaired surrogates.
2335                    }
2336                    s8+=length8;
2337                    ++stringsLength;
2338                }
2339            }
2340        }
2341    }
2342
2343    const UnicodeSet &getSet() const {
2344        return set;
2345    }
2346
2347    UBool hasStrings() const {
2348        return (UBool)(stringsLength>0);
2349    }
2350
2351    UBool hasStringsWithSurrogates() const {
2352        return hasSurrogates;
2353    }
2354
2355private:
2356    friend class UnicodeSetWithStringsIterator;
2357
2358    const UnicodeSet &set;
2359
2360    const UnicodeString *strings[20];
2361    int32_t stringsLength;
2362    UBool hasSurrogates;
2363
2364    char utf8[1024];
2365    int32_t utf8Lengths[20];
2366
2367    int32_t nextStringIndex;
2368    int32_t nextUTF8Start;
2369};
2370
2371class UnicodeSetWithStringsIterator {
2372public:
2373    UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2374            fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2375    }
2376
2377    void reset() {
2378        nextStringIndex=nextUTF8Start=0;
2379    }
2380
2381    const UnicodeString *nextString() {
2382        if(nextStringIndex<fSet.stringsLength) {
2383            return fSet.strings[nextStringIndex++];
2384        } else {
2385            return NULL;
2386        }
2387    }
2388
2389    // Do not mix with calls to nextString().
2390    const char *nextUTF8(int32_t &length) {
2391        if(nextStringIndex<fSet.stringsLength) {
2392            const char *s8=fSet.utf8+nextUTF8Start;
2393            nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2394            return s8;
2395        } else {
2396            length=0;
2397            return NULL;
2398        }
2399    }
2400
2401private:
2402    const UnicodeSetWithStrings &fSet;
2403    int32_t nextStringIndex;
2404    int32_t nextUTF8Start;
2405};
2406
2407// Compare 16-bit Unicode strings (which may be malformed UTF-16)
2408// at code point boundaries.
2409// That is, each edge of a match must not be in the middle of a surrogate pair.
2410static inline UBool
2411matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2412    s+=start;
2413    limit-=start;
2414    int32_t length=t.length();
2415    return 0==t.compare(s, length) &&
2416           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2417           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2418}
2419
2420// Implement span() with contains() for comparison.
2421static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2422                                 USetSpanCondition spanCondition) {
2423    const UnicodeSet &realSet(set.getSet());
2424    if(!set.hasStrings()) {
2425        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2426            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2427        }
2428
2429        UChar32 c;
2430        int32_t start=0, prev;
2431        while((prev=start)<length) {
2432            U16_NEXT(s, start, length, c);
2433            if(realSet.contains(c)!=spanCondition) {
2434                break;
2435            }
2436        }
2437        return prev;
2438    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2439        UnicodeSetWithStringsIterator iter(set);
2440        UChar32 c;
2441        int32_t start, next;
2442        for(start=next=0; start<length;) {
2443            U16_NEXT(s, next, length, c);
2444            if(realSet.contains(c)) {
2445                break;
2446            }
2447            const UnicodeString *str;
2448            iter.reset();
2449            while((str=iter.nextString())!=NULL) {
2450                if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2451                    // spanNeedsStrings=TRUE;
2452                    return start;
2453                }
2454            }
2455            start=next;
2456        }
2457        return start;
2458    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2459        UnicodeSetWithStringsIterator iter(set);
2460        UChar32 c;
2461        int32_t start, next, maxSpanLimit=0;
2462        for(start=next=0; start<length;) {
2463            U16_NEXT(s, next, length, c);
2464            if(!realSet.contains(c)) {
2465                next=start;  // Do not span this single, not-contained code point.
2466            }
2467            const UnicodeString *str;
2468            iter.reset();
2469            while((str=iter.nextString())!=NULL) {
2470                if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2471                    // spanNeedsStrings=TRUE;
2472                    int32_t matchLimit=start+str->length();
2473                    if(matchLimit==length) {
2474                        return length;
2475                    }
2476                    if(spanCondition==USET_SPAN_CONTAINED) {
2477                        // Iterate for the shortest match at each position.
2478                        // Recurse for each but the shortest match.
2479                        if(next==start) {
2480                            next=matchLimit;  // First match from start.
2481                        } else {
2482                            if(matchLimit<next) {
2483                                // Remember shortest match from start for iteration.
2484                                int32_t temp=next;
2485                                next=matchLimit;
2486                                matchLimit=temp;
2487                            }
2488                            // Recurse for non-shortest match from start.
2489                            int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2490                                                                 USET_SPAN_CONTAINED);
2491                            if((matchLimit+spanLength)>maxSpanLimit) {
2492                                maxSpanLimit=matchLimit+spanLength;
2493                                if(maxSpanLimit==length) {
2494                                    return length;
2495                                }
2496                            }
2497                        }
2498                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2499                        if(matchLimit>next) {
2500                            // Remember longest match from start.
2501                            next=matchLimit;
2502                        }
2503                    }
2504                }
2505            }
2506            if(next==start) {
2507                break;  // No match from start.
2508            }
2509            start=next;
2510        }
2511        if(start>maxSpanLimit) {
2512            return start;
2513        } else {
2514            return maxSpanLimit;
2515        }
2516    }
2517}
2518
2519static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2520                                     USetSpanCondition spanCondition) {
2521    if(length==0) {
2522        return 0;
2523    }
2524    const UnicodeSet &realSet(set.getSet());
2525    if(!set.hasStrings()) {
2526        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2527            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2528        }
2529
2530        UChar32 c;
2531        int32_t prev=length;
2532        do {
2533            U16_PREV(s, 0, length, c);
2534            if(realSet.contains(c)!=spanCondition) {
2535                break;
2536            }
2537        } while((prev=length)>0);
2538        return prev;
2539    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2540        UnicodeSetWithStringsIterator iter(set);
2541        UChar32 c;
2542        int32_t prev=length, length0=length;
2543        do {
2544            U16_PREV(s, 0, length, c);
2545            if(realSet.contains(c)) {
2546                break;
2547            }
2548            const UnicodeString *str;
2549            iter.reset();
2550            while((str=iter.nextString())!=NULL) {
2551                if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2552                    // spanNeedsStrings=TRUE;
2553                    return prev;
2554                }
2555            }
2556        } while((prev=length)>0);
2557        return prev;
2558    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2559        UnicodeSetWithStringsIterator iter(set);
2560        UChar32 c;
2561        int32_t prev=length, minSpanStart=length, length0=length;
2562        do {
2563            U16_PREV(s, 0, length, c);
2564            if(!realSet.contains(c)) {
2565                length=prev;  // Do not span this single, not-contained code point.
2566            }
2567            const UnicodeString *str;
2568            iter.reset();
2569            while((str=iter.nextString())!=NULL) {
2570                if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2571                    // spanNeedsStrings=TRUE;
2572                    int32_t matchStart=prev-str->length();
2573                    if(matchStart==0) {
2574                        return 0;
2575                    }
2576                    if(spanCondition==USET_SPAN_CONTAINED) {
2577                        // Iterate for the shortest match at each position.
2578                        // Recurse for each but the shortest match.
2579                        if(length==prev) {
2580                            length=matchStart;  // First match from prev.
2581                        } else {
2582                            if(matchStart>length) {
2583                                // Remember shortest match from prev for iteration.
2584                                int32_t temp=length;
2585                                length=matchStart;
2586                                matchStart=temp;
2587                            }
2588                            // Recurse for non-shortest match from prev.
2589                            int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2590                                                                    USET_SPAN_CONTAINED);
2591                            if(spanStart<minSpanStart) {
2592                                minSpanStart=spanStart;
2593                                if(minSpanStart==0) {
2594                                    return 0;
2595                                }
2596                            }
2597                        }
2598                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2599                        if(matchStart<length) {
2600                            // Remember longest match from prev.
2601                            length=matchStart;
2602                        }
2603                    }
2604                }
2605            }
2606            if(length==prev) {
2607                break;  // No match from prev.
2608            }
2609        } while((prev=length)>0);
2610        if(prev<minSpanStart) {
2611            return prev;
2612        } else {
2613            return minSpanStart;
2614        }
2615    }
2616}
2617
2618static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2619                                USetSpanCondition spanCondition) {
2620    const UnicodeSet &realSet(set.getSet());
2621    if(!set.hasStrings()) {
2622        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2623            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2624        }
2625
2626        UChar32 c;
2627        int32_t start=0, prev;
2628        while((prev=start)<length) {
2629            U8_NEXT(s, start, length, c);
2630            if(c<0) {
2631                c=0xfffd;
2632            }
2633            if(realSet.contains(c)!=spanCondition) {
2634                break;
2635            }
2636        }
2637        return prev;
2638    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2639        UnicodeSetWithStringsIterator iter(set);
2640        UChar32 c;
2641        int32_t start, next;
2642        for(start=next=0; start<length;) {
2643            U8_NEXT(s, next, length, c);
2644            if(c<0) {
2645                c=0xfffd;
2646            }
2647            if(realSet.contains(c)) {
2648                break;
2649            }
2650            const char *s8;
2651            int32_t length8;
2652            iter.reset();
2653            while((s8=iter.nextUTF8(length8))!=NULL) {
2654                if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2655                    // spanNeedsStrings=TRUE;
2656                    return start;
2657                }
2658            }
2659            start=next;
2660        }
2661        return start;
2662    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2663        UnicodeSetWithStringsIterator iter(set);
2664        UChar32 c;
2665        int32_t start, next, maxSpanLimit=0;
2666        for(start=next=0; start<length;) {
2667            U8_NEXT(s, next, length, c);
2668            if(c<0) {
2669                c=0xfffd;
2670            }
2671            if(!realSet.contains(c)) {
2672                next=start;  // Do not span this single, not-contained code point.
2673            }
2674            const char *s8;
2675            int32_t length8;
2676            iter.reset();
2677            while((s8=iter.nextUTF8(length8))!=NULL) {
2678                if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2679                    // spanNeedsStrings=TRUE;
2680                    int32_t matchLimit=start+length8;
2681                    if(matchLimit==length) {
2682                        return length;
2683                    }
2684                    if(spanCondition==USET_SPAN_CONTAINED) {
2685                        // Iterate for the shortest match at each position.
2686                        // Recurse for each but the shortest match.
2687                        if(next==start) {
2688                            next=matchLimit;  // First match from start.
2689                        } else {
2690                            if(matchLimit<next) {
2691                                // Remember shortest match from start for iteration.
2692                                int32_t temp=next;
2693                                next=matchLimit;
2694                                matchLimit=temp;
2695                            }
2696                            // Recurse for non-shortest match from start.
2697                            int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2698                                                                USET_SPAN_CONTAINED);
2699                            if((matchLimit+spanLength)>maxSpanLimit) {
2700                                maxSpanLimit=matchLimit+spanLength;
2701                                if(maxSpanLimit==length) {
2702                                    return length;
2703                                }
2704                            }
2705                        }
2706                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2707                        if(matchLimit>next) {
2708                            // Remember longest match from start.
2709                            next=matchLimit;
2710                        }
2711                    }
2712                }
2713            }
2714            if(next==start) {
2715                break;  // No match from start.
2716            }
2717            start=next;
2718        }
2719        if(start>maxSpanLimit) {
2720            return start;
2721        } else {
2722            return maxSpanLimit;
2723        }
2724    }
2725}
2726
2727static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2728                                    USetSpanCondition spanCondition) {
2729    if(length==0) {
2730        return 0;
2731    }
2732    const UnicodeSet &realSet(set.getSet());
2733    if(!set.hasStrings()) {
2734        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2735            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2736        }
2737
2738        UChar32 c;
2739        int32_t prev=length;
2740        do {
2741            U8_PREV(s, 0, length, c);
2742            if(c<0) {
2743                c=0xfffd;
2744            }
2745            if(realSet.contains(c)!=spanCondition) {
2746                break;
2747            }
2748        } while((prev=length)>0);
2749        return prev;
2750    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2751        UnicodeSetWithStringsIterator iter(set);
2752        UChar32 c;
2753        int32_t prev=length;
2754        do {
2755            U8_PREV(s, 0, length, c);
2756            if(c<0) {
2757                c=0xfffd;
2758            }
2759            if(realSet.contains(c)) {
2760                break;
2761            }
2762            const char *s8;
2763            int32_t length8;
2764            iter.reset();
2765            while((s8=iter.nextUTF8(length8))!=NULL) {
2766                if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2767                    // spanNeedsStrings=TRUE;
2768                    return prev;
2769                }
2770            }
2771        } while((prev=length)>0);
2772        return prev;
2773    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2774        UnicodeSetWithStringsIterator iter(set);
2775        UChar32 c;
2776        int32_t prev=length, minSpanStart=length;
2777        do {
2778            U8_PREV(s, 0, length, c);
2779            if(c<0) {
2780                c=0xfffd;
2781            }
2782            if(!realSet.contains(c)) {
2783                length=prev;  // Do not span this single, not-contained code point.
2784            }
2785            const char *s8;
2786            int32_t length8;
2787            iter.reset();
2788            while((s8=iter.nextUTF8(length8))!=NULL) {
2789                if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2790                    // spanNeedsStrings=TRUE;
2791                    int32_t matchStart=prev-length8;
2792                    if(matchStart==0) {
2793                        return 0;
2794                    }
2795                    if(spanCondition==USET_SPAN_CONTAINED) {
2796                        // Iterate for the shortest match at each position.
2797                        // Recurse for each but the shortest match.
2798                        if(length==prev) {
2799                            length=matchStart;  // First match from prev.
2800                        } else {
2801                            if(matchStart>length) {
2802                                // Remember shortest match from prev for iteration.
2803                                int32_t temp=length;
2804                                length=matchStart;
2805                                matchStart=temp;
2806                            }
2807                            // Recurse for non-shortest match from prev.
2808                            int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2809                                                                   USET_SPAN_CONTAINED);
2810                            if(spanStart<minSpanStart) {
2811                                minSpanStart=spanStart;
2812                                if(minSpanStart==0) {
2813                                    return 0;
2814                                }
2815                            }
2816                        }
2817                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2818                        if(matchStart<length) {
2819                            // Remember longest match from prev.
2820                            length=matchStart;
2821                        }
2822                    }
2823                }
2824            }
2825            if(length==prev) {
2826                break;  // No match from prev.
2827            }
2828        } while((prev=length)>0);
2829        if(prev<minSpanStart) {
2830            return prev;
2831        } else {
2832            return minSpanStart;
2833        }
2834    }
2835}
2836
2837// spans to be performed and compared
2838enum {
2839    SPAN_UTF16          =1,
2840    SPAN_UTF8           =2,
2841    SPAN_UTFS           =3,
2842
2843    SPAN_SET            =4,
2844    SPAN_COMPLEMENT     =8,
2845    SPAN_POLARITY       =0xc,
2846
2847    SPAN_FWD            =0x10,
2848    SPAN_BACK           =0x20,
2849    SPAN_DIRS           =0x30,
2850
2851    SPAN_CONTAINED      =0x100,
2852    SPAN_SIMPLE         =0x200,
2853    SPAN_CONDITION      =0x300,
2854
2855    SPAN_ALL            =0x33f
2856};
2857
2858static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2859    return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2860}
2861
2862static inline int32_t slen(const void *s, UBool isUTF16) {
2863    return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2864}
2865
2866/*
2867 * Count spans on a string with the method according to type and set the span limits.
2868 * The set may be the complement of the original.
2869 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2870 * according to the expected number of spans.
2871 * Sets typeName to an empty string if there is no such type.
2872 * Returns -1 if the span option is filtered out.
2873 */
2874static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2875                        const void *s, int32_t length, UBool isUTF16,
2876                        uint32_t whichSpans,
2877                        int type, const char *&typeName,
2878                        int32_t limits[], int32_t limitsCapacity,
2879                        int32_t expectCount) {
2880    const UnicodeSet &realSet(set.getSet());
2881    int32_t start, count;
2882    USetSpanCondition spanCondition, firstSpanCondition, contained;
2883    UBool isForward;
2884
2885    if(type<0 || 7<type) {
2886        typeName="";
2887        return 0;
2888    }
2889
2890    static const char *const typeNames16[]={
2891        "contains", "contains(LM)",
2892        "span", "span(LM)",
2893        "containsBack", "containsBack(LM)",
2894        "spanBack", "spanBack(LM)"
2895    };
2896
2897    static const char *const typeNames8[]={
2898        "containsUTF8", "containsUTF8(LM)",
2899        "spanUTF8", "spanUTF8(LM)",
2900        "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2901        "spanBackUTF8", "spanBackUTF8(LM)"
2902    };
2903
2904    typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2905
2906    // filter span options
2907    if(type<=3) {
2908        // span forward
2909        if((whichSpans&SPAN_FWD)==0) {
2910            return -1;
2911        }
2912        isForward=TRUE;
2913    } else {
2914        // span backward
2915        if((whichSpans&SPAN_BACK)==0) {
2916            return -1;
2917        }
2918        isForward=FALSE;
2919    }
2920    if((type&1)==0) {
2921        // use USET_SPAN_CONTAINED
2922        if((whichSpans&SPAN_CONTAINED)==0) {
2923            return -1;
2924        }
2925        contained=USET_SPAN_CONTAINED;
2926    } else {
2927        // use USET_SPAN_SIMPLE
2928        if((whichSpans&SPAN_SIMPLE)==0) {
2929            return -1;
2930        }
2931        contained=USET_SPAN_SIMPLE;
2932    }
2933
2934    // Default first span condition for going forward with an uncomplemented set.
2935    spanCondition=USET_SPAN_NOT_CONTAINED;
2936    if(isComplement) {
2937        spanCondition=invertSpanCondition(spanCondition, contained);
2938    }
2939
2940    // First span condition for span(), used to terminate the spanBack() iteration.
2941    firstSpanCondition=spanCondition;
2942
2943    // spanBack(): Its initial span condition is span()'s last span condition,
2944    // which is the opposite of span()'s first span condition
2945    // if we expect an even number of spans.
2946    // (The loop inverts spanCondition (expectCount-1) times
2947    // before the expectCount'th span() call.)
2948    // If we do not compare forward and backward directions, then we do not have an
2949    // expectCount and just start with firstSpanCondition.
2950    if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2951        spanCondition=invertSpanCondition(spanCondition, contained);
2952    }
2953
2954    count=0;
2955    switch(type) {
2956    case 0:
2957    case 1:
2958        start=0;
2959        if(length<0) {
2960            length=slen(s, isUTF16);
2961        }
2962        for(;;) {
2963            start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2964                              containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2965            if(count<limitsCapacity) {
2966                limits[count]=start;
2967            }
2968            ++count;
2969            if(start>=length) {
2970                break;
2971            }
2972            spanCondition=invertSpanCondition(spanCondition, contained);
2973        }
2974        break;
2975    case 2:
2976    case 3:
2977        start=0;
2978        for(;;) {
2979            start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2980                              realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2981            if(count<limitsCapacity) {
2982                limits[count]=start;
2983            }
2984            ++count;
2985            if(length>=0 ? start>=length :
2986                           isUTF16 ? ((const UChar *)s)[start]==0 :
2987                                     ((const char *)s)[start]==0
2988            ) {
2989                break;
2990            }
2991            spanCondition=invertSpanCondition(spanCondition, contained);
2992        }
2993        break;
2994    case 4:
2995    case 5:
2996        if(length<0) {
2997            length=slen(s, isUTF16);
2998        }
2999        for(;;) {
3000            ++count;
3001            if(count<=limitsCapacity) {
3002                limits[limitsCapacity-count]=length;
3003            }
3004            length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
3005                              containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
3006            if(length==0 && spanCondition==firstSpanCondition) {
3007                break;
3008            }
3009            spanCondition=invertSpanCondition(spanCondition, contained);
3010        }
3011        if(count<limitsCapacity) {
3012            memmove(limits, limits+(limitsCapacity-count), count*4);
3013        }
3014        break;
3015    case 6:
3016    case 7:
3017        for(;;) {
3018            ++count;
3019            if(count<=limitsCapacity) {
3020                limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3021            }
3022            // Note: Length<0 is tested only for the first spanBack().
3023            // If we wanted to keep length<0 for all spanBack()s, we would have to
3024            // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3025            length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3026                              realSet.spanBackUTF8((const char *)s, length, spanCondition);
3027            if(length==0 && spanCondition==firstSpanCondition) {
3028                break;
3029            }
3030            spanCondition=invertSpanCondition(spanCondition, contained);
3031        }
3032        if(count<limitsCapacity) {
3033            memmove(limits, limits+(limitsCapacity-count), count*4);
3034        }
3035        break;
3036    default:
3037        typeName="";
3038        return -1;
3039    }
3040
3041    return count;
3042}
3043
3044// sets to be tested; odd index=isComplement
3045enum {
3046    SLOW,
3047    SLOW_NOT,
3048    FAST,
3049    FAST_NOT,
3050    SET_COUNT
3051};
3052
3053static const char *const setNames[SET_COUNT]={
3054    "slow",
3055    "slow.not",
3056    "fast",
3057    "fast.not"
3058};
3059
3060/*
3061 * Verify that we get the same results whether we look at text with contains(),
3062 * span() or spanBack(), using unfrozen or frozen versions of the set,
3063 * and using the set or its complement (switching the spanConditions accordingly).
3064 * The latter verifies that
3065 *   set.span(spanCondition) == set.complement().span(!spanCondition).
3066 *
3067 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3068 * or returned to the caller (with an input expectCount<0).
3069 */
3070void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3071                              const void *s, int32_t length, UBool isUTF16,
3072                              uint32_t whichSpans,
3073                              int32_t expectLimits[], int32_t &expectCount,
3074                              const char *testName, int32_t index) {
3075    int32_t limits[500];
3076    int32_t limitsCount;
3077    int i, j;
3078
3079    const char *typeName;
3080    int type;
3081
3082    for(i=0; i<SET_COUNT; ++i) {
3083        if((i&1)==0) {
3084            // Even-numbered sets are original, uncomplemented sets.
3085            if((whichSpans&SPAN_SET)==0) {
3086                continue;
3087            }
3088        } else {
3089            // Odd-numbered sets are complemented.
3090            if((whichSpans&SPAN_COMPLEMENT)==0) {
3091                continue;
3092            }
3093        }
3094        for(type=0;; ++type) {
3095            limitsCount=getSpans(*sets[i], (UBool)(i&1),
3096                                 s, length, isUTF16,
3097                                 whichSpans,
3098                                 type, typeName,
3099                                 limits, LENGTHOF(limits), expectCount);
3100            if(typeName[0]==0) {
3101                break; // All types tried.
3102            }
3103            if(limitsCount<0) {
3104                continue; // Span option filtered out.
3105            }
3106            if(expectCount<0) {
3107                expectCount=limitsCount;
3108                if(limitsCount>LENGTHOF(limits)) {
3109                    errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3110                          testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
3111                    return;
3112                }
3113                memcpy(expectLimits, limits, limitsCount*4);
3114            } else if(limitsCount!=expectCount) {
3115                errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3116                      testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3117            } else {
3118                for(j=0; j<limitsCount; ++j) {
3119                    if(limits[j]!=expectLimits[j]) {
3120                        errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3121                              testName, (long)index, setNames[i], typeName, (long)limitsCount,
3122                              j, (long)limits[j], (long)expectLimits[j]);
3123                        break;
3124                    }
3125                }
3126            }
3127        }
3128    }
3129
3130    // Compare span() with containsAll()/containsNone(),
3131    // but only if we have expectLimits[] from the uncomplemented set.
3132    if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3133        const UChar *s16=(const UChar *)s;
3134        UnicodeString string;
3135        int32_t prev=0, limit, length;
3136        for(i=0; i<expectCount; ++i) {
3137            limit=expectLimits[i];
3138            length=limit-prev;
3139            if(length>0) {
3140                string.setTo(FALSE, s16+prev, length);  // read-only alias
3141                if(i&1) {
3142                    if(!sets[SLOW]->getSet().containsAll(string)) {
3143                        errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3144                              testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3145                        return;
3146                    }
3147                    if(!sets[FAST]->getSet().containsAll(string)) {
3148                        errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3149                              testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3150                        return;
3151                    }
3152                } else {
3153                    if(!sets[SLOW]->getSet().containsNone(string)) {
3154                        errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3155                              testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3156                        return;
3157                    }
3158                    if(!sets[FAST]->getSet().containsNone(string)) {
3159                        errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3160                              testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3161                        return;
3162                    }
3163                }
3164            }
3165            prev=limit;
3166        }
3167    }
3168}
3169
3170// Specifically test either UTF-16 or UTF-8.
3171void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3172                              const void *s, int32_t length, UBool isUTF16,
3173                              uint32_t whichSpans,
3174                              const char *testName, int32_t index) {
3175    int32_t expectLimits[500];
3176    int32_t expectCount=-1;
3177    testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3178}
3179
3180UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3181    UChar c, c2;
3182
3183    if(length>=0) {
3184        while(length>0) {
3185            c=*s++;
3186            --length;
3187            if(0xd800<=c && c<0xe000) {
3188                if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3189                    return TRUE;
3190                }
3191                --length;
3192            }
3193        }
3194    } else {
3195        while((c=*s++)!=0) {
3196            if(0xd800<=c && c<0xe000) {
3197                if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3198                    return TRUE;
3199                }
3200            }
3201        }
3202    }
3203    return FALSE;
3204}
3205
3206// Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3207// unless either UTF is turned off in whichSpans.
3208// Testing UTF-16 and UTF-8 together requires that surrogate code points
3209// have the same contains(c) value as U+FFFD.
3210void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3211                                      const UChar *s16, int32_t length16,
3212                                      uint32_t whichSpans,
3213                                      const char *testName, int32_t index) {
3214    int32_t expectLimits[500];
3215    int32_t expectCount;
3216
3217    expectCount=-1;  // Get expectLimits[] from testSpan().
3218
3219    if((whichSpans&SPAN_UTF16)!=0) {
3220        testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3221    }
3222    if((whichSpans&SPAN_UTF8)==0) {
3223        return;
3224    }
3225
3226    // Convert s16[] and expectLimits[] to UTF-8.
3227    uint8_t s8[3000];
3228    int32_t offsets[3000];
3229
3230    const UChar *s16Limit=s16+length16;
3231    char *t=(char *)s8;
3232    char *tLimit=t+sizeof(s8);
3233    int32_t *o=offsets;
3234    UErrorCode errorCode=U_ZERO_ERROR;
3235
3236    // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3237    ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3238    if(U_FAILURE(errorCode)) {
3239        errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3240              testName, (long)index, u_errorName(errorCode));
3241        ucnv_resetFromUnicode(utf8Cnv);
3242        return;
3243    }
3244    int32_t length8=(int32_t)(t-(char *)s8);
3245
3246    // Convert expectLimits[].
3247    int32_t i, j, expect;
3248    for(i=j=0; i<expectCount; ++i) {
3249        expect=expectLimits[i];
3250        if(expect==length16) {
3251            expectLimits[i]=length8;
3252        } else {
3253            while(offsets[j]<expect) {
3254                ++j;
3255            }
3256            expectLimits[i]=j;
3257        }
3258    }
3259
3260    testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3261}
3262
3263static UChar32 nextCodePoint(UChar32 c) {
3264    // Skip some large and boring ranges.
3265    switch(c) {
3266    case 0x3441:
3267        return 0x4d7f;
3268    case 0x5100:
3269        return 0x9f00;
3270    case 0xb040:
3271        return 0xd780;
3272    case 0xe041:
3273        return 0xf8fe;
3274    case 0x10100:
3275        return 0x20000;
3276    case 0x20041:
3277        return 0xe0000;
3278    case 0xe0101:
3279        return 0x10fffd;
3280    default:
3281        return c+1;
3282    }
3283}
3284
3285// Verify that all implementations represent the same set.
3286void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3287    // contains(U+FFFD) is inconsistent with contains(some surrogates),
3288    // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3289    // Skip the UTF-8 part of the test - if the string contains surrogates -
3290    // because it is likely to produce a different result.
3291    UBool inconsistentSurrogates=
3292            (!(sets[0]->getSet().contains(0xfffd) ?
3293               sets[0]->getSet().contains(0xd800, 0xdfff) :
3294               sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3295             sets[0]->hasStringsWithSurrogates());
3296
3297    UChar s[1000];
3298    int32_t length=0;
3299    uint32_t localWhichSpans;
3300
3301    UChar32 c, first;
3302    for(first=c=0;; c=nextCodePoint(c)) {
3303        if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
3304            localWhichSpans=whichSpans;
3305            if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3306                localWhichSpans&=~SPAN_UTF8;
3307            }
3308            testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3309            if(c>0x10ffff) {
3310                break;
3311            }
3312            length=0;
3313            first=c;
3314        }
3315        U16_APPEND_UNSAFE(s, length, c);
3316    }
3317}
3318
3319// Test with a particular, interesting string.
3320// Specify length and try NUL-termination.
3321void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3322    static const UChar s[]={
3323        0x61, 0x62, 0x20,                       // Latin, space
3324        0x3b1, 0x3b2, 0x3b3,                    // Greek
3325        0xd900,                                 // lead surrogate
3326        0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3327        0xdc05,                                 // trail surrogate
3328        0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3329        0xd900, 0xdc05,                         // unassigned supplementary
3330        0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3331        0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3332        0                                       // NUL
3333    };
3334
3335    if((whichSpans&SPAN_UTF16)==0) {
3336        return;
3337    }
3338    testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3339    testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3340}
3341
3342void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3343    static const char s[]={
3344        "abc"                                   // Latin
3345
3346        /* trail byte in lead position */
3347        "\x80"
3348
3349        " "                                     // space
3350
3351        /* truncated multi-byte sequences */
3352        "\xd0"
3353        "\xe0"
3354        "\xe1"
3355        "\xed"
3356        "\xee"
3357        "\xf0"
3358        "\xf1"
3359        "\xf4"
3360        "\xf8"
3361        "\xfc"
3362
3363        "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3364
3365        /* trail byte in lead position */
3366        "\x80"
3367
3368        "\xe0\x80"
3369        "\xe0\xa0"
3370        "\xe1\x80"
3371        "\xed\x80"
3372        "\xed\xa0"
3373        "\xee\x80"
3374        "\xf0\x80"
3375        "\xf0\x90"
3376        "\xf1\x80"
3377        "\xf4\x80"
3378        "\xf4\x90"
3379        "\xf8\x80"
3380        "\xfc\x80"
3381
3382        "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3383
3384        /* trail byte in lead position */
3385        "\x80"
3386
3387        "\xf0\x80\x80"
3388        "\xf0\x90\x80"
3389        "\xf1\x80\x80"
3390        "\xf4\x80\x80"
3391        "\xf4\x90\x80"
3392        "\xf8\x80\x80"
3393        "\xfc\x80\x80"
3394
3395        "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3396
3397        /* trail byte in lead position */
3398        "\x80"
3399
3400        "\xf8\x80\x80\x80"
3401        "\xfc\x80\x80\x80"
3402
3403        "\xF1\x90\x80\x85"                      // unassigned supplementary
3404
3405        /* trail byte in lead position */
3406        "\x80"
3407
3408        "\xfc\x80\x80\x80\x80"
3409
3410        "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3411
3412        /* trail byte in lead position */
3413        "\x80"
3414
3415        /* complete sequences but non-shortest forms or out of range etc. */
3416        "\xc0\x80"
3417        "\xe0\x80\x80"
3418        "\xed\xa0\x80"
3419        "\xf0\x80\x80\x80"
3420        "\xf4\x90\x80\x80"
3421        "\xf8\x80\x80\x80\x80"
3422        "\xfc\x80\x80\x80\x80\x80"
3423        "\xfe"
3424        "\xff"
3425
3426        /* trail byte in lead position */
3427        "\x80"
3428
3429        "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3430    };
3431
3432    if((whichSpans&SPAN_UTF8)==0) {
3433        return;
3434    }
3435    testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3436    testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3437}
3438
3439// Take a set of span options and multiply them so that
3440// each portion only has one of the options a, b and c.
3441// If b==0, then the set of options is just modified with mask and a.
3442// If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3443static int32_t
3444addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3445               uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3446    uint32_t s;
3447    int32_t i;
3448
3449    for(i=0; i<whichSpansCount; ++i) {
3450        s=whichSpans[i]&mask;
3451        whichSpans[i]=s|a;
3452        if(b!=0) {
3453            whichSpans[whichSpansCount+i]=s|b;
3454            if(c!=0) {
3455                whichSpans[2*whichSpansCount+i]=s|c;
3456            }
3457        }
3458    }
3459    return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3460}
3461
3462#define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3463#define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3464#define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3465#define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3466
3467void UnicodeSetTest::TestSpan() {
3468    // "[...]" is a UnicodeSet pattern.
3469    // "*" performs tests on all Unicode code points and on a selection of
3470    //   malformed UTF-8/16 strings.
3471    // "-options" limits the scope of testing for the current set.
3472    //   By default, the test verifies that equivalent boundaries are found
3473    //   for UTF-16 and UTF-8, going forward and backward,
3474    //   alternating USET_SPAN_NOT_CONTAINED with
3475    //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3476    //   Single-character options:
3477    //     8 -- UTF-16 and UTF-8 boundaries may differ.
3478    //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3479    //          or the set contains strings with unpaired surrogates
3480    //          which do not translate to valid UTF-8.
3481    //     c -- set.span() and set.complement().span() boundaries may differ.
3482    //          Cause: Set strings are not complemented.
3483    //     b -- span() and spanBack() boundaries may differ.
3484    //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3485    //          and spanBack(USET_SPAN_SIMPLE) are defined to
3486    //          match with non-overlapping substrings.
3487    //          For example, with a set containing "ab" and "ba",
3488    //          span() of "aba" yields boundaries { 0, 2, 3 }
3489    //          because the initial "ab" matches from 0 to 2,
3490    //          while spanBack() yields boundaries { 0, 1, 3 }
3491    //          because the final "ba" matches from 1 to 3.
3492    //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3493    //          Cause: Strings in the set overlap, and a longer match may
3494    //          require a sequence including non-longest substrings.
3495    //          For example, with a set containing "ab", "abc" and "cd",
3496    //          span(contained) of "abcd" spans the entire string
3497    //          but span(longest match) only spans the first 3 characters.
3498    //   Each "-options" first resets all options and then applies the specified options.
3499    //   A "-" without options resets the options.
3500    //   The options are also reset for each new set.
3501    // Other strings will be spanned.
3502    static const char *const testdata[]={
3503        "[:ID_Continue:]",
3504        "*",
3505        "[:White_Space:]",
3506        "*",
3507        "[]",
3508        "*",
3509        "[\\u0000-\\U0010FFFF]",
3510        "*",
3511        "[\\u0000\\u0080\\u0800\\U00010000]",
3512        "*",
3513        "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3514        "*",
3515        "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3516        "-c",
3517        "*",
3518        "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3519        "-c",
3520        "*",
3521
3522        // Overlapping strings cause overlapping attempts to match.
3523        "[x{xy}{xya}{axy}{ax}]",
3524        "-cl",
3525
3526        // More repetitions of "xya" would take too long with the recursive
3527        // reference implementation.
3528        // containsAll()=FALSE
3529        // test_string 0x14
3530        "xx"
3531        "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3532        "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3533        "xyaxyaxyaxya"
3534        "xx"
3535        "xyaxyaxyaxya"  // span() ends here.
3536        "aaa",
3537
3538        // containsAll()=TRUE
3539        // test_string 0x15
3540        "xx"
3541        "xyaxyaxyaxya"
3542        "xx"
3543        "xyaxyaxyaxya"
3544        "xx"
3545        "xyaxyaxyaxy",
3546
3547        "-bc",
3548        // test_string 0x17
3549        "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3550        "-c",
3551        "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3552        "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3553        "-",
3554        "byaya",     // span() -> { 5 }
3555        "byay",      // span() -> { 4 }
3556        "bya",       // span() -> { 3 }
3557
3558        // span(longest match) will not span the whole string.
3559        "[a{ab}{bc}]",
3560        "-cl",
3561        // test_string 0x21
3562        "abc",
3563
3564        "[a{ab}{abc}{cd}]",
3565        "-cl",
3566        "acdabcdabccd",
3567
3568        // spanBack(longest match) will not span the whole string.
3569        "[c{ab}{bc}]",
3570        "-cl",
3571        "abc",
3572
3573        "[d{cd}{bcd}{ab}]",
3574        "-cl",
3575        "abbcdabcdabd",
3576
3577        // Test with non-ASCII set strings - test proper handling of surrogate pairs
3578        // and UTF-8 trail bytes.
3579        // Copies of above test sets and strings, but transliterated to have
3580        // different code points with similar trail units.
3581        // Previous: a      b         c            d
3582        // Unicode:  042B   30AB      200AB        204AB
3583        // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3584        // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3585        "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3586        "-cl",
3587        "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3588
3589        "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3590        "-cl",
3591        "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3592
3593        // Stress bookkeeping and recursion.
3594        // The following strings are barely doable with the recursive
3595        // reference implementation.
3596        // The not-contained character at the end prevents an early exit from the span().
3597        "[b{bb}]",
3598        "-c",
3599        // test_string 0x33
3600        "bbbbbbbbbbbbbbbbbbbbbbbb-",
3601        // On complement sets, span() and spanBack() get different results
3602        // because b is not in the complement set and there is an odd number of b's
3603        // in the test string.
3604        "-bc",
3605        "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3606
3607        // Test with set strings with an initial or final code point span
3608        // longer than 254.
3609        "[a{" _64_a _64_a _64_a _64_a "b}"
3610          "{a" _64_b _64_b _64_b _64_b "}]",
3611        "-c",
3612        _64_a _64_a _64_a _63_a "b",
3613        _64_a _64_a _64_a _64_a "b",
3614        _64_a _64_a _64_a _64_a "aaaabbbb",
3615        "a" _64_b _64_b _64_b _63_b,
3616        "a" _64_b _64_b _64_b _64_b,
3617        "aaaabbbb" _64_b _64_b _64_b _64_b,
3618
3619        // Test with strings containing unpaired surrogates.
3620        // They are not representable in UTF-8, and a leading trail surrogate
3621        // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3622        // U+20001 == \\uD840\\uDC01
3623        // U+20400 == \\uD841\\uDC00
3624        "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3625        "-8cl",
3626        "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3627    };
3628    uint32_t whichSpans[96]={ SPAN_ALL };
3629    int32_t whichSpansCount=1;
3630
3631    UnicodeSet *sets[SET_COUNT]={ NULL };
3632    const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3633
3634    char testName[1024];
3635    char *testNameLimit=testName;
3636
3637    int32_t i, j;
3638    for(i=0; i<LENGTHOF(testdata); ++i) {
3639        const char *s=testdata[i];
3640        if(s[0]=='[') {
3641            // Create new test sets from this pattern.
3642            for(j=0; j<SET_COUNT; ++j) {
3643                delete sets_with_str[j];
3644                delete sets[j];
3645            }
3646            UErrorCode errorCode=U_ZERO_ERROR;
3647            sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3648            if(U_FAILURE(errorCode)) {
3649                dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3650                break;
3651            }
3652            sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3653            sets[SLOW_NOT]->complement();
3654            // Intermediate set: Test cloning of a frozen set.
3655            UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3656            fast->freeze();
3657            sets[FAST]=(UnicodeSet *)fast->clone();
3658            delete fast;
3659            UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3660            fastNot->freeze();
3661            sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3662            delete fastNot;
3663
3664            for(j=0; j<SET_COUNT; ++j) {
3665                sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3666            }
3667
3668            strcpy(testName, s);
3669            testNameLimit=strchr(testName, 0);
3670            *testNameLimit++=':';
3671            *testNameLimit=0;
3672
3673            whichSpans[0]=SPAN_ALL;
3674            whichSpansCount=1;
3675        } else if(s[0]=='-') {
3676            whichSpans[0]=SPAN_ALL;
3677            whichSpansCount=1;
3678
3679            while(*++s!=0) {
3680                switch(*s) {
3681                case 'c':
3682                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3683                                                   ~SPAN_POLARITY,
3684                                                   SPAN_SET,
3685                                                   SPAN_COMPLEMENT,
3686                                                   0);
3687                    break;
3688                case 'b':
3689                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3690                                                   ~SPAN_DIRS,
3691                                                   SPAN_FWD,
3692                                                   SPAN_BACK,
3693                                                   0);
3694                    break;
3695                case 'l':
3696                    // test USET_SPAN_CONTAINED FWD & BACK, and separately
3697                    // USET_SPAN_SIMPLE only FWD, and separately
3698                    // USET_SPAN_SIMPLE only BACK
3699                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3700                                                   ~(SPAN_DIRS|SPAN_CONDITION),
3701                                                   SPAN_DIRS|SPAN_CONTAINED,
3702                                                   SPAN_FWD|SPAN_SIMPLE,
3703                                                   SPAN_BACK|SPAN_SIMPLE);
3704                    break;
3705                case '8':
3706                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3707                                                   ~SPAN_UTFS,
3708                                                   SPAN_UTF16,
3709                                                   SPAN_UTF8,
3710                                                   0);
3711                    break;
3712                default:
3713                    errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3714                    break;
3715                }
3716            }
3717        } else if(0==strcmp(s, "*")) {
3718            strcpy(testNameLimit, "bad_string");
3719            for(j=0; j<whichSpansCount; ++j) {
3720                if(whichSpansCount>1) {
3721                    sprintf(testNameLimit+10 /* strlen("bad_string") */,
3722                            "%%0x%3x",
3723                            whichSpans[j]);
3724                }
3725                testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3726                testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3727            }
3728
3729            strcpy(testNameLimit, "contents");
3730            for(j=0; j<whichSpansCount; ++j) {
3731                if(whichSpansCount>1) {
3732                    sprintf(testNameLimit+8 /* strlen("contents") */,
3733                            "%%0x%3x",
3734                            whichSpans[j]);
3735                }
3736                testSpanContents(sets_with_str, whichSpans[j], testName);
3737            }
3738        } else {
3739            UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3740            strcpy(testNameLimit, "test_string");
3741            for(j=0; j<whichSpansCount; ++j) {
3742                if(whichSpansCount>1) {
3743                    sprintf(testNameLimit+11 /* strlen("test_string") */,
3744                            "%%0x%3x",
3745                            whichSpans[j]);
3746                }
3747                testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3748            }
3749        }
3750    }
3751    for(j=0; j<SET_COUNT; ++j) {
3752        delete sets_with_str[j];
3753        delete sets[j];
3754    }
3755}
3756
3757// Test select patterns and strings, and test USET_SPAN_SIMPLE.
3758void UnicodeSetTest::TestStringSpan() {
3759    static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3760    static const char *const string=
3761        "xx"
3762        "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3763        "xx"
3764        "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3765        "xx"
3766        "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3767        "aaaa";
3768
3769    UErrorCode errorCode=U_ZERO_ERROR;
3770    UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3771    UnicodeSet set(pattern16, errorCode);
3772    if(U_FAILURE(errorCode)) {
3773        errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3774        return;
3775    }
3776
3777    UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3778
3779    if(set.containsAll(string16)) {
3780        errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3781    }
3782
3783    // Remove trailing "aaaa".
3784    string16.truncate(string16.length()-4);
3785    if(!set.containsAll(string16)) {
3786        errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3787    }
3788
3789    string16=UNICODE_STRING_SIMPLE("byayaxya");
3790    const UChar *s16=string16.getBuffer();
3791    int32_t length16=string16.length();
3792    if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3793        set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3794        set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3795        set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3796        set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3797        set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3798    ) {
3799        errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3800    }
3801
3802    pattern="[a{ab}{abc}{cd}]";
3803    pattern16=UnicodeString(pattern, -1, US_INV);
3804    set.applyPattern(pattern16, errorCode);
3805    if(U_FAILURE(errorCode)) {
3806        errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3807        return;
3808    }
3809    string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3810    s16=string16.getBuffer();
3811    length16=string16.length();
3812    if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3813        set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3814        set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3815    ) {
3816        errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3817    }
3818
3819    pattern="[d{cd}{bcd}{ab}]";
3820    pattern16=UnicodeString(pattern, -1, US_INV);
3821    set.applyPattern(pattern16, errorCode).freeze();
3822    if(U_FAILURE(errorCode)) {
3823        errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3824        return;
3825    }
3826    string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3827    s16=string16.getBuffer();
3828    length16=string16.length();
3829    if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3830        set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3831        set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3832    ) {
3833        errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3834    }
3835}
3836