1/*
2********************************************************************************
3*   Copyright (C) 1999-2013 International Business Machines Corporation and
4*   others. All Rights Reserved.
5********************************************************************************
6*   Date        Name        Description
7*   10/20/99    alan        Creation.
8*   03/22/2000  Madhu       Added additional tests
9********************************************************************************
10*/
11
12#include <stdio.h>
13
14#include <string.h>
15#include "unicode/utypes.h"
16#include "usettest.h"
17#include "unicode/ucnv.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/usetiter.h"
21#include "unicode/ustring.h"
22#include "unicode/parsepos.h"
23#include "unicode/symtable.h"
24#include "unicode/uversion.h"
25#include "hash.h"
26
27#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
28
29#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
30    dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
31    u_errorName(status));}}
32
33#define TEST_ASSERT(expr) {if (!(expr)) { \
34    dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
35
36UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
37    UnicodeString pat;
38    set.toPattern(pat);
39    return left + UnicodeSetTest::escape(pat);
40}
41
42#define CASE(id,test) case id:                          \
43                          name = #test;                 \
44                          if (exec) {                   \
45                              logln(#test "---");       \
46                              logln();                  \
47                              test();                   \
48                          }                             \
49                          break
50
51UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
52}
53
54UConverter *UnicodeSetTest::openUTF8Converter() {
55    if(utf8Cnv==NULL) {
56        UErrorCode errorCode=U_ZERO_ERROR;
57        utf8Cnv=ucnv_open("UTF-8", &errorCode);
58    }
59    return utf8Cnv;
60}
61
62UnicodeSetTest::~UnicodeSetTest() {
63    ucnv_close(utf8Cnv);
64}
65
66void
67UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
68                               const char* &name, char* /*par*/) {
69    // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
70    switch (index) {
71        CASE(0,TestPatterns);
72        CASE(1,TestAddRemove);
73        CASE(2,TestCategories);
74        CASE(3,TestCloneEqualHash);
75        CASE(4,TestMinimalRep);
76        CASE(5,TestAPI);
77        CASE(6,TestScriptSet);
78        CASE(7,TestPropertySet);
79        CASE(8,TestClone);
80        CASE(9,TestExhaustive);
81        CASE(10,TestToPattern);
82        CASE(11,TestIndexOf);
83        CASE(12,TestStrings);
84        CASE(13,Testj2268);
85        CASE(14,TestCloseOver);
86        CASE(15,TestEscapePattern);
87        CASE(16,TestInvalidCodePoint);
88        CASE(17,TestSymbolTable);
89        CASE(18,TestSurrogate);
90        CASE(19,TestPosixClasses);
91        CASE(20,TestIteration);
92        CASE(21,TestFreezable);
93        CASE(22,TestSpan);
94        CASE(23,TestStringSpan);
95        default: name = ""; break;
96    }
97}
98
99static const char NOT[] = "%%%%";
100
101/**
102 * UVector was improperly copying contents
103 * This code will crash this is still true
104 */
105void UnicodeSetTest::Testj2268() {
106  UnicodeSet t;
107  t.add(UnicodeString("abc"));
108  UnicodeSet test(t);
109  UnicodeString ustrPat;
110  test.toPattern(ustrPat, TRUE);
111}
112
113/**
114 * Test toPattern().
115 */
116void UnicodeSetTest::TestToPattern() {
117    UErrorCode ec = U_ZERO_ERROR;
118
119    // Test that toPattern() round trips with syntax characters and
120    // whitespace.
121    {
122        static const char* OTHER_TOPATTERN_TESTS[] = {
123            "[[:latin:]&[:greek:]]",
124            "[[:latin:]-[:greek:]]",
125            "[:nonspacing mark:]",
126            NULL
127        };
128
129        for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
130            ec = U_ZERO_ERROR;
131            UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
132            if (U_FAILURE(ec)) {
133                dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
134                continue;
135            }
136            checkPat(OTHER_TOPATTERN_TESTS[j], s);
137        }
138
139        for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
140            if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
141
142                // check various combinations to make sure they all work.
143                if (i != 0 && !toPatternAux(i, i)){
144                    continue;
145                }
146                if (!toPatternAux(0, i)){
147                    continue;
148                }
149                if (!toPatternAux(i, 0xFFFF)){
150                    continue;
151                }
152            }
153        }
154    }
155
156    // Test pattern behavior of multicharacter strings.
157    {
158        ec = U_ZERO_ERROR;
159        UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
160
161        // This loop isn't a loop.  It's here to make the compiler happy.
162        // If you're curious, try removing it and changing the 'break'
163        // statements (except for the last) to goto's.
164        for (;;) {
165            if (U_FAILURE(ec)) break;
166            const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
167            expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
168
169            s->add("ac");
170            const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
171            expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
172
173            s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
174            if (U_FAILURE(ec)) break;
175            const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
176            expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
177
178            s->add("[]");
179            const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
180            expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
181
182            s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
183            if (U_FAILURE(ec)) break;
184            const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
185            expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
186
187            // j2189
188            s->clear();
189            s->add(UnicodeString("abc", ""));
190            s->add(UnicodeString("abc", ""));
191            const char* exp6[] = {"abc", NOT, "ab", NULL};
192            expectToPattern(*s, "[{abc}]", exp6);
193
194            break;
195        }
196
197        if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
198        delete s;
199    }
200
201    // JB#3400: For 2 character ranges prefer [ab] to [a-b]
202    UnicodeSet s;
203    s.add((UChar)97, (UChar)98); // 'a', 'b'
204    expectToPattern(s, "[ab]", NULL);
205}
206
207UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
208
209    // use Integer.toString because Utility.hex doesn't handle ints
210    UnicodeString pat = "";
211    // TODO do these in hex
212    //String source = "0x" + Integer.toString(start,16).toUpperCase();
213    //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
214    UnicodeString source;
215    source = source + (uint32_t)start;
216    if (start != end)
217        source = source + ".." + (uint32_t)end;
218    UnicodeSet testSet;
219    testSet.add(start, end);
220    return checkPat(source, testSet);
221}
222
223UBool UnicodeSetTest::checkPat(const UnicodeString& source,
224                               const UnicodeSet& testSet) {
225    // What we want to make sure of is that a pattern generated
226    // by toPattern(), with or without escaped unprintables, can
227    // be passed back into the UnicodeSet constructor.
228    UnicodeString pat0;
229
230    testSet.toPattern(pat0, TRUE);
231
232    if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
233
234    //String pat1 = unescapeLeniently(pat0);
235    //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
236
237    UnicodeString pat2;
238    testSet.toPattern(pat2, FALSE);
239    if (!checkPat(source, testSet, pat2)) return FALSE;
240
241    //String pat3 = unescapeLeniently(pat2);
242    // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
243
244    //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
245    logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
246    return TRUE;
247}
248
249UBool UnicodeSetTest::checkPat(const UnicodeString& source,
250                               const UnicodeSet& testSet,
251                               const UnicodeString& pat) {
252    UErrorCode ec = U_ZERO_ERROR;
253    UnicodeSet testSet2(pat, ec);
254    if (testSet2 != testSet) {
255        errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
256        return FALSE;
257    }
258    return TRUE;
259}
260
261void
262UnicodeSetTest::TestPatterns(void) {
263    UnicodeSet set;
264    expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
265    expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
266    expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
267    expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
268    expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
269    expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
270
271    // Throw in a test of complement
272    set.complement();
273    UnicodeString exp;
274    exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
275    expectPairs(set, exp);
276}
277
278void
279UnicodeSetTest::TestCategories(void) {
280    UErrorCode status = U_ZERO_ERROR;
281    const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
282    UnicodeSet set(pat, status);
283    if (U_FAILURE(status)) {
284        dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
285        return;
286    } else {
287        expectContainment(set, pat, "ABC", "abc");
288    }
289
290    UChar32 i;
291    int32_t failures = 0;
292    // Make sure generation of L doesn't pollute cached Lu set
293    // First generate L, then Lu
294    set.applyPattern("[:L:]", status);
295    if (U_FAILURE(status)) { errln("FAIL"); return; }
296    for (i=0; i<0x200; ++i) {
297        UBool l = u_isalpha((UChar)i);
298        if (l != set.contains(i)) {
299            errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
300                  set.contains(i));
301            if (++failures == 10) break;
302        }
303    }
304
305    set.applyPattern("[:Lu:]", status);
306    if (U_FAILURE(status)) { errln("FAIL"); return; }
307    for (i=0; i<0x200; ++i) {
308        UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
309        if (lu != set.contains(i)) {
310            errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
311                  set.contains(i));
312            if (++failures == 20) break;
313        }
314    }
315}
316void
317UnicodeSetTest::TestCloneEqualHash(void) {
318    UErrorCode status = U_ZERO_ERROR;
319    // set1 and set2 used to be built with the obsolete constructor taking
320    // UCharCategory values; replaced with pattern constructors
321    // markus 20030502
322    UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
323    UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
324    if (U_FAILURE(status)){
325        dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
326        return;
327    }
328    UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
329    UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
330    if (U_FAILURE(status)){
331        errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
332        return;
333    }
334
335    if (*set1 != *set1a) {
336        errln("FAIL: category constructor for Ll broken");
337    }
338    if (*set2 != *set2a) {
339        errln("FAIL: category constructor for Nd broken");
340    }
341    delete set1a;
342    delete set2a;
343
344    logln("Testing copy construction");
345    UnicodeSet *set1copy=new UnicodeSet(*set1);
346    if(*set1 != *set1copy || *set1 == *set2 ||
347        getPairs(*set1) != getPairs(*set1copy) ||
348        set1->hashCode() != set1copy->hashCode()){
349        errln("FAIL : Error in copy construction");
350        return;
351    }
352
353    logln("Testing =operator");
354    UnicodeSet set1equal=*set1;
355    UnicodeSet set2equal=*set2;
356    if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
357        set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
358        errln("FAIL: Error in =operator");
359    }
360
361    logln("Testing clone()");
362    UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
363    UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
364    if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
365        *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
366        *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
367        errln("FAIL: Error in clone");
368    }
369
370    logln("Testing hashcode");
371    if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
372        set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
373        set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
374        set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
375        set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
376        errln("FAIL: Error in hashCode()");
377    }
378
379    delete set1;
380    delete set1copy;
381    delete set2;
382    delete set1clone;
383    delete set2clone;
384
385
386}
387void
388UnicodeSetTest::TestAddRemove(void) {
389    UnicodeSet set; // Construct empty set
390    doAssert(set.isEmpty() == TRUE, "set should be empty");
391    doAssert(set.size() == 0, "size should be 0");
392    set.complement();
393    doAssert(set.size() == 0x110000, "size should be 0x110000");
394    set.clear();
395    set.add(0x0061, 0x007a);
396    expectPairs(set, "az");
397    doAssert(set.isEmpty() == FALSE, "set should not be empty");
398    doAssert(set.size() != 0, "size should not be equal to 0");
399    doAssert(set.size() == 26, "size should be equal to 26");
400    set.remove(0x006d, 0x0070);
401    expectPairs(set, "alqz");
402    doAssert(set.size() == 22, "size should be equal to 22");
403    set.remove(0x0065, 0x0067);
404    expectPairs(set, "adhlqz");
405    doAssert(set.size() == 19, "size should be equal to 19");
406    set.remove(0x0064, 0x0069);
407    expectPairs(set, "acjlqz");
408    doAssert(set.size() == 16, "size should be equal to 16");
409    set.remove(0x0063, 0x0072);
410    expectPairs(set, "absz");
411    doAssert(set.size() == 10, "size should be equal to 10");
412    set.add(0x0066, 0x0071);
413    expectPairs(set, "abfqsz");
414    doAssert(set.size() == 22, "size should be equal to 22");
415    set.remove(0x0061, 0x0067);
416    expectPairs(set, "hqsz");
417    set.remove(0x0061, 0x007a);
418    expectPairs(set, "");
419    doAssert(set.isEmpty() == TRUE, "set should be empty");
420    doAssert(set.size() == 0, "size should be 0");
421    set.add(0x0061);
422    doAssert(set.isEmpty() == FALSE, "set should not be empty");
423    doAssert(set.size() == 1, "size should not be equal to 1");
424    set.add(0x0062);
425    set.add(0x0063);
426    expectPairs(set, "ac");
427    doAssert(set.size() == 3, "size should not be equal to 3");
428    set.add(0x0070);
429    set.add(0x0071);
430    expectPairs(set, "acpq");
431    doAssert(set.size() == 5, "size should not be equal to 5");
432    set.clear();
433    expectPairs(set, "");
434    doAssert(set.isEmpty() == TRUE, "set should be empty");
435    doAssert(set.size() == 0, "size should be 0");
436
437    // Try removing an entire set from another set
438    expectPattern(set, "[c-x]", "cx");
439    UnicodeSet set2;
440    expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
441    set.removeAll(set2);
442    expectPairs(set, "deluxx");
443
444    // Try adding an entire set to another set
445    expectPattern(set, "[jackiemclean]", "aacceein");
446    expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
447    set.addAll(set2);
448    expectPairs(set, "aacehort");
449    doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
450
451    // Try retaining an set of elements contained in another set (intersection)
452    UnicodeSet set3;
453    expectPattern(set3, "[a-c]", "ac");
454    doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
455    set3.remove(0x0062);
456    expectPairs(set3, "aacc");
457    doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
458    set.retainAll(set3);
459    expectPairs(set, "aacc");
460    doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
461    doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
462    set.clear();
463    doAssert(set.size() != set3.size(), "set.size() != set3.size()");
464
465    // Test commutativity
466    expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
467    expectPattern(set2, "[jackiemclean]", "aacceein");
468    set.addAll(set2);
469    expectPairs(set, "aacehort");
470    doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
471
472
473
474
475}
476
477/**
478 * Make sure minimal representation is maintained.
479 */
480void UnicodeSetTest::TestMinimalRep() {
481    UErrorCode status = U_ZERO_ERROR;
482    // This is pretty thoroughly tested by checkCanonicalRep()
483    // run against the exhaustive operation results.  Use the code
484    // here for debugging specific spot problems.
485
486    // 1 overlap against 2
487    UnicodeSet set("[h-km-q]", status);
488    if (U_FAILURE(status)) { errln("FAIL"); return; }
489    UnicodeSet set2("[i-o]", status);
490    if (U_FAILURE(status)) { errln("FAIL"); return; }
491    set.addAll(set2);
492    expectPairs(set, "hq");
493    // right
494    set.applyPattern("[a-m]", status);
495    if (U_FAILURE(status)) { errln("FAIL"); return; }
496    set2.applyPattern("[e-o]", status);
497    if (U_FAILURE(status)) { errln("FAIL"); return; }
498    set.addAll(set2);
499    expectPairs(set, "ao");
500    // left
501    set.applyPattern("[e-o]", status);
502    if (U_FAILURE(status)) { errln("FAIL"); return; }
503    set2.applyPattern("[a-m]", status);
504    if (U_FAILURE(status)) { errln("FAIL"); return; }
505    set.addAll(set2);
506    expectPairs(set, "ao");
507    // 1 overlap against 3
508    set.applyPattern("[a-eg-mo-w]", status);
509    if (U_FAILURE(status)) { errln("FAIL"); return; }
510    set2.applyPattern("[d-q]", status);
511    if (U_FAILURE(status)) { errln("FAIL"); return; }
512    set.addAll(set2);
513    expectPairs(set, "aw");
514}
515
516void UnicodeSetTest::TestAPI() {
517    UErrorCode status = U_ZERO_ERROR;
518    // default ct
519    UnicodeSet set;
520    if (!set.isEmpty() || set.getRangeCount() != 0) {
521        errln((UnicodeString)"FAIL, set should be empty but isn't: " +
522              set);
523    }
524
525    // clear(), isEmpty()
526    set.add(0x0061);
527    if (set.isEmpty()) {
528        errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
529              set);
530    }
531    set.clear();
532    if (!set.isEmpty()) {
533        errln((UnicodeString)"FAIL, set should be empty but isn't: " +
534              set);
535    }
536
537    // size()
538    set.clear();
539    if (set.size() != 0) {
540        errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
541              ": " + set);
542    }
543    set.add(0x0061);
544    if (set.size() != 1) {
545        errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
546              ": " + set);
547    }
548    set.add(0x0031, 0x0039);
549    if (set.size() != 10) {
550        errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
551              ": " + set);
552    }
553
554    // contains(first, last)
555    set.clear();
556    set.applyPattern("[A-Y 1-8 b-d l-y]", status);
557    if (U_FAILURE(status)) { errln("FAIL"); return; }
558    for (int32_t i = 0; i<set.getRangeCount(); ++i) {
559        UChar32 a = set.getRangeStart(i);
560        UChar32 b = set.getRangeEnd(i);
561        if (!set.contains(a, b)) {
562            errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
563                  " but doesn't: " + set);
564        }
565        if (set.contains((UChar32)(a-1), b)) {
566            errln((UnicodeString)"FAIL, shouldn't contain " +
567                  (unsigned short)(a-1) + '-' + (unsigned short)b +
568                  " but does: " + set);
569        }
570        if (set.contains(a, (UChar32)(b+1))) {
571            errln((UnicodeString)"FAIL, shouldn't contain " +
572                  (unsigned short)a + '-' + (unsigned short)(b+1) +
573                  " but does: " + set);
574        }
575    }
576
577    // Ported InversionList test.
578    UnicodeSet a((UChar32)3,(UChar32)10);
579    UnicodeSet b((UChar32)7,(UChar32)15);
580    UnicodeSet c;
581
582    logln((UnicodeString)"a [3-10]: " + a);
583    logln((UnicodeString)"b [7-15]: " + b);
584    c = a;
585    c.addAll(b);
586    UnicodeSet exp((UChar32)3,(UChar32)15);
587    if (c == exp) {
588        logln((UnicodeString)"c.set(a).add(b): " + c);
589    } else {
590        errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
591    }
592    c.complement();
593    exp.set((UChar32)0, (UChar32)2);
594    exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
595    if (c == exp) {
596        logln((UnicodeString)"c.complement(): " + c);
597    } else {
598        errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
599    }
600    c.complement();
601    exp.set((UChar32)3, (UChar32)15);
602    if (c == exp) {
603        logln((UnicodeString)"c.complement(): " + c);
604    } else {
605        errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
606    }
607    c = a;
608    c.complementAll(b);
609    exp.set((UChar32)3,(UChar32)6);
610    exp.add((UChar32)11,(UChar32) 15);
611    if (c == exp) {
612        logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
613    } else {
614        errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
615    }
616
617    exp = c;
618    bitsToSet(setToBits(c), c);
619    if (c == exp) {
620        logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
621    } else {
622        errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
623    }
624
625    // Additional tests for coverage JB#2118
626    //UnicodeSet::complement(class UnicodeString const &)
627    //UnicodeSet::complementAll(class UnicodeString const &)
628    //UnicodeSet::containsNone(class UnicodeSet const &)
629    //UnicodeSet::containsNone(long,long)
630    //UnicodeSet::containsSome(class UnicodeSet const &)
631    //UnicodeSet::containsSome(long,long)
632    //UnicodeSet::removeAll(class UnicodeString const &)
633    //UnicodeSet::retain(long)
634    //UnicodeSet::retainAll(class UnicodeString const &)
635    //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
636    //UnicodeSetIterator::getString(void)
637    set.clear();
638    set.complement("ab");
639    exp.applyPattern("[{ab}]", status);
640    if (U_FAILURE(status)) { errln("FAIL"); return; }
641    if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
642
643    UnicodeSetIterator iset(set);
644    if (!iset.next() || !iset.isString()) {
645        errln("FAIL: UnicodeSetIterator::next/isString");
646    } else if (iset.getString() != "ab") {
647        errln("FAIL: UnicodeSetIterator::getString");
648    }
649
650    set.add((UChar32)0x61, (UChar32)0x7A);
651    set.complementAll("alan");
652    exp.applyPattern("[{ab}b-kmo-z]", status);
653    if (U_FAILURE(status)) { errln("FAIL"); return; }
654    if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
655
656    exp.applyPattern("[a-z]", status);
657    if (U_FAILURE(status)) { errln("FAIL"); return; }
658    if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
659    if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
660    exp.applyPattern("[aln]", status);
661    if (U_FAILURE(status)) { errln("FAIL"); return; }
662    if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
663    if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
664
665    if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
666        errln("FAIL: containsNone(UChar32, UChar32)");
667    }
668    if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
669        errln("FAIL: containsSome(UChar32, UChar32)");
670    }
671    if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
672        errln("FAIL: containsNone(UChar32, UChar32)");
673    }
674    if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
675        errln("FAIL: containsSome(UChar32, UChar32)");
676    }
677
678    set.removeAll("liu");
679    exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
680    if (U_FAILURE(status)) { errln("FAIL"); return; }
681    if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
682
683    set.retainAll("star");
684    exp.applyPattern("[rst]", status);
685    if (U_FAILURE(status)) { errln("FAIL"); return; }
686    if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
687
688    set.retain((UChar32)0x73);
689    exp.applyPattern("[s]", status);
690    if (U_FAILURE(status)) { errln("FAIL"); return; }
691    if (set != exp) { errln("FAIL: retain('s')"); return; }
692
693    uint16_t buf[32];
694    int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
695    if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
696    if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
697        errln("FAIL: serialize");
698        return;
699    }
700
701    // Conversions to and from USet
702    UnicodeSet *uniset = &set;
703    USet *uset = uniset->toUSet();
704    TEST_ASSERT((void *)uset == (void *)uniset);
705    UnicodeSet *setx = UnicodeSet::fromUSet(uset);
706    TEST_ASSERT((void *)setx == (void *)uset);
707    const UnicodeSet *constSet = uniset;
708    const USet *constUSet = constSet->toUSet();
709    TEST_ASSERT((void *)constUSet == (void *)constSet);
710    const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
711    TEST_ASSERT((void *)constSetx == (void *)constUSet);
712
713    // span(UnicodeString) and spanBack(UnicodeString) convenience methods
714    UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
715    UnicodeSet ac(0x61, 0x63);
716    ac.remove(0x62).freeze();
717    if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
718        ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
719        ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
720        ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
721        ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
722        ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
723        ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
724        ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
725        ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
726        ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
727    ) {
728        errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
729    }
730    if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
731        ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
732        ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
733        ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
734        ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
735        ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
736        ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
737        ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
738        ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
739        ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
740    ) {
741        errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
742    }
743}
744
745void UnicodeSetTest::TestIteration() {
746    UErrorCode ec = U_ZERO_ERROR;
747    int i = 0;
748    int outerLoop;
749
750    // 6 code points, 3 ranges, 2 strings, 8 total elements
751    //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
752    UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
753    TEST_ASSERT_SUCCESS(ec);
754    UnicodeSetIterator it(set);
755
756    for (outerLoop=0; outerLoop<3; outerLoop++) {
757        // Run the test multiple times, to check that iterator.reset() is working.
758        for (i=0; i<10; i++) {
759            UBool         nextv        = it.next();
760            UBool         isString     = it.isString();
761            int32_t       codePoint    = it.getCodepoint();
762            //int32_t       codePointEnd = it.getCodepointEnd();
763            UnicodeString s   = it.getString();
764            switch (i) {
765            case 0:
766                TEST_ASSERT(nextv == TRUE);
767                TEST_ASSERT(isString == FALSE);
768                TEST_ASSERT(codePoint==0x61);
769                TEST_ASSERT(s == "a");
770                break;
771            case 1:
772                TEST_ASSERT(nextv == TRUE);
773                TEST_ASSERT(isString == FALSE);
774                TEST_ASSERT(codePoint==0x62);
775                TEST_ASSERT(s == "b");
776                break;
777            case 2:
778                TEST_ASSERT(nextv == TRUE);
779                TEST_ASSERT(isString == FALSE);
780                TEST_ASSERT(codePoint==0x63);
781                TEST_ASSERT(s == "c");
782                break;
783            case 3:
784                TEST_ASSERT(nextv == TRUE);
785                TEST_ASSERT(isString == FALSE);
786                TEST_ASSERT(codePoint==0x79);
787                TEST_ASSERT(s == "y");
788                break;
789            case 4:
790                TEST_ASSERT(nextv == TRUE);
791                TEST_ASSERT(isString == FALSE);
792                TEST_ASSERT(codePoint==0x7a);
793                TEST_ASSERT(s == "z");
794                break;
795            case 5:
796                TEST_ASSERT(nextv == TRUE);
797                TEST_ASSERT(isString == FALSE);
798                TEST_ASSERT(codePoint==0x1abcd);
799                TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
800                break;
801            case 6:
802                TEST_ASSERT(nextv == TRUE);
803                TEST_ASSERT(isString == TRUE);
804                TEST_ASSERT(s == "str1");
805                break;
806            case 7:
807                TEST_ASSERT(nextv == TRUE);
808                TEST_ASSERT(isString == TRUE);
809                TEST_ASSERT(s == "str2");
810                break;
811            case 8:
812                TEST_ASSERT(nextv == FALSE);
813                break;
814            case 9:
815                TEST_ASSERT(nextv == FALSE);
816                break;
817            }
818        }
819        it.reset();  // prepare to run the iteration again.
820    }
821}
822
823
824
825
826void UnicodeSetTest::TestStrings() {
827    UErrorCode ec = U_ZERO_ERROR;
828
829    UnicodeSet* testList[] = {
830        UnicodeSet::createFromAll("abc"),
831        new UnicodeSet("[a-c]", ec),
832
833        &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
834        new UnicodeSet("[{ll}{ch}a-z]", ec),
835
836        UnicodeSet::createFrom("ab}c"),
837        new UnicodeSet("[{ab\\}c}]", ec),
838
839        &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
840        new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
841
842        NULL
843    };
844
845    if (U_FAILURE(ec)) {
846        errln("FAIL: couldn't construct test sets");
847    }
848
849    for (int32_t i = 0; testList[i] != NULL; i+=2) {
850        if (U_SUCCESS(ec)) {
851            UnicodeString pat0, pat1;
852            testList[i]->toPattern(pat0, TRUE);
853            testList[i+1]->toPattern(pat1, TRUE);
854            if (*testList[i] == *testList[i+1]) {
855                logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
856            } else {
857                logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
858            }
859        }
860        delete testList[i];
861        delete testList[i+1];
862    }
863}
864
865/**
866 * Test the [:Latin:] syntax.
867 */
868void UnicodeSetTest::TestScriptSet() {
869    expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
870
871    expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
872
873    /* Jitterbug 1423 */
874    expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
875
876}
877
878/**
879 * Test the [:Latin:] syntax.
880 */
881void UnicodeSetTest::TestPropertySet() {
882    static const char* const DATA[] = {
883        // Pattern, Chars IN, Chars NOT in
884
885        "[:Latin:]",
886        "aA",
887        "\\u0391\\u03B1",
888
889        "[\\p{Greek}]",
890        "\\u0391\\u03B1",
891        "aA",
892
893        "\\P{ GENERAL Category = upper case letter }",
894        "abc",
895        "ABC",
896
897#if !UCONFIG_NO_NORMALIZATION
898        // Combining class: @since ICU 2.2
899        // Check both symbolic and numeric
900        "\\p{ccc=Nukta}",
901        "\\u0ABC",
902        "abc",
903
904        "\\p{Canonical Combining Class = 11}",
905        "\\u05B1",
906        "\\u05B2",
907
908        "[:c c c = iota subscript :]",
909        "\\u0345",
910        "xyz",
911#endif
912
913        // Bidi class: @since ICU 2.2
914        "\\p{bidiclass=lefttoright}",
915        "abc",
916        "\\u0671\\u0672",
917
918        // Binary properties: @since ICU 2.2
919        "\\p{ideographic}",
920        "\\u4E0A",
921        "x",
922
923        "[:math=false:]",
924        "q)*(",
925        // weiv: )(and * were removed from math in Unicode 4.0.1
926        //"(*+)",
927        "+<>^",
928
929        // JB#1767 \N{}, \p{ASCII}
930        "[:Ascii:]",
931        "abc\\u0000\\u007F",
932        "\\u0080\\u4E00",
933
934        "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
935        "az",
936        "qrs",
937
938        // JB#2015
939        "[:any:]",
940        "a\\U0010FFFF",
941        "",
942
943        "[:nv=0.5:]",
944        "\\u00BD\\u0F2A",
945        "\\u00BC",
946
947        // JB#2653: Age
948        "[:Age=1.1:]",
949        "\\u03D6", // 1.1
950        "\\u03D8\\u03D9", // 3.2
951
952        "[:Age=3.1:]",
953        "\\u1800\\u3400\\U0002f800",
954        "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
955
956        // JB#2350: Case_Sensitive
957        "[:Case Sensitive:]",
958        "A\\u1FFC\\U00010410",
959        ";\\u00B4\\U00010500",
960
961        // JB#2832: C99-compatibility props
962        "[:blank:]",
963        " \\u0009",
964        "1-9A-Z",
965
966        "[:graph:]",
967        "19AZ",
968        " \\u0003\\u0007\\u0009\\u000A\\u000D",
969
970        "[:punct:]",
971        "!@#%&*()[]{}-_\\/;:,.?'\"",
972        "09azAZ",
973
974        "[:xdigit:]",
975        "09afAF",
976        "gG!",
977
978        // Regex compatibility test
979        "[-b]", // leading '-' is literal
980        "-b",
981        "ac",
982
983        "[^-b]", // leading '-' is literal
984        "ac",
985        "-b",
986
987        "[b-]", // trailing '-' is literal
988        "-b",
989        "ac",
990
991        "[^b-]", // trailing '-' is literal
992        "ac",
993        "-b",
994
995        "[a-b-]", // trailing '-' is literal
996        "ab-",
997        "c=",
998
999        "[[a-q]&[p-z]-]", // trailing '-' is literal
1000        "pq-",
1001        "or=",
1002
1003        "[\\s|\\)|:|$|\\>]", // from regex tests
1004        "s|):$>",
1005        "abc",
1006
1007        "[\\uDC00cd]", // JB#2906: isolated trail at start
1008        "cd\\uDC00",
1009        "ab\\uD800\\U00010000",
1010
1011        "[ab\\uD800]", // JB#2906: isolated trail at start
1012        "ab\\uD800",
1013        "cd\\uDC00\\U00010000",
1014
1015        "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1016        "abcd\\uD800",
1017        "ef\\uDC00\\U00010000",
1018
1019        "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1020        "abcd\\uDC00",
1021        "ef\\uD800\\U00010000",
1022
1023#if !UCONFIG_NO_NORMALIZATION
1024        "[:^lccc=0:]", // Lead canonical class
1025        "\\u0300\\u0301",
1026        "abcd\\u00c0\\u00c5",
1027
1028        "[:^tccc=0:]", // Trail canonical class
1029        "\\u0300\\u0301\\u00c0\\u00c5",
1030        "abcd",
1031
1032        "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1033        "\\u0300\\u0301\\u00c0\\u00c5",
1034        "abcd",
1035
1036        "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1037        "",
1038        "abcd\\u0300\\u0301\\u00c0\\u00c5",
1039
1040        "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1041        "\\u0F73\\u0F75\\u0F81",
1042        "abcd\\u0300\\u0301\\u00c0\\u00c5",
1043#endif /* !UCONFIG_NO_NORMALIZATION */
1044
1045        "[:Assigned:]",
1046        "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1047        "\\u0888\\uFDD3\\uFFFE\\U00050005",
1048
1049        // Script_Extensions, new in Unicode 6.0
1050        "[:scx=Arab:]",
1051        "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1052        "\\u061D\\uFDEF\\uFDFE",
1053
1054        // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1055        // so scx-sc is missing U+FDF2.
1056        "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1057        "\\u0640\\u064B\\u0650\\u0655\\uFDFD",
1058        "\\uFDF2"
1059    };
1060
1061    static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1062
1063    for (int32_t i=0; i<DATA_LEN; i+=3) {
1064        expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1065                          CharsToUnicodeString(DATA[i+2]));
1066    }
1067}
1068
1069/**
1070  * Test that Posix style character classes [:digit:], etc.
1071  *   have the Unicode definitions from TR 18.
1072  */
1073void UnicodeSetTest::TestPosixClasses() {
1074    {
1075        UErrorCode status = U_ZERO_ERROR;
1076        UnicodeSet s1("[:alpha:]", status);
1077        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1078        TEST_ASSERT_SUCCESS(status);
1079        TEST_ASSERT(s1==s2);
1080    }
1081    {
1082        UErrorCode status = U_ZERO_ERROR;
1083        UnicodeSet s1("[:lower:]", status);
1084        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1085        TEST_ASSERT_SUCCESS(status);
1086        TEST_ASSERT(s1==s2);
1087    }
1088    {
1089        UErrorCode status = U_ZERO_ERROR;
1090        UnicodeSet s1("[:upper:]", status);
1091        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1092        TEST_ASSERT_SUCCESS(status);
1093        TEST_ASSERT(s1==s2);
1094    }
1095    {
1096        UErrorCode status = U_ZERO_ERROR;
1097        UnicodeSet s1("[:punct:]", status);
1098        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1099        TEST_ASSERT_SUCCESS(status);
1100        TEST_ASSERT(s1==s2);
1101    }
1102    {
1103        UErrorCode status = U_ZERO_ERROR;
1104        UnicodeSet s1("[:digit:]", status);
1105        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1106        TEST_ASSERT_SUCCESS(status);
1107        TEST_ASSERT(s1==s2);
1108    }
1109    {
1110        UErrorCode status = U_ZERO_ERROR;
1111        UnicodeSet s1("[:xdigit:]", status);
1112        UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1113        TEST_ASSERT_SUCCESS(status);
1114        TEST_ASSERT(s1==s2);
1115    }
1116    {
1117        UErrorCode status = U_ZERO_ERROR;
1118        UnicodeSet s1("[:alnum:]", status);
1119        UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1120        TEST_ASSERT_SUCCESS(status);
1121        TEST_ASSERT(s1==s2);
1122    }
1123    {
1124        UErrorCode status = U_ZERO_ERROR;
1125        UnicodeSet s1("[:space:]", status);
1126        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1127        TEST_ASSERT_SUCCESS(status);
1128        TEST_ASSERT(s1==s2);
1129    }
1130    {
1131        UErrorCode status = U_ZERO_ERROR;
1132        UnicodeSet s1("[:blank:]", status);
1133        TEST_ASSERT_SUCCESS(status);
1134        UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1135            status);
1136        TEST_ASSERT_SUCCESS(status);
1137        TEST_ASSERT(s1==s2);
1138    }
1139    {
1140        UErrorCode status = U_ZERO_ERROR;
1141        UnicodeSet s1("[:cntrl:]", status);
1142        TEST_ASSERT_SUCCESS(status);
1143        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1144        TEST_ASSERT_SUCCESS(status);
1145        TEST_ASSERT(s1==s2);
1146    }
1147    {
1148        UErrorCode status = U_ZERO_ERROR;
1149        UnicodeSet s1("[:graph:]", status);
1150        TEST_ASSERT_SUCCESS(status);
1151        UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1152        TEST_ASSERT_SUCCESS(status);
1153        TEST_ASSERT(s1==s2);
1154    }
1155    {
1156        UErrorCode status = U_ZERO_ERROR;
1157        UnicodeSet s1("[:print:]", status);
1158        TEST_ASSERT_SUCCESS(status);
1159        UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1160        TEST_ASSERT_SUCCESS(status);
1161        TEST_ASSERT(s1==s2);
1162    }
1163}
1164/**
1165 * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1166 */
1167void UnicodeSetTest::TestClone() {
1168    UErrorCode ec = U_ZERO_ERROR;
1169    UnicodeSet s("[abcxyz]", ec);
1170    UnicodeSet t(s);
1171    expectContainment(t, "abc", "def");
1172}
1173
1174/**
1175 * Test the indexOf() and charAt() methods.
1176 */
1177void UnicodeSetTest::TestIndexOf() {
1178    UErrorCode ec = U_ZERO_ERROR;
1179    UnicodeSet set("[a-cx-y3578]", ec);
1180    if (U_FAILURE(ec)) {
1181        errln("FAIL: UnicodeSet constructor");
1182        return;
1183    }
1184    for (int32_t i=0; i<set.size(); ++i) {
1185        UChar32 c = set.charAt(i);
1186        if (set.indexOf(c) != i) {
1187            errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1188                i, c, set.indexOf(c));
1189        }
1190    }
1191    UChar32 c = set.charAt(set.size());
1192    if (c != -1) {
1193        errln("FAIL: charAt(<out of range>) = %X", c);
1194    }
1195    int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1196    if (j != -1) {
1197        errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1198    }
1199}
1200
1201/**
1202 * Test closure API.
1203 */
1204void UnicodeSetTest::TestCloseOver() {
1205    UErrorCode ec = U_ZERO_ERROR;
1206
1207    char CASE[] = {(char)USET_CASE_INSENSITIVE};
1208    char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1209    const char* DATA[] = {
1210        // selector, input, output
1211        CASE,
1212        "[aq\\u00DF{Bc}{bC}{Fi}]",
1213        "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1214
1215        CASE,
1216        "[\\u01F1]", // 'DZ'
1217        "[\\u01F1\\u01F2\\u01F3]",
1218
1219        CASE,
1220        "[\\u1FB4]",
1221        "[\\u1FB4{\\u03AC\\u03B9}]",
1222
1223        CASE,
1224        "[{F\\uFB01}]",
1225        "[\\uFB03{ffi}]",
1226
1227        CASE, // make sure binary search finds limits
1228        "[a\\uFF3A]",
1229        "[aA\\uFF3A\\uFF5A]",
1230
1231        CASE,
1232        "[a-z]","[A-Za-z\\u017F\\u212A]",
1233        CASE,
1234        "[abc]","[A-Ca-c]",
1235        CASE,
1236        "[ABC]","[A-Ca-c]",
1237
1238        CASE, "[i]", "[iI]",
1239
1240        CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1241        CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1242
1243        CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1244
1245        CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1246
1247        CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1248
1249        CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1250
1251        CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1252
1253        CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1254
1255        CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1256        CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1257
1258        CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1259
1260        CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1261
1262        CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1263
1264#if !UCONFIG_NO_FILE_IO
1265        CASE_MAPPINGS,
1266        "[aq\\u00DF{Bc}{bC}{Fi}]",
1267        "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1268#endif
1269
1270        CASE_MAPPINGS,
1271        "[\\u01F1]", // 'DZ'
1272        "[\\u01F1\\u01F2\\u01F3]",
1273
1274        CASE_MAPPINGS,
1275        "[a-z]",
1276        "[A-Za-z]",
1277
1278        NULL
1279    };
1280
1281    UnicodeSet s;
1282    UnicodeSet t;
1283    UnicodeString buf;
1284    for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1285        int32_t selector = DATA[i][0];
1286        UnicodeString pat(DATA[i+1], -1, US_INV);
1287        UnicodeString exp(DATA[i+2], -1, US_INV);
1288        s.applyPattern(pat, ec);
1289        s.closeOver(selector);
1290        t.applyPattern(exp, ec);
1291        if (U_FAILURE(ec)) {
1292            errln("FAIL: applyPattern failed");
1293            continue;
1294        }
1295        if (s == t) {
1296            logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1297        } else {
1298            dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1299                  s.toPattern(buf, TRUE) + ", expected " + exp);
1300        }
1301    }
1302
1303#if 0
1304    /*
1305     * Unused test code.
1306     * This was used to compare the old implementation (using USET_CASE)
1307     * with the new one (using 0x100 temporarily)
1308     * while transitioning from hardcoded case closure tables in uniset.cpp
1309     * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1310     * and using ucase.c functions for closure.
1311     * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1312     *
1313     * Note: The old and new implementation never fully matched because
1314     * the old implementation turned out to not map U+0130 and U+0131 correctly
1315     * (dotted I and dotless i) and because the old implementation's data tables
1316     * were outdated compared to Unicode 4.0.1 at the time of the change to the
1317     * new implementation. (So sigmas and some other characters were not handled
1318     * according to the newer Unicode version.)
1319     */
1320    UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1321    UnicodeSetIterator si(sens);
1322    UnicodeString str, buf2;
1323    const UnicodeString *pStr;
1324    UChar32 c;
1325    while(si.next()) {
1326        if(!si.isString()) {
1327            c=si.getCodepoint();
1328            s.clear();
1329            s.add(c);
1330
1331            str.setTo(c);
1332            str.foldCase();
1333            sens2.add(str);
1334
1335            t=s;
1336            s.closeOver(USET_CASE);
1337            t.closeOver(0x100);
1338            if(s!=t) {
1339                errln("FAIL: closeOver(U+%04x) differs: ", c);
1340                errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1341            }
1342        }
1343    }
1344    // remove all code points
1345    // should contain all full case folding mapping strings
1346    sens2.remove(0, 0x10ffff);
1347    si.reset(sens2);
1348    while(si.next()) {
1349        if(si.isString()) {
1350            pStr=&si.getString();
1351            s.clear();
1352            s.add(*pStr);
1353            t=s2=s;
1354            s.closeOver(USET_CASE);
1355            t.closeOver(0x100);
1356            if(s!=t) {
1357                errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1358                errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1359            }
1360        }
1361    }
1362#endif
1363
1364    // Test the pattern API
1365    s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1366    if (U_FAILURE(ec)) {
1367        errln("FAIL: applyPattern failed");
1368    } else {
1369        expectContainment(s, "abcABC", "defDEF");
1370    }
1371    UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1372    if (U_FAILURE(ec)) {
1373        errln("FAIL: constructor failed");
1374    } else {
1375        expectContainment(v, "defDEF", "abcABC");
1376    }
1377    UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1378    if (U_FAILURE(ec)) {
1379        errln("FAIL: construct w/case mappings failed");
1380    } else {
1381        expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1382    }
1383}
1384
1385void UnicodeSetTest::TestEscapePattern() {
1386    const char pattern[] =
1387        "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1388    const char exp[] =
1389        "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1390    // We test this with two passes; in the second pass we
1391    // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1392    // this fails -- which is what we expect.
1393    for (int32_t pass=1; pass<=2; ++pass) {
1394        UErrorCode ec = U_ZERO_ERROR;
1395        UnicodeString pat(pattern, -1, US_INV);
1396        if (pass==2) {
1397            pat = pat.unescape();
1398        }
1399        // Pattern is only good for pass 1
1400        UBool isPatternValid = (pass==1);
1401
1402        UnicodeSet set(pat, ec);
1403        if (U_SUCCESS(ec) != isPatternValid){
1404            errln((UnicodeString)"FAIL: applyPattern(" +
1405                  escape(pat) + ") => " +
1406                  u_errorName(ec));
1407            continue;
1408        }
1409        if (U_FAILURE(ec)) {
1410            continue;
1411        }
1412        if (set.contains((UChar)0x0644)){
1413            errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1414        }
1415
1416        UnicodeString newpat;
1417        set.toPattern(newpat, TRUE);
1418        if (newpat == UnicodeString(exp, -1, US_INV)) {
1419            logln(escape(pat) + " => " + newpat);
1420        } else {
1421            errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1422        }
1423
1424        for (int32_t i=0; i<set.getRangeCount(); ++i) {
1425            UnicodeString str("Range ");
1426            str.append((UChar)(0x30 + i))
1427                .append(": ")
1428                .append((UChar32)set.getRangeStart(i))
1429                .append(" - ")
1430                .append((UChar32)set.getRangeEnd(i));
1431            str = str + " (" + set.getRangeStart(i) + " - " +
1432                set.getRangeEnd(i) + ")";
1433            if (set.getRangeStart(i) < 0) {
1434                errln((UnicodeString)"FAIL: " + escape(str));
1435            } else {
1436                logln(escape(str));
1437            }
1438        }
1439    }
1440}
1441
1442void UnicodeSetTest::expectRange(const UnicodeString& label,
1443                                 const UnicodeSet& set,
1444                                 UChar32 start, UChar32 end) {
1445    UnicodeSet exp(start, end);
1446    UnicodeString pat;
1447    if (set == exp) {
1448        logln(label + " => " + set.toPattern(pat, TRUE));
1449    } else {
1450        UnicodeString xpat;
1451        errln((UnicodeString)"FAIL: " + label + " => " +
1452              set.toPattern(pat, TRUE) +
1453              ", expected " + exp.toPattern(xpat, TRUE));
1454    }
1455}
1456
1457void UnicodeSetTest::TestInvalidCodePoint() {
1458
1459    const UChar32 DATA[] = {
1460        // Test range             Expected range
1461        0, 0x10FFFF,              0, 0x10FFFF,
1462        (UChar32)-1, 8,           0, 8,
1463        8, 0x110000,              8, 0x10FFFF
1464    };
1465    const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1466
1467    UnicodeString pat;
1468    int32_t i;
1469
1470    for (i=0; i<DATA_LENGTH; i+=4) {
1471        UChar32 start  = DATA[i];
1472        UChar32 end    = DATA[i+1];
1473        UChar32 xstart = DATA[i+2];
1474        UChar32 xend   = DATA[i+3];
1475
1476        // Try various API using the test code points
1477
1478        UnicodeSet set(start, end);
1479        expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1480                    set, xstart, xend);
1481
1482        set.clear();
1483        set.set(start, end);
1484        expectRange((UnicodeString)"set(" + start + "," + end + ")",
1485                    set, xstart, xend);
1486
1487        UBool b = set.contains(start);
1488        b = set.contains(start, end);
1489        b = set.containsNone(start, end);
1490        b = set.containsSome(start, end);
1491        (void)b;   // Suppress set but not used warning.
1492
1493        /*int32_t index = set.indexOf(start);*/
1494
1495        set.clear();
1496        set.add(start);
1497        set.add(start, end);
1498        expectRange((UnicodeString)"add(" + start + "," + end + ")",
1499                    set, xstart, xend);
1500
1501        set.set(0, 0x10FFFF);
1502        set.retain(start, end);
1503        expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1504                    set, xstart, xend);
1505        set.retain(start);
1506
1507        set.set(0, 0x10FFFF);
1508        set.remove(start);
1509        set.remove(start, end);
1510        set.complement();
1511        expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1512                    set, xstart, xend);
1513
1514        set.set(0, 0x10FFFF);
1515        set.complement(start, end);
1516        set.complement();
1517        expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1518                    set, xstart, xend);
1519        set.complement(start);
1520    }
1521
1522    const UChar32 DATA2[] = {
1523        0,
1524        0x10FFFF,
1525        (UChar32)-1,
1526        0x110000
1527    };
1528    const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1529
1530    for (i=0; i<DATA2_LENGTH; ++i) {
1531        UChar32 c = DATA2[i], end = 0x10FFFF;
1532        UBool valid = (c >= 0 && c <= 0x10FFFF);
1533
1534        UnicodeSet set(0, 0x10FFFF);
1535
1536        // For single-codepoint contains, invalid codepoints are NOT contained
1537        UBool b = set.contains(c);
1538        if (b == valid) {
1539            logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1540                  ") = " + b);
1541        } else {
1542            errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1543                  ") = " + b);
1544        }
1545
1546        // For codepoint range contains, containsNone, and containsSome,
1547        // invalid or empty (start > end) ranges have UNDEFINED behavior.
1548        b = set.contains(c, end);
1549        logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1550              "," + end + ") = " + b);
1551
1552        b = set.containsNone(c, end);
1553        logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1554              "," + end + ") = " + b);
1555
1556        b = set.containsSome(c, end);
1557        logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1558              "," + end + ") = " + b);
1559
1560        int32_t index = set.indexOf(c);
1561        if ((index >= 0) == valid) {
1562            logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1563                  ") = " + index);
1564        } else {
1565            errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1566                  ") = " + index);
1567        }
1568    }
1569}
1570
1571// Used by TestSymbolTable
1572class TokenSymbolTable : public SymbolTable {
1573public:
1574    Hashtable contents;
1575
1576    TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1577        contents.setValueDeleter(uprv_deleteUObject);
1578    }
1579
1580    ~TokenSymbolTable() {}
1581
1582    /**
1583     * (Non-SymbolTable API) Add the given variable and value to
1584     * the table.  Variable should NOT contain leading '$'.
1585     */
1586    void add(const UnicodeString& var, const UnicodeString& value,
1587             UErrorCode& ec) {
1588        if (U_SUCCESS(ec)) {
1589            contents.put(var, new UnicodeString(value), ec);
1590        }
1591    }
1592
1593    /**
1594     * SymbolTable API
1595     */
1596    virtual const UnicodeString* lookup(const UnicodeString& s) const {
1597        return (const UnicodeString*) contents.get(s);
1598    }
1599
1600    /**
1601     * SymbolTable API
1602     */
1603    virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1604        return NULL;
1605    }
1606
1607    /**
1608     * SymbolTable API
1609     */
1610    virtual UnicodeString parseReference(const UnicodeString& text,
1611                                         ParsePosition& pos, int32_t limit) const {
1612        int32_t start = pos.getIndex();
1613        int32_t i = start;
1614        UnicodeString result;
1615        while (i < limit) {
1616            UChar c = text.charAt(i);
1617            if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1618                break;
1619            }
1620            ++i;
1621        }
1622        if (i == start) { // No valid name chars
1623            return result; // Indicate failure with empty string
1624        }
1625        pos.setIndex(i);
1626        text.extractBetween(start, i, result);
1627        return result;
1628    }
1629};
1630
1631void UnicodeSetTest::TestSymbolTable() {
1632    // Multiple test cases can be set up here.  Each test case
1633    // is terminated by null:
1634    // var, value, var, value,..., input pat., exp. output pat., null
1635    const char* DATA[] = {
1636        "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1637        "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1638        "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1639        NULL
1640    };
1641
1642    for (int32_t i=0; DATA[i]!=NULL; ++i) {
1643        UErrorCode ec = U_ZERO_ERROR;
1644        TokenSymbolTable sym(ec);
1645        if (U_FAILURE(ec)) {
1646            errln("FAIL: couldn't construct TokenSymbolTable");
1647            continue;
1648        }
1649
1650        // Set up variables
1651        while (DATA[i+2] != NULL) {
1652            sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1653            if (U_FAILURE(ec)) {
1654                errln("FAIL: couldn't add to TokenSymbolTable");
1655                continue;
1656            }
1657            i += 2;
1658        }
1659
1660        // Input pattern and expected output pattern
1661        UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1662        i += 2;
1663
1664        ParsePosition pos(0);
1665        UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1666        if (U_FAILURE(ec)) {
1667            errln("FAIL: couldn't construct UnicodeSet");
1668            continue;
1669        }
1670
1671        // results
1672        if (pos.getIndex() != inpat.length()) {
1673            errln((UnicodeString)"Failed to read to end of string \""
1674                  + inpat + "\": read to "
1675                  + pos.getIndex() + ", length is "
1676                  + inpat.length());
1677        }
1678
1679        UnicodeSet us2(exppat, ec);
1680        if (U_FAILURE(ec)) {
1681            errln("FAIL: couldn't construct expected UnicodeSet");
1682            continue;
1683        }
1684
1685        UnicodeString a, b;
1686        if (us != us2) {
1687            errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1688                  ", expected " + us2.toPattern(b, TRUE));
1689        } else {
1690            logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1691        }
1692    }
1693}
1694
1695void UnicodeSetTest::TestSurrogate() {
1696    const char* DATA[] = {
1697        // These should all behave identically
1698        "[abc\\uD800\\uDC00]",
1699        // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1700        "[abc\\U00010000]",
1701        0
1702    };
1703    for (int i=0; DATA[i] != 0; ++i) {
1704        UErrorCode ec = U_ZERO_ERROR;
1705        logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1706        UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1707        UnicodeSet set(str, ec);
1708        if (U_FAILURE(ec)) {
1709            errln("FAIL: UnicodeSet constructor");
1710            continue;
1711        }
1712        expectContainment(set,
1713                          CharsToUnicodeString("abc\\U00010000"),
1714                          CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1715        if (set.size() != 4) {
1716            errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1717                  set.size() + ", expected 4");
1718        }
1719    }
1720}
1721
1722void UnicodeSetTest::TestExhaustive() {
1723    // exhaustive tests. Simulate UnicodeSets with integers.
1724    // That gives us very solid tests (except for large memory tests).
1725
1726    int32_t limit = 128;
1727
1728    UnicodeSet x, y, z, aa;
1729
1730    for (int32_t i = 0; i < limit; ++i) {
1731        bitsToSet(i, x);
1732        logln((UnicodeString)"Testing " + i + ", " + x);
1733        _testComplement(i, x, y);
1734
1735        // AS LONG AS WE ARE HERE, check roundtrip
1736        checkRoundTrip(bitsToSet(i, aa));
1737
1738        for (int32_t j = 0; j < limit; ++j) {
1739            _testAdd(i,j,  x,y,z);
1740            _testXor(i,j,  x,y,z);
1741            _testRetain(i,j,  x,y,z);
1742            _testRemove(i,j,  x,y,z);
1743        }
1744    }
1745}
1746
1747void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1748    bitsToSet(a, x);
1749    z = x;
1750    z.complement();
1751    int32_t c = setToBits(z);
1752    if (c != (~a)) {
1753        errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1754        errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1755    }
1756    checkCanonicalRep(z, (UnicodeString)"complement " + a);
1757}
1758
1759void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1760    bitsToSet(a, x);
1761    bitsToSet(b, y);
1762    z = x;
1763    z.addAll(y);
1764    int32_t c = setToBits(z);
1765    if (c != (a | b)) {
1766        errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1767        errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1768    }
1769    checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1770}
1771
1772void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1773    bitsToSet(a, x);
1774    bitsToSet(b, y);
1775    z = x;
1776    z.retainAll(y);
1777    int32_t c = setToBits(z);
1778    if (c != (a & b)) {
1779        errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1780        errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1781    }
1782    checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1783}
1784
1785void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1786    bitsToSet(a, x);
1787    bitsToSet(b, y);
1788    z = x;
1789    z.removeAll(y);
1790    int32_t c = setToBits(z);
1791    if (c != (a &~ b)) {
1792        errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1793        errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1794    }
1795    checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1796}
1797
1798void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1799    bitsToSet(a, x);
1800    bitsToSet(b, y);
1801    z = x;
1802    z.complementAll(y);
1803    int32_t c = setToBits(z);
1804    if (c != (a ^ b)) {
1805        errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1806        errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1807    }
1808    checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1809}
1810
1811/**
1812 * Check that ranges are monotonically increasing and non-
1813 * overlapping.
1814 */
1815void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1816    int32_t n = set.getRangeCount();
1817    if (n < 0) {
1818        errln((UnicodeString)"FAIL result of " + msg +
1819              ": range count should be >= 0 but is " +
1820              n /*+ " for " + set.toPattern())*/);
1821        return;
1822    }
1823    UChar32 last = 0;
1824    for (int32_t i=0; i<n; ++i) {
1825        UChar32 start = set.getRangeStart(i);
1826        UChar32 end = set.getRangeEnd(i);
1827        if (start > end) {
1828            errln((UnicodeString)"FAIL result of " + msg +
1829                  ": range " + (i+1) +
1830                  " start > end: " + (int)start + ", " + (int)end +
1831                  " for " + set);
1832        }
1833        if (i > 0 && start <= last) {
1834            errln((UnicodeString)"FAIL result of " + msg +
1835                  ": range " + (i+1) +
1836                  " overlaps previous range: " + (int)start + ", " + (int)end +
1837                  " for " + set);
1838        }
1839        last = end;
1840    }
1841}
1842
1843/**
1844 * Convert a bitmask to a UnicodeSet.
1845 */
1846UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1847    result.clear();
1848    for (UChar32 i = 0; i < 32; ++i) {
1849        if ((a & (1<<i)) != 0) {
1850            result.add(i);
1851        }
1852    }
1853    return result;
1854}
1855
1856/**
1857 * Convert a UnicodeSet to a bitmask.  Only the characters
1858 * U+0000 to U+0020 are represented in the bitmask.
1859 */
1860int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1861    int32_t result = 0;
1862    for (int32_t i = 0; i < 32; ++i) {
1863        if (x.contains((UChar32)i)) {
1864            result |= (1<<i);
1865        }
1866    }
1867    return result;
1868}
1869
1870/**
1871 * Return the representation of an inversion list based UnicodeSet
1872 * as a pairs list.  Ranges are listed in ascending Unicode order.
1873 * For example, the set [a-zA-M3] is represented as "33AMaz".
1874 */
1875UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1876    UnicodeString pairs;
1877    for (int32_t i=0; i<set.getRangeCount(); ++i) {
1878        UChar32 start = set.getRangeStart(i);
1879        UChar32 end = set.getRangeEnd(i);
1880        if (end > 0xFFFF) {
1881            end = 0xFFFF;
1882            i = set.getRangeCount(); // Should be unnecessary
1883        }
1884        pairs.append((UChar)start).append((UChar)end);
1885    }
1886    return pairs;
1887}
1888
1889/**
1890 * Basic consistency check for a few items.
1891 * That the iterator works, and that we can create a pattern and
1892 * get the same thing back
1893 */
1894void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1895    UErrorCode ec = U_ZERO_ERROR;
1896
1897    UnicodeSet t(s);
1898    checkEqual(s, t, "copy ct");
1899
1900    t = s;
1901    checkEqual(s, t, "operator=");
1902
1903    copyWithIterator(t, s, FALSE);
1904    checkEqual(s, t, "iterator roundtrip");
1905
1906    copyWithIterator(t, s, TRUE); // try range
1907    checkEqual(s, t, "iterator roundtrip");
1908
1909    UnicodeString pat; s.toPattern(pat, FALSE);
1910    t.applyPattern(pat, ec);
1911    if (U_FAILURE(ec)) {
1912        errln("FAIL: applyPattern");
1913        return;
1914    } else {
1915        checkEqual(s, t, "toPattern(false)");
1916    }
1917
1918    s.toPattern(pat, TRUE);
1919    t.applyPattern(pat, ec);
1920    if (U_FAILURE(ec)) {
1921        errln("FAIL: applyPattern");
1922        return;
1923    } else {
1924        checkEqual(s, t, "toPattern(true)");
1925    }
1926}
1927
1928void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1929    t.clear();
1930    UnicodeSetIterator it(s);
1931    if (withRange) {
1932        while (it.nextRange()) {
1933            if (it.isString()) {
1934                t.add(it.getString());
1935            } else {
1936                t.add(it.getCodepoint(), it.getCodepointEnd());
1937            }
1938        }
1939    } else {
1940        while (it.next()) {
1941            if (it.isString()) {
1942                t.add(it.getString());
1943            } else {
1944                t.add(it.getCodepoint());
1945            }
1946        }
1947    }
1948}
1949
1950UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1951    UnicodeString source; s.toPattern(source, TRUE);
1952    UnicodeString result; t.toPattern(result, TRUE);
1953    if (s != t) {
1954        errln((UnicodeString)"FAIL: " + message
1955              + "; source = " + source
1956              + "; result = " + result
1957              );
1958        return FALSE;
1959    } else {
1960        logln((UnicodeString)"Ok: " + message
1961              + "; source = " + source
1962              + "; result = " + result
1963              );
1964    }
1965    return TRUE;
1966}
1967
1968void
1969UnicodeSetTest::expectContainment(const UnicodeString& pat,
1970                                  const UnicodeString& charsIn,
1971                                  const UnicodeString& charsOut) {
1972    UErrorCode ec = U_ZERO_ERROR;
1973    UnicodeSet set(pat, ec);
1974    if (U_FAILURE(ec)) {
1975        dataerrln((UnicodeString)"FAIL: pattern \"" +
1976              pat + "\" => " + u_errorName(ec));
1977        return;
1978    }
1979    expectContainment(set, pat, charsIn, charsOut);
1980}
1981
1982void
1983UnicodeSetTest::expectContainment(const UnicodeSet& set,
1984                                  const UnicodeString& charsIn,
1985                                  const UnicodeString& charsOut) {
1986    UnicodeString pat;
1987    set.toPattern(pat);
1988    expectContainment(set, pat, charsIn, charsOut);
1989}
1990
1991void
1992UnicodeSetTest::expectContainment(const UnicodeSet& set,
1993                                  const UnicodeString& setName,
1994                                  const UnicodeString& charsIn,
1995                                  const UnicodeString& charsOut) {
1996    UnicodeString bad;
1997    UChar32 c;
1998    int32_t i;
1999
2000    for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
2001        c = charsIn.char32At(i);
2002        if (!set.contains(c)) {
2003            bad.append(c);
2004        }
2005    }
2006    if (bad.length() > 0) {
2007        errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2008              ", expected containment of " + prettify(charsIn));
2009    } else {
2010        logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2011    }
2012
2013    bad.truncate(0);
2014    for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2015        c = charsOut.char32At(i);
2016        if (set.contains(c)) {
2017            bad.append(c);
2018        }
2019    }
2020    if (bad.length() > 0) {
2021        errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2022              ", expected non-containment of " + prettify(charsOut));
2023    } else {
2024        logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2025    }
2026}
2027
2028void
2029UnicodeSetTest::expectPattern(UnicodeSet& set,
2030                              const UnicodeString& pattern,
2031                              const UnicodeString& expectedPairs){
2032    UErrorCode status = U_ZERO_ERROR;
2033    set.applyPattern(pattern, status);
2034    if (U_FAILURE(status)) {
2035        errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2036              "\") failed");
2037        return;
2038    } else {
2039        if (getPairs(set) != expectedPairs ) {
2040            errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2041                  "\") => pairs \"" +
2042                  escape(getPairs(set)) + "\", expected \"" +
2043                  escape(expectedPairs) + "\"");
2044        } else {
2045            logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2046                  "\") => pairs \"" +
2047                  escape(getPairs(set)) + "\"");
2048        }
2049    }
2050    // the result of calling set.toPattern(), which is the string representation of
2051    // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2052    // will produce another set that is equal to this one.
2053    UnicodeString temppattern;
2054    set.toPattern(temppattern);
2055    UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2056    if (U_FAILURE(status)) {
2057        errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2058        return;
2059    }
2060    if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2061        errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2062            escape(getPairs(set)) + "\""));
2063    } else{
2064        logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2065    }
2066
2067    delete tempset;
2068
2069}
2070
2071void
2072UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2073    if (getPairs(set) != expectedPairs) {
2074        errln(UnicodeString("FAIL: Expected pair list \"") +
2075              escape(expectedPairs) + "\", got \"" +
2076              escape(getPairs(set)) + "\"");
2077    }
2078}
2079
2080void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2081                                     const UnicodeString& expPat,
2082                                     const char** expStrings) {
2083    UnicodeString pat;
2084    set.toPattern(pat, TRUE);
2085    if (pat == expPat) {
2086        logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2087    } else {
2088        errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2089        return;
2090    }
2091    if (expStrings == NULL) {
2092        return;
2093    }
2094    UBool in = TRUE;
2095    for (int32_t i=0; expStrings[i] != NULL; ++i) {
2096        if (expStrings[i] == NOT) { // sic; pointer comparison
2097            in = FALSE;
2098            continue;
2099        }
2100        UnicodeString s = CharsToUnicodeString(expStrings[i]);
2101        UBool contained = set.contains(s);
2102        if (contained == in) {
2103            logln((UnicodeString)"Ok: " + expPat +
2104                  (contained ? " contains {" : " does not contain {") +
2105                  escape(expStrings[i]) + "}");
2106        } else {
2107            errln((UnicodeString)"FAIL: " + expPat +
2108                  (contained ? " contains {" : " does not contain {") +
2109                  escape(expStrings[i]) + "}");
2110        }
2111    }
2112}
2113
2114static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2115
2116void
2117UnicodeSetTest::doAssert(UBool condition, const char *message)
2118{
2119    if (!condition) {
2120        errln(UnicodeString("ERROR : ") + message);
2121    }
2122}
2123
2124UnicodeString
2125UnicodeSetTest::escape(const UnicodeString& s) {
2126    UnicodeString buf;
2127    for (int32_t i=0; i<s.length(); )
2128    {
2129        UChar32 c = s.char32At(i);
2130        if (0x0020 <= c && c <= 0x007F) {
2131            buf += c;
2132        } else {
2133            if (c <= 0xFFFF) {
2134                buf += (UChar)0x5c; buf += (UChar)0x75;
2135            } else {
2136                buf += (UChar)0x5c; buf += (UChar)0x55;
2137                buf += toHexString((c & 0xF0000000) >> 28);
2138                buf += toHexString((c & 0x0F000000) >> 24);
2139                buf += toHexString((c & 0x00F00000) >> 20);
2140                buf += toHexString((c & 0x000F0000) >> 16);
2141            }
2142            buf += toHexString((c & 0xF000) >> 12);
2143            buf += toHexString((c & 0x0F00) >> 8);
2144            buf += toHexString((c & 0x00F0) >> 4);
2145            buf += toHexString(c & 0x000F);
2146        }
2147        i += U16_LENGTH(c);
2148    }
2149    return buf;
2150}
2151
2152void UnicodeSetTest::TestFreezable() {
2153    UErrorCode errorCode=U_ZERO_ERROR;
2154    UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2155    UnicodeSet idSet(idPattern, errorCode);
2156    if(U_FAILURE(errorCode)) {
2157        dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2158        return;
2159    }
2160
2161    UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2162    UnicodeSet wsSet(wsPattern, errorCode);
2163    if(U_FAILURE(errorCode)) {
2164        dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2165        return;
2166    }
2167
2168    idSet.add(idPattern);
2169    UnicodeSet frozen(idSet);
2170    frozen.freeze();
2171
2172    if(idSet.isFrozen() || !frozen.isFrozen()) {
2173        errln("FAIL: isFrozen() is wrong");
2174    }
2175    if(frozen!=idSet || !(frozen==idSet)) {
2176        errln("FAIL: a copy-constructed frozen set differs from its original");
2177    }
2178
2179    frozen=wsSet;
2180    if(frozen!=idSet || !(frozen==idSet)) {
2181        errln("FAIL: a frozen set was modified by operator=");
2182    }
2183
2184    UnicodeSet frozen2(frozen);
2185    if(frozen2!=frozen || frozen2!=idSet) {
2186        errln("FAIL: a copied frozen set differs from its frozen original");
2187    }
2188    if(!frozen2.isFrozen()) {
2189        errln("FAIL: copy-constructing a frozen set results in a thawed one");
2190    }
2191    UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2192    if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2193        errln("FAIL: UnicodeSet(5, 55) failed");
2194    }
2195    frozen3=frozen;
2196    if(!frozen3.isFrozen()) {
2197        errln("FAIL: copying a frozen set results in a thawed one");
2198    }
2199
2200    UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2201    if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2202        errln("FAIL: clone() failed");
2203    }
2204    cloned->add(0xd802, 0xd805);
2205    if(cloned->containsSome(0xd802, 0xd805)) {
2206        errln("FAIL: unable to modify clone");
2207    }
2208    delete cloned;
2209
2210    UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2211    if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2212        errln("FAIL: cloneAsThawed() failed");
2213    }
2214    thawed->add(0xd802, 0xd805);
2215    if(!thawed->contains(0xd802, 0xd805)) {
2216        errln("FAIL: unable to modify thawed clone");
2217    }
2218    delete thawed;
2219
2220    frozen.set(5, 55);
2221    if(frozen!=idSet || !(frozen==idSet)) {
2222        errln("FAIL: UnicodeSet::set() modified a frozen set");
2223    }
2224
2225    frozen.clear();
2226    if(frozen!=idSet || !(frozen==idSet)) {
2227        errln("FAIL: UnicodeSet::clear() modified a frozen set");
2228    }
2229
2230    frozen.closeOver(USET_CASE_INSENSITIVE);
2231    if(frozen!=idSet || !(frozen==idSet)) {
2232        errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2233    }
2234
2235    frozen.compact();
2236    if(frozen!=idSet || !(frozen==idSet)) {
2237        errln("FAIL: UnicodeSet::compact() modified a frozen set");
2238    }
2239
2240    ParsePosition pos;
2241    frozen.
2242        applyPattern(wsPattern, errorCode).
2243        applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2244        applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2245        applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2246        applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2247    if(frozen!=idSet || !(frozen==idSet)) {
2248        errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2249    }
2250
2251    frozen.
2252        add(0xd800).
2253        add(0xd802, 0xd805).
2254        add(wsPattern).
2255        addAll(idPattern).
2256        addAll(wsSet);
2257    if(frozen!=idSet || !(frozen==idSet)) {
2258        errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2259    }
2260
2261    frozen.
2262        retain(0x62).
2263        retain(0x64, 0x69).
2264        retainAll(wsPattern).
2265        retainAll(wsSet);
2266    if(frozen!=idSet || !(frozen==idSet)) {
2267        errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2268    }
2269
2270    frozen.
2271        remove(0x62).
2272        remove(0x64, 0x69).
2273        remove(idPattern).
2274        removeAll(idPattern).
2275        removeAll(idSet);
2276    if(frozen!=idSet || !(frozen==idSet)) {
2277        errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2278    }
2279
2280    frozen.
2281        complement().
2282        complement(0x62).
2283        complement(0x64, 0x69).
2284        complement(idPattern).
2285        complementAll(idPattern).
2286        complementAll(idSet);
2287    if(frozen!=idSet || !(frozen==idSet)) {
2288        errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2289    }
2290}
2291
2292// Test span() etc. -------------------------------------------------------- ***
2293
2294// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2295static int32_t
2296appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2297    UErrorCode errorCode=U_ZERO_ERROR;
2298    int32_t length8=0;
2299    u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2300    if(U_SUCCESS(errorCode)) {
2301        return length8;
2302    } else {
2303        // The string contains an unpaired surrogate.
2304        // Ignore this string.
2305        return 0;
2306    }
2307}
2308
2309class UnicodeSetWithStringsIterator;
2310
2311// Make the strings in a UnicodeSet easily accessible.
2312class UnicodeSetWithStrings {
2313public:
2314    UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2315            set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2316        int32_t size=set.size();
2317        if(size>0 && set.charAt(size-1)<0) {
2318            // If a set's last element is not a code point, then it must contain strings.
2319            // Iterate over the set, skip all code point ranges, and cache the strings.
2320            // Convert them to UTF-8 for spanUTF8().
2321            UnicodeSetIterator iter(set);
2322            const UnicodeString *s;
2323            char *s8=utf8;
2324            int32_t length8, utf8Count=0;
2325            while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
2326                if(iter.isString()) {
2327                    // Store the pointer to the set's string element
2328                    // which we happen to know is a stable pointer.
2329                    strings[stringsLength]=s=&iter.getString();
2330                    utf8Count+=
2331                        utf8Lengths[stringsLength]=length8=
2332                        appendUTF8(s->getBuffer(), s->length(),
2333                                   s8, (int32_t)(sizeof(utf8)-utf8Count));
2334                    if(length8==0) {
2335                        hasSurrogates=TRUE;  // Contains unpaired surrogates.
2336                    }
2337                    s8+=length8;
2338                    ++stringsLength;
2339                }
2340            }
2341        }
2342    }
2343
2344    const UnicodeSet &getSet() const {
2345        return set;
2346    }
2347
2348    UBool hasStrings() const {
2349        return (UBool)(stringsLength>0);
2350    }
2351
2352    UBool hasStringsWithSurrogates() const {
2353        return hasSurrogates;
2354    }
2355
2356private:
2357    friend class UnicodeSetWithStringsIterator;
2358
2359    const UnicodeSet &set;
2360
2361    const UnicodeString *strings[20];
2362    int32_t stringsLength;
2363    UBool hasSurrogates;
2364
2365    char utf8[1024];
2366    int32_t utf8Lengths[20];
2367};
2368
2369class UnicodeSetWithStringsIterator {
2370public:
2371    UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2372            fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2373    }
2374
2375    void reset() {
2376        nextStringIndex=nextUTF8Start=0;
2377    }
2378
2379    const UnicodeString *nextString() {
2380        if(nextStringIndex<fSet.stringsLength) {
2381            return fSet.strings[nextStringIndex++];
2382        } else {
2383            return NULL;
2384        }
2385    }
2386
2387    // Do not mix with calls to nextString().
2388    const char *nextUTF8(int32_t &length) {
2389        if(nextStringIndex<fSet.stringsLength) {
2390            const char *s8=fSet.utf8+nextUTF8Start;
2391            nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2392            return s8;
2393        } else {
2394            length=0;
2395            return NULL;
2396        }
2397    }
2398
2399private:
2400    const UnicodeSetWithStrings &fSet;
2401    int32_t nextStringIndex;
2402    int32_t nextUTF8Start;
2403};
2404
2405// Compare 16-bit Unicode strings (which may be malformed UTF-16)
2406// at code point boundaries.
2407// That is, each edge of a match must not be in the middle of a surrogate pair.
2408static inline UBool
2409matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2410    s+=start;
2411    limit-=start;
2412    int32_t length=t.length();
2413    return 0==t.compare(s, length) &&
2414           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2415           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2416}
2417
2418// Implement span() with contains() for comparison.
2419static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2420                                 USetSpanCondition spanCondition) {
2421    const UnicodeSet &realSet(set.getSet());
2422    if(!set.hasStrings()) {
2423        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2424            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2425        }
2426
2427        UChar32 c;
2428        int32_t start=0, prev;
2429        while((prev=start)<length) {
2430            U16_NEXT(s, start, length, c);
2431            if(realSet.contains(c)!=spanCondition) {
2432                break;
2433            }
2434        }
2435        return prev;
2436    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2437        UnicodeSetWithStringsIterator iter(set);
2438        UChar32 c;
2439        int32_t start, next;
2440        for(start=next=0; start<length;) {
2441            U16_NEXT(s, next, length, c);
2442            if(realSet.contains(c)) {
2443                break;
2444            }
2445            const UnicodeString *str;
2446            iter.reset();
2447            while((str=iter.nextString())!=NULL) {
2448                if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2449                    // spanNeedsStrings=TRUE;
2450                    return start;
2451                }
2452            }
2453            start=next;
2454        }
2455        return start;
2456    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2457        UnicodeSetWithStringsIterator iter(set);
2458        UChar32 c;
2459        int32_t start, next, maxSpanLimit=0;
2460        for(start=next=0; start<length;) {
2461            U16_NEXT(s, next, length, c);
2462            if(!realSet.contains(c)) {
2463                next=start;  // Do not span this single, not-contained code point.
2464            }
2465            const UnicodeString *str;
2466            iter.reset();
2467            while((str=iter.nextString())!=NULL) {
2468                if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2469                    // spanNeedsStrings=TRUE;
2470                    int32_t matchLimit=start+str->length();
2471                    if(matchLimit==length) {
2472                        return length;
2473                    }
2474                    if(spanCondition==USET_SPAN_CONTAINED) {
2475                        // Iterate for the shortest match at each position.
2476                        // Recurse for each but the shortest match.
2477                        if(next==start) {
2478                            next=matchLimit;  // First match from start.
2479                        } else {
2480                            if(matchLimit<next) {
2481                                // Remember shortest match from start for iteration.
2482                                int32_t temp=next;
2483                                next=matchLimit;
2484                                matchLimit=temp;
2485                            }
2486                            // Recurse for non-shortest match from start.
2487                            int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2488                                                                 USET_SPAN_CONTAINED);
2489                            if((matchLimit+spanLength)>maxSpanLimit) {
2490                                maxSpanLimit=matchLimit+spanLength;
2491                                if(maxSpanLimit==length) {
2492                                    return length;
2493                                }
2494                            }
2495                        }
2496                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2497                        if(matchLimit>next) {
2498                            // Remember longest match from start.
2499                            next=matchLimit;
2500                        }
2501                    }
2502                }
2503            }
2504            if(next==start) {
2505                break;  // No match from start.
2506            }
2507            start=next;
2508        }
2509        if(start>maxSpanLimit) {
2510            return start;
2511        } else {
2512            return maxSpanLimit;
2513        }
2514    }
2515}
2516
2517static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2518                                     USetSpanCondition spanCondition) {
2519    if(length==0) {
2520        return 0;
2521    }
2522    const UnicodeSet &realSet(set.getSet());
2523    if(!set.hasStrings()) {
2524        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2525            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2526        }
2527
2528        UChar32 c;
2529        int32_t prev=length;
2530        do {
2531            U16_PREV(s, 0, length, c);
2532            if(realSet.contains(c)!=spanCondition) {
2533                break;
2534            }
2535        } while((prev=length)>0);
2536        return prev;
2537    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2538        UnicodeSetWithStringsIterator iter(set);
2539        UChar32 c;
2540        int32_t prev=length, length0=length;
2541        do {
2542            U16_PREV(s, 0, length, c);
2543            if(realSet.contains(c)) {
2544                break;
2545            }
2546            const UnicodeString *str;
2547            iter.reset();
2548            while((str=iter.nextString())!=NULL) {
2549                if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2550                    // spanNeedsStrings=TRUE;
2551                    return prev;
2552                }
2553            }
2554        } while((prev=length)>0);
2555        return prev;
2556    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2557        UnicodeSetWithStringsIterator iter(set);
2558        UChar32 c;
2559        int32_t prev=length, minSpanStart=length, length0=length;
2560        do {
2561            U16_PREV(s, 0, length, c);
2562            if(!realSet.contains(c)) {
2563                length=prev;  // Do not span this single, not-contained code point.
2564            }
2565            const UnicodeString *str;
2566            iter.reset();
2567            while((str=iter.nextString())!=NULL) {
2568                if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2569                    // spanNeedsStrings=TRUE;
2570                    int32_t matchStart=prev-str->length();
2571                    if(matchStart==0) {
2572                        return 0;
2573                    }
2574                    if(spanCondition==USET_SPAN_CONTAINED) {
2575                        // Iterate for the shortest match at each position.
2576                        // Recurse for each but the shortest match.
2577                        if(length==prev) {
2578                            length=matchStart;  // First match from prev.
2579                        } else {
2580                            if(matchStart>length) {
2581                                // Remember shortest match from prev for iteration.
2582                                int32_t temp=length;
2583                                length=matchStart;
2584                                matchStart=temp;
2585                            }
2586                            // Recurse for non-shortest match from prev.
2587                            int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2588                                                                    USET_SPAN_CONTAINED);
2589                            if(spanStart<minSpanStart) {
2590                                minSpanStart=spanStart;
2591                                if(minSpanStart==0) {
2592                                    return 0;
2593                                }
2594                            }
2595                        }
2596                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2597                        if(matchStart<length) {
2598                            // Remember longest match from prev.
2599                            length=matchStart;
2600                        }
2601                    }
2602                }
2603            }
2604            if(length==prev) {
2605                break;  // No match from prev.
2606            }
2607        } while((prev=length)>0);
2608        if(prev<minSpanStart) {
2609            return prev;
2610        } else {
2611            return minSpanStart;
2612        }
2613    }
2614}
2615
2616static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2617                                USetSpanCondition spanCondition) {
2618    const UnicodeSet &realSet(set.getSet());
2619    if(!set.hasStrings()) {
2620        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2621            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2622        }
2623
2624        UChar32 c;
2625        int32_t start=0, prev;
2626        while((prev=start)<length) {
2627            U8_NEXT_OR_FFFD(s, start, length, c);
2628            if(realSet.contains(c)!=spanCondition) {
2629                break;
2630            }
2631        }
2632        return prev;
2633    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2634        UnicodeSetWithStringsIterator iter(set);
2635        UChar32 c;
2636        int32_t start, next;
2637        for(start=next=0; start<length;) {
2638            U8_NEXT_OR_FFFD(s, next, length, c);
2639            if(realSet.contains(c)) {
2640                break;
2641            }
2642            const char *s8;
2643            int32_t length8;
2644            iter.reset();
2645            while((s8=iter.nextUTF8(length8))!=NULL) {
2646                if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2647                    // spanNeedsStrings=TRUE;
2648                    return start;
2649                }
2650            }
2651            start=next;
2652        }
2653        return start;
2654    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2655        UnicodeSetWithStringsIterator iter(set);
2656        UChar32 c;
2657        int32_t start, next, maxSpanLimit=0;
2658        for(start=next=0; start<length;) {
2659            U8_NEXT_OR_FFFD(s, next, length, c);
2660            if(!realSet.contains(c)) {
2661                next=start;  // Do not span this single, not-contained code point.
2662            }
2663            const char *s8;
2664            int32_t length8;
2665            iter.reset();
2666            while((s8=iter.nextUTF8(length8))!=NULL) {
2667                if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2668                    // spanNeedsStrings=TRUE;
2669                    int32_t matchLimit=start+length8;
2670                    if(matchLimit==length) {
2671                        return length;
2672                    }
2673                    if(spanCondition==USET_SPAN_CONTAINED) {
2674                        // Iterate for the shortest match at each position.
2675                        // Recurse for each but the shortest match.
2676                        if(next==start) {
2677                            next=matchLimit;  // First match from start.
2678                        } else {
2679                            if(matchLimit<next) {
2680                                // Remember shortest match from start for iteration.
2681                                int32_t temp=next;
2682                                next=matchLimit;
2683                                matchLimit=temp;
2684                            }
2685                            // Recurse for non-shortest match from start.
2686                            int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2687                                                                USET_SPAN_CONTAINED);
2688                            if((matchLimit+spanLength)>maxSpanLimit) {
2689                                maxSpanLimit=matchLimit+spanLength;
2690                                if(maxSpanLimit==length) {
2691                                    return length;
2692                                }
2693                            }
2694                        }
2695                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2696                        if(matchLimit>next) {
2697                            // Remember longest match from start.
2698                            next=matchLimit;
2699                        }
2700                    }
2701                }
2702            }
2703            if(next==start) {
2704                break;  // No match from start.
2705            }
2706            start=next;
2707        }
2708        if(start>maxSpanLimit) {
2709            return start;
2710        } else {
2711            return maxSpanLimit;
2712        }
2713    }
2714}
2715
2716static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2717                                    USetSpanCondition spanCondition) {
2718    if(length==0) {
2719        return 0;
2720    }
2721    const UnicodeSet &realSet(set.getSet());
2722    if(!set.hasStrings()) {
2723        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2724            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2725        }
2726
2727        UChar32 c;
2728        int32_t prev=length;
2729        do {
2730            U8_PREV_OR_FFFD(s, 0, length, c);
2731            if(realSet.contains(c)!=spanCondition) {
2732                break;
2733            }
2734        } while((prev=length)>0);
2735        return prev;
2736    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2737        UnicodeSetWithStringsIterator iter(set);
2738        UChar32 c;
2739        int32_t prev=length;
2740        do {
2741            U8_PREV_OR_FFFD(s, 0, length, c);
2742            if(realSet.contains(c)) {
2743                break;
2744            }
2745            const char *s8;
2746            int32_t length8;
2747            iter.reset();
2748            while((s8=iter.nextUTF8(length8))!=NULL) {
2749                if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2750                    // spanNeedsStrings=TRUE;
2751                    return prev;
2752                }
2753            }
2754        } while((prev=length)>0);
2755        return prev;
2756    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2757        UnicodeSetWithStringsIterator iter(set);
2758        UChar32 c;
2759        int32_t prev=length, minSpanStart=length;
2760        do {
2761            U8_PREV_OR_FFFD(s, 0, length, c);
2762            if(!realSet.contains(c)) {
2763                length=prev;  // Do not span this single, not-contained code point.
2764            }
2765            const char *s8;
2766            int32_t length8;
2767            iter.reset();
2768            while((s8=iter.nextUTF8(length8))!=NULL) {
2769                if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2770                    // spanNeedsStrings=TRUE;
2771                    int32_t matchStart=prev-length8;
2772                    if(matchStart==0) {
2773                        return 0;
2774                    }
2775                    if(spanCondition==USET_SPAN_CONTAINED) {
2776                        // Iterate for the shortest match at each position.
2777                        // Recurse for each but the shortest match.
2778                        if(length==prev) {
2779                            length=matchStart;  // First match from prev.
2780                        } else {
2781                            if(matchStart>length) {
2782                                // Remember shortest match from prev for iteration.
2783                                int32_t temp=length;
2784                                length=matchStart;
2785                                matchStart=temp;
2786                            }
2787                            // Recurse for non-shortest match from prev.
2788                            int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2789                                                                   USET_SPAN_CONTAINED);
2790                            if(spanStart<minSpanStart) {
2791                                minSpanStart=spanStart;
2792                                if(minSpanStart==0) {
2793                                    return 0;
2794                                }
2795                            }
2796                        }
2797                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2798                        if(matchStart<length) {
2799                            // Remember longest match from prev.
2800                            length=matchStart;
2801                        }
2802                    }
2803                }
2804            }
2805            if(length==prev) {
2806                break;  // No match from prev.
2807            }
2808        } while((prev=length)>0);
2809        if(prev<minSpanStart) {
2810            return prev;
2811        } else {
2812            return minSpanStart;
2813        }
2814    }
2815}
2816
2817// spans to be performed and compared
2818enum {
2819    SPAN_UTF16          =1,
2820    SPAN_UTF8           =2,
2821    SPAN_UTFS           =3,
2822
2823    SPAN_SET            =4,
2824    SPAN_COMPLEMENT     =8,
2825    SPAN_POLARITY       =0xc,
2826
2827    SPAN_FWD            =0x10,
2828    SPAN_BACK           =0x20,
2829    SPAN_DIRS           =0x30,
2830
2831    SPAN_CONTAINED      =0x100,
2832    SPAN_SIMPLE         =0x200,
2833    SPAN_CONDITION      =0x300,
2834
2835    SPAN_ALL            =0x33f
2836};
2837
2838static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2839    return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2840}
2841
2842static inline int32_t slen(const void *s, UBool isUTF16) {
2843    return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2844}
2845
2846/*
2847 * Count spans on a string with the method according to type and set the span limits.
2848 * The set may be the complement of the original.
2849 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2850 * according to the expected number of spans.
2851 * Sets typeName to an empty string if there is no such type.
2852 * Returns -1 if the span option is filtered out.
2853 */
2854static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2855                        const void *s, int32_t length, UBool isUTF16,
2856                        uint32_t whichSpans,
2857                        int type, const char *&typeName,
2858                        int32_t limits[], int32_t limitsCapacity,
2859                        int32_t expectCount) {
2860    const UnicodeSet &realSet(set.getSet());
2861    int32_t start, count;
2862    USetSpanCondition spanCondition, firstSpanCondition, contained;
2863    UBool isForward;
2864
2865    if(type<0 || 7<type) {
2866        typeName="";
2867        return 0;
2868    }
2869
2870    static const char *const typeNames16[]={
2871        "contains", "contains(LM)",
2872        "span", "span(LM)",
2873        "containsBack", "containsBack(LM)",
2874        "spanBack", "spanBack(LM)"
2875    };
2876
2877    static const char *const typeNames8[]={
2878        "containsUTF8", "containsUTF8(LM)",
2879        "spanUTF8", "spanUTF8(LM)",
2880        "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2881        "spanBackUTF8", "spanBackUTF8(LM)"
2882    };
2883
2884    typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2885
2886    // filter span options
2887    if(type<=3) {
2888        // span forward
2889        if((whichSpans&SPAN_FWD)==0) {
2890            return -1;
2891        }
2892        isForward=TRUE;
2893    } else {
2894        // span backward
2895        if((whichSpans&SPAN_BACK)==0) {
2896            return -1;
2897        }
2898        isForward=FALSE;
2899    }
2900    if((type&1)==0) {
2901        // use USET_SPAN_CONTAINED
2902        if((whichSpans&SPAN_CONTAINED)==0) {
2903            return -1;
2904        }
2905        contained=USET_SPAN_CONTAINED;
2906    } else {
2907        // use USET_SPAN_SIMPLE
2908        if((whichSpans&SPAN_SIMPLE)==0) {
2909            return -1;
2910        }
2911        contained=USET_SPAN_SIMPLE;
2912    }
2913
2914    // Default first span condition for going forward with an uncomplemented set.
2915    spanCondition=USET_SPAN_NOT_CONTAINED;
2916    if(isComplement) {
2917        spanCondition=invertSpanCondition(spanCondition, contained);
2918    }
2919
2920    // First span condition for span(), used to terminate the spanBack() iteration.
2921    firstSpanCondition=spanCondition;
2922
2923    // spanBack(): Its initial span condition is span()'s last span condition,
2924    // which is the opposite of span()'s first span condition
2925    // if we expect an even number of spans.
2926    // (The loop inverts spanCondition (expectCount-1) times
2927    // before the expectCount'th span() call.)
2928    // If we do not compare forward and backward directions, then we do not have an
2929    // expectCount and just start with firstSpanCondition.
2930    if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2931        spanCondition=invertSpanCondition(spanCondition, contained);
2932    }
2933
2934    count=0;
2935    switch(type) {
2936    case 0:
2937    case 1:
2938        start=0;
2939        if(length<0) {
2940            length=slen(s, isUTF16);
2941        }
2942        for(;;) {
2943            start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2944                              containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2945            if(count<limitsCapacity) {
2946                limits[count]=start;
2947            }
2948            ++count;
2949            if(start>=length) {
2950                break;
2951            }
2952            spanCondition=invertSpanCondition(spanCondition, contained);
2953        }
2954        break;
2955    case 2:
2956    case 3:
2957        start=0;
2958        for(;;) {
2959            start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2960                              realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2961            if(count<limitsCapacity) {
2962                limits[count]=start;
2963            }
2964            ++count;
2965            if(length>=0 ? start>=length :
2966                           isUTF16 ? ((const UChar *)s)[start]==0 :
2967                                     ((const char *)s)[start]==0
2968            ) {
2969                break;
2970            }
2971            spanCondition=invertSpanCondition(spanCondition, contained);
2972        }
2973        break;
2974    case 4:
2975    case 5:
2976        if(length<0) {
2977            length=slen(s, isUTF16);
2978        }
2979        for(;;) {
2980            ++count;
2981            if(count<=limitsCapacity) {
2982                limits[limitsCapacity-count]=length;
2983            }
2984            length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
2985                              containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
2986            if(length==0 && spanCondition==firstSpanCondition) {
2987                break;
2988            }
2989            spanCondition=invertSpanCondition(spanCondition, contained);
2990        }
2991        if(count<limitsCapacity) {
2992            memmove(limits, limits+(limitsCapacity-count), count*4);
2993        }
2994        break;
2995    case 6:
2996    case 7:
2997        for(;;) {
2998            ++count;
2999            if(count<=limitsCapacity) {
3000                limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3001            }
3002            // Note: Length<0 is tested only for the first spanBack().
3003            // If we wanted to keep length<0 for all spanBack()s, we would have to
3004            // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3005            length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3006                              realSet.spanBackUTF8((const char *)s, length, spanCondition);
3007            if(length==0 && spanCondition==firstSpanCondition) {
3008                break;
3009            }
3010            spanCondition=invertSpanCondition(spanCondition, contained);
3011        }
3012        if(count<limitsCapacity) {
3013            memmove(limits, limits+(limitsCapacity-count), count*4);
3014        }
3015        break;
3016    default:
3017        typeName="";
3018        return -1;
3019    }
3020
3021    return count;
3022}
3023
3024// sets to be tested; odd index=isComplement
3025enum {
3026    SLOW,
3027    SLOW_NOT,
3028    FAST,
3029    FAST_NOT,
3030    SET_COUNT
3031};
3032
3033static const char *const setNames[SET_COUNT]={
3034    "slow",
3035    "slow.not",
3036    "fast",
3037    "fast.not"
3038};
3039
3040/*
3041 * Verify that we get the same results whether we look at text with contains(),
3042 * span() or spanBack(), using unfrozen or frozen versions of the set,
3043 * and using the set or its complement (switching the spanConditions accordingly).
3044 * The latter verifies that
3045 *   set.span(spanCondition) == set.complement().span(!spanCondition).
3046 *
3047 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3048 * or returned to the caller (with an input expectCount<0).
3049 */
3050void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3051                              const void *s, int32_t length, UBool isUTF16,
3052                              uint32_t whichSpans,
3053                              int32_t expectLimits[], int32_t &expectCount,
3054                              const char *testName, int32_t index) {
3055    int32_t limits[500];
3056    int32_t limitsCount;
3057    int i, j;
3058
3059    const char *typeName;
3060    int type;
3061
3062    for(i=0; i<SET_COUNT; ++i) {
3063        if((i&1)==0) {
3064            // Even-numbered sets are original, uncomplemented sets.
3065            if((whichSpans&SPAN_SET)==0) {
3066                continue;
3067            }
3068        } else {
3069            // Odd-numbered sets are complemented.
3070            if((whichSpans&SPAN_COMPLEMENT)==0) {
3071                continue;
3072            }
3073        }
3074        for(type=0;; ++type) {
3075            limitsCount=getSpans(*sets[i], (UBool)(i&1),
3076                                 s, length, isUTF16,
3077                                 whichSpans,
3078                                 type, typeName,
3079                                 limits, LENGTHOF(limits), expectCount);
3080            if(typeName[0]==0) {
3081                break; // All types tried.
3082            }
3083            if(limitsCount<0) {
3084                continue; // Span option filtered out.
3085            }
3086            if(expectCount<0) {
3087                expectCount=limitsCount;
3088                if(limitsCount>LENGTHOF(limits)) {
3089                    errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3090                          testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
3091                    return;
3092                }
3093                memcpy(expectLimits, limits, limitsCount*4);
3094            } else if(limitsCount!=expectCount) {
3095                errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3096                      testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3097            } else {
3098                for(j=0; j<limitsCount; ++j) {
3099                    if(limits[j]!=expectLimits[j]) {
3100                        errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3101                              testName, (long)index, setNames[i], typeName, (long)limitsCount,
3102                              j, (long)limits[j], (long)expectLimits[j]);
3103                        break;
3104                    }
3105                }
3106            }
3107        }
3108    }
3109
3110    // Compare span() with containsAll()/containsNone(),
3111    // but only if we have expectLimits[] from the uncomplemented set.
3112    if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3113        const UChar *s16=(const UChar *)s;
3114        UnicodeString string;
3115        int32_t prev=0, limit, length;
3116        for(i=0; i<expectCount; ++i) {
3117            limit=expectLimits[i];
3118            length=limit-prev;
3119            if(length>0) {
3120                string.setTo(FALSE, s16+prev, length);  // read-only alias
3121                if(i&1) {
3122                    if(!sets[SLOW]->getSet().containsAll(string)) {
3123                        errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3124                              testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3125                        return;
3126                    }
3127                    if(!sets[FAST]->getSet().containsAll(string)) {
3128                        errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3129                              testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3130                        return;
3131                    }
3132                } else {
3133                    if(!sets[SLOW]->getSet().containsNone(string)) {
3134                        errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3135                              testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3136                        return;
3137                    }
3138                    if(!sets[FAST]->getSet().containsNone(string)) {
3139                        errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3140                              testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3141                        return;
3142                    }
3143                }
3144            }
3145            prev=limit;
3146        }
3147    }
3148}
3149
3150// Specifically test either UTF-16 or UTF-8.
3151void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3152                              const void *s, int32_t length, UBool isUTF16,
3153                              uint32_t whichSpans,
3154                              const char *testName, int32_t index) {
3155    int32_t expectLimits[500];
3156    int32_t expectCount=-1;
3157    testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3158}
3159
3160UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3161    UChar c, c2;
3162
3163    if(length>=0) {
3164        while(length>0) {
3165            c=*s++;
3166            --length;
3167            if(0xd800<=c && c<0xe000) {
3168                if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3169                    return TRUE;
3170                }
3171                --length;
3172            }
3173        }
3174    } else {
3175        while((c=*s++)!=0) {
3176            if(0xd800<=c && c<0xe000) {
3177                if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3178                    return TRUE;
3179                }
3180            }
3181        }
3182    }
3183    return FALSE;
3184}
3185
3186// Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3187// unless either UTF is turned off in whichSpans.
3188// Testing UTF-16 and UTF-8 together requires that surrogate code points
3189// have the same contains(c) value as U+FFFD.
3190void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3191                                      const UChar *s16, int32_t length16,
3192                                      uint32_t whichSpans,
3193                                      const char *testName, int32_t index) {
3194    int32_t expectLimits[500];
3195    int32_t expectCount;
3196
3197    expectCount=-1;  // Get expectLimits[] from testSpan().
3198
3199    if((whichSpans&SPAN_UTF16)!=0) {
3200        testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3201    }
3202    if((whichSpans&SPAN_UTF8)==0) {
3203        return;
3204    }
3205
3206    // Convert s16[] and expectLimits[] to UTF-8.
3207    uint8_t s8[3000];
3208    int32_t offsets[3000];
3209
3210    const UChar *s16Limit=s16+length16;
3211    char *t=(char *)s8;
3212    char *tLimit=t+sizeof(s8);
3213    int32_t *o=offsets;
3214    UErrorCode errorCode=U_ZERO_ERROR;
3215
3216    // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3217    ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3218    if(U_FAILURE(errorCode)) {
3219        errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3220              testName, (long)index, u_errorName(errorCode));
3221        ucnv_resetFromUnicode(utf8Cnv);
3222        return;
3223    }
3224    int32_t length8=(int32_t)(t-(char *)s8);
3225
3226    // Convert expectLimits[].
3227    int32_t i, j, expect;
3228    for(i=j=0; i<expectCount; ++i) {
3229        expect=expectLimits[i];
3230        if(expect==length16) {
3231            expectLimits[i]=length8;
3232        } else {
3233            while(offsets[j]<expect) {
3234                ++j;
3235            }
3236            expectLimits[i]=j;
3237        }
3238    }
3239
3240    testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3241}
3242
3243static UChar32 nextCodePoint(UChar32 c) {
3244    // Skip some large and boring ranges.
3245    switch(c) {
3246    case 0x3441:
3247        return 0x4d7f;
3248    case 0x5100:
3249        return 0x9f00;
3250    case 0xb040:
3251        return 0xd780;
3252    case 0xe041:
3253        return 0xf8fe;
3254    case 0x10100:
3255        return 0x20000;
3256    case 0x20041:
3257        return 0xe0000;
3258    case 0xe0101:
3259        return 0x10fffd;
3260    default:
3261        return c+1;
3262    }
3263}
3264
3265// Verify that all implementations represent the same set.
3266void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3267    // contains(U+FFFD) is inconsistent with contains(some surrogates),
3268    // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3269    // Skip the UTF-8 part of the test - if the string contains surrogates -
3270    // because it is likely to produce a different result.
3271    UBool inconsistentSurrogates=
3272            (!(sets[0]->getSet().contains(0xfffd) ?
3273               sets[0]->getSet().contains(0xd800, 0xdfff) :
3274               sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3275             sets[0]->hasStringsWithSurrogates());
3276
3277    UChar s[1000];
3278    int32_t length=0;
3279    uint32_t localWhichSpans;
3280
3281    UChar32 c, first;
3282    for(first=c=0;; c=nextCodePoint(c)) {
3283        if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
3284            localWhichSpans=whichSpans;
3285            if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3286                localWhichSpans&=~SPAN_UTF8;
3287            }
3288            testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3289            if(c>0x10ffff) {
3290                break;
3291            }
3292            length=0;
3293            first=c;
3294        }
3295        U16_APPEND_UNSAFE(s, length, c);
3296    }
3297}
3298
3299// Test with a particular, interesting string.
3300// Specify length and try NUL-termination.
3301void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3302    static const UChar s[]={
3303        0x61, 0x62, 0x20,                       // Latin, space
3304        0x3b1, 0x3b2, 0x3b3,                    // Greek
3305        0xd900,                                 // lead surrogate
3306        0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3307        0xdc05,                                 // trail surrogate
3308        0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3309        0xd900, 0xdc05,                         // unassigned supplementary
3310        0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3311        0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3312        0                                       // NUL
3313    };
3314
3315    if((whichSpans&SPAN_UTF16)==0) {
3316        return;
3317    }
3318    testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3319    testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3320}
3321
3322void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3323    static const char s[]={
3324        "abc"                                   // Latin
3325
3326        /* trail byte in lead position */
3327        "\x80"
3328
3329        " "                                     // space
3330
3331        /* truncated multi-byte sequences */
3332        "\xd0"
3333        "\xe0"
3334        "\xe1"
3335        "\xed"
3336        "\xee"
3337        "\xf0"
3338        "\xf1"
3339        "\xf4"
3340        "\xf8"
3341        "\xfc"
3342
3343        "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3344
3345        /* trail byte in lead position */
3346        "\x80"
3347
3348        "\xe0\x80"
3349        "\xe0\xa0"
3350        "\xe1\x80"
3351        "\xed\x80"
3352        "\xed\xa0"
3353        "\xee\x80"
3354        "\xf0\x80"
3355        "\xf0\x90"
3356        "\xf1\x80"
3357        "\xf4\x80"
3358        "\xf4\x90"
3359        "\xf8\x80"
3360        "\xfc\x80"
3361
3362        "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3363
3364        /* trail byte in lead position */
3365        "\x80"
3366
3367        "\xf0\x80\x80"
3368        "\xf0\x90\x80"
3369        "\xf1\x80\x80"
3370        "\xf4\x80\x80"
3371        "\xf4\x90\x80"
3372        "\xf8\x80\x80"
3373        "\xfc\x80\x80"
3374
3375        "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3376
3377        /* trail byte in lead position */
3378        "\x80"
3379
3380        "\xf8\x80\x80\x80"
3381        "\xfc\x80\x80\x80"
3382
3383        "\xF1\x90\x80\x85"                      // unassigned supplementary
3384
3385        /* trail byte in lead position */
3386        "\x80"
3387
3388        "\xfc\x80\x80\x80\x80"
3389
3390        "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3391
3392        /* trail byte in lead position */
3393        "\x80"
3394
3395        /* complete sequences but non-shortest forms or out of range etc. */
3396        "\xc0\x80"
3397        "\xe0\x80\x80"
3398        "\xed\xa0\x80"
3399        "\xf0\x80\x80\x80"
3400        "\xf4\x90\x80\x80"
3401        "\xf8\x80\x80\x80\x80"
3402        "\xfc\x80\x80\x80\x80\x80"
3403        "\xfe"
3404        "\xff"
3405
3406        /* trail byte in lead position */
3407        "\x80"
3408
3409        "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3410    };
3411
3412    if((whichSpans&SPAN_UTF8)==0) {
3413        return;
3414    }
3415    testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3416    testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3417}
3418
3419// Take a set of span options and multiply them so that
3420// each portion only has one of the options a, b and c.
3421// If b==0, then the set of options is just modified with mask and a.
3422// If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3423static int32_t
3424addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3425               uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3426    uint32_t s;
3427    int32_t i;
3428
3429    for(i=0; i<whichSpansCount; ++i) {
3430        s=whichSpans[i]&mask;
3431        whichSpans[i]=s|a;
3432        if(b!=0) {
3433            whichSpans[whichSpansCount+i]=s|b;
3434            if(c!=0) {
3435                whichSpans[2*whichSpansCount+i]=s|c;
3436            }
3437        }
3438    }
3439    return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3440}
3441
3442#define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3443#define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3444#define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3445#define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3446
3447void UnicodeSetTest::TestSpan() {
3448    // "[...]" is a UnicodeSet pattern.
3449    // "*" performs tests on all Unicode code points and on a selection of
3450    //   malformed UTF-8/16 strings.
3451    // "-options" limits the scope of testing for the current set.
3452    //   By default, the test verifies that equivalent boundaries are found
3453    //   for UTF-16 and UTF-8, going forward and backward,
3454    //   alternating USET_SPAN_NOT_CONTAINED with
3455    //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3456    //   Single-character options:
3457    //     8 -- UTF-16 and UTF-8 boundaries may differ.
3458    //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3459    //          or the set contains strings with unpaired surrogates
3460    //          which do not translate to valid UTF-8.
3461    //     c -- set.span() and set.complement().span() boundaries may differ.
3462    //          Cause: Set strings are not complemented.
3463    //     b -- span() and spanBack() boundaries may differ.
3464    //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3465    //          and spanBack(USET_SPAN_SIMPLE) are defined to
3466    //          match with non-overlapping substrings.
3467    //          For example, with a set containing "ab" and "ba",
3468    //          span() of "aba" yields boundaries { 0, 2, 3 }
3469    //          because the initial "ab" matches from 0 to 2,
3470    //          while spanBack() yields boundaries { 0, 1, 3 }
3471    //          because the final "ba" matches from 1 to 3.
3472    //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3473    //          Cause: Strings in the set overlap, and a longer match may
3474    //          require a sequence including non-longest substrings.
3475    //          For example, with a set containing "ab", "abc" and "cd",
3476    //          span(contained) of "abcd" spans the entire string
3477    //          but span(longest match) only spans the first 3 characters.
3478    //   Each "-options" first resets all options and then applies the specified options.
3479    //   A "-" without options resets the options.
3480    //   The options are also reset for each new set.
3481    // Other strings will be spanned.
3482    static const char *const testdata[]={
3483        "[:ID_Continue:]",
3484        "*",
3485        "[:White_Space:]",
3486        "*",
3487        "[]",
3488        "*",
3489        "[\\u0000-\\U0010FFFF]",
3490        "*",
3491        "[\\u0000\\u0080\\u0800\\U00010000]",
3492        "*",
3493        "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3494        "*",
3495        "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3496        "-c",
3497        "*",
3498        "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3499        "-c",
3500        "*",
3501
3502        // Overlapping strings cause overlapping attempts to match.
3503        "[x{xy}{xya}{axy}{ax}]",
3504        "-cl",
3505
3506        // More repetitions of "xya" would take too long with the recursive
3507        // reference implementation.
3508        // containsAll()=FALSE
3509        // test_string 0x14
3510        "xx"
3511        "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3512        "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3513        "xyaxyaxyaxya"
3514        "xx"
3515        "xyaxyaxyaxya"  // span() ends here.
3516        "aaa",
3517
3518        // containsAll()=TRUE
3519        // test_string 0x15
3520        "xx"
3521        "xyaxyaxyaxya"
3522        "xx"
3523        "xyaxyaxyaxya"
3524        "xx"
3525        "xyaxyaxyaxy",
3526
3527        "-bc",
3528        // test_string 0x17
3529        "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3530        "-c",
3531        "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3532        "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3533        "-",
3534        "byaya",     // span() -> { 5 }
3535        "byay",      // span() -> { 4 }
3536        "bya",       // span() -> { 3 }
3537
3538        // span(longest match) will not span the whole string.
3539        "[a{ab}{bc}]",
3540        "-cl",
3541        // test_string 0x21
3542        "abc",
3543
3544        "[a{ab}{abc}{cd}]",
3545        "-cl",
3546        "acdabcdabccd",
3547
3548        // spanBack(longest match) will not span the whole string.
3549        "[c{ab}{bc}]",
3550        "-cl",
3551        "abc",
3552
3553        "[d{cd}{bcd}{ab}]",
3554        "-cl",
3555        "abbcdabcdabd",
3556
3557        // Test with non-ASCII set strings - test proper handling of surrogate pairs
3558        // and UTF-8 trail bytes.
3559        // Copies of above test sets and strings, but transliterated to have
3560        // different code points with similar trail units.
3561        // Previous: a      b         c            d
3562        // Unicode:  042B   30AB      200AB        204AB
3563        // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3564        // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3565        "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3566        "-cl",
3567        "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3568
3569        "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3570        "-cl",
3571        "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3572
3573        // Stress bookkeeping and recursion.
3574        // The following strings are barely doable with the recursive
3575        // reference implementation.
3576        // The not-contained character at the end prevents an early exit from the span().
3577        "[b{bb}]",
3578        "-c",
3579        // test_string 0x33
3580        "bbbbbbbbbbbbbbbbbbbbbbbb-",
3581        // On complement sets, span() and spanBack() get different results
3582        // because b is not in the complement set and there is an odd number of b's
3583        // in the test string.
3584        "-bc",
3585        "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3586
3587        // Test with set strings with an initial or final code point span
3588        // longer than 254.
3589        "[a{" _64_a _64_a _64_a _64_a "b}"
3590          "{a" _64_b _64_b _64_b _64_b "}]",
3591        "-c",
3592        _64_a _64_a _64_a _63_a "b",
3593        _64_a _64_a _64_a _64_a "b",
3594        _64_a _64_a _64_a _64_a "aaaabbbb",
3595        "a" _64_b _64_b _64_b _63_b,
3596        "a" _64_b _64_b _64_b _64_b,
3597        "aaaabbbb" _64_b _64_b _64_b _64_b,
3598
3599        // Test with strings containing unpaired surrogates.
3600        // They are not representable in UTF-8, and a leading trail surrogate
3601        // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3602        // U+20001 == \\uD840\\uDC01
3603        // U+20400 == \\uD841\\uDC00
3604        "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3605        "-8cl",
3606        "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3607    };
3608    uint32_t whichSpans[96]={ SPAN_ALL };
3609    int32_t whichSpansCount=1;
3610
3611    UnicodeSet *sets[SET_COUNT]={ NULL };
3612    const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3613
3614    char testName[1024];
3615    char *testNameLimit=testName;
3616
3617    int32_t i, j;
3618    for(i=0; i<LENGTHOF(testdata); ++i) {
3619        const char *s=testdata[i];
3620        if(s[0]=='[') {
3621            // Create new test sets from this pattern.
3622            for(j=0; j<SET_COUNT; ++j) {
3623                delete sets_with_str[j];
3624                delete sets[j];
3625            }
3626            UErrorCode errorCode=U_ZERO_ERROR;
3627            sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3628            if(U_FAILURE(errorCode)) {
3629                dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3630                break;
3631            }
3632            sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3633            sets[SLOW_NOT]->complement();
3634            // Intermediate set: Test cloning of a frozen set.
3635            UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3636            fast->freeze();
3637            sets[FAST]=(UnicodeSet *)fast->clone();
3638            delete fast;
3639            UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3640            fastNot->freeze();
3641            sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3642            delete fastNot;
3643
3644            for(j=0; j<SET_COUNT; ++j) {
3645                sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3646            }
3647
3648            strcpy(testName, s);
3649            testNameLimit=strchr(testName, 0);
3650            *testNameLimit++=':';
3651            *testNameLimit=0;
3652
3653            whichSpans[0]=SPAN_ALL;
3654            whichSpansCount=1;
3655        } else if(s[0]=='-') {
3656            whichSpans[0]=SPAN_ALL;
3657            whichSpansCount=1;
3658
3659            while(*++s!=0) {
3660                switch(*s) {
3661                case 'c':
3662                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3663                                                   ~SPAN_POLARITY,
3664                                                   SPAN_SET,
3665                                                   SPAN_COMPLEMENT,
3666                                                   0);
3667                    break;
3668                case 'b':
3669                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3670                                                   ~SPAN_DIRS,
3671                                                   SPAN_FWD,
3672                                                   SPAN_BACK,
3673                                                   0);
3674                    break;
3675                case 'l':
3676                    // test USET_SPAN_CONTAINED FWD & BACK, and separately
3677                    // USET_SPAN_SIMPLE only FWD, and separately
3678                    // USET_SPAN_SIMPLE only BACK
3679                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3680                                                   ~(SPAN_DIRS|SPAN_CONDITION),
3681                                                   SPAN_DIRS|SPAN_CONTAINED,
3682                                                   SPAN_FWD|SPAN_SIMPLE,
3683                                                   SPAN_BACK|SPAN_SIMPLE);
3684                    break;
3685                case '8':
3686                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3687                                                   ~SPAN_UTFS,
3688                                                   SPAN_UTF16,
3689                                                   SPAN_UTF8,
3690                                                   0);
3691                    break;
3692                default:
3693                    errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3694                    break;
3695                }
3696            }
3697        } else if(0==strcmp(s, "*")) {
3698            strcpy(testNameLimit, "bad_string");
3699            for(j=0; j<whichSpansCount; ++j) {
3700                if(whichSpansCount>1) {
3701                    sprintf(testNameLimit+10 /* strlen("bad_string") */,
3702                            "%%0x%3x",
3703                            whichSpans[j]);
3704                }
3705                testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3706                testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3707            }
3708
3709            strcpy(testNameLimit, "contents");
3710            for(j=0; j<whichSpansCount; ++j) {
3711                if(whichSpansCount>1) {
3712                    sprintf(testNameLimit+8 /* strlen("contents") */,
3713                            "%%0x%3x",
3714                            whichSpans[j]);
3715                }
3716                testSpanContents(sets_with_str, whichSpans[j], testName);
3717            }
3718        } else {
3719            UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3720            strcpy(testNameLimit, "test_string");
3721            for(j=0; j<whichSpansCount; ++j) {
3722                if(whichSpansCount>1) {
3723                    sprintf(testNameLimit+11 /* strlen("test_string") */,
3724                            "%%0x%3x",
3725                            whichSpans[j]);
3726                }
3727                testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3728            }
3729        }
3730    }
3731    for(j=0; j<SET_COUNT; ++j) {
3732        delete sets_with_str[j];
3733        delete sets[j];
3734    }
3735}
3736
3737// Test select patterns and strings, and test USET_SPAN_SIMPLE.
3738void UnicodeSetTest::TestStringSpan() {
3739    static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3740    static const char *const string=
3741        "xx"
3742        "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3743        "xx"
3744        "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3745        "xx"
3746        "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3747        "aaaa";
3748
3749    UErrorCode errorCode=U_ZERO_ERROR;
3750    UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3751    UnicodeSet set(pattern16, errorCode);
3752    if(U_FAILURE(errorCode)) {
3753        errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3754        return;
3755    }
3756
3757    UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3758
3759    if(set.containsAll(string16)) {
3760        errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3761    }
3762
3763    // Remove trailing "aaaa".
3764    string16.truncate(string16.length()-4);
3765    if(!set.containsAll(string16)) {
3766        errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3767    }
3768
3769    string16=UNICODE_STRING_SIMPLE("byayaxya");
3770    const UChar *s16=string16.getBuffer();
3771    int32_t length16=string16.length();
3772    (void)length16;   // Suppress set but not used warning.
3773    if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3774        set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3775        set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3776        set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3777        set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3778        set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3779    ) {
3780        errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3781    }
3782
3783    pattern="[a{ab}{abc}{cd}]";
3784    pattern16=UnicodeString(pattern, -1, US_INV);
3785    set.applyPattern(pattern16, errorCode);
3786    if(U_FAILURE(errorCode)) {
3787        errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3788        return;
3789    }
3790    string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3791    s16=string16.getBuffer();
3792    length16=string16.length();
3793    if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3794        set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3795        set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3796    ) {
3797        errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3798    }
3799
3800    pattern="[d{cd}{bcd}{ab}]";
3801    pattern16=UnicodeString(pattern, -1, US_INV);
3802    set.applyPattern(pattern16, errorCode).freeze();
3803    if(U_FAILURE(errorCode)) {
3804        errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3805        return;
3806    }
3807    string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3808    s16=string16.getBuffer();
3809    length16=string16.length();
3810    if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3811        set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3812        set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3813    ) {
3814        errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3815    }
3816}
3817