1/*
2********************************************************************************
3*   Copyright (C) 1999-2014 International Business Machines Corporation and
4*   others. All Rights Reserved.
5********************************************************************************
6*   Date        Name        Description
7*   10/20/99    alan        Creation.
8*   03/22/2000  Madhu       Added additional tests
9********************************************************************************
10*/
11
12#include <stdio.h>
13
14#include <string.h>
15#include "unicode/utypes.h"
16#include "usettest.h"
17#include "unicode/ucnv.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/usetiter.h"
21#include "unicode/ustring.h"
22#include "unicode/parsepos.h"
23#include "unicode/symtable.h"
24#include "unicode/uversion.h"
25#include "hash.h"
26
27#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
28    dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
29    u_errorName(status));}}
30
31#define TEST_ASSERT(expr) {if (!(expr)) { \
32    dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
33
34UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
35    UnicodeString pat;
36    set.toPattern(pat);
37    return left + UnicodeSetTest::escape(pat);
38}
39
40#define CASE(id,test) case id:                          \
41                          name = #test;                 \
42                          if (exec) {                   \
43                              logln(#test "---");       \
44                              logln();                  \
45                              test();                   \
46                          }                             \
47                          break
48
49UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
50}
51
52UConverter *UnicodeSetTest::openUTF8Converter() {
53    if(utf8Cnv==NULL) {
54        UErrorCode errorCode=U_ZERO_ERROR;
55        utf8Cnv=ucnv_open("UTF-8", &errorCode);
56    }
57    return utf8Cnv;
58}
59
60UnicodeSetTest::~UnicodeSetTest() {
61    ucnv_close(utf8Cnv);
62}
63
64void
65UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
66                               const char* &name, char* /*par*/) {
67    // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
68    switch (index) {
69        CASE(0,TestPatterns);
70        CASE(1,TestAddRemove);
71        CASE(2,TestCategories);
72        CASE(3,TestCloneEqualHash);
73        CASE(4,TestMinimalRep);
74        CASE(5,TestAPI);
75        CASE(6,TestScriptSet);
76        CASE(7,TestPropertySet);
77        CASE(8,TestClone);
78        CASE(9,TestExhaustive);
79        CASE(10,TestToPattern);
80        CASE(11,TestIndexOf);
81        CASE(12,TestStrings);
82        CASE(13,Testj2268);
83        CASE(14,TestCloseOver);
84        CASE(15,TestEscapePattern);
85        CASE(16,TestInvalidCodePoint);
86        CASE(17,TestSymbolTable);
87        CASE(18,TestSurrogate);
88        CASE(19,TestPosixClasses);
89        CASE(20,TestIteration);
90        CASE(21,TestFreezable);
91        CASE(22,TestSpan);
92        CASE(23,TestStringSpan);
93        default: name = ""; break;
94    }
95}
96
97static const char NOT[] = "%%%%";
98
99/**
100 * UVector was improperly copying contents
101 * This code will crash this is still true
102 */
103void UnicodeSetTest::Testj2268() {
104  UnicodeSet t;
105  t.add(UnicodeString("abc"));
106  UnicodeSet test(t);
107  UnicodeString ustrPat;
108  test.toPattern(ustrPat, TRUE);
109}
110
111/**
112 * Test toPattern().
113 */
114void UnicodeSetTest::TestToPattern() {
115    UErrorCode ec = U_ZERO_ERROR;
116
117    // Test that toPattern() round trips with syntax characters and
118    // whitespace.
119    {
120        static const char* OTHER_TOPATTERN_TESTS[] = {
121            "[[:latin:]&[:greek:]]",
122            "[[:latin:]-[:greek:]]",
123            "[:nonspacing mark:]",
124            NULL
125        };
126
127        for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
128            ec = U_ZERO_ERROR;
129            UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
130            if (U_FAILURE(ec)) {
131                dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
132                continue;
133            }
134            checkPat(OTHER_TOPATTERN_TESTS[j], s);
135        }
136
137        for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
138            if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
139
140                // check various combinations to make sure they all work.
141                if (i != 0 && !toPatternAux(i, i)){
142                    continue;
143                }
144                if (!toPatternAux(0, i)){
145                    continue;
146                }
147                if (!toPatternAux(i, 0xFFFF)){
148                    continue;
149                }
150            }
151        }
152    }
153
154    // Test pattern behavior of multicharacter strings.
155    {
156        ec = U_ZERO_ERROR;
157        UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
158
159        // This loop isn't a loop.  It's here to make the compiler happy.
160        // If you're curious, try removing it and changing the 'break'
161        // statements (except for the last) to goto's.
162        for (;;) {
163            if (U_FAILURE(ec)) break;
164            const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
165            expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
166
167            s->add("ac");
168            const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
169            expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
170
171            s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
172            if (U_FAILURE(ec)) break;
173            const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
174            expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
175
176            s->add("[]");
177            const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
178            expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
179
180            s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
181            if (U_FAILURE(ec)) break;
182            const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
183            expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
184
185            // j2189
186            s->clear();
187            s->add(UnicodeString("abc", ""));
188            s->add(UnicodeString("abc", ""));
189            const char* exp6[] = {"abc", NOT, "ab", NULL};
190            expectToPattern(*s, "[{abc}]", exp6);
191
192            break;
193        }
194
195        if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
196        delete s;
197    }
198
199    // JB#3400: For 2 character ranges prefer [ab] to [a-b]
200    UnicodeSet s;
201    s.add((UChar)97, (UChar)98); // 'a', 'b'
202    expectToPattern(s, "[ab]", NULL);
203}
204
205UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
206
207    // use Integer.toString because Utility.hex doesn't handle ints
208    UnicodeString pat = "";
209    // TODO do these in hex
210    //String source = "0x" + Integer.toString(start,16).toUpperCase();
211    //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
212    UnicodeString source;
213    source = source + (uint32_t)start;
214    if (start != end)
215        source = source + ".." + (uint32_t)end;
216    UnicodeSet testSet;
217    testSet.add(start, end);
218    return checkPat(source, testSet);
219}
220
221UBool UnicodeSetTest::checkPat(const UnicodeString& source,
222                               const UnicodeSet& testSet) {
223    // What we want to make sure of is that a pattern generated
224    // by toPattern(), with or without escaped unprintables, can
225    // be passed back into the UnicodeSet constructor.
226    UnicodeString pat0;
227
228    testSet.toPattern(pat0, TRUE);
229
230    if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
231
232    //String pat1 = unescapeLeniently(pat0);
233    //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
234
235    UnicodeString pat2;
236    testSet.toPattern(pat2, FALSE);
237    if (!checkPat(source, testSet, pat2)) return FALSE;
238
239    //String pat3 = unescapeLeniently(pat2);
240    // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
241
242    //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
243    logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
244    return TRUE;
245}
246
247UBool UnicodeSetTest::checkPat(const UnicodeString& source,
248                               const UnicodeSet& testSet,
249                               const UnicodeString& pat) {
250    UErrorCode ec = U_ZERO_ERROR;
251    UnicodeSet testSet2(pat, ec);
252    if (testSet2 != testSet) {
253        errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
254        return FALSE;
255    }
256    return TRUE;
257}
258
259void
260UnicodeSetTest::TestPatterns(void) {
261    UnicodeSet set;
262    expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
263    expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
264    expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
265    expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
266    expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
267    expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
268
269    // Throw in a test of complement
270    set.complement();
271    UnicodeString exp;
272    exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
273    expectPairs(set, exp);
274}
275
276void
277UnicodeSetTest::TestCategories(void) {
278    UErrorCode status = U_ZERO_ERROR;
279    const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
280    UnicodeSet set(pat, status);
281    if (U_FAILURE(status)) {
282        dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
283        return;
284    } else {
285        expectContainment(set, pat, "ABC", "abc");
286    }
287
288    UChar32 i;
289    int32_t failures = 0;
290    // Make sure generation of L doesn't pollute cached Lu set
291    // First generate L, then Lu
292    set.applyPattern("[:L:]", status);
293    if (U_FAILURE(status)) { errln("FAIL"); return; }
294    for (i=0; i<0x200; ++i) {
295        UBool l = u_isalpha((UChar)i);
296        if (l != set.contains(i)) {
297            errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
298                  set.contains(i));
299            if (++failures == 10) break;
300        }
301    }
302
303    set.applyPattern("[:Lu:]", status);
304    if (U_FAILURE(status)) { errln("FAIL"); return; }
305    for (i=0; i<0x200; ++i) {
306        UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
307        if (lu != set.contains(i)) {
308            errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
309                  set.contains(i));
310            if (++failures == 20) break;
311        }
312    }
313}
314void
315UnicodeSetTest::TestCloneEqualHash(void) {
316    UErrorCode status = U_ZERO_ERROR;
317    // set1 and set2 used to be built with the obsolete constructor taking
318    // UCharCategory values; replaced with pattern constructors
319    // markus 20030502
320    UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
321    UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
322    if (U_FAILURE(status)){
323        dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
324        return;
325    }
326    UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
327    UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
328    if (U_FAILURE(status)){
329        errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
330        return;
331    }
332
333    if (*set1 != *set1a) {
334        errln("FAIL: category constructor for Ll broken");
335    }
336    if (*set2 != *set2a) {
337        errln("FAIL: category constructor for Nd broken");
338    }
339    delete set1a;
340    delete set2a;
341
342    logln("Testing copy construction");
343    UnicodeSet *set1copy=new UnicodeSet(*set1);
344    if(*set1 != *set1copy || *set1 == *set2 ||
345        getPairs(*set1) != getPairs(*set1copy) ||
346        set1->hashCode() != set1copy->hashCode()){
347        errln("FAIL : Error in copy construction");
348        return;
349    }
350
351    logln("Testing =operator");
352    UnicodeSet set1equal=*set1;
353    UnicodeSet set2equal=*set2;
354    if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
355        set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
356        errln("FAIL: Error in =operator");
357    }
358
359    logln("Testing clone()");
360    UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
361    UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
362    if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
363        *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
364        *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
365        errln("FAIL: Error in clone");
366    }
367
368    logln("Testing hashcode");
369    if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
370        set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
371        set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
372        set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
373        set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
374        errln("FAIL: Error in hashCode()");
375    }
376
377    delete set1;
378    delete set1copy;
379    delete set2;
380    delete set1clone;
381    delete set2clone;
382
383
384}
385void
386UnicodeSetTest::TestAddRemove(void) {
387    UnicodeSet set; // Construct empty set
388    doAssert(set.isEmpty() == TRUE, "set should be empty");
389    doAssert(set.size() == 0, "size should be 0");
390    set.complement();
391    doAssert(set.size() == 0x110000, "size should be 0x110000");
392    set.clear();
393    set.add(0x0061, 0x007a);
394    expectPairs(set, "az");
395    doAssert(set.isEmpty() == FALSE, "set should not be empty");
396    doAssert(set.size() != 0, "size should not be equal to 0");
397    doAssert(set.size() == 26, "size should be equal to 26");
398    set.remove(0x006d, 0x0070);
399    expectPairs(set, "alqz");
400    doAssert(set.size() == 22, "size should be equal to 22");
401    set.remove(0x0065, 0x0067);
402    expectPairs(set, "adhlqz");
403    doAssert(set.size() == 19, "size should be equal to 19");
404    set.remove(0x0064, 0x0069);
405    expectPairs(set, "acjlqz");
406    doAssert(set.size() == 16, "size should be equal to 16");
407    set.remove(0x0063, 0x0072);
408    expectPairs(set, "absz");
409    doAssert(set.size() == 10, "size should be equal to 10");
410    set.add(0x0066, 0x0071);
411    expectPairs(set, "abfqsz");
412    doAssert(set.size() == 22, "size should be equal to 22");
413    set.remove(0x0061, 0x0067);
414    expectPairs(set, "hqsz");
415    set.remove(0x0061, 0x007a);
416    expectPairs(set, "");
417    doAssert(set.isEmpty() == TRUE, "set should be empty");
418    doAssert(set.size() == 0, "size should be 0");
419    set.add(0x0061);
420    doAssert(set.isEmpty() == FALSE, "set should not be empty");
421    doAssert(set.size() == 1, "size should not be equal to 1");
422    set.add(0x0062);
423    set.add(0x0063);
424    expectPairs(set, "ac");
425    doAssert(set.size() == 3, "size should not be equal to 3");
426    set.add(0x0070);
427    set.add(0x0071);
428    expectPairs(set, "acpq");
429    doAssert(set.size() == 5, "size should not be equal to 5");
430    set.clear();
431    expectPairs(set, "");
432    doAssert(set.isEmpty() == TRUE, "set should be empty");
433    doAssert(set.size() == 0, "size should be 0");
434
435    // Try removing an entire set from another set
436    expectPattern(set, "[c-x]", "cx");
437    UnicodeSet set2;
438    expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
439    set.removeAll(set2);
440    expectPairs(set, "deluxx");
441
442    // Try adding an entire set to another set
443    expectPattern(set, "[jackiemclean]", "aacceein");
444    expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
445    set.addAll(set2);
446    expectPairs(set, "aacehort");
447    doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
448
449    // Try retaining an set of elements contained in another set (intersection)
450    UnicodeSet set3;
451    expectPattern(set3, "[a-c]", "ac");
452    doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
453    set3.remove(0x0062);
454    expectPairs(set3, "aacc");
455    doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
456    set.retainAll(set3);
457    expectPairs(set, "aacc");
458    doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
459    doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
460    set.clear();
461    doAssert(set.size() != set3.size(), "set.size() != set3.size()");
462
463    // Test commutativity
464    expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
465    expectPattern(set2, "[jackiemclean]", "aacceein");
466    set.addAll(set2);
467    expectPairs(set, "aacehort");
468    doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
469
470
471
472
473}
474
475/**
476 * Make sure minimal representation is maintained.
477 */
478void UnicodeSetTest::TestMinimalRep() {
479    UErrorCode status = U_ZERO_ERROR;
480    // This is pretty thoroughly tested by checkCanonicalRep()
481    // run against the exhaustive operation results.  Use the code
482    // here for debugging specific spot problems.
483
484    // 1 overlap against 2
485    UnicodeSet set("[h-km-q]", status);
486    if (U_FAILURE(status)) { errln("FAIL"); return; }
487    UnicodeSet set2("[i-o]", status);
488    if (U_FAILURE(status)) { errln("FAIL"); return; }
489    set.addAll(set2);
490    expectPairs(set, "hq");
491    // right
492    set.applyPattern("[a-m]", status);
493    if (U_FAILURE(status)) { errln("FAIL"); return; }
494    set2.applyPattern("[e-o]", status);
495    if (U_FAILURE(status)) { errln("FAIL"); return; }
496    set.addAll(set2);
497    expectPairs(set, "ao");
498    // left
499    set.applyPattern("[e-o]", status);
500    if (U_FAILURE(status)) { errln("FAIL"); return; }
501    set2.applyPattern("[a-m]", status);
502    if (U_FAILURE(status)) { errln("FAIL"); return; }
503    set.addAll(set2);
504    expectPairs(set, "ao");
505    // 1 overlap against 3
506    set.applyPattern("[a-eg-mo-w]", status);
507    if (U_FAILURE(status)) { errln("FAIL"); return; }
508    set2.applyPattern("[d-q]", status);
509    if (U_FAILURE(status)) { errln("FAIL"); return; }
510    set.addAll(set2);
511    expectPairs(set, "aw");
512}
513
514void UnicodeSetTest::TestAPI() {
515    UErrorCode status = U_ZERO_ERROR;
516    // default ct
517    UnicodeSet set;
518    if (!set.isEmpty() || set.getRangeCount() != 0) {
519        errln((UnicodeString)"FAIL, set should be empty but isn't: " +
520              set);
521    }
522
523    // clear(), isEmpty()
524    set.add(0x0061);
525    if (set.isEmpty()) {
526        errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
527              set);
528    }
529    set.clear();
530    if (!set.isEmpty()) {
531        errln((UnicodeString)"FAIL, set should be empty but isn't: " +
532              set);
533    }
534
535    // size()
536    set.clear();
537    if (set.size() != 0) {
538        errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
539              ": " + set);
540    }
541    set.add(0x0061);
542    if (set.size() != 1) {
543        errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
544              ": " + set);
545    }
546    set.add(0x0031, 0x0039);
547    if (set.size() != 10) {
548        errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
549              ": " + set);
550    }
551
552    // contains(first, last)
553    set.clear();
554    set.applyPattern("[A-Y 1-8 b-d l-y]", status);
555    if (U_FAILURE(status)) { errln("FAIL"); return; }
556    for (int32_t i = 0; i<set.getRangeCount(); ++i) {
557        UChar32 a = set.getRangeStart(i);
558        UChar32 b = set.getRangeEnd(i);
559        if (!set.contains(a, b)) {
560            errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
561                  " but doesn't: " + set);
562        }
563        if (set.contains((UChar32)(a-1), b)) {
564            errln((UnicodeString)"FAIL, shouldn't contain " +
565                  (unsigned short)(a-1) + '-' + (unsigned short)b +
566                  " but does: " + set);
567        }
568        if (set.contains(a, (UChar32)(b+1))) {
569            errln((UnicodeString)"FAIL, shouldn't contain " +
570                  (unsigned short)a + '-' + (unsigned short)(b+1) +
571                  " but does: " + set);
572        }
573    }
574
575    // Ported InversionList test.
576    UnicodeSet a((UChar32)3,(UChar32)10);
577    UnicodeSet b((UChar32)7,(UChar32)15);
578    UnicodeSet c;
579
580    logln((UnicodeString)"a [3-10]: " + a);
581    logln((UnicodeString)"b [7-15]: " + b);
582    c = a;
583    c.addAll(b);
584    UnicodeSet exp((UChar32)3,(UChar32)15);
585    if (c == exp) {
586        logln((UnicodeString)"c.set(a).add(b): " + c);
587    } else {
588        errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
589    }
590    c.complement();
591    exp.set((UChar32)0, (UChar32)2);
592    exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
593    if (c == exp) {
594        logln((UnicodeString)"c.complement(): " + c);
595    } else {
596        errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
597    }
598    c.complement();
599    exp.set((UChar32)3, (UChar32)15);
600    if (c == exp) {
601        logln((UnicodeString)"c.complement(): " + c);
602    } else {
603        errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
604    }
605    c = a;
606    c.complementAll(b);
607    exp.set((UChar32)3,(UChar32)6);
608    exp.add((UChar32)11,(UChar32) 15);
609    if (c == exp) {
610        logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
611    } else {
612        errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
613    }
614
615    exp = c;
616    bitsToSet(setToBits(c), c);
617    if (c == exp) {
618        logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
619    } else {
620        errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
621    }
622
623    // Additional tests for coverage JB#2118
624    //UnicodeSet::complement(class UnicodeString const &)
625    //UnicodeSet::complementAll(class UnicodeString const &)
626    //UnicodeSet::containsNone(class UnicodeSet const &)
627    //UnicodeSet::containsNone(long,long)
628    //UnicodeSet::containsSome(class UnicodeSet const &)
629    //UnicodeSet::containsSome(long,long)
630    //UnicodeSet::removeAll(class UnicodeString const &)
631    //UnicodeSet::retain(long)
632    //UnicodeSet::retainAll(class UnicodeString const &)
633    //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
634    //UnicodeSetIterator::getString(void)
635    set.clear();
636    set.complement("ab");
637    exp.applyPattern("[{ab}]", status);
638    if (U_FAILURE(status)) { errln("FAIL"); return; }
639    if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
640
641    UnicodeSetIterator iset(set);
642    if (!iset.next() || !iset.isString()) {
643        errln("FAIL: UnicodeSetIterator::next/isString");
644    } else if (iset.getString() != "ab") {
645        errln("FAIL: UnicodeSetIterator::getString");
646    }
647
648    set.add((UChar32)0x61, (UChar32)0x7A);
649    set.complementAll("alan");
650    exp.applyPattern("[{ab}b-kmo-z]", status);
651    if (U_FAILURE(status)) { errln("FAIL"); return; }
652    if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
653
654    exp.applyPattern("[a-z]", status);
655    if (U_FAILURE(status)) { errln("FAIL"); return; }
656    if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
657    if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
658    exp.applyPattern("[aln]", status);
659    if (U_FAILURE(status)) { errln("FAIL"); return; }
660    if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
661    if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
662
663    if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
664        errln("FAIL: containsNone(UChar32, UChar32)");
665    }
666    if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
667        errln("FAIL: containsSome(UChar32, UChar32)");
668    }
669    if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
670        errln("FAIL: containsNone(UChar32, UChar32)");
671    }
672    if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
673        errln("FAIL: containsSome(UChar32, UChar32)");
674    }
675
676    set.removeAll("liu");
677    exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
678    if (U_FAILURE(status)) { errln("FAIL"); return; }
679    if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
680
681    set.retainAll("star");
682    exp.applyPattern("[rst]", status);
683    if (U_FAILURE(status)) { errln("FAIL"); return; }
684    if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
685
686    set.retain((UChar32)0x73);
687    exp.applyPattern("[s]", status);
688    if (U_FAILURE(status)) { errln("FAIL"); return; }
689    if (set != exp) { errln("FAIL: retain('s')"); return; }
690
691    uint16_t buf[32];
692    int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
693    if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
694    if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
695        errln("FAIL: serialize");
696        return;
697    }
698
699    // Conversions to and from USet
700    UnicodeSet *uniset = &set;
701    USet *uset = uniset->toUSet();
702    TEST_ASSERT((void *)uset == (void *)uniset);
703    UnicodeSet *setx = UnicodeSet::fromUSet(uset);
704    TEST_ASSERT((void *)setx == (void *)uset);
705    const UnicodeSet *constSet = uniset;
706    const USet *constUSet = constSet->toUSet();
707    TEST_ASSERT((void *)constUSet == (void *)constSet);
708    const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
709    TEST_ASSERT((void *)constSetx == (void *)constUSet);
710
711    // span(UnicodeString) and spanBack(UnicodeString) convenience methods
712    UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
713    UnicodeSet ac(0x61, 0x63);
714    ac.remove(0x62).freeze();
715    if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
716        ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
717        ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
718        ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
719        ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
720        ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
721        ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
722        ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
723        ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
724        ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
725    ) {
726        errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
727    }
728    if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
729        ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
730        ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
731        ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
732        ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
733        ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
734        ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
735        ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
736        ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
737        ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
738    ) {
739        errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
740    }
741}
742
743void UnicodeSetTest::TestIteration() {
744    UErrorCode ec = U_ZERO_ERROR;
745    int i = 0;
746    int outerLoop;
747
748    // 6 code points, 3 ranges, 2 strings, 8 total elements
749    //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
750    UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
751    TEST_ASSERT_SUCCESS(ec);
752    UnicodeSetIterator it(set);
753
754    for (outerLoop=0; outerLoop<3; outerLoop++) {
755        // Run the test multiple times, to check that iterator.reset() is working.
756        for (i=0; i<10; i++) {
757            UBool         nextv        = it.next();
758            UBool         isString     = it.isString();
759            int32_t       codePoint    = it.getCodepoint();
760            //int32_t       codePointEnd = it.getCodepointEnd();
761            UnicodeString s   = it.getString();
762            switch (i) {
763            case 0:
764                TEST_ASSERT(nextv == TRUE);
765                TEST_ASSERT(isString == FALSE);
766                TEST_ASSERT(codePoint==0x61);
767                TEST_ASSERT(s == "a");
768                break;
769            case 1:
770                TEST_ASSERT(nextv == TRUE);
771                TEST_ASSERT(isString == FALSE);
772                TEST_ASSERT(codePoint==0x62);
773                TEST_ASSERT(s == "b");
774                break;
775            case 2:
776                TEST_ASSERT(nextv == TRUE);
777                TEST_ASSERT(isString == FALSE);
778                TEST_ASSERT(codePoint==0x63);
779                TEST_ASSERT(s == "c");
780                break;
781            case 3:
782                TEST_ASSERT(nextv == TRUE);
783                TEST_ASSERT(isString == FALSE);
784                TEST_ASSERT(codePoint==0x79);
785                TEST_ASSERT(s == "y");
786                break;
787            case 4:
788                TEST_ASSERT(nextv == TRUE);
789                TEST_ASSERT(isString == FALSE);
790                TEST_ASSERT(codePoint==0x7a);
791                TEST_ASSERT(s == "z");
792                break;
793            case 5:
794                TEST_ASSERT(nextv == TRUE);
795                TEST_ASSERT(isString == FALSE);
796                TEST_ASSERT(codePoint==0x1abcd);
797                TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
798                break;
799            case 6:
800                TEST_ASSERT(nextv == TRUE);
801                TEST_ASSERT(isString == TRUE);
802                TEST_ASSERT(s == "str1");
803                break;
804            case 7:
805                TEST_ASSERT(nextv == TRUE);
806                TEST_ASSERT(isString == TRUE);
807                TEST_ASSERT(s == "str2");
808                break;
809            case 8:
810                TEST_ASSERT(nextv == FALSE);
811                break;
812            case 9:
813                TEST_ASSERT(nextv == FALSE);
814                break;
815            }
816        }
817        it.reset();  // prepare to run the iteration again.
818    }
819}
820
821
822
823
824void UnicodeSetTest::TestStrings() {
825    UErrorCode ec = U_ZERO_ERROR;
826
827    UnicodeSet* testList[] = {
828        UnicodeSet::createFromAll("abc"),
829        new UnicodeSet("[a-c]", ec),
830
831        &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
832        new UnicodeSet("[{ll}{ch}a-z]", ec),
833
834        UnicodeSet::createFrom("ab}c"),
835        new UnicodeSet("[{ab\\}c}]", ec),
836
837        &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
838        new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
839
840        NULL
841    };
842
843    if (U_FAILURE(ec)) {
844        errln("FAIL: couldn't construct test sets");
845    }
846
847    for (int32_t i = 0; testList[i] != NULL; i+=2) {
848        if (U_SUCCESS(ec)) {
849            UnicodeString pat0, pat1;
850            testList[i]->toPattern(pat0, TRUE);
851            testList[i+1]->toPattern(pat1, TRUE);
852            if (*testList[i] == *testList[i+1]) {
853                logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
854            } else {
855                logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
856            }
857        }
858        delete testList[i];
859        delete testList[i+1];
860    }
861}
862
863/**
864 * Test the [:Latin:] syntax.
865 */
866void UnicodeSetTest::TestScriptSet() {
867    expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
868
869    expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
870
871    /* Jitterbug 1423 */
872    expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
873
874}
875
876/**
877 * Test the [:Latin:] syntax.
878 */
879void UnicodeSetTest::TestPropertySet() {
880    static const char* const DATA[] = {
881        // Pattern, Chars IN, Chars NOT in
882
883        "[:Latin:]",
884        "aA",
885        "\\u0391\\u03B1",
886
887        "[\\p{Greek}]",
888        "\\u0391\\u03B1",
889        "aA",
890
891        "\\P{ GENERAL Category = upper case letter }",
892        "abc",
893        "ABC",
894
895#if !UCONFIG_NO_NORMALIZATION
896        // Combining class: @since ICU 2.2
897        // Check both symbolic and numeric
898        "\\p{ccc=Nukta}",
899        "\\u0ABC",
900        "abc",
901
902        "\\p{Canonical Combining Class = 11}",
903        "\\u05B1",
904        "\\u05B2",
905
906        "[:c c c = iota subscript :]",
907        "\\u0345",
908        "xyz",
909#endif
910
911        // Bidi class: @since ICU 2.2
912        "\\p{bidiclass=lefttoright}",
913        "abc",
914        "\\u0671\\u0672",
915
916        // Binary properties: @since ICU 2.2
917        "\\p{ideographic}",
918        "\\u4E0A",
919        "x",
920
921        "[:math=false:]",
922        "q)*(",
923        // weiv: )(and * were removed from math in Unicode 4.0.1
924        //"(*+)",
925        "+<>^",
926
927        // JB#1767 \N{}, \p{ASCII}
928        "[:Ascii:]",
929        "abc\\u0000\\u007F",
930        "\\u0080\\u4E00",
931
932        "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
933        "az",
934        "qrs",
935
936        // JB#2015
937        "[:any:]",
938        "a\\U0010FFFF",
939        "",
940
941        "[:nv=0.5:]",
942        "\\u00BD\\u0F2A",
943        "\\u00BC",
944
945        // JB#2653: Age
946        "[:Age=1.1:]",
947        "\\u03D6", // 1.1
948        "\\u03D8\\u03D9", // 3.2
949
950        "[:Age=3.1:]",
951        "\\u1800\\u3400\\U0002f800",
952        "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
953
954        // JB#2350: Case_Sensitive
955        "[:Case Sensitive:]",
956        "A\\u1FFC\\U00010410",
957        ";\\u00B4\\U00010500",
958
959        // JB#2832: C99-compatibility props
960        "[:blank:]",
961        " \\u0009",
962        "1-9A-Z",
963
964        "[:graph:]",
965        "19AZ",
966        " \\u0003\\u0007\\u0009\\u000A\\u000D",
967
968        "[:punct:]",
969        "!@#%&*()[]{}-_\\/;:,.?'\"",
970        "09azAZ",
971
972        "[:xdigit:]",
973        "09afAF",
974        "gG!",
975
976        // Regex compatibility test
977        "[-b]", // leading '-' is literal
978        "-b",
979        "ac",
980
981        "[^-b]", // leading '-' is literal
982        "ac",
983        "-b",
984
985        "[b-]", // trailing '-' is literal
986        "-b",
987        "ac",
988
989        "[^b-]", // trailing '-' is literal
990        "ac",
991        "-b",
992
993        "[a-b-]", // trailing '-' is literal
994        "ab-",
995        "c=",
996
997        "[[a-q]&[p-z]-]", // trailing '-' is literal
998        "pq-",
999        "or=",
1000
1001        "[\\s|\\)|:|$|\\>]", // from regex tests
1002        "s|):$>",
1003        "abc",
1004
1005        "[\\uDC00cd]", // JB#2906: isolated trail at start
1006        "cd\\uDC00",
1007        "ab\\uD800\\U00010000",
1008
1009        "[ab\\uD800]", // JB#2906: isolated trail at start
1010        "ab\\uD800",
1011        "cd\\uDC00\\U00010000",
1012
1013        "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1014        "abcd\\uD800",
1015        "ef\\uDC00\\U00010000",
1016
1017        "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1018        "abcd\\uDC00",
1019        "ef\\uD800\\U00010000",
1020
1021#if !UCONFIG_NO_NORMALIZATION
1022        "[:^lccc=0:]", // Lead canonical class
1023        "\\u0300\\u0301",
1024        "abcd\\u00c0\\u00c5",
1025
1026        "[:^tccc=0:]", // Trail canonical class
1027        "\\u0300\\u0301\\u00c0\\u00c5",
1028        "abcd",
1029
1030        "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1031        "\\u0300\\u0301\\u00c0\\u00c5",
1032        "abcd",
1033
1034        "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1035        "",
1036        "abcd\\u0300\\u0301\\u00c0\\u00c5",
1037
1038        "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1039        "\\u0F73\\u0F75\\u0F81",
1040        "abcd\\u0300\\u0301\\u00c0\\u00c5",
1041#endif /* !UCONFIG_NO_NORMALIZATION */
1042
1043        "[:Assigned:]",
1044        "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1045        "\\u0888\\uFDD3\\uFFFE\\U00050005",
1046
1047        // Script_Extensions, new in Unicode 6.0
1048        "[:scx=Arab:]",
1049        "\\u061E\\u061F\\u0620\\u0621\\u063F\\u0640\\u0650\\u065E\\uFDF1\\uFDF2\\uFDF3",
1050        "\\u061D\\uFDEF\\uFDFE",
1051
1052        // U+FDF2 has Script=Arabic and also Arab in its Script_Extensions,
1053        // so scx-sc is missing U+FDF2.
1054        "[[:Script_Extensions=Arabic:]-[:Arab:]]",
1055        "\\u0640\\u064B\\u0650\\u0655",
1056        "\\uFDF2"
1057    };
1058
1059    static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1060
1061    for (int32_t i=0; i<DATA_LEN; i+=3) {
1062        expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1063                          CharsToUnicodeString(DATA[i+2]));
1064    }
1065}
1066
1067/**
1068  * Test that Posix style character classes [:digit:], etc.
1069  *   have the Unicode definitions from TR 18.
1070  */
1071void UnicodeSetTest::TestPosixClasses() {
1072    {
1073        UErrorCode status = U_ZERO_ERROR;
1074        UnicodeSet s1("[:alpha:]", status);
1075        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1076        TEST_ASSERT_SUCCESS(status);
1077        TEST_ASSERT(s1==s2);
1078    }
1079    {
1080        UErrorCode status = U_ZERO_ERROR;
1081        UnicodeSet s1("[:lower:]", status);
1082        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1083        TEST_ASSERT_SUCCESS(status);
1084        TEST_ASSERT(s1==s2);
1085    }
1086    {
1087        UErrorCode status = U_ZERO_ERROR;
1088        UnicodeSet s1("[:upper:]", status);
1089        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1090        TEST_ASSERT_SUCCESS(status);
1091        TEST_ASSERT(s1==s2);
1092    }
1093    {
1094        UErrorCode status = U_ZERO_ERROR;
1095        UnicodeSet s1("[:punct:]", status);
1096        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1097        TEST_ASSERT_SUCCESS(status);
1098        TEST_ASSERT(s1==s2);
1099    }
1100    {
1101        UErrorCode status = U_ZERO_ERROR;
1102        UnicodeSet s1("[:digit:]", status);
1103        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1104        TEST_ASSERT_SUCCESS(status);
1105        TEST_ASSERT(s1==s2);
1106    }
1107    {
1108        UErrorCode status = U_ZERO_ERROR;
1109        UnicodeSet s1("[:xdigit:]", status);
1110        UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1111        TEST_ASSERT_SUCCESS(status);
1112        TEST_ASSERT(s1==s2);
1113    }
1114    {
1115        UErrorCode status = U_ZERO_ERROR;
1116        UnicodeSet s1("[:alnum:]", status);
1117        UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1118        TEST_ASSERT_SUCCESS(status);
1119        TEST_ASSERT(s1==s2);
1120    }
1121    {
1122        UErrorCode status = U_ZERO_ERROR;
1123        UnicodeSet s1("[:space:]", status);
1124        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1125        TEST_ASSERT_SUCCESS(status);
1126        TEST_ASSERT(s1==s2);
1127    }
1128    {
1129        UErrorCode status = U_ZERO_ERROR;
1130        UnicodeSet s1("[:blank:]", status);
1131        TEST_ASSERT_SUCCESS(status);
1132        UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1133            status);
1134        TEST_ASSERT_SUCCESS(status);
1135        TEST_ASSERT(s1==s2);
1136    }
1137    {
1138        UErrorCode status = U_ZERO_ERROR;
1139        UnicodeSet s1("[:cntrl:]", status);
1140        TEST_ASSERT_SUCCESS(status);
1141        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1142        TEST_ASSERT_SUCCESS(status);
1143        TEST_ASSERT(s1==s2);
1144    }
1145    {
1146        UErrorCode status = U_ZERO_ERROR;
1147        UnicodeSet s1("[:graph:]", status);
1148        TEST_ASSERT_SUCCESS(status);
1149        UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1150        TEST_ASSERT_SUCCESS(status);
1151        TEST_ASSERT(s1==s2);
1152    }
1153    {
1154        UErrorCode status = U_ZERO_ERROR;
1155        UnicodeSet s1("[:print:]", status);
1156        TEST_ASSERT_SUCCESS(status);
1157        UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1158        TEST_ASSERT_SUCCESS(status);
1159        TEST_ASSERT(s1==s2);
1160    }
1161}
1162/**
1163 * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1164 */
1165void UnicodeSetTest::TestClone() {
1166    UErrorCode ec = U_ZERO_ERROR;
1167    UnicodeSet s("[abcxyz]", ec);
1168    UnicodeSet t(s);
1169    expectContainment(t, "abc", "def");
1170}
1171
1172/**
1173 * Test the indexOf() and charAt() methods.
1174 */
1175void UnicodeSetTest::TestIndexOf() {
1176    UErrorCode ec = U_ZERO_ERROR;
1177    UnicodeSet set("[a-cx-y3578]", ec);
1178    if (U_FAILURE(ec)) {
1179        errln("FAIL: UnicodeSet constructor");
1180        return;
1181    }
1182    for (int32_t i=0; i<set.size(); ++i) {
1183        UChar32 c = set.charAt(i);
1184        if (set.indexOf(c) != i) {
1185            errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1186                i, c, set.indexOf(c));
1187        }
1188    }
1189    UChar32 c = set.charAt(set.size());
1190    if (c != -1) {
1191        errln("FAIL: charAt(<out of range>) = %X", c);
1192    }
1193    int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1194    if (j != -1) {
1195        errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1196    }
1197}
1198
1199/**
1200 * Test closure API.
1201 */
1202void UnicodeSetTest::TestCloseOver() {
1203    UErrorCode ec = U_ZERO_ERROR;
1204
1205    char CASE[] = {(char)USET_CASE_INSENSITIVE};
1206    char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1207    const char* DATA[] = {
1208        // selector, input, output
1209        CASE,
1210        "[aq\\u00DF{Bc}{bC}{Fi}]",
1211        "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1212
1213        CASE,
1214        "[\\u01F1]", // 'DZ'
1215        "[\\u01F1\\u01F2\\u01F3]",
1216
1217        CASE,
1218        "[\\u1FB4]",
1219        "[\\u1FB4{\\u03AC\\u03B9}]",
1220
1221        CASE,
1222        "[{F\\uFB01}]",
1223        "[\\uFB03{ffi}]",
1224
1225        CASE, // make sure binary search finds limits
1226        "[a\\uFF3A]",
1227        "[aA\\uFF3A\\uFF5A]",
1228
1229        CASE,
1230        "[a-z]","[A-Za-z\\u017F\\u212A]",
1231        CASE,
1232        "[abc]","[A-Ca-c]",
1233        CASE,
1234        "[ABC]","[A-Ca-c]",
1235
1236        CASE, "[i]", "[iI]",
1237
1238        CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1239        CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1240
1241        CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1242
1243        CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1244
1245        CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1246
1247        CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1248
1249        CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1250
1251        CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1252
1253        CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1254        CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1255
1256        CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1257
1258        CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1259
1260        CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1261
1262#if !UCONFIG_NO_FILE_IO
1263        CASE_MAPPINGS,
1264        "[aq\\u00DF{Bc}{bC}{Fi}]",
1265        "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1266#endif
1267
1268        CASE_MAPPINGS,
1269        "[\\u01F1]", // 'DZ'
1270        "[\\u01F1\\u01F2\\u01F3]",
1271
1272        CASE_MAPPINGS,
1273        "[a-z]",
1274        "[A-Za-z]",
1275
1276        NULL
1277    };
1278
1279    UnicodeSet s;
1280    UnicodeSet t;
1281    UnicodeString buf;
1282    for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1283        int32_t selector = DATA[i][0];
1284        UnicodeString pat(DATA[i+1], -1, US_INV);
1285        UnicodeString exp(DATA[i+2], -1, US_INV);
1286        s.applyPattern(pat, ec);
1287        s.closeOver(selector);
1288        t.applyPattern(exp, ec);
1289        if (U_FAILURE(ec)) {
1290            errln("FAIL: applyPattern failed");
1291            continue;
1292        }
1293        if (s == t) {
1294            logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1295        } else {
1296            dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1297                  s.toPattern(buf, TRUE) + ", expected " + exp);
1298        }
1299    }
1300
1301#if 0
1302    /*
1303     * Unused test code.
1304     * This was used to compare the old implementation (using USET_CASE)
1305     * with the new one (using 0x100 temporarily)
1306     * while transitioning from hardcoded case closure tables in uniset.cpp
1307     * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1308     * and using ucase.c functions for closure.
1309     * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1310     *
1311     * Note: The old and new implementation never fully matched because
1312     * the old implementation turned out to not map U+0130 and U+0131 correctly
1313     * (dotted I and dotless i) and because the old implementation's data tables
1314     * were outdated compared to Unicode 4.0.1 at the time of the change to the
1315     * new implementation. (So sigmas and some other characters were not handled
1316     * according to the newer Unicode version.)
1317     */
1318    UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1319    UnicodeSetIterator si(sens);
1320    UnicodeString str, buf2;
1321    const UnicodeString *pStr;
1322    UChar32 c;
1323    while(si.next()) {
1324        if(!si.isString()) {
1325            c=si.getCodepoint();
1326            s.clear();
1327            s.add(c);
1328
1329            str.setTo(c);
1330            str.foldCase();
1331            sens2.add(str);
1332
1333            t=s;
1334            s.closeOver(USET_CASE);
1335            t.closeOver(0x100);
1336            if(s!=t) {
1337                errln("FAIL: closeOver(U+%04x) differs: ", c);
1338                errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1339            }
1340        }
1341    }
1342    // remove all code points
1343    // should contain all full case folding mapping strings
1344    sens2.remove(0, 0x10ffff);
1345    si.reset(sens2);
1346    while(si.next()) {
1347        if(si.isString()) {
1348            pStr=&si.getString();
1349            s.clear();
1350            s.add(*pStr);
1351            t=s2=s;
1352            s.closeOver(USET_CASE);
1353            t.closeOver(0x100);
1354            if(s!=t) {
1355                errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1356                errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1357            }
1358        }
1359    }
1360#endif
1361
1362    // Test the pattern API
1363    s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1364    if (U_FAILURE(ec)) {
1365        errln("FAIL: applyPattern failed");
1366    } else {
1367        expectContainment(s, "abcABC", "defDEF");
1368    }
1369    UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1370    if (U_FAILURE(ec)) {
1371        errln("FAIL: constructor failed");
1372    } else {
1373        expectContainment(v, "defDEF", "abcABC");
1374    }
1375    UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1376    if (U_FAILURE(ec)) {
1377        errln("FAIL: construct w/case mappings failed");
1378    } else {
1379        expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1380    }
1381}
1382
1383void UnicodeSetTest::TestEscapePattern() {
1384    const char pattern[] =
1385        "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1386    const char exp[] =
1387        "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1388    // We test this with two passes; in the second pass we
1389    // pre-unescape the pattern.  Since U+200E is Pattern_White_Space,
1390    // this fails -- which is what we expect.
1391    for (int32_t pass=1; pass<=2; ++pass) {
1392        UErrorCode ec = U_ZERO_ERROR;
1393        UnicodeString pat(pattern, -1, US_INV);
1394        if (pass==2) {
1395            pat = pat.unescape();
1396        }
1397        // Pattern is only good for pass 1
1398        UBool isPatternValid = (pass==1);
1399
1400        UnicodeSet set(pat, ec);
1401        if (U_SUCCESS(ec) != isPatternValid){
1402            errln((UnicodeString)"FAIL: applyPattern(" +
1403                  escape(pat) + ") => " +
1404                  u_errorName(ec));
1405            continue;
1406        }
1407        if (U_FAILURE(ec)) {
1408            continue;
1409        }
1410        if (set.contains((UChar)0x0644)){
1411            errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1412        }
1413
1414        UnicodeString newpat;
1415        set.toPattern(newpat, TRUE);
1416        if (newpat == UnicodeString(exp, -1, US_INV)) {
1417            logln(escape(pat) + " => " + newpat);
1418        } else {
1419            errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1420        }
1421
1422        for (int32_t i=0; i<set.getRangeCount(); ++i) {
1423            UnicodeString str("Range ");
1424            str.append((UChar)(0x30 + i))
1425                .append(": ")
1426                .append((UChar32)set.getRangeStart(i))
1427                .append(" - ")
1428                .append((UChar32)set.getRangeEnd(i));
1429            str = str + " (" + set.getRangeStart(i) + " - " +
1430                set.getRangeEnd(i) + ")";
1431            if (set.getRangeStart(i) < 0) {
1432                errln((UnicodeString)"FAIL: " + escape(str));
1433            } else {
1434                logln(escape(str));
1435            }
1436        }
1437    }
1438}
1439
1440void UnicodeSetTest::expectRange(const UnicodeString& label,
1441                                 const UnicodeSet& set,
1442                                 UChar32 start, UChar32 end) {
1443    UnicodeSet exp(start, end);
1444    UnicodeString pat;
1445    if (set == exp) {
1446        logln(label + " => " + set.toPattern(pat, TRUE));
1447    } else {
1448        UnicodeString xpat;
1449        errln((UnicodeString)"FAIL: " + label + " => " +
1450              set.toPattern(pat, TRUE) +
1451              ", expected " + exp.toPattern(xpat, TRUE));
1452    }
1453}
1454
1455void UnicodeSetTest::TestInvalidCodePoint() {
1456
1457    const UChar32 DATA[] = {
1458        // Test range             Expected range
1459        0, 0x10FFFF,              0, 0x10FFFF,
1460        (UChar32)-1, 8,           0, 8,
1461        8, 0x110000,              8, 0x10FFFF
1462    };
1463    const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1464
1465    UnicodeString pat;
1466    int32_t i;
1467
1468    for (i=0; i<DATA_LENGTH; i+=4) {
1469        UChar32 start  = DATA[i];
1470        UChar32 end    = DATA[i+1];
1471        UChar32 xstart = DATA[i+2];
1472        UChar32 xend   = DATA[i+3];
1473
1474        // Try various API using the test code points
1475
1476        UnicodeSet set(start, end);
1477        expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1478                    set, xstart, xend);
1479
1480        set.clear();
1481        set.set(start, end);
1482        expectRange((UnicodeString)"set(" + start + "," + end + ")",
1483                    set, xstart, xend);
1484
1485        UBool b = set.contains(start);
1486        b = set.contains(start, end);
1487        b = set.containsNone(start, end);
1488        b = set.containsSome(start, end);
1489        (void)b;   // Suppress set but not used warning.
1490
1491        /*int32_t index = set.indexOf(start);*/
1492
1493        set.clear();
1494        set.add(start);
1495        set.add(start, end);
1496        expectRange((UnicodeString)"add(" + start + "," + end + ")",
1497                    set, xstart, xend);
1498
1499        set.set(0, 0x10FFFF);
1500        set.retain(start, end);
1501        expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1502                    set, xstart, xend);
1503        set.retain(start);
1504
1505        set.set(0, 0x10FFFF);
1506        set.remove(start);
1507        set.remove(start, end);
1508        set.complement();
1509        expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1510                    set, xstart, xend);
1511
1512        set.set(0, 0x10FFFF);
1513        set.complement(start, end);
1514        set.complement();
1515        expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1516                    set, xstart, xend);
1517        set.complement(start);
1518    }
1519
1520    const UChar32 DATA2[] = {
1521        0,
1522        0x10FFFF,
1523        (UChar32)-1,
1524        0x110000
1525    };
1526    const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1527
1528    for (i=0; i<DATA2_LENGTH; ++i) {
1529        UChar32 c = DATA2[i], end = 0x10FFFF;
1530        UBool valid = (c >= 0 && c <= 0x10FFFF);
1531
1532        UnicodeSet set(0, 0x10FFFF);
1533
1534        // For single-codepoint contains, invalid codepoints are NOT contained
1535        UBool b = set.contains(c);
1536        if (b == valid) {
1537            logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1538                  ") = " + b);
1539        } else {
1540            errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1541                  ") = " + b);
1542        }
1543
1544        // For codepoint range contains, containsNone, and containsSome,
1545        // invalid or empty (start > end) ranges have UNDEFINED behavior.
1546        b = set.contains(c, end);
1547        logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1548              "," + end + ") = " + b);
1549
1550        b = set.containsNone(c, end);
1551        logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1552              "," + end + ") = " + b);
1553
1554        b = set.containsSome(c, end);
1555        logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1556              "," + end + ") = " + b);
1557
1558        int32_t index = set.indexOf(c);
1559        if ((index >= 0) == valid) {
1560            logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1561                  ") = " + index);
1562        } else {
1563            errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1564                  ") = " + index);
1565        }
1566    }
1567}
1568
1569// Used by TestSymbolTable
1570class TokenSymbolTable : public SymbolTable {
1571public:
1572    Hashtable contents;
1573
1574    TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1575        contents.setValueDeleter(uprv_deleteUObject);
1576    }
1577
1578    ~TokenSymbolTable() {}
1579
1580    /**
1581     * (Non-SymbolTable API) Add the given variable and value to
1582     * the table.  Variable should NOT contain leading '$'.
1583     */
1584    void add(const UnicodeString& var, const UnicodeString& value,
1585             UErrorCode& ec) {
1586        if (U_SUCCESS(ec)) {
1587            contents.put(var, new UnicodeString(value), ec);
1588        }
1589    }
1590
1591    /**
1592     * SymbolTable API
1593     */
1594    virtual const UnicodeString* lookup(const UnicodeString& s) const {
1595        return (const UnicodeString*) contents.get(s);
1596    }
1597
1598    /**
1599     * SymbolTable API
1600     */
1601    virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1602        return NULL;
1603    }
1604
1605    /**
1606     * SymbolTable API
1607     */
1608    virtual UnicodeString parseReference(const UnicodeString& text,
1609                                         ParsePosition& pos, int32_t limit) const {
1610        int32_t start = pos.getIndex();
1611        int32_t i = start;
1612        UnicodeString result;
1613        while (i < limit) {
1614            UChar c = text.charAt(i);
1615            if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1616                break;
1617            }
1618            ++i;
1619        }
1620        if (i == start) { // No valid name chars
1621            return result; // Indicate failure with empty string
1622        }
1623        pos.setIndex(i);
1624        text.extractBetween(start, i, result);
1625        return result;
1626    }
1627};
1628
1629void UnicodeSetTest::TestSymbolTable() {
1630    // Multiple test cases can be set up here.  Each test case
1631    // is terminated by null:
1632    // var, value, var, value,..., input pat., exp. output pat., null
1633    const char* DATA[] = {
1634        "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1635        "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1636        "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1637        NULL
1638    };
1639
1640    for (int32_t i=0; DATA[i]!=NULL; ++i) {
1641        UErrorCode ec = U_ZERO_ERROR;
1642        TokenSymbolTable sym(ec);
1643        if (U_FAILURE(ec)) {
1644            errln("FAIL: couldn't construct TokenSymbolTable");
1645            continue;
1646        }
1647
1648        // Set up variables
1649        while (DATA[i+2] != NULL) {
1650            sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1651            if (U_FAILURE(ec)) {
1652                errln("FAIL: couldn't add to TokenSymbolTable");
1653                continue;
1654            }
1655            i += 2;
1656        }
1657
1658        // Input pattern and expected output pattern
1659        UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1660        i += 2;
1661
1662        ParsePosition pos(0);
1663        UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1664        if (U_FAILURE(ec)) {
1665            errln("FAIL: couldn't construct UnicodeSet");
1666            continue;
1667        }
1668
1669        // results
1670        if (pos.getIndex() != inpat.length()) {
1671            errln((UnicodeString)"Failed to read to end of string \""
1672                  + inpat + "\": read to "
1673                  + pos.getIndex() + ", length is "
1674                  + inpat.length());
1675        }
1676
1677        UnicodeSet us2(exppat, ec);
1678        if (U_FAILURE(ec)) {
1679            errln("FAIL: couldn't construct expected UnicodeSet");
1680            continue;
1681        }
1682
1683        UnicodeString a, b;
1684        if (us != us2) {
1685            errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1686                  ", expected " + us2.toPattern(b, TRUE));
1687        } else {
1688            logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1689        }
1690    }
1691}
1692
1693void UnicodeSetTest::TestSurrogate() {
1694    const char* DATA[] = {
1695        // These should all behave identically
1696        "[abc\\uD800\\uDC00]",
1697        // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1698        "[abc\\U00010000]",
1699        0
1700    };
1701    for (int i=0; DATA[i] != 0; ++i) {
1702        UErrorCode ec = U_ZERO_ERROR;
1703        logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1704        UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1705        UnicodeSet set(str, ec);
1706        if (U_FAILURE(ec)) {
1707            errln("FAIL: UnicodeSet constructor");
1708            continue;
1709        }
1710        expectContainment(set,
1711                          CharsToUnicodeString("abc\\U00010000"),
1712                          CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1713        if (set.size() != 4) {
1714            errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1715                  set.size() + ", expected 4");
1716        }
1717    }
1718}
1719
1720void UnicodeSetTest::TestExhaustive() {
1721    // exhaustive tests. Simulate UnicodeSets with integers.
1722    // That gives us very solid tests (except for large memory tests).
1723
1724    int32_t limit = 128;
1725
1726    UnicodeSet x, y, z, aa;
1727
1728    for (int32_t i = 0; i < limit; ++i) {
1729        bitsToSet(i, x);
1730        logln((UnicodeString)"Testing " + i + ", " + x);
1731        _testComplement(i, x, y);
1732
1733        // AS LONG AS WE ARE HERE, check roundtrip
1734        checkRoundTrip(bitsToSet(i, aa));
1735
1736        for (int32_t j = 0; j < limit; ++j) {
1737            _testAdd(i,j,  x,y,z);
1738            _testXor(i,j,  x,y,z);
1739            _testRetain(i,j,  x,y,z);
1740            _testRemove(i,j,  x,y,z);
1741        }
1742    }
1743}
1744
1745void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1746    bitsToSet(a, x);
1747    z = x;
1748    z.complement();
1749    int32_t c = setToBits(z);
1750    if (c != (~a)) {
1751        errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1752        errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1753    }
1754    checkCanonicalRep(z, (UnicodeString)"complement " + a);
1755}
1756
1757void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1758    bitsToSet(a, x);
1759    bitsToSet(b, y);
1760    z = x;
1761    z.addAll(y);
1762    int32_t c = setToBits(z);
1763    if (c != (a | b)) {
1764        errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1765        errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1766    }
1767    checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1768}
1769
1770void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1771    bitsToSet(a, x);
1772    bitsToSet(b, y);
1773    z = x;
1774    z.retainAll(y);
1775    int32_t c = setToBits(z);
1776    if (c != (a & b)) {
1777        errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1778        errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1779    }
1780    checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1781}
1782
1783void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1784    bitsToSet(a, x);
1785    bitsToSet(b, y);
1786    z = x;
1787    z.removeAll(y);
1788    int32_t c = setToBits(z);
1789    if (c != (a &~ b)) {
1790        errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1791        errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1792    }
1793    checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1794}
1795
1796void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1797    bitsToSet(a, x);
1798    bitsToSet(b, y);
1799    z = x;
1800    z.complementAll(y);
1801    int32_t c = setToBits(z);
1802    if (c != (a ^ b)) {
1803        errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1804        errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1805    }
1806    checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1807}
1808
1809/**
1810 * Check that ranges are monotonically increasing and non-
1811 * overlapping.
1812 */
1813void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1814    int32_t n = set.getRangeCount();
1815    if (n < 0) {
1816        errln((UnicodeString)"FAIL result of " + msg +
1817              ": range count should be >= 0 but is " +
1818              n /*+ " for " + set.toPattern())*/);
1819        return;
1820    }
1821    UChar32 last = 0;
1822    for (int32_t i=0; i<n; ++i) {
1823        UChar32 start = set.getRangeStart(i);
1824        UChar32 end = set.getRangeEnd(i);
1825        if (start > end) {
1826            errln((UnicodeString)"FAIL result of " + msg +
1827                  ": range " + (i+1) +
1828                  " start > end: " + (int)start + ", " + (int)end +
1829                  " for " + set);
1830        }
1831        if (i > 0 && start <= last) {
1832            errln((UnicodeString)"FAIL result of " + msg +
1833                  ": range " + (i+1) +
1834                  " overlaps previous range: " + (int)start + ", " + (int)end +
1835                  " for " + set);
1836        }
1837        last = end;
1838    }
1839}
1840
1841/**
1842 * Convert a bitmask to a UnicodeSet.
1843 */
1844UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1845    result.clear();
1846    for (UChar32 i = 0; i < 32; ++i) {
1847        if ((a & (1<<i)) != 0) {
1848            result.add(i);
1849        }
1850    }
1851    return result;
1852}
1853
1854/**
1855 * Convert a UnicodeSet to a bitmask.  Only the characters
1856 * U+0000 to U+0020 are represented in the bitmask.
1857 */
1858int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1859    int32_t result = 0;
1860    for (int32_t i = 0; i < 32; ++i) {
1861        if (x.contains((UChar32)i)) {
1862            result |= (1<<i);
1863        }
1864    }
1865    return result;
1866}
1867
1868/**
1869 * Return the representation of an inversion list based UnicodeSet
1870 * as a pairs list.  Ranges are listed in ascending Unicode order.
1871 * For example, the set [a-zA-M3] is represented as "33AMaz".
1872 */
1873UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1874    UnicodeString pairs;
1875    for (int32_t i=0; i<set.getRangeCount(); ++i) {
1876        UChar32 start = set.getRangeStart(i);
1877        UChar32 end = set.getRangeEnd(i);
1878        if (end > 0xFFFF) {
1879            end = 0xFFFF;
1880            i = set.getRangeCount(); // Should be unnecessary
1881        }
1882        pairs.append((UChar)start).append((UChar)end);
1883    }
1884    return pairs;
1885}
1886
1887/**
1888 * Basic consistency check for a few items.
1889 * That the iterator works, and that we can create a pattern and
1890 * get the same thing back
1891 */
1892void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1893    UErrorCode ec = U_ZERO_ERROR;
1894
1895    UnicodeSet t(s);
1896    checkEqual(s, t, "copy ct");
1897
1898    t = s;
1899    checkEqual(s, t, "operator=");
1900
1901    copyWithIterator(t, s, FALSE);
1902    checkEqual(s, t, "iterator roundtrip");
1903
1904    copyWithIterator(t, s, TRUE); // try range
1905    checkEqual(s, t, "iterator roundtrip");
1906
1907    UnicodeString pat; s.toPattern(pat, FALSE);
1908    t.applyPattern(pat, ec);
1909    if (U_FAILURE(ec)) {
1910        errln("FAIL: applyPattern");
1911        return;
1912    } else {
1913        checkEqual(s, t, "toPattern(false)");
1914    }
1915
1916    s.toPattern(pat, TRUE);
1917    t.applyPattern(pat, ec);
1918    if (U_FAILURE(ec)) {
1919        errln("FAIL: applyPattern");
1920        return;
1921    } else {
1922        checkEqual(s, t, "toPattern(true)");
1923    }
1924}
1925
1926void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1927    t.clear();
1928    UnicodeSetIterator it(s);
1929    if (withRange) {
1930        while (it.nextRange()) {
1931            if (it.isString()) {
1932                t.add(it.getString());
1933            } else {
1934                t.add(it.getCodepoint(), it.getCodepointEnd());
1935            }
1936        }
1937    } else {
1938        while (it.next()) {
1939            if (it.isString()) {
1940                t.add(it.getString());
1941            } else {
1942                t.add(it.getCodepoint());
1943            }
1944        }
1945    }
1946}
1947
1948UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1949    UnicodeString source; s.toPattern(source, TRUE);
1950    UnicodeString result; t.toPattern(result, TRUE);
1951    if (s != t) {
1952        errln((UnicodeString)"FAIL: " + message
1953              + "; source = " + source
1954              + "; result = " + result
1955              );
1956        return FALSE;
1957    } else {
1958        logln((UnicodeString)"Ok: " + message
1959              + "; source = " + source
1960              + "; result = " + result
1961              );
1962    }
1963    return TRUE;
1964}
1965
1966void
1967UnicodeSetTest::expectContainment(const UnicodeString& pat,
1968                                  const UnicodeString& charsIn,
1969                                  const UnicodeString& charsOut) {
1970    UErrorCode ec = U_ZERO_ERROR;
1971    UnicodeSet set(pat, ec);
1972    if (U_FAILURE(ec)) {
1973        dataerrln((UnicodeString)"FAIL: pattern \"" +
1974              pat + "\" => " + u_errorName(ec));
1975        return;
1976    }
1977    expectContainment(set, pat, charsIn, charsOut);
1978}
1979
1980void
1981UnicodeSetTest::expectContainment(const UnicodeSet& set,
1982                                  const UnicodeString& charsIn,
1983                                  const UnicodeString& charsOut) {
1984    UnicodeString pat;
1985    set.toPattern(pat);
1986    expectContainment(set, pat, charsIn, charsOut);
1987}
1988
1989void
1990UnicodeSetTest::expectContainment(const UnicodeSet& set,
1991                                  const UnicodeString& setName,
1992                                  const UnicodeString& charsIn,
1993                                  const UnicodeString& charsOut) {
1994    UnicodeString bad;
1995    UChar32 c;
1996    int32_t i;
1997
1998    for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
1999        c = charsIn.char32At(i);
2000        if (!set.contains(c)) {
2001            bad.append(c);
2002        }
2003    }
2004    if (bad.length() > 0) {
2005        errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
2006              ", expected containment of " + prettify(charsIn));
2007    } else {
2008        logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
2009    }
2010
2011    bad.truncate(0);
2012    for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2013        c = charsOut.char32At(i);
2014        if (set.contains(c)) {
2015            bad.append(c);
2016        }
2017    }
2018    if (bad.length() > 0) {
2019        errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2020              ", expected non-containment of " + prettify(charsOut));
2021    } else {
2022        logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2023    }
2024}
2025
2026void
2027UnicodeSetTest::expectPattern(UnicodeSet& set,
2028                              const UnicodeString& pattern,
2029                              const UnicodeString& expectedPairs){
2030    UErrorCode status = U_ZERO_ERROR;
2031    set.applyPattern(pattern, status);
2032    if (U_FAILURE(status)) {
2033        errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2034              "\") failed");
2035        return;
2036    } else {
2037        if (getPairs(set) != expectedPairs ) {
2038            errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2039                  "\") => pairs \"" +
2040                  escape(getPairs(set)) + "\", expected \"" +
2041                  escape(expectedPairs) + "\"");
2042        } else {
2043            logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2044                  "\") => pairs \"" +
2045                  escape(getPairs(set)) + "\"");
2046        }
2047    }
2048    // the result of calling set.toPattern(), which is the string representation of
2049    // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2050    // will produce another set that is equal to this one.
2051    UnicodeString temppattern;
2052    set.toPattern(temppattern);
2053    UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2054    if (U_FAILURE(status)) {
2055        errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2056        return;
2057    }
2058    if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2059        errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2060            escape(getPairs(set)) + "\""));
2061    } else{
2062        logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2063    }
2064
2065    delete tempset;
2066
2067}
2068
2069void
2070UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2071    if (getPairs(set) != expectedPairs) {
2072        errln(UnicodeString("FAIL: Expected pair list \"") +
2073              escape(expectedPairs) + "\", got \"" +
2074              escape(getPairs(set)) + "\"");
2075    }
2076}
2077
2078void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2079                                     const UnicodeString& expPat,
2080                                     const char** expStrings) {
2081    UnicodeString pat;
2082    set.toPattern(pat, TRUE);
2083    if (pat == expPat) {
2084        logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2085    } else {
2086        errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2087        return;
2088    }
2089    if (expStrings == NULL) {
2090        return;
2091    }
2092    UBool in = TRUE;
2093    for (int32_t i=0; expStrings[i] != NULL; ++i) {
2094        if (expStrings[i] == NOT) { // sic; pointer comparison
2095            in = FALSE;
2096            continue;
2097        }
2098        UnicodeString s = CharsToUnicodeString(expStrings[i]);
2099        UBool contained = set.contains(s);
2100        if (contained == in) {
2101            logln((UnicodeString)"Ok: " + expPat +
2102                  (contained ? " contains {" : " does not contain {") +
2103                  escape(expStrings[i]) + "}");
2104        } else {
2105            errln((UnicodeString)"FAIL: " + expPat +
2106                  (contained ? " contains {" : " does not contain {") +
2107                  escape(expStrings[i]) + "}");
2108        }
2109    }
2110}
2111
2112static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2113
2114void
2115UnicodeSetTest::doAssert(UBool condition, const char *message)
2116{
2117    if (!condition) {
2118        errln(UnicodeString("ERROR : ") + message);
2119    }
2120}
2121
2122UnicodeString
2123UnicodeSetTest::escape(const UnicodeString& s) {
2124    UnicodeString buf;
2125    for (int32_t i=0; i<s.length(); )
2126    {
2127        UChar32 c = s.char32At(i);
2128        if (0x0020 <= c && c <= 0x007F) {
2129            buf += c;
2130        } else {
2131            if (c <= 0xFFFF) {
2132                buf += (UChar)0x5c; buf += (UChar)0x75;
2133            } else {
2134                buf += (UChar)0x5c; buf += (UChar)0x55;
2135                buf += toHexString((c & 0xF0000000) >> 28);
2136                buf += toHexString((c & 0x0F000000) >> 24);
2137                buf += toHexString((c & 0x00F00000) >> 20);
2138                buf += toHexString((c & 0x000F0000) >> 16);
2139            }
2140            buf += toHexString((c & 0xF000) >> 12);
2141            buf += toHexString((c & 0x0F00) >> 8);
2142            buf += toHexString((c & 0x00F0) >> 4);
2143            buf += toHexString(c & 0x000F);
2144        }
2145        i += U16_LENGTH(c);
2146    }
2147    return buf;
2148}
2149
2150void UnicodeSetTest::TestFreezable() {
2151    UErrorCode errorCode=U_ZERO_ERROR;
2152    UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2153    UnicodeSet idSet(idPattern, errorCode);
2154    if(U_FAILURE(errorCode)) {
2155        dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2156        return;
2157    }
2158
2159    UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2160    UnicodeSet wsSet(wsPattern, errorCode);
2161    if(U_FAILURE(errorCode)) {
2162        dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2163        return;
2164    }
2165
2166    idSet.add(idPattern);
2167    UnicodeSet frozen(idSet);
2168    frozen.freeze();
2169
2170    if(idSet.isFrozen() || !frozen.isFrozen()) {
2171        errln("FAIL: isFrozen() is wrong");
2172    }
2173    if(frozen!=idSet || !(frozen==idSet)) {
2174        errln("FAIL: a copy-constructed frozen set differs from its original");
2175    }
2176
2177    frozen=wsSet;
2178    if(frozen!=idSet || !(frozen==idSet)) {
2179        errln("FAIL: a frozen set was modified by operator=");
2180    }
2181
2182    UnicodeSet frozen2(frozen);
2183    if(frozen2!=frozen || frozen2!=idSet) {
2184        errln("FAIL: a copied frozen set differs from its frozen original");
2185    }
2186    if(!frozen2.isFrozen()) {
2187        errln("FAIL: copy-constructing a frozen set results in a thawed one");
2188    }
2189    UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2190    if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2191        errln("FAIL: UnicodeSet(5, 55) failed");
2192    }
2193    frozen3=frozen;
2194    if(!frozen3.isFrozen()) {
2195        errln("FAIL: copying a frozen set results in a thawed one");
2196    }
2197
2198    UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2199    if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2200        errln("FAIL: clone() failed");
2201    }
2202    cloned->add(0xd802, 0xd805);
2203    if(cloned->containsSome(0xd802, 0xd805)) {
2204        errln("FAIL: unable to modify clone");
2205    }
2206    delete cloned;
2207
2208    UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2209    if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2210        errln("FAIL: cloneAsThawed() failed");
2211    }
2212    thawed->add(0xd802, 0xd805);
2213    if(!thawed->contains(0xd802, 0xd805)) {
2214        errln("FAIL: unable to modify thawed clone");
2215    }
2216    delete thawed;
2217
2218    frozen.set(5, 55);
2219    if(frozen!=idSet || !(frozen==idSet)) {
2220        errln("FAIL: UnicodeSet::set() modified a frozen set");
2221    }
2222
2223    frozen.clear();
2224    if(frozen!=idSet || !(frozen==idSet)) {
2225        errln("FAIL: UnicodeSet::clear() modified a frozen set");
2226    }
2227
2228    frozen.closeOver(USET_CASE_INSENSITIVE);
2229    if(frozen!=idSet || !(frozen==idSet)) {
2230        errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2231    }
2232
2233    frozen.compact();
2234    if(frozen!=idSet || !(frozen==idSet)) {
2235        errln("FAIL: UnicodeSet::compact() modified a frozen set");
2236    }
2237
2238    ParsePosition pos;
2239    frozen.
2240        applyPattern(wsPattern, errorCode).
2241        applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2242        applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2243        applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2244        applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2245    if(frozen!=idSet || !(frozen==idSet)) {
2246        errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2247    }
2248
2249    frozen.
2250        add(0xd800).
2251        add(0xd802, 0xd805).
2252        add(wsPattern).
2253        addAll(idPattern).
2254        addAll(wsSet);
2255    if(frozen!=idSet || !(frozen==idSet)) {
2256        errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2257    }
2258
2259    frozen.
2260        retain(0x62).
2261        retain(0x64, 0x69).
2262        retainAll(wsPattern).
2263        retainAll(wsSet);
2264    if(frozen!=idSet || !(frozen==idSet)) {
2265        errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2266    }
2267
2268    frozen.
2269        remove(0x62).
2270        remove(0x64, 0x69).
2271        remove(idPattern).
2272        removeAll(idPattern).
2273        removeAll(idSet);
2274    if(frozen!=idSet || !(frozen==idSet)) {
2275        errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2276    }
2277
2278    frozen.
2279        complement().
2280        complement(0x62).
2281        complement(0x64, 0x69).
2282        complement(idPattern).
2283        complementAll(idPattern).
2284        complementAll(idSet);
2285    if(frozen!=idSet || !(frozen==idSet)) {
2286        errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2287    }
2288}
2289
2290// Test span() etc. -------------------------------------------------------- ***
2291
2292// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2293static int32_t
2294appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2295    UErrorCode errorCode=U_ZERO_ERROR;
2296    int32_t length8=0;
2297    u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2298    if(U_SUCCESS(errorCode)) {
2299        return length8;
2300    } else {
2301        // The string contains an unpaired surrogate.
2302        // Ignore this string.
2303        return 0;
2304    }
2305}
2306
2307class UnicodeSetWithStringsIterator;
2308
2309// Make the strings in a UnicodeSet easily accessible.
2310class UnicodeSetWithStrings {
2311public:
2312    UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2313            set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2314        int32_t size=set.size();
2315        if(size>0 && set.charAt(size-1)<0) {
2316            // If a set's last element is not a code point, then it must contain strings.
2317            // Iterate over the set, skip all code point ranges, and cache the strings.
2318            // Convert them to UTF-8 for spanUTF8().
2319            UnicodeSetIterator iter(set);
2320            const UnicodeString *s;
2321            char *s8=utf8;
2322            int32_t length8, utf8Count=0;
2323            while(iter.nextRange() && stringsLength<UPRV_LENGTHOF(strings)) {
2324                if(iter.isString()) {
2325                    // Store the pointer to the set's string element
2326                    // which we happen to know is a stable pointer.
2327                    strings[stringsLength]=s=&iter.getString();
2328                    utf8Count+=
2329                        utf8Lengths[stringsLength]=length8=
2330                        appendUTF8(s->getBuffer(), s->length(),
2331                                   s8, (int32_t)(sizeof(utf8)-utf8Count));
2332                    if(length8==0) {
2333                        hasSurrogates=TRUE;  // Contains unpaired surrogates.
2334                    }
2335                    s8+=length8;
2336                    ++stringsLength;
2337                }
2338            }
2339        }
2340    }
2341
2342    const UnicodeSet &getSet() const {
2343        return set;
2344    }
2345
2346    UBool hasStrings() const {
2347        return (UBool)(stringsLength>0);
2348    }
2349
2350    UBool hasStringsWithSurrogates() const {
2351        return hasSurrogates;
2352    }
2353
2354private:
2355    friend class UnicodeSetWithStringsIterator;
2356
2357    const UnicodeSet &set;
2358
2359    const UnicodeString *strings[20];
2360    int32_t stringsLength;
2361    UBool hasSurrogates;
2362
2363    char utf8[1024];
2364    int32_t utf8Lengths[20];
2365};
2366
2367class UnicodeSetWithStringsIterator {
2368public:
2369    UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2370            fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2371    }
2372
2373    void reset() {
2374        nextStringIndex=nextUTF8Start=0;
2375    }
2376
2377    const UnicodeString *nextString() {
2378        if(nextStringIndex<fSet.stringsLength) {
2379            return fSet.strings[nextStringIndex++];
2380        } else {
2381            return NULL;
2382        }
2383    }
2384
2385    // Do not mix with calls to nextString().
2386    const char *nextUTF8(int32_t &length) {
2387        if(nextStringIndex<fSet.stringsLength) {
2388            const char *s8=fSet.utf8+nextUTF8Start;
2389            nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2390            return s8;
2391        } else {
2392            length=0;
2393            return NULL;
2394        }
2395    }
2396
2397private:
2398    const UnicodeSetWithStrings &fSet;
2399    int32_t nextStringIndex;
2400    int32_t nextUTF8Start;
2401};
2402
2403// Compare 16-bit Unicode strings (which may be malformed UTF-16)
2404// at code point boundaries.
2405// That is, each edge of a match must not be in the middle of a surrogate pair.
2406static inline UBool
2407matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2408    s+=start;
2409    limit-=start;
2410    int32_t length=t.length();
2411    return 0==t.compare(s, length) &&
2412           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2413           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2414}
2415
2416// Implement span() with contains() for comparison.
2417static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2418                                 USetSpanCondition spanCondition) {
2419    const UnicodeSet &realSet(set.getSet());
2420    if(!set.hasStrings()) {
2421        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2422            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2423        }
2424
2425        UChar32 c;
2426        int32_t start=0, prev;
2427        while((prev=start)<length) {
2428            U16_NEXT(s, start, length, c);
2429            if(realSet.contains(c)!=spanCondition) {
2430                break;
2431            }
2432        }
2433        return prev;
2434    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2435        UnicodeSetWithStringsIterator iter(set);
2436        UChar32 c;
2437        int32_t start, next;
2438        for(start=next=0; start<length;) {
2439            U16_NEXT(s, next, length, c);
2440            if(realSet.contains(c)) {
2441                break;
2442            }
2443            const UnicodeString *str;
2444            iter.reset();
2445            while((str=iter.nextString())!=NULL) {
2446                if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2447                    // spanNeedsStrings=TRUE;
2448                    return start;
2449                }
2450            }
2451            start=next;
2452        }
2453        return start;
2454    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2455        UnicodeSetWithStringsIterator iter(set);
2456        UChar32 c;
2457        int32_t start, next, maxSpanLimit=0;
2458        for(start=next=0; start<length;) {
2459            U16_NEXT(s, next, length, c);
2460            if(!realSet.contains(c)) {
2461                next=start;  // Do not span this single, not-contained code point.
2462            }
2463            const UnicodeString *str;
2464            iter.reset();
2465            while((str=iter.nextString())!=NULL) {
2466                if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2467                    // spanNeedsStrings=TRUE;
2468                    int32_t matchLimit=start+str->length();
2469                    if(matchLimit==length) {
2470                        return length;
2471                    }
2472                    if(spanCondition==USET_SPAN_CONTAINED) {
2473                        // Iterate for the shortest match at each position.
2474                        // Recurse for each but the shortest match.
2475                        if(next==start) {
2476                            next=matchLimit;  // First match from start.
2477                        } else {
2478                            if(matchLimit<next) {
2479                                // Remember shortest match from start for iteration.
2480                                int32_t temp=next;
2481                                next=matchLimit;
2482                                matchLimit=temp;
2483                            }
2484                            // Recurse for non-shortest match from start.
2485                            int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2486                                                                 USET_SPAN_CONTAINED);
2487                            if((matchLimit+spanLength)>maxSpanLimit) {
2488                                maxSpanLimit=matchLimit+spanLength;
2489                                if(maxSpanLimit==length) {
2490                                    return length;
2491                                }
2492                            }
2493                        }
2494                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2495                        if(matchLimit>next) {
2496                            // Remember longest match from start.
2497                            next=matchLimit;
2498                        }
2499                    }
2500                }
2501            }
2502            if(next==start) {
2503                break;  // No match from start.
2504            }
2505            start=next;
2506        }
2507        if(start>maxSpanLimit) {
2508            return start;
2509        } else {
2510            return maxSpanLimit;
2511        }
2512    }
2513}
2514
2515static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2516                                     USetSpanCondition spanCondition) {
2517    if(length==0) {
2518        return 0;
2519    }
2520    const UnicodeSet &realSet(set.getSet());
2521    if(!set.hasStrings()) {
2522        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2523            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2524        }
2525
2526        UChar32 c;
2527        int32_t prev=length;
2528        do {
2529            U16_PREV(s, 0, length, c);
2530            if(realSet.contains(c)!=spanCondition) {
2531                break;
2532            }
2533        } while((prev=length)>0);
2534        return prev;
2535    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2536        UnicodeSetWithStringsIterator iter(set);
2537        UChar32 c;
2538        int32_t prev=length, length0=length;
2539        do {
2540            U16_PREV(s, 0, length, c);
2541            if(realSet.contains(c)) {
2542                break;
2543            }
2544            const UnicodeString *str;
2545            iter.reset();
2546            while((str=iter.nextString())!=NULL) {
2547                if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2548                    // spanNeedsStrings=TRUE;
2549                    return prev;
2550                }
2551            }
2552        } while((prev=length)>0);
2553        return prev;
2554    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2555        UnicodeSetWithStringsIterator iter(set);
2556        UChar32 c;
2557        int32_t prev=length, minSpanStart=length, length0=length;
2558        do {
2559            U16_PREV(s, 0, length, c);
2560            if(!realSet.contains(c)) {
2561                length=prev;  // Do not span this single, not-contained code point.
2562            }
2563            const UnicodeString *str;
2564            iter.reset();
2565            while((str=iter.nextString())!=NULL) {
2566                if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2567                    // spanNeedsStrings=TRUE;
2568                    int32_t matchStart=prev-str->length();
2569                    if(matchStart==0) {
2570                        return 0;
2571                    }
2572                    if(spanCondition==USET_SPAN_CONTAINED) {
2573                        // Iterate for the shortest match at each position.
2574                        // Recurse for each but the shortest match.
2575                        if(length==prev) {
2576                            length=matchStart;  // First match from prev.
2577                        } else {
2578                            if(matchStart>length) {
2579                                // Remember shortest match from prev for iteration.
2580                                int32_t temp=length;
2581                                length=matchStart;
2582                                matchStart=temp;
2583                            }
2584                            // Recurse for non-shortest match from prev.
2585                            int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2586                                                                    USET_SPAN_CONTAINED);
2587                            if(spanStart<minSpanStart) {
2588                                minSpanStart=spanStart;
2589                                if(minSpanStart==0) {
2590                                    return 0;
2591                                }
2592                            }
2593                        }
2594                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2595                        if(matchStart<length) {
2596                            // Remember longest match from prev.
2597                            length=matchStart;
2598                        }
2599                    }
2600                }
2601            }
2602            if(length==prev) {
2603                break;  // No match from prev.
2604            }
2605        } while((prev=length)>0);
2606        if(prev<minSpanStart) {
2607            return prev;
2608        } else {
2609            return minSpanStart;
2610        }
2611    }
2612}
2613
2614static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2615                                USetSpanCondition spanCondition) {
2616    const UnicodeSet &realSet(set.getSet());
2617    if(!set.hasStrings()) {
2618        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2619            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2620        }
2621
2622        UChar32 c;
2623        int32_t start=0, prev;
2624        while((prev=start)<length) {
2625            U8_NEXT_OR_FFFD(s, start, length, c);
2626            if(realSet.contains(c)!=spanCondition) {
2627                break;
2628            }
2629        }
2630        return prev;
2631    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2632        UnicodeSetWithStringsIterator iter(set);
2633        UChar32 c;
2634        int32_t start, next;
2635        for(start=next=0; start<length;) {
2636            U8_NEXT_OR_FFFD(s, next, length, c);
2637            if(realSet.contains(c)) {
2638                break;
2639            }
2640            const char *s8;
2641            int32_t length8;
2642            iter.reset();
2643            while((s8=iter.nextUTF8(length8))!=NULL) {
2644                if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2645                    // spanNeedsStrings=TRUE;
2646                    return start;
2647                }
2648            }
2649            start=next;
2650        }
2651        return start;
2652    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2653        UnicodeSetWithStringsIterator iter(set);
2654        UChar32 c;
2655        int32_t start, next, maxSpanLimit=0;
2656        for(start=next=0; start<length;) {
2657            U8_NEXT_OR_FFFD(s, next, length, c);
2658            if(!realSet.contains(c)) {
2659                next=start;  // Do not span this single, not-contained code point.
2660            }
2661            const char *s8;
2662            int32_t length8;
2663            iter.reset();
2664            while((s8=iter.nextUTF8(length8))!=NULL) {
2665                if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2666                    // spanNeedsStrings=TRUE;
2667                    int32_t matchLimit=start+length8;
2668                    if(matchLimit==length) {
2669                        return length;
2670                    }
2671                    if(spanCondition==USET_SPAN_CONTAINED) {
2672                        // Iterate for the shortest match at each position.
2673                        // Recurse for each but the shortest match.
2674                        if(next==start) {
2675                            next=matchLimit;  // First match from start.
2676                        } else {
2677                            if(matchLimit<next) {
2678                                // Remember shortest match from start for iteration.
2679                                int32_t temp=next;
2680                                next=matchLimit;
2681                                matchLimit=temp;
2682                            }
2683                            // Recurse for non-shortest match from start.
2684                            int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2685                                                                USET_SPAN_CONTAINED);
2686                            if((matchLimit+spanLength)>maxSpanLimit) {
2687                                maxSpanLimit=matchLimit+spanLength;
2688                                if(maxSpanLimit==length) {
2689                                    return length;
2690                                }
2691                            }
2692                        }
2693                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2694                        if(matchLimit>next) {
2695                            // Remember longest match from start.
2696                            next=matchLimit;
2697                        }
2698                    }
2699                }
2700            }
2701            if(next==start) {
2702                break;  // No match from start.
2703            }
2704            start=next;
2705        }
2706        if(start>maxSpanLimit) {
2707            return start;
2708        } else {
2709            return maxSpanLimit;
2710        }
2711    }
2712}
2713
2714static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2715                                    USetSpanCondition spanCondition) {
2716    if(length==0) {
2717        return 0;
2718    }
2719    const UnicodeSet &realSet(set.getSet());
2720    if(!set.hasStrings()) {
2721        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2722            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2723        }
2724
2725        UChar32 c;
2726        int32_t prev=length;
2727        do {
2728            U8_PREV_OR_FFFD(s, 0, length, c);
2729            if(realSet.contains(c)!=spanCondition) {
2730                break;
2731            }
2732        } while((prev=length)>0);
2733        return prev;
2734    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2735        UnicodeSetWithStringsIterator iter(set);
2736        UChar32 c;
2737        int32_t prev=length;
2738        do {
2739            U8_PREV_OR_FFFD(s, 0, length, c);
2740            if(realSet.contains(c)) {
2741                break;
2742            }
2743            const char *s8;
2744            int32_t length8;
2745            iter.reset();
2746            while((s8=iter.nextUTF8(length8))!=NULL) {
2747                if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2748                    // spanNeedsStrings=TRUE;
2749                    return prev;
2750                }
2751            }
2752        } while((prev=length)>0);
2753        return prev;
2754    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2755        UnicodeSetWithStringsIterator iter(set);
2756        UChar32 c;
2757        int32_t prev=length, minSpanStart=length;
2758        do {
2759            U8_PREV_OR_FFFD(s, 0, length, c);
2760            if(!realSet.contains(c)) {
2761                length=prev;  // Do not span this single, not-contained code point.
2762            }
2763            const char *s8;
2764            int32_t length8;
2765            iter.reset();
2766            while((s8=iter.nextUTF8(length8))!=NULL) {
2767                if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2768                    // spanNeedsStrings=TRUE;
2769                    int32_t matchStart=prev-length8;
2770                    if(matchStart==0) {
2771                        return 0;
2772                    }
2773                    if(spanCondition==USET_SPAN_CONTAINED) {
2774                        // Iterate for the shortest match at each position.
2775                        // Recurse for each but the shortest match.
2776                        if(length==prev) {
2777                            length=matchStart;  // First match from prev.
2778                        } else {
2779                            if(matchStart>length) {
2780                                // Remember shortest match from prev for iteration.
2781                                int32_t temp=length;
2782                                length=matchStart;
2783                                matchStart=temp;
2784                            }
2785                            // Recurse for non-shortest match from prev.
2786                            int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2787                                                                   USET_SPAN_CONTAINED);
2788                            if(spanStart<minSpanStart) {
2789                                minSpanStart=spanStart;
2790                                if(minSpanStart==0) {
2791                                    return 0;
2792                                }
2793                            }
2794                        }
2795                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2796                        if(matchStart<length) {
2797                            // Remember longest match from prev.
2798                            length=matchStart;
2799                        }
2800                    }
2801                }
2802            }
2803            if(length==prev) {
2804                break;  // No match from prev.
2805            }
2806        } while((prev=length)>0);
2807        if(prev<minSpanStart) {
2808            return prev;
2809        } else {
2810            return minSpanStart;
2811        }
2812    }
2813}
2814
2815// spans to be performed and compared
2816enum {
2817    SPAN_UTF16          =1,
2818    SPAN_UTF8           =2,
2819    SPAN_UTFS           =3,
2820
2821    SPAN_SET            =4,
2822    SPAN_COMPLEMENT     =8,
2823    SPAN_POLARITY       =0xc,
2824
2825    SPAN_FWD            =0x10,
2826    SPAN_BACK           =0x20,
2827    SPAN_DIRS           =0x30,
2828
2829    SPAN_CONTAINED      =0x100,
2830    SPAN_SIMPLE         =0x200,
2831    SPAN_CONDITION      =0x300,
2832
2833    SPAN_ALL            =0x33f
2834};
2835
2836static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2837    return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2838}
2839
2840static inline int32_t slen(const void *s, UBool isUTF16) {
2841    return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2842}
2843
2844/*
2845 * Count spans on a string with the method according to type and set the span limits.
2846 * The set may be the complement of the original.
2847 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2848 * according to the expected number of spans.
2849 * Sets typeName to an empty string if there is no such type.
2850 * Returns -1 if the span option is filtered out.
2851 */
2852static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2853                        const void *s, int32_t length, UBool isUTF16,
2854                        uint32_t whichSpans,
2855                        int type, const char *&typeName,
2856                        int32_t limits[], int32_t limitsCapacity,
2857                        int32_t expectCount) {
2858    const UnicodeSet &realSet(set.getSet());
2859    int32_t start, count;
2860    USetSpanCondition spanCondition, firstSpanCondition, contained;
2861    UBool isForward;
2862
2863    if(type<0 || 7<type) {
2864        typeName="";
2865        return 0;
2866    }
2867
2868    static const char *const typeNames16[]={
2869        "contains", "contains(LM)",
2870        "span", "span(LM)",
2871        "containsBack", "containsBack(LM)",
2872        "spanBack", "spanBack(LM)"
2873    };
2874
2875    static const char *const typeNames8[]={
2876        "containsUTF8", "containsUTF8(LM)",
2877        "spanUTF8", "spanUTF8(LM)",
2878        "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2879        "spanBackUTF8", "spanBackUTF8(LM)"
2880    };
2881
2882    typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2883
2884    // filter span options
2885    if(type<=3) {
2886        // span forward
2887        if((whichSpans&SPAN_FWD)==0) {
2888            return -1;
2889        }
2890        isForward=TRUE;
2891    } else {
2892        // span backward
2893        if((whichSpans&SPAN_BACK)==0) {
2894            return -1;
2895        }
2896        isForward=FALSE;
2897    }
2898    if((type&1)==0) {
2899        // use USET_SPAN_CONTAINED
2900        if((whichSpans&SPAN_CONTAINED)==0) {
2901            return -1;
2902        }
2903        contained=USET_SPAN_CONTAINED;
2904    } else {
2905        // use USET_SPAN_SIMPLE
2906        if((whichSpans&SPAN_SIMPLE)==0) {
2907            return -1;
2908        }
2909        contained=USET_SPAN_SIMPLE;
2910    }
2911
2912    // Default first span condition for going forward with an uncomplemented set.
2913    spanCondition=USET_SPAN_NOT_CONTAINED;
2914    if(isComplement) {
2915        spanCondition=invertSpanCondition(spanCondition, contained);
2916    }
2917
2918    // First span condition for span(), used to terminate the spanBack() iteration.
2919    firstSpanCondition=spanCondition;
2920
2921    // spanBack(): Its initial span condition is span()'s last span condition,
2922    // which is the opposite of span()'s first span condition
2923    // if we expect an even number of spans.
2924    // (The loop inverts spanCondition (expectCount-1) times
2925    // before the expectCount'th span() call.)
2926    // If we do not compare forward and backward directions, then we do not have an
2927    // expectCount and just start with firstSpanCondition.
2928    if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2929        spanCondition=invertSpanCondition(spanCondition, contained);
2930    }
2931
2932    count=0;
2933    switch(type) {
2934    case 0:
2935    case 1:
2936        start=0;
2937        if(length<0) {
2938            length=slen(s, isUTF16);
2939        }
2940        for(;;) {
2941            start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2942                              containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2943            if(count<limitsCapacity) {
2944                limits[count]=start;
2945            }
2946            ++count;
2947            if(start>=length) {
2948                break;
2949            }
2950            spanCondition=invertSpanCondition(spanCondition, contained);
2951        }
2952        break;
2953    case 2:
2954    case 3:
2955        start=0;
2956        for(;;) {
2957            start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2958                              realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2959            if(count<limitsCapacity) {
2960                limits[count]=start;
2961            }
2962            ++count;
2963            if(length>=0 ? start>=length :
2964                           isUTF16 ? ((const UChar *)s)[start]==0 :
2965                                     ((const char *)s)[start]==0
2966            ) {
2967                break;
2968            }
2969            spanCondition=invertSpanCondition(spanCondition, contained);
2970        }
2971        break;
2972    case 4:
2973    case 5:
2974        if(length<0) {
2975            length=slen(s, isUTF16);
2976        }
2977        for(;;) {
2978            ++count;
2979            if(count<=limitsCapacity) {
2980                limits[limitsCapacity-count]=length;
2981            }
2982            length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
2983                              containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
2984            if(length==0 && spanCondition==firstSpanCondition) {
2985                break;
2986            }
2987            spanCondition=invertSpanCondition(spanCondition, contained);
2988        }
2989        if(count<limitsCapacity) {
2990            memmove(limits, limits+(limitsCapacity-count), count*4);
2991        }
2992        break;
2993    case 6:
2994    case 7:
2995        for(;;) {
2996            ++count;
2997            if(count<=limitsCapacity) {
2998                limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
2999            }
3000            // Note: Length<0 is tested only for the first spanBack().
3001            // If we wanted to keep length<0 for all spanBack()s, we would have to
3002            // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3003            length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3004                              realSet.spanBackUTF8((const char *)s, length, spanCondition);
3005            if(length==0 && spanCondition==firstSpanCondition) {
3006                break;
3007            }
3008            spanCondition=invertSpanCondition(spanCondition, contained);
3009        }
3010        if(count<limitsCapacity) {
3011            memmove(limits, limits+(limitsCapacity-count), count*4);
3012        }
3013        break;
3014    default:
3015        typeName="";
3016        return -1;
3017    }
3018
3019    return count;
3020}
3021
3022// sets to be tested; odd index=isComplement
3023enum {
3024    SLOW,
3025    SLOW_NOT,
3026    FAST,
3027    FAST_NOT,
3028    SET_COUNT
3029};
3030
3031static const char *const setNames[SET_COUNT]={
3032    "slow",
3033    "slow.not",
3034    "fast",
3035    "fast.not"
3036};
3037
3038/*
3039 * Verify that we get the same results whether we look at text with contains(),
3040 * span() or spanBack(), using unfrozen or frozen versions of the set,
3041 * and using the set or its complement (switching the spanConditions accordingly).
3042 * The latter verifies that
3043 *   set.span(spanCondition) == set.complement().span(!spanCondition).
3044 *
3045 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3046 * or returned to the caller (with an input expectCount<0).
3047 */
3048void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3049                              const void *s, int32_t length, UBool isUTF16,
3050                              uint32_t whichSpans,
3051                              int32_t expectLimits[], int32_t &expectCount,
3052                              const char *testName, int32_t index) {
3053    int32_t limits[500];
3054    int32_t limitsCount;
3055    int i, j;
3056
3057    const char *typeName;
3058    int type;
3059
3060    for(i=0; i<SET_COUNT; ++i) {
3061        if((i&1)==0) {
3062            // Even-numbered sets are original, uncomplemented sets.
3063            if((whichSpans&SPAN_SET)==0) {
3064                continue;
3065            }
3066        } else {
3067            // Odd-numbered sets are complemented.
3068            if((whichSpans&SPAN_COMPLEMENT)==0) {
3069                continue;
3070            }
3071        }
3072        for(type=0;; ++type) {
3073            limitsCount=getSpans(*sets[i], (UBool)(i&1),
3074                                 s, length, isUTF16,
3075                                 whichSpans,
3076                                 type, typeName,
3077                                 limits, UPRV_LENGTHOF(limits), expectCount);
3078            if(typeName[0]==0) {
3079                break; // All types tried.
3080            }
3081            if(limitsCount<0) {
3082                continue; // Span option filtered out.
3083            }
3084            if(expectCount<0) {
3085                expectCount=limitsCount;
3086                if(limitsCount>UPRV_LENGTHOF(limits)) {
3087                    errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3088                          testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)UPRV_LENGTHOF(limits));
3089                    return;
3090                }
3091                memcpy(expectLimits, limits, limitsCount*4);
3092            } else if(limitsCount!=expectCount) {
3093                errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3094                      testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3095            } else {
3096                for(j=0; j<limitsCount; ++j) {
3097                    if(limits[j]!=expectLimits[j]) {
3098                        errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3099                              testName, (long)index, setNames[i], typeName, (long)limitsCount,
3100                              j, (long)limits[j], (long)expectLimits[j]);
3101                        break;
3102                    }
3103                }
3104            }
3105        }
3106    }
3107
3108    // Compare span() with containsAll()/containsNone(),
3109    // but only if we have expectLimits[] from the uncomplemented set.
3110    if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3111        const UChar *s16=(const UChar *)s;
3112        UnicodeString string;
3113        int32_t prev=0, limit, length;
3114        for(i=0; i<expectCount; ++i) {
3115            limit=expectLimits[i];
3116            length=limit-prev;
3117            if(length>0) {
3118                string.setTo(FALSE, s16+prev, length);  // read-only alias
3119                if(i&1) {
3120                    if(!sets[SLOW]->getSet().containsAll(string)) {
3121                        errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3122                              testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3123                        return;
3124                    }
3125                    if(!sets[FAST]->getSet().containsAll(string)) {
3126                        errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3127                              testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3128                        return;
3129                    }
3130                } else {
3131                    if(!sets[SLOW]->getSet().containsNone(string)) {
3132                        errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3133                              testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3134                        return;
3135                    }
3136                    if(!sets[FAST]->getSet().containsNone(string)) {
3137                        errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3138                              testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3139                        return;
3140                    }
3141                }
3142            }
3143            prev=limit;
3144        }
3145    }
3146}
3147
3148// Specifically test either UTF-16 or UTF-8.
3149void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3150                              const void *s, int32_t length, UBool isUTF16,
3151                              uint32_t whichSpans,
3152                              const char *testName, int32_t index) {
3153    int32_t expectLimits[500];
3154    int32_t expectCount=-1;
3155    testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3156}
3157
3158UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3159    UChar c, c2;
3160
3161    if(length>=0) {
3162        while(length>0) {
3163            c=*s++;
3164            --length;
3165            if(0xd800<=c && c<0xe000) {
3166                if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3167                    return TRUE;
3168                }
3169                --length;
3170            }
3171        }
3172    } else {
3173        while((c=*s++)!=0) {
3174            if(0xd800<=c && c<0xe000) {
3175                if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3176                    return TRUE;
3177                }
3178            }
3179        }
3180    }
3181    return FALSE;
3182}
3183
3184// Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3185// unless either UTF is turned off in whichSpans.
3186// Testing UTF-16 and UTF-8 together requires that surrogate code points
3187// have the same contains(c) value as U+FFFD.
3188void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3189                                      const UChar *s16, int32_t length16,
3190                                      uint32_t whichSpans,
3191                                      const char *testName, int32_t index) {
3192    int32_t expectLimits[500];
3193    int32_t expectCount;
3194
3195    expectCount=-1;  // Get expectLimits[] from testSpan().
3196
3197    if((whichSpans&SPAN_UTF16)!=0) {
3198        testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3199    }
3200    if((whichSpans&SPAN_UTF8)==0) {
3201        return;
3202    }
3203
3204    // Convert s16[] and expectLimits[] to UTF-8.
3205    uint8_t s8[3000];
3206    int32_t offsets[3000];
3207
3208    const UChar *s16Limit=s16+length16;
3209    char *t=(char *)s8;
3210    char *tLimit=t+sizeof(s8);
3211    int32_t *o=offsets;
3212    UErrorCode errorCode=U_ZERO_ERROR;
3213
3214    // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3215    ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3216    if(U_FAILURE(errorCode)) {
3217        errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3218              testName, (long)index, u_errorName(errorCode));
3219        ucnv_resetFromUnicode(utf8Cnv);
3220        return;
3221    }
3222    int32_t length8=(int32_t)(t-(char *)s8);
3223
3224    // Convert expectLimits[].
3225    int32_t i, j, expect;
3226    for(i=j=0; i<expectCount; ++i) {
3227        expect=expectLimits[i];
3228        if(expect==length16) {
3229            expectLimits[i]=length8;
3230        } else {
3231            while(offsets[j]<expect) {
3232                ++j;
3233            }
3234            expectLimits[i]=j;
3235        }
3236    }
3237
3238    testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3239}
3240
3241static UChar32 nextCodePoint(UChar32 c) {
3242    // Skip some large and boring ranges.
3243    switch(c) {
3244    case 0x3441:
3245        return 0x4d7f;
3246    case 0x5100:
3247        return 0x9f00;
3248    case 0xb040:
3249        return 0xd780;
3250    case 0xe041:
3251        return 0xf8fe;
3252    case 0x10100:
3253        return 0x20000;
3254    case 0x20041:
3255        return 0xe0000;
3256    case 0xe0101:
3257        return 0x10fffd;
3258    default:
3259        return c+1;
3260    }
3261}
3262
3263// Verify that all implementations represent the same set.
3264void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3265    // contains(U+FFFD) is inconsistent with contains(some surrogates),
3266    // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3267    // Skip the UTF-8 part of the test - if the string contains surrogates -
3268    // because it is likely to produce a different result.
3269    UBool inconsistentSurrogates=
3270            (!(sets[0]->getSet().contains(0xfffd) ?
3271               sets[0]->getSet().contains(0xd800, 0xdfff) :
3272               sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3273             sets[0]->hasStringsWithSurrogates());
3274
3275    UChar s[1000];
3276    int32_t length=0;
3277    uint32_t localWhichSpans;
3278
3279    UChar32 c, first;
3280    for(first=c=0;; c=nextCodePoint(c)) {
3281        if(c>0x10ffff || length>(UPRV_LENGTHOF(s)-U16_MAX_LENGTH)) {
3282            localWhichSpans=whichSpans;
3283            if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3284                localWhichSpans&=~SPAN_UTF8;
3285            }
3286            testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3287            if(c>0x10ffff) {
3288                break;
3289            }
3290            length=0;
3291            first=c;
3292        }
3293        U16_APPEND_UNSAFE(s, length, c);
3294    }
3295}
3296
3297// Test with a particular, interesting string.
3298// Specify length and try NUL-termination.
3299void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3300    static const UChar s[]={
3301        0x61, 0x62, 0x20,                       // Latin, space
3302        0x3b1, 0x3b2, 0x3b3,                    // Greek
3303        0xd900,                                 // lead surrogate
3304        0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3305        0xdc05,                                 // trail surrogate
3306        0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3307        0xd900, 0xdc05,                         // unassigned supplementary
3308        0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3309        0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3310        0                                       // NUL
3311    };
3312
3313    if((whichSpans&SPAN_UTF16)==0) {
3314        return;
3315    }
3316    testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3317    testSpan(sets, s, UPRV_LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3318}
3319
3320void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3321    static const char s[]={
3322        "abc"                                   // Latin
3323
3324        /* trail byte in lead position */
3325        "\x80"
3326
3327        " "                                     // space
3328
3329        /* truncated multi-byte sequences */
3330        "\xd0"
3331        "\xe0"
3332        "\xe1"
3333        "\xed"
3334        "\xee"
3335        "\xf0"
3336        "\xf1"
3337        "\xf4"
3338        "\xf8"
3339        "\xfc"
3340
3341        "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3342
3343        /* trail byte in lead position */
3344        "\x80"
3345
3346        "\xe0\x80"
3347        "\xe0\xa0"
3348        "\xe1\x80"
3349        "\xed\x80"
3350        "\xed\xa0"
3351        "\xee\x80"
3352        "\xf0\x80"
3353        "\xf0\x90"
3354        "\xf1\x80"
3355        "\xf4\x80"
3356        "\xf4\x90"
3357        "\xf8\x80"
3358        "\xfc\x80"
3359
3360        "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3361
3362        /* trail byte in lead position */
3363        "\x80"
3364
3365        "\xf0\x80\x80"
3366        "\xf0\x90\x80"
3367        "\xf1\x80\x80"
3368        "\xf4\x80\x80"
3369        "\xf4\x90\x80"
3370        "\xf8\x80\x80"
3371        "\xfc\x80\x80"
3372
3373        "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3374
3375        /* trail byte in lead position */
3376        "\x80"
3377
3378        "\xf8\x80\x80\x80"
3379        "\xfc\x80\x80\x80"
3380
3381        "\xF1\x90\x80\x85"                      // unassigned supplementary
3382
3383        /* trail byte in lead position */
3384        "\x80"
3385
3386        "\xfc\x80\x80\x80\x80"
3387
3388        "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3389
3390        /* trail byte in lead position */
3391        "\x80"
3392
3393        /* complete sequences but non-shortest forms or out of range etc. */
3394        "\xc0\x80"
3395        "\xe0\x80\x80"
3396        "\xed\xa0\x80"
3397        "\xf0\x80\x80\x80"
3398        "\xf4\x90\x80\x80"
3399        "\xf8\x80\x80\x80\x80"
3400        "\xfc\x80\x80\x80\x80\x80"
3401        "\xfe"
3402        "\xff"
3403
3404        /* trail byte in lead position */
3405        "\x80"
3406
3407        "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3408    };
3409
3410    if((whichSpans&SPAN_UTF8)==0) {
3411        return;
3412    }
3413    testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3414    testSpan(sets, s, UPRV_LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3415}
3416
3417// Take a set of span options and multiply them so that
3418// each portion only has one of the options a, b and c.
3419// If b==0, then the set of options is just modified with mask and a.
3420// If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3421static int32_t
3422addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3423               uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3424    uint32_t s;
3425    int32_t i;
3426
3427    for(i=0; i<whichSpansCount; ++i) {
3428        s=whichSpans[i]&mask;
3429        whichSpans[i]=s|a;
3430        if(b!=0) {
3431            whichSpans[whichSpansCount+i]=s|b;
3432            if(c!=0) {
3433                whichSpans[2*whichSpansCount+i]=s|c;
3434            }
3435        }
3436    }
3437    return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3438}
3439
3440#define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3441#define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3442#define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3443#define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3444
3445void UnicodeSetTest::TestSpan() {
3446    // "[...]" is a UnicodeSet pattern.
3447    // "*" performs tests on all Unicode code points and on a selection of
3448    //   malformed UTF-8/16 strings.
3449    // "-options" limits the scope of testing for the current set.
3450    //   By default, the test verifies that equivalent boundaries are found
3451    //   for UTF-16 and UTF-8, going forward and backward,
3452    //   alternating USET_SPAN_NOT_CONTAINED with
3453    //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3454    //   Single-character options:
3455    //     8 -- UTF-16 and UTF-8 boundaries may differ.
3456    //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3457    //          or the set contains strings with unpaired surrogates
3458    //          which do not translate to valid UTF-8.
3459    //     c -- set.span() and set.complement().span() boundaries may differ.
3460    //          Cause: Set strings are not complemented.
3461    //     b -- span() and spanBack() boundaries may differ.
3462    //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3463    //          and spanBack(USET_SPAN_SIMPLE) are defined to
3464    //          match with non-overlapping substrings.
3465    //          For example, with a set containing "ab" and "ba",
3466    //          span() of "aba" yields boundaries { 0, 2, 3 }
3467    //          because the initial "ab" matches from 0 to 2,
3468    //          while spanBack() yields boundaries { 0, 1, 3 }
3469    //          because the final "ba" matches from 1 to 3.
3470    //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3471    //          Cause: Strings in the set overlap, and a longer match may
3472    //          require a sequence including non-longest substrings.
3473    //          For example, with a set containing "ab", "abc" and "cd",
3474    //          span(contained) of "abcd" spans the entire string
3475    //          but span(longest match) only spans the first 3 characters.
3476    //   Each "-options" first resets all options and then applies the specified options.
3477    //   A "-" without options resets the options.
3478    //   The options are also reset for each new set.
3479    // Other strings will be spanned.
3480    static const char *const testdata[]={
3481        "[:ID_Continue:]",
3482        "*",
3483        "[:White_Space:]",
3484        "*",
3485        "[]",
3486        "*",
3487        "[\\u0000-\\U0010FFFF]",
3488        "*",
3489        "[\\u0000\\u0080\\u0800\\U00010000]",
3490        "*",
3491        "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3492        "*",
3493        "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3494        "-c",
3495        "*",
3496        "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3497        "-c",
3498        "*",
3499
3500        // Overlapping strings cause overlapping attempts to match.
3501        "[x{xy}{xya}{axy}{ax}]",
3502        "-cl",
3503
3504        // More repetitions of "xya" would take too long with the recursive
3505        // reference implementation.
3506        // containsAll()=FALSE
3507        // test_string 0x14
3508        "xx"
3509        "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3510        "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3511        "xyaxyaxyaxya"
3512        "xx"
3513        "xyaxyaxyaxya"  // span() ends here.
3514        "aaa",
3515
3516        // containsAll()=TRUE
3517        // test_string 0x15
3518        "xx"
3519        "xyaxyaxyaxya"
3520        "xx"
3521        "xyaxyaxyaxya"
3522        "xx"
3523        "xyaxyaxyaxy",
3524
3525        "-bc",
3526        // test_string 0x17
3527        "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3528        "-c",
3529        "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3530        "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3531        "-",
3532        "byaya",     // span() -> { 5 }
3533        "byay",      // span() -> { 4 }
3534        "bya",       // span() -> { 3 }
3535
3536        // span(longest match) will not span the whole string.
3537        "[a{ab}{bc}]",
3538        "-cl",
3539        // test_string 0x21
3540        "abc",
3541
3542        "[a{ab}{abc}{cd}]",
3543        "-cl",
3544        "acdabcdabccd",
3545
3546        // spanBack(longest match) will not span the whole string.
3547        "[c{ab}{bc}]",
3548        "-cl",
3549        "abc",
3550
3551        "[d{cd}{bcd}{ab}]",
3552        "-cl",
3553        "abbcdabcdabd",
3554
3555        // Test with non-ASCII set strings - test proper handling of surrogate pairs
3556        // and UTF-8 trail bytes.
3557        // Copies of above test sets and strings, but transliterated to have
3558        // different code points with similar trail units.
3559        // Previous: a      b         c            d
3560        // Unicode:  042B   30AB      200AB        204AB
3561        // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3562        // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3563        "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3564        "-cl",
3565        "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3566
3567        "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3568        "-cl",
3569        "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3570
3571        // Stress bookkeeping and recursion.
3572        // The following strings are barely doable with the recursive
3573        // reference implementation.
3574        // The not-contained character at the end prevents an early exit from the span().
3575        "[b{bb}]",
3576        "-c",
3577        // test_string 0x33
3578        "bbbbbbbbbbbbbbbbbbbbbbbb-",
3579        // On complement sets, span() and spanBack() get different results
3580        // because b is not in the complement set and there is an odd number of b's
3581        // in the test string.
3582        "-bc",
3583        "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3584
3585        // Test with set strings with an initial or final code point span
3586        // longer than 254.
3587        "[a{" _64_a _64_a _64_a _64_a "b}"
3588          "{a" _64_b _64_b _64_b _64_b "}]",
3589        "-c",
3590        _64_a _64_a _64_a _63_a "b",
3591        _64_a _64_a _64_a _64_a "b",
3592        _64_a _64_a _64_a _64_a "aaaabbbb",
3593        "a" _64_b _64_b _64_b _63_b,
3594        "a" _64_b _64_b _64_b _64_b,
3595        "aaaabbbb" _64_b _64_b _64_b _64_b,
3596
3597        // Test with strings containing unpaired surrogates.
3598        // They are not representable in UTF-8, and a leading trail surrogate
3599        // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3600        // U+20001 == \\uD840\\uDC01
3601        // U+20400 == \\uD841\\uDC00
3602        "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3603        "-8cl",
3604        "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3605    };
3606    uint32_t whichSpans[96]={ SPAN_ALL };
3607    int32_t whichSpansCount=1;
3608
3609    UnicodeSet *sets[SET_COUNT]={ NULL };
3610    const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3611
3612    char testName[1024];
3613    char *testNameLimit=testName;
3614
3615    int32_t i, j;
3616    for(i=0; i<UPRV_LENGTHOF(testdata); ++i) {
3617        const char *s=testdata[i];
3618        if(s[0]=='[') {
3619            // Create new test sets from this pattern.
3620            for(j=0; j<SET_COUNT; ++j) {
3621                delete sets_with_str[j];
3622                delete sets[j];
3623            }
3624            UErrorCode errorCode=U_ZERO_ERROR;
3625            sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3626            if(U_FAILURE(errorCode)) {
3627                dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3628                break;
3629            }
3630            sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3631            sets[SLOW_NOT]->complement();
3632            // Intermediate set: Test cloning of a frozen set.
3633            UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3634            fast->freeze();
3635            sets[FAST]=(UnicodeSet *)fast->clone();
3636            delete fast;
3637            UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3638            fastNot->freeze();
3639            sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3640            delete fastNot;
3641
3642            for(j=0; j<SET_COUNT; ++j) {
3643                sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3644            }
3645
3646            strcpy(testName, s);
3647            testNameLimit=strchr(testName, 0);
3648            *testNameLimit++=':';
3649            *testNameLimit=0;
3650
3651            whichSpans[0]=SPAN_ALL;
3652            whichSpansCount=1;
3653        } else if(s[0]=='-') {
3654            whichSpans[0]=SPAN_ALL;
3655            whichSpansCount=1;
3656
3657            while(*++s!=0) {
3658                switch(*s) {
3659                case 'c':
3660                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3661                                                   ~SPAN_POLARITY,
3662                                                   SPAN_SET,
3663                                                   SPAN_COMPLEMENT,
3664                                                   0);
3665                    break;
3666                case 'b':
3667                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3668                                                   ~SPAN_DIRS,
3669                                                   SPAN_FWD,
3670                                                   SPAN_BACK,
3671                                                   0);
3672                    break;
3673                case 'l':
3674                    // test USET_SPAN_CONTAINED FWD & BACK, and separately
3675                    // USET_SPAN_SIMPLE only FWD, and separately
3676                    // USET_SPAN_SIMPLE only BACK
3677                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3678                                                   ~(SPAN_DIRS|SPAN_CONDITION),
3679                                                   SPAN_DIRS|SPAN_CONTAINED,
3680                                                   SPAN_FWD|SPAN_SIMPLE,
3681                                                   SPAN_BACK|SPAN_SIMPLE);
3682                    break;
3683                case '8':
3684                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3685                                                   ~SPAN_UTFS,
3686                                                   SPAN_UTF16,
3687                                                   SPAN_UTF8,
3688                                                   0);
3689                    break;
3690                default:
3691                    errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3692                    break;
3693                }
3694            }
3695        } else if(0==strcmp(s, "*")) {
3696            strcpy(testNameLimit, "bad_string");
3697            for(j=0; j<whichSpansCount; ++j) {
3698                if(whichSpansCount>1) {
3699                    sprintf(testNameLimit+10 /* strlen("bad_string") */,
3700                            "%%0x%3x",
3701                            whichSpans[j]);
3702                }
3703                testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3704                testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3705            }
3706
3707            strcpy(testNameLimit, "contents");
3708            for(j=0; j<whichSpansCount; ++j) {
3709                if(whichSpansCount>1) {
3710                    sprintf(testNameLimit+8 /* strlen("contents") */,
3711                            "%%0x%3x",
3712                            whichSpans[j]);
3713                }
3714                testSpanContents(sets_with_str, whichSpans[j], testName);
3715            }
3716        } else {
3717            UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3718            strcpy(testNameLimit, "test_string");
3719            for(j=0; j<whichSpansCount; ++j) {
3720                if(whichSpansCount>1) {
3721                    sprintf(testNameLimit+11 /* strlen("test_string") */,
3722                            "%%0x%3x",
3723                            whichSpans[j]);
3724                }
3725                testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3726            }
3727        }
3728    }
3729    for(j=0; j<SET_COUNT; ++j) {
3730        delete sets_with_str[j];
3731        delete sets[j];
3732    }
3733}
3734
3735// Test select patterns and strings, and test USET_SPAN_SIMPLE.
3736void UnicodeSetTest::TestStringSpan() {
3737    static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3738    static const char *const string=
3739        "xx"
3740        "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3741        "xx"
3742        "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3743        "xx"
3744        "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3745        "aaaa";
3746
3747    UErrorCode errorCode=U_ZERO_ERROR;
3748    UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3749    UnicodeSet set(pattern16, errorCode);
3750    if(U_FAILURE(errorCode)) {
3751        errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3752        return;
3753    }
3754
3755    UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3756
3757    if(set.containsAll(string16)) {
3758        errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3759    }
3760
3761    // Remove trailing "aaaa".
3762    string16.truncate(string16.length()-4);
3763    if(!set.containsAll(string16)) {
3764        errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3765    }
3766
3767    string16=UNICODE_STRING_SIMPLE("byayaxya");
3768    const UChar *s16=string16.getBuffer();
3769    int32_t length16=string16.length();
3770    (void)length16;   // Suppress set but not used warning.
3771    if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3772        set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3773        set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3774        set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3775        set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3776        set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3777    ) {
3778        errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3779    }
3780
3781    pattern="[a{ab}{abc}{cd}]";
3782    pattern16=UnicodeString(pattern, -1, US_INV);
3783    set.applyPattern(pattern16, errorCode);
3784    if(U_FAILURE(errorCode)) {
3785        errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3786        return;
3787    }
3788    string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3789    s16=string16.getBuffer();
3790    length16=string16.length();
3791    if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3792        set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3793        set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3794    ) {
3795        errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3796    }
3797
3798    pattern="[d{cd}{bcd}{ab}]";
3799    pattern16=UnicodeString(pattern, -1, US_INV);
3800    set.applyPattern(pattern16, errorCode).freeze();
3801    if(U_FAILURE(errorCode)) {
3802        errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3803        return;
3804    }
3805    string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3806    s16=string16.getBuffer();
3807    length16=string16.length();
3808    if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3809        set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3810        set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3811    ) {
3812        errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3813    }
3814}
3815