1/*
2********************************************************************************
3*   Copyright (C) 1999-2010 International Business Machines Corporation and
4*   others. All Rights Reserved.
5********************************************************************************
6*   Date        Name        Description
7*   10/20/99    alan        Creation.
8*   03/22/2000  Madhu       Added additional tests
9********************************************************************************
10*/
11
12#include <stdio.h>
13
14#include <string.h>
15#include "unicode/utypes.h"
16#include "usettest.h"
17#include "unicode/ucnv.h"
18#include "unicode/uniset.h"
19#include "unicode/uchar.h"
20#include "unicode/usetiter.h"
21#include "unicode/ustring.h"
22#include "unicode/parsepos.h"
23#include "unicode/symtable.h"
24#include "unicode/uversion.h"
25#include "hash.h"
26
27#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
28
29#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
30    dataerrln("fail in file \"%s\", line %d: \"%s\"", __FILE__, __LINE__, \
31    u_errorName(status));}}
32
33#define TEST_ASSERT(expr) {if (!(expr)) { \
34    dataerrln("fail in file \"%s\", line %d", __FILE__, __LINE__); }}
35
36UnicodeString operator+(const UnicodeString& left, const UnicodeSet& set) {
37    UnicodeString pat;
38    set.toPattern(pat);
39    return left + UnicodeSetTest::escape(pat);
40}
41
42#define CASE(id,test) case id:                          \
43                          name = #test;                 \
44                          if (exec) {                   \
45                              logln(#test "---");       \
46                              logln();                  \
47                              test();                   \
48                          }                             \
49                          break
50
51UnicodeSetTest::UnicodeSetTest() : utf8Cnv(NULL) {
52}
53
54UConverter *UnicodeSetTest::openUTF8Converter() {
55    if(utf8Cnv==NULL) {
56        UErrorCode errorCode=U_ZERO_ERROR;
57        utf8Cnv=ucnv_open("UTF-8", &errorCode);
58    }
59    return utf8Cnv;
60}
61
62UnicodeSetTest::~UnicodeSetTest() {
63    ucnv_close(utf8Cnv);
64}
65
66void
67UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
68                               const char* &name, char* /*par*/) {
69    // if (exec) logln((UnicodeString)"TestSuite UnicodeSetTest");
70    switch (index) {
71        CASE(0,TestPatterns);
72        CASE(1,TestAddRemove);
73        CASE(2,TestCategories);
74        CASE(3,TestCloneEqualHash);
75        CASE(4,TestMinimalRep);
76        CASE(5,TestAPI);
77        CASE(6,TestScriptSet);
78        CASE(7,TestPropertySet);
79        CASE(8,TestClone);
80        CASE(9,TestExhaustive);
81        CASE(10,TestToPattern);
82        CASE(11,TestIndexOf);
83        CASE(12,TestStrings);
84        CASE(13,Testj2268);
85        CASE(14,TestCloseOver);
86        CASE(15,TestEscapePattern);
87        CASE(16,TestInvalidCodePoint);
88        CASE(17,TestSymbolTable);
89        CASE(18,TestSurrogate);
90        CASE(19,TestPosixClasses);
91        CASE(20,TestIteration);
92        CASE(21,TestFreezable);
93        CASE(22,TestSpan);
94        CASE(23,TestStringSpan);
95        default: name = ""; break;
96    }
97}
98
99static const char NOT[] = "%%%%";
100
101/**
102 * UVector was improperly copying contents
103 * This code will crash this is still true
104 */
105void UnicodeSetTest::Testj2268() {
106  UnicodeSet t;
107  t.add(UnicodeString("abc"));
108  UnicodeSet test(t);
109  UnicodeString ustrPat;
110  test.toPattern(ustrPat, TRUE);
111}
112
113/**
114 * Test toPattern().
115 */
116void UnicodeSetTest::TestToPattern() {
117    UErrorCode ec = U_ZERO_ERROR;
118
119    // Test that toPattern() round trips with syntax characters and
120    // whitespace.
121    {
122        static const char* OTHER_TOPATTERN_TESTS[] = {
123            "[[:latin:]&[:greek:]]",
124            "[[:latin:]-[:greek:]]",
125            "[:nonspacing mark:]",
126            NULL
127        };
128
129        for (int32_t j=0; OTHER_TOPATTERN_TESTS[j]!=NULL; ++j) {
130            ec = U_ZERO_ERROR;
131            UnicodeSet s(OTHER_TOPATTERN_TESTS[j], ec);
132            if (U_FAILURE(ec)) {
133                dataerrln((UnicodeString)"FAIL: bad pattern " + OTHER_TOPATTERN_TESTS[j] + " - " + UnicodeString(u_errorName(ec)));
134                continue;
135            }
136            checkPat(OTHER_TOPATTERN_TESTS[j], s);
137        }
138
139        for (UChar32 i = 0; i <= 0x10FFFF; ++i) {
140            if ((i <= 0xFF && !u_isalpha(i)) || u_isspace(i)) {
141
142                // check various combinations to make sure they all work.
143                if (i != 0 && !toPatternAux(i, i)){
144                    continue;
145                }
146                if (!toPatternAux(0, i)){
147                    continue;
148                }
149                if (!toPatternAux(i, 0xFFFF)){
150                    continue;
151                }
152            }
153        }
154    }
155
156    // Test pattern behavior of multicharacter strings.
157    {
158        ec = U_ZERO_ERROR;
159        UnicodeSet* s = new UnicodeSet("[a-z {aa} {ab}]", ec);
160
161        // This loop isn't a loop.  It's here to make the compiler happy.
162        // If you're curious, try removing it and changing the 'break'
163        // statements (except for the last) to goto's.
164        for (;;) {
165            if (U_FAILURE(ec)) break;
166            const char* exp1[] = {"aa", "ab", NOT, "ac", NULL};
167            expectToPattern(*s, "[a-z{aa}{ab}]", exp1);
168
169            s->add("ac");
170            const char* exp2[] = {"aa", "ab", "ac", NOT, "xy", NULL};
171            expectToPattern(*s, "[a-z{aa}{ab}{ac}]", exp2);
172
173            s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\{l} {r\\}}]"), ec);
174            if (U_FAILURE(ec)) break;
175            const char* exp3[] = {"{l", "r}", NOT, "xy", NULL};
176            expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{r\\}}{\\{l}]"), exp3);
177
178            s->add("[]");
179            const char* exp4[] = {"{l", "r}", "[]", NOT, "xy", NULL};
180            expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\[\\]}{r\\}}{\\{l}]"), exp4);
181
182            s->applyPattern(UNICODE_STRING_SIMPLE("[a-z {\\u4E01\\u4E02}{\\n\\r}]"), ec);
183            if (U_FAILURE(ec)) break;
184            const char* exp5[] = {"\\u4E01\\u4E02", "\n\r", NULL};
185            expectToPattern(*s, UNICODE_STRING_SIMPLE("[a-z{\\u000A\\u000D}{\\u4E01\\u4E02}]"), exp5);
186
187            // j2189
188            s->clear();
189            s->add(UnicodeString("abc", ""));
190            s->add(UnicodeString("abc", ""));
191            const char* exp6[] = {"abc", NOT, "ab", NULL};
192            expectToPattern(*s, "[{abc}]", exp6);
193
194            break;
195        }
196
197        if (U_FAILURE(ec)) errln("FAIL: pattern parse error");
198        delete s;
199    }
200
201    // JB#3400: For 2 character ranges prefer [ab] to [a-b]
202    UnicodeSet s;
203    s.add((UChar)97, (UChar)98); // 'a', 'b'
204    expectToPattern(s, "[ab]", NULL);
205}
206
207UBool UnicodeSetTest::toPatternAux(UChar32 start, UChar32 end) {
208
209    // use Integer.toString because Utility.hex doesn't handle ints
210    UnicodeString pat = "";
211    // TODO do these in hex
212    //String source = "0x" + Integer.toString(start,16).toUpperCase();
213    //if (start != end) source += "..0x" + Integer.toString(end,16).toUpperCase();
214    UnicodeString source;
215    source = source + (uint32_t)start;
216    if (start != end)
217        source = source + ".." + (uint32_t)end;
218    UnicodeSet testSet;
219    testSet.add(start, end);
220    return checkPat(source, testSet);
221}
222
223UBool UnicodeSetTest::checkPat(const UnicodeString& source,
224                               const UnicodeSet& testSet) {
225    // What we want to make sure of is that a pattern generated
226    // by toPattern(), with or without escaped unprintables, can
227    // be passed back into the UnicodeSet constructor.
228    UnicodeString pat0;
229
230    testSet.toPattern(pat0, TRUE);
231
232    if (!checkPat(source + " (escaped)", testSet, pat0)) return FALSE;
233
234    //String pat1 = unescapeLeniently(pat0);
235    //if (!checkPat(source + " (in code)", testSet, pat1)) return false;
236
237    UnicodeString pat2;
238    testSet.toPattern(pat2, FALSE);
239    if (!checkPat(source, testSet, pat2)) return FALSE;
240
241    //String pat3 = unescapeLeniently(pat2);
242    // if (!checkPat(source + " (in code)", testSet, pat3)) return false;
243
244    //logln(source + " => " + pat0 + ", " + pat1 + ", " + pat2 + ", " + pat3);
245    logln((UnicodeString)source + " => " + pat0 + ", " + pat2);
246    return TRUE;
247}
248
249UBool UnicodeSetTest::checkPat(const UnicodeString& source,
250                               const UnicodeSet& testSet,
251                               const UnicodeString& pat) {
252    UErrorCode ec = U_ZERO_ERROR;
253    UnicodeSet testSet2(pat, ec);
254    if (testSet2 != testSet) {
255        errln((UnicodeString)"Fail toPattern: " + source + " => " + pat);
256        return FALSE;
257    }
258    return TRUE;
259}
260
261void
262UnicodeSetTest::TestPatterns(void) {
263    UnicodeSet set;
264    expectPattern(set, UnicodeString("[[a-m]&[d-z]&[k-y]]", ""),  "km");
265    expectPattern(set, UnicodeString("[[a-z]-[m-y]-[d-r]]", ""),  "aczz");
266    expectPattern(set, UnicodeString("[a\\-z]", ""),  "--aazz");
267    expectPattern(set, UnicodeString("[-az]", ""),  "--aazz");
268    expectPattern(set, UnicodeString("[az-]", ""),  "--aazz");
269    expectPattern(set, UnicodeString("[[[a-z]-[aeiou]i]]", ""), "bdfnptvz");
270
271    // Throw in a test of complement
272    set.complement();
273    UnicodeString exp;
274    exp.append((UChar)0x0000).append("aeeoouu").append((UChar)(0x007a+1)).append((UChar)0xFFFF);
275    expectPairs(set, exp);
276}
277
278void
279UnicodeSetTest::TestCategories(void) {
280    UErrorCode status = U_ZERO_ERROR;
281    const char* pat = " [:Lu:] "; // Whitespace ok outside [:..:]
282    UnicodeSet set(pat, status);
283    if (U_FAILURE(status)) {
284        dataerrln((UnicodeString)"Fail: Can't construct set with " + pat + " - " + UnicodeString(u_errorName(status)));
285        return;
286    } else {
287        expectContainment(set, pat, "ABC", "abc");
288    }
289
290    UChar32 i;
291    int32_t failures = 0;
292    // Make sure generation of L doesn't pollute cached Lu set
293    // First generate L, then Lu
294    set.applyPattern("[:L:]", status);
295    if (U_FAILURE(status)) { errln("FAIL"); return; }
296    for (i=0; i<0x200; ++i) {
297        UBool l = u_isalpha((UChar)i);
298        if (l != set.contains(i)) {
299            errln((UnicodeString)"FAIL: L contains " + (unsigned short)i + " = " +
300                  set.contains(i));
301            if (++failures == 10) break;
302        }
303    }
304
305    set.applyPattern("[:Lu:]", status);
306    if (U_FAILURE(status)) { errln("FAIL"); return; }
307    for (i=0; i<0x200; ++i) {
308        UBool lu = (u_charType((UChar)i) == U_UPPERCASE_LETTER);
309        if (lu != set.contains(i)) {
310            errln((UnicodeString)"FAIL: Lu contains " + (unsigned short)i + " = " +
311                  set.contains(i));
312            if (++failures == 20) break;
313        }
314    }
315}
316void
317UnicodeSetTest::TestCloneEqualHash(void) {
318    UErrorCode status = U_ZERO_ERROR;
319    // set1 and set2 used to be built with the obsolete constructor taking
320    // UCharCategory values; replaced with pattern constructors
321    // markus 20030502
322    UnicodeSet *set1=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Lowercase Letter}"), status); //  :Ll: Letter, lowercase
323    UnicodeSet *set1a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Ll:]"), status); //  Letter, lowercase
324    if (U_FAILURE(status)){
325        dataerrln((UnicodeString)"FAIL: Can't construst set with category->Ll" + " - " + UnicodeString(u_errorName(status)));
326        return;
327    }
328    UnicodeSet *set2=new UnicodeSet(UNICODE_STRING_SIMPLE("\\p{Decimal Number}"), status);   //Number, Decimal digit
329    UnicodeSet *set2a=new UnicodeSet(UNICODE_STRING_SIMPLE("[:Nd:]"), status);   //Number, Decimal digit
330    if (U_FAILURE(status)){
331        errln((UnicodeString)"FAIL: Can't construct set with category->Nd");
332        return;
333    }
334
335    if (*set1 != *set1a) {
336        errln("FAIL: category constructor for Ll broken");
337    }
338    if (*set2 != *set2a) {
339        errln("FAIL: category constructor for Nd broken");
340    }
341    delete set1a;
342    delete set2a;
343
344    logln("Testing copy construction");
345    UnicodeSet *set1copy=new UnicodeSet(*set1);
346    if(*set1 != *set1copy || *set1 == *set2 ||
347        getPairs(*set1) != getPairs(*set1copy) ||
348        set1->hashCode() != set1copy->hashCode()){
349        errln("FAIL : Error in copy construction");
350        return;
351    }
352
353    logln("Testing =operator");
354    UnicodeSet set1equal=*set1;
355    UnicodeSet set2equal=*set2;
356    if(set1equal != *set1 || set1equal != *set1copy || set2equal != *set2 ||
357        set2equal == *set1 || set2equal == *set1copy || set2equal == set1equal){
358        errln("FAIL: Error in =operator");
359    }
360
361    logln("Testing clone()");
362    UnicodeSet *set1clone=(UnicodeSet*)set1->clone();
363    UnicodeSet *set2clone=(UnicodeSet*)set2->clone();
364    if(*set1clone != *set1 || *set1clone != *set1copy || *set1clone != set1equal ||
365        *set2clone != *set2 || *set2clone == *set1copy || *set2clone != set2equal ||
366        *set2clone == *set1 || *set2clone == set1equal || *set2clone == *set1clone){
367        errln("FAIL: Error in clone");
368    }
369
370    logln("Testing hashcode");
371    if(set1->hashCode() != set1equal.hashCode() || set1->hashCode() != set1clone->hashCode() ||
372        set2->hashCode() != set2equal.hashCode() || set2->hashCode() != set2clone->hashCode() ||
373        set1copy->hashCode() != set1equal.hashCode() || set1copy->hashCode() != set1clone->hashCode() ||
374        set1->hashCode() == set2->hashCode()  || set1copy->hashCode() == set2->hashCode() ||
375        set2->hashCode() == set1clone->hashCode() || set2->hashCode() == set1equal.hashCode() ){
376        errln("FAIL: Error in hashCode()");
377    }
378
379    delete set1;
380    delete set1copy;
381    delete set2;
382    delete set1clone;
383    delete set2clone;
384
385
386}
387void
388UnicodeSetTest::TestAddRemove(void) {
389    UnicodeSet set; // Construct empty set
390    doAssert(set.isEmpty() == TRUE, "set should be empty");
391    doAssert(set.size() == 0, "size should be 0");
392    set.complement();
393    doAssert(set.size() == 0x110000, "size should be 0x110000");
394    set.clear();
395    set.add(0x0061, 0x007a);
396    expectPairs(set, "az");
397    doAssert(set.isEmpty() == FALSE, "set should not be empty");
398    doAssert(set.size() != 0, "size should not be equal to 0");
399    doAssert(set.size() == 26, "size should be equal to 26");
400    set.remove(0x006d, 0x0070);
401    expectPairs(set, "alqz");
402    doAssert(set.size() == 22, "size should be equal to 22");
403    set.remove(0x0065, 0x0067);
404    expectPairs(set, "adhlqz");
405    doAssert(set.size() == 19, "size should be equal to 19");
406    set.remove(0x0064, 0x0069);
407    expectPairs(set, "acjlqz");
408    doAssert(set.size() == 16, "size should be equal to 16");
409    set.remove(0x0063, 0x0072);
410    expectPairs(set, "absz");
411    doAssert(set.size() == 10, "size should be equal to 10");
412    set.add(0x0066, 0x0071);
413    expectPairs(set, "abfqsz");
414    doAssert(set.size() == 22, "size should be equal to 22");
415    set.remove(0x0061, 0x0067);
416    expectPairs(set, "hqsz");
417    set.remove(0x0061, 0x007a);
418    expectPairs(set, "");
419    doAssert(set.isEmpty() == TRUE, "set should be empty");
420    doAssert(set.size() == 0, "size should be 0");
421    set.add(0x0061);
422    doAssert(set.isEmpty() == FALSE, "set should not be empty");
423    doAssert(set.size() == 1, "size should not be equal to 1");
424    set.add(0x0062);
425    set.add(0x0063);
426    expectPairs(set, "ac");
427    doAssert(set.size() == 3, "size should not be equal to 3");
428    set.add(0x0070);
429    set.add(0x0071);
430    expectPairs(set, "acpq");
431    doAssert(set.size() == 5, "size should not be equal to 5");
432    set.clear();
433    expectPairs(set, "");
434    doAssert(set.isEmpty() == TRUE, "set should be empty");
435    doAssert(set.size() == 0, "size should be 0");
436
437    // Try removing an entire set from another set
438    expectPattern(set, "[c-x]", "cx");
439    UnicodeSet set2;
440    expectPattern(set2, "[f-ky-za-bc[vw]]", "acfkvwyz");
441    set.removeAll(set2);
442    expectPairs(set, "deluxx");
443
444    // Try adding an entire set to another set
445    expectPattern(set, "[jackiemclean]", "aacceein");
446    expectPattern(set2, "[hitoshinamekatajamesanderson]", "aadehkmort");
447    set.addAll(set2);
448    expectPairs(set, "aacehort");
449    doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
450
451    // Try retaining an set of elements contained in another set (intersection)
452    UnicodeSet set3;
453    expectPattern(set3, "[a-c]", "ac");
454    doAssert(set.containsAll(set3) == FALSE, "set doesn't contain all the elements in set3");
455    set3.remove(0x0062);
456    expectPairs(set3, "aacc");
457    doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
458    set.retainAll(set3);
459    expectPairs(set, "aacc");
460    doAssert(set.size() == set3.size(), "set.size() should be set3.size()");
461    doAssert(set.containsAll(set3) == TRUE, "set should contain all the elements in set3");
462    set.clear();
463    doAssert(set.size() != set3.size(), "set.size() != set3.size()");
464
465    // Test commutativity
466    expectPattern(set, "[hitoshinamekatajamesanderson]", "aadehkmort");
467    expectPattern(set2, "[jackiemclean]", "aacceein");
468    set.addAll(set2);
469    expectPairs(set, "aacehort");
470    doAssert(set.containsAll(set2) == TRUE, "set should contain all the elements in set2");
471
472
473
474
475}
476
477/**
478 * Make sure minimal representation is maintained.
479 */
480void UnicodeSetTest::TestMinimalRep() {
481    UErrorCode status = U_ZERO_ERROR;
482    // This is pretty thoroughly tested by checkCanonicalRep()
483    // run against the exhaustive operation results.  Use the code
484    // here for debugging specific spot problems.
485
486    // 1 overlap against 2
487    UnicodeSet set("[h-km-q]", status);
488    if (U_FAILURE(status)) { errln("FAIL"); return; }
489    UnicodeSet set2("[i-o]", status);
490    if (U_FAILURE(status)) { errln("FAIL"); return; }
491    set.addAll(set2);
492    expectPairs(set, "hq");
493    // right
494    set.applyPattern("[a-m]", status);
495    if (U_FAILURE(status)) { errln("FAIL"); return; }
496    set2.applyPattern("[e-o]", status);
497    if (U_FAILURE(status)) { errln("FAIL"); return; }
498    set.addAll(set2);
499    expectPairs(set, "ao");
500    // left
501    set.applyPattern("[e-o]", status);
502    if (U_FAILURE(status)) { errln("FAIL"); return; }
503    set2.applyPattern("[a-m]", status);
504    if (U_FAILURE(status)) { errln("FAIL"); return; }
505    set.addAll(set2);
506    expectPairs(set, "ao");
507    // 1 overlap against 3
508    set.applyPattern("[a-eg-mo-w]", status);
509    if (U_FAILURE(status)) { errln("FAIL"); return; }
510    set2.applyPattern("[d-q]", status);
511    if (U_FAILURE(status)) { errln("FAIL"); return; }
512    set.addAll(set2);
513    expectPairs(set, "aw");
514}
515
516void UnicodeSetTest::TestAPI() {
517    UErrorCode status = U_ZERO_ERROR;
518    // default ct
519    UnicodeSet set;
520    if (!set.isEmpty() || set.getRangeCount() != 0) {
521        errln((UnicodeString)"FAIL, set should be empty but isn't: " +
522              set);
523    }
524
525    // clear(), isEmpty()
526    set.add(0x0061);
527    if (set.isEmpty()) {
528        errln((UnicodeString)"FAIL, set shouldn't be empty but is: " +
529              set);
530    }
531    set.clear();
532    if (!set.isEmpty()) {
533        errln((UnicodeString)"FAIL, set should be empty but isn't: " +
534              set);
535    }
536
537    // size()
538    set.clear();
539    if (set.size() != 0) {
540        errln((UnicodeString)"FAIL, size should be 0, but is " + set.size() +
541              ": " + set);
542    }
543    set.add(0x0061);
544    if (set.size() != 1) {
545        errln((UnicodeString)"FAIL, size should be 1, but is " + set.size() +
546              ": " + set);
547    }
548    set.add(0x0031, 0x0039);
549    if (set.size() != 10) {
550        errln((UnicodeString)"FAIL, size should be 10, but is " + set.size() +
551              ": " + set);
552    }
553
554    // contains(first, last)
555    set.clear();
556    set.applyPattern("[A-Y 1-8 b-d l-y]", status);
557    if (U_FAILURE(status)) { errln("FAIL"); return; }
558    for (int32_t i = 0; i<set.getRangeCount(); ++i) {
559        UChar32 a = set.getRangeStart(i);
560        UChar32 b = set.getRangeEnd(i);
561        if (!set.contains(a, b)) {
562            errln((UnicodeString)"FAIL, should contain " + (unsigned short)a + '-' + (unsigned short)b +
563                  " but doesn't: " + set);
564        }
565        if (set.contains((UChar32)(a-1), b)) {
566            errln((UnicodeString)"FAIL, shouldn't contain " +
567                  (unsigned short)(a-1) + '-' + (unsigned short)b +
568                  " but does: " + set);
569        }
570        if (set.contains(a, (UChar32)(b+1))) {
571            errln((UnicodeString)"FAIL, shouldn't contain " +
572                  (unsigned short)a + '-' + (unsigned short)(b+1) +
573                  " but does: " + set);
574        }
575    }
576
577    // Ported InversionList test.
578    UnicodeSet a((UChar32)3,(UChar32)10);
579    UnicodeSet b((UChar32)7,(UChar32)15);
580    UnicodeSet c;
581
582    logln((UnicodeString)"a [3-10]: " + a);
583    logln((UnicodeString)"b [7-15]: " + b);
584    c = a;
585    c.addAll(b);
586    UnicodeSet exp((UChar32)3,(UChar32)15);
587    if (c == exp) {
588        logln((UnicodeString)"c.set(a).add(b): " + c);
589    } else {
590        errln((UnicodeString)"FAIL: c.set(a).add(b) = " + c + ", expect " + exp);
591    }
592    c.complement();
593    exp.set((UChar32)0, (UChar32)2);
594    exp.add((UChar32)16, UnicodeSet::MAX_VALUE);
595    if (c == exp) {
596        logln((UnicodeString)"c.complement(): " + c);
597    } else {
598        errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
599    }
600    c.complement();
601    exp.set((UChar32)3, (UChar32)15);
602    if (c == exp) {
603        logln((UnicodeString)"c.complement(): " + c);
604    } else {
605        errln((UnicodeString)"FAIL: c.complement() = " + c + ", expect " + exp);
606    }
607    c = a;
608    c.complementAll(b);
609    exp.set((UChar32)3,(UChar32)6);
610    exp.add((UChar32)11,(UChar32) 15);
611    if (c == exp) {
612        logln((UnicodeString)"c.set(a).exclusiveOr(b): " + c);
613    } else {
614        errln((UnicodeString)"FAIL: c.set(a).exclusiveOr(b) = " + c + ", expect " + exp);
615    }
616
617    exp = c;
618    bitsToSet(setToBits(c), c);
619    if (c == exp) {
620        logln((UnicodeString)"bitsToSet(setToBits(c)): " + c);
621    } else {
622        errln((UnicodeString)"FAIL: bitsToSet(setToBits(c)) = " + c + ", expect " + exp);
623    }
624
625    // Additional tests for coverage JB#2118
626    //UnicodeSet::complement(class UnicodeString const &)
627    //UnicodeSet::complementAll(class UnicodeString const &)
628    //UnicodeSet::containsNone(class UnicodeSet const &)
629    //UnicodeSet::containsNone(long,long)
630    //UnicodeSet::containsSome(class UnicodeSet const &)
631    //UnicodeSet::containsSome(long,long)
632    //UnicodeSet::removeAll(class UnicodeString const &)
633    //UnicodeSet::retain(long)
634    //UnicodeSet::retainAll(class UnicodeString const &)
635    //UnicodeSet::serialize(unsigned short *,long,enum UErrorCode &)
636    //UnicodeSetIterator::getString(void)
637    set.clear();
638    set.complement("ab");
639    exp.applyPattern("[{ab}]", status);
640    if (U_FAILURE(status)) { errln("FAIL"); return; }
641    if (set != exp) { errln("FAIL: complement(\"ab\")"); return; }
642
643    UnicodeSetIterator iset(set);
644    if (!iset.next() || !iset.isString()) {
645        errln("FAIL: UnicodeSetIterator::next/isString");
646    } else if (iset.getString() != "ab") {
647        errln("FAIL: UnicodeSetIterator::getString");
648    }
649
650    set.add((UChar32)0x61, (UChar32)0x7A);
651    set.complementAll("alan");
652    exp.applyPattern("[{ab}b-kmo-z]", status);
653    if (U_FAILURE(status)) { errln("FAIL"); return; }
654    if (set != exp) { errln("FAIL: complementAll(\"alan\")"); return; }
655
656    exp.applyPattern("[a-z]", status);
657    if (U_FAILURE(status)) { errln("FAIL"); return; }
658    if (set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
659    if (!set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
660    exp.applyPattern("[aln]", status);
661    if (U_FAILURE(status)) { errln("FAIL"); return; }
662    if (!set.containsNone(exp)) { errln("FAIL: containsNone(UnicodeSet)"); }
663    if (set.containsSome(exp)) { errln("FAIL: containsSome(UnicodeSet)"); }
664
665    if (set.containsNone((UChar32)0x61, (UChar32)0x7A)) {
666        errln("FAIL: containsNone(UChar32, UChar32)");
667    }
668    if (!set.containsSome((UChar32)0x61, (UChar32)0x7A)) {
669        errln("FAIL: containsSome(UChar32, UChar32)");
670    }
671    if (!set.containsNone((UChar32)0x41, (UChar32)0x5A)) {
672        errln("FAIL: containsNone(UChar32, UChar32)");
673    }
674    if (set.containsSome((UChar32)0x41, (UChar32)0x5A)) {
675        errln("FAIL: containsSome(UChar32, UChar32)");
676    }
677
678    set.removeAll("liu");
679    exp.applyPattern("[{ab}b-hj-kmo-tv-z]", status);
680    if (U_FAILURE(status)) { errln("FAIL"); return; }
681    if (set != exp) { errln("FAIL: removeAll(\"liu\")"); return; }
682
683    set.retainAll("star");
684    exp.applyPattern("[rst]", status);
685    if (U_FAILURE(status)) { errln("FAIL"); return; }
686    if (set != exp) { errln("FAIL: retainAll(\"star\")"); return; }
687
688    set.retain((UChar32)0x73);
689    exp.applyPattern("[s]", status);
690    if (U_FAILURE(status)) { errln("FAIL"); return; }
691    if (set != exp) { errln("FAIL: retain('s')"); return; }
692
693    uint16_t buf[32];
694    int32_t slen = set.serialize(buf, sizeof(buf)/sizeof(buf[0]), status);
695    if (U_FAILURE(status)) { errln("FAIL: serialize"); return; }
696    if (slen != 3 || buf[0] != 2 || buf[1] != 0x73 || buf[2] != 0x74) {
697        errln("FAIL: serialize");
698        return;
699    }
700
701    // Conversions to and from USet
702    UnicodeSet *uniset = &set;
703    USet *uset = uniset->toUSet();
704    TEST_ASSERT((void *)uset == (void *)uniset);
705    UnicodeSet *setx = UnicodeSet::fromUSet(uset);
706    TEST_ASSERT((void *)setx == (void *)uset);
707    const UnicodeSet *constSet = uniset;
708    const USet *constUSet = constSet->toUSet();
709    TEST_ASSERT((void *)constUSet == (void *)constSet);
710    const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
711    TEST_ASSERT((void *)constSetx == (void *)constUSet);
712
713    // span(UnicodeString) and spanBack(UnicodeString) convenience methods
714    UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
715    UnicodeSet ac(0x61, 0x63);
716    ac.remove(0x62).freeze();
717    if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
718        ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
719        ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
720        ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
721        ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
722        ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
723        ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
724        ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
725        ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
726        ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
727    ) {
728        errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
729    }
730    if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
731        ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
732        ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
733        ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
734        ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
735        ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
736        ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
737        ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
738        ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
739        ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
740    ) {
741        errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
742    }
743}
744
745void UnicodeSetTest::TestIteration() {
746    UErrorCode ec = U_ZERO_ERROR;
747    int i = 0;
748    int outerLoop;
749
750    // 6 code points, 3 ranges, 2 strings, 8 total elements
751    //   Iteration will access them in sorted order -  a, b, c, y, z, U0001abcd, "str1", "str2"
752    UnicodeSet set(UNICODE_STRING_SIMPLE("[zabyc\\U0001abcd{str1}{str2}]"), ec);
753    TEST_ASSERT_SUCCESS(ec);
754    UnicodeSetIterator it(set);
755
756    for (outerLoop=0; outerLoop<3; outerLoop++) {
757        // Run the test multiple times, to check that iterator.reset() is working.
758        for (i=0; i<10; i++) {
759            UBool         nextv        = it.next();
760            UBool         isString     = it.isString();
761            int32_t       codePoint    = it.getCodepoint();
762            //int32_t       codePointEnd = it.getCodepointEnd();
763            UnicodeString s   = it.getString();
764            switch (i) {
765            case 0:
766                TEST_ASSERT(nextv == TRUE);
767                TEST_ASSERT(isString == FALSE);
768                TEST_ASSERT(codePoint==0x61);
769                TEST_ASSERT(s == "a");
770                break;
771            case 1:
772                TEST_ASSERT(nextv == TRUE);
773                TEST_ASSERT(isString == FALSE);
774                TEST_ASSERT(codePoint==0x62);
775                TEST_ASSERT(s == "b");
776                break;
777            case 2:
778                TEST_ASSERT(nextv == TRUE);
779                TEST_ASSERT(isString == FALSE);
780                TEST_ASSERT(codePoint==0x63);
781                TEST_ASSERT(s == "c");
782                break;
783            case 3:
784                TEST_ASSERT(nextv == TRUE);
785                TEST_ASSERT(isString == FALSE);
786                TEST_ASSERT(codePoint==0x79);
787                TEST_ASSERT(s == "y");
788                break;
789            case 4:
790                TEST_ASSERT(nextv == TRUE);
791                TEST_ASSERT(isString == FALSE);
792                TEST_ASSERT(codePoint==0x7a);
793                TEST_ASSERT(s == "z");
794                break;
795            case 5:
796                TEST_ASSERT(nextv == TRUE);
797                TEST_ASSERT(isString == FALSE);
798                TEST_ASSERT(codePoint==0x1abcd);
799                TEST_ASSERT(s == UnicodeString((UChar32)0x1abcd));
800                break;
801            case 6:
802                TEST_ASSERT(nextv == TRUE);
803                TEST_ASSERT(isString == TRUE);
804                TEST_ASSERT(s == "str1");
805                break;
806            case 7:
807                TEST_ASSERT(nextv == TRUE);
808                TEST_ASSERT(isString == TRUE);
809                TEST_ASSERT(s == "str2");
810                break;
811            case 8:
812                TEST_ASSERT(nextv == FALSE);
813                break;
814            case 9:
815                TEST_ASSERT(nextv == FALSE);
816                break;
817            }
818        }
819        it.reset();  // prepare to run the iteration again.
820    }
821}
822
823
824
825
826void UnicodeSetTest::TestStrings() {
827    UErrorCode ec = U_ZERO_ERROR;
828
829    UnicodeSet* testList[] = {
830        UnicodeSet::createFromAll("abc"),
831        new UnicodeSet("[a-c]", ec),
832
833        &(UnicodeSet::createFrom("ch")->add('a','z').add("ll")),
834        new UnicodeSet("[{ll}{ch}a-z]", ec),
835
836        UnicodeSet::createFrom("ab}c"),
837        new UnicodeSet("[{ab\\}c}]", ec),
838
839        &((new UnicodeSet('a','z'))->add('A', 'Z').retain('M','m').complement('X')),
840        new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]", ec),
841
842        NULL
843    };
844
845    if (U_FAILURE(ec)) {
846        errln("FAIL: couldn't construct test sets");
847    }
848
849    for (int32_t i = 0; testList[i] != NULL; i+=2) {
850        if (U_SUCCESS(ec)) {
851            UnicodeString pat0, pat1;
852            testList[i]->toPattern(pat0, TRUE);
853            testList[i+1]->toPattern(pat1, TRUE);
854            if (*testList[i] == *testList[i+1]) {
855                logln((UnicodeString)"Ok: " + pat0 + " == " + pat1);
856            } else {
857                logln((UnicodeString)"FAIL: " + pat0 + " != " + pat1);
858            }
859        }
860        delete testList[i];
861        delete testList[i+1];
862    }
863}
864
865/**
866 * Test the [:Latin:] syntax.
867 */
868void UnicodeSetTest::TestScriptSet() {
869    expectContainment(UNICODE_STRING_SIMPLE("[:Latin:]"), "aA", CharsToUnicodeString("\\u0391\\u03B1"));
870
871    expectContainment(UNICODE_STRING_SIMPLE("[:Greek:]"), CharsToUnicodeString("\\u0391\\u03B1"), "aA");
872
873    /* Jitterbug 1423 */
874    expectContainment(UNICODE_STRING_SIMPLE("[[:Common:][:Inherited:]]"), CharsToUnicodeString("\\U00003099\\U0001D169\\u0000"), "aA");
875
876}
877
878/**
879 * Test the [:Latin:] syntax.
880 */
881void UnicodeSetTest::TestPropertySet() {
882    static const char* const DATA[] = {
883        // Pattern, Chars IN, Chars NOT in
884
885        "[:Latin:]",
886        "aA",
887        "\\u0391\\u03B1",
888
889        "[\\p{Greek}]",
890        "\\u0391\\u03B1",
891        "aA",
892
893        "\\P{ GENERAL Category = upper case letter }",
894        "abc",
895        "ABC",
896
897#if !UCONFIG_NO_NORMALIZATION
898        // Combining class: @since ICU 2.2
899        // Check both symbolic and numeric
900        "\\p{ccc=Nukta}",
901        "\\u0ABC",
902        "abc",
903
904        "\\p{Canonical Combining Class = 11}",
905        "\\u05B1",
906        "\\u05B2",
907
908        "[:c c c = iota subscript :]",
909        "\\u0345",
910        "xyz",
911#endif
912
913        // Bidi class: @since ICU 2.2
914        "\\p{bidiclass=lefttoright}",
915        "abc",
916        "\\u0671\\u0672",
917
918        // Binary properties: @since ICU 2.2
919        "\\p{ideographic}",
920        "\\u4E0A",
921        "x",
922
923        "[:math=false:]",
924        "q)*(",
925        // weiv: )(and * were removed from math in Unicode 4.0.1
926        //"(*+)",
927        "+<>^",
928
929        // JB#1767 \N{}, \p{ASCII}
930        "[:Ascii:]",
931        "abc\\u0000\\u007F",
932        "\\u0080\\u4E00",
933
934        "[\\N{ latin small letter  a  }[:name= latin small letter z:]]",
935        "az",
936        "qrs",
937
938        // JB#2015
939        "[:any:]",
940        "a\\U0010FFFF",
941        "",
942
943        "[:nv=0.5:]",
944        "\\u00BD\\u0F2A",
945        "\\u00BC",
946
947        // JB#2653: Age
948        "[:Age=1.1:]",
949        "\\u03D6", // 1.1
950        "\\u03D8\\u03D9", // 3.2
951
952        "[:Age=3.1:]",
953        "\\u1800\\u3400\\U0002f800",
954        "\\u0220\\u034f\\u30ff\\u33ff\\ufe73\\U00010000\\U00050000",
955
956        // JB#2350: Case_Sensitive
957        "[:Case Sensitive:]",
958        "A\\u1FFC\\U00010410",
959        ";\\u00B4\\U00010500",
960
961        // JB#2832: C99-compatibility props
962        "[:blank:]",
963        " \\u0009",
964        "1-9A-Z",
965
966        "[:graph:]",
967        "19AZ",
968        " \\u0003\\u0007\\u0009\\u000A\\u000D",
969
970        "[:punct:]",
971        "!@#%&*()[]{}-_\\/;:,.?'\"",
972        "09azAZ",
973
974        "[:xdigit:]",
975        "09afAF",
976        "gG!",
977
978        // Regex compatibility test
979        "[-b]", // leading '-' is literal
980        "-b",
981        "ac",
982
983        "[^-b]", // leading '-' is literal
984        "ac",
985        "-b",
986
987        "[b-]", // trailing '-' is literal
988        "-b",
989        "ac",
990
991        "[^b-]", // trailing '-' is literal
992        "ac",
993        "-b",
994
995        "[a-b-]", // trailing '-' is literal
996        "ab-",
997        "c=",
998
999        "[[a-q]&[p-z]-]", // trailing '-' is literal
1000        "pq-",
1001        "or=",
1002
1003        "[\\s|\\)|:|$|\\>]", // from regex tests
1004        "s|):$>",
1005        "abc",
1006
1007        "[\\uDC00cd]", // JB#2906: isolated trail at start
1008        "cd\\uDC00",
1009        "ab\\uD800\\U00010000",
1010
1011        "[ab\\uD800]", // JB#2906: isolated trail at start
1012        "ab\\uD800",
1013        "cd\\uDC00\\U00010000",
1014
1015        "[ab\\uD800cd]", // JB#2906: isolated lead in middle
1016        "abcd\\uD800",
1017        "ef\\uDC00\\U00010000",
1018
1019        "[ab\\uDC00cd]", // JB#2906: isolated trail in middle
1020        "abcd\\uDC00",
1021        "ef\\uD800\\U00010000",
1022
1023#if !UCONFIG_NO_NORMALIZATION
1024        "[:^lccc=0:]", // Lead canonical class
1025        "\\u0300\\u0301",
1026        "abcd\\u00c0\\u00c5",
1027
1028        "[:^tccc=0:]", // Trail canonical class
1029        "\\u0300\\u0301\\u00c0\\u00c5",
1030        "abcd",
1031
1032        "[[:^lccc=0:][:^tccc=0:]]", // Lead and trail canonical class
1033        "\\u0300\\u0301\\u00c0\\u00c5",
1034        "abcd",
1035
1036        "[[:^lccc=0:]-[:^tccc=0:]]", // Stuff that starts with an accent but ends with a base (none right now)
1037        "",
1038        "abcd\\u0300\\u0301\\u00c0\\u00c5",
1039
1040        "[[:ccc=0:]-[:lccc=0:]-[:tccc=0:]]", // Weirdos. Complete canonical class is zero, but both lead and trail are not
1041        "\\u0F73\\u0F75\\u0F81",
1042        "abcd\\u0300\\u0301\\u00c0\\u00c5",
1043#endif /* !UCONFIG_NO_NORMALIZATION */
1044
1045        "[:Assigned:]",
1046        "A\\uE000\\uF8FF\\uFDC7\\U00010000\\U0010FFFD",
1047        "\\u0888\\uFDD3\\uFFFE\\U00050005"
1048    };
1049
1050    static const int32_t DATA_LEN = sizeof(DATA)/sizeof(DATA[0]);
1051
1052    for (int32_t i=0; i<DATA_LEN; i+=3) {
1053        expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]),
1054                          CharsToUnicodeString(DATA[i+2]));
1055    }
1056}
1057
1058/**
1059  * Test that Posix style character classes [:digit:], etc.
1060  *   have the Unicode definitions from TR 18.
1061  */
1062void UnicodeSetTest::TestPosixClasses() {
1063    {
1064        UErrorCode status = U_ZERO_ERROR;
1065        UnicodeSet s1("[:alpha:]", status);
1066        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Alphabetic}"), status);
1067        TEST_ASSERT_SUCCESS(status);
1068        TEST_ASSERT(s1==s2);
1069    }
1070    {
1071        UErrorCode status = U_ZERO_ERROR;
1072        UnicodeSet s1("[:lower:]", status);
1073        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{lowercase}"), status);
1074        TEST_ASSERT_SUCCESS(status);
1075        TEST_ASSERT(s1==s2);
1076    }
1077    {
1078        UErrorCode status = U_ZERO_ERROR;
1079        UnicodeSet s1("[:upper:]", status);
1080        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Uppercase}"), status);
1081        TEST_ASSERT_SUCCESS(status);
1082        TEST_ASSERT(s1==s2);
1083    }
1084    {
1085        UErrorCode status = U_ZERO_ERROR;
1086        UnicodeSet s1("[:punct:]", status);
1087        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=Punctuation}"), status);
1088        TEST_ASSERT_SUCCESS(status);
1089        TEST_ASSERT(s1==s2);
1090    }
1091    {
1092        UErrorCode status = U_ZERO_ERROR;
1093        UnicodeSet s1("[:digit:]", status);
1094        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{gc=DecimalNumber}"), status);
1095        TEST_ASSERT_SUCCESS(status);
1096        TEST_ASSERT(s1==s2);
1097    }
1098    {
1099        UErrorCode status = U_ZERO_ERROR;
1100        UnicodeSet s1("[:xdigit:]", status);
1101        UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{DecimalNumber}\\p{HexDigit}]"), status);
1102        TEST_ASSERT_SUCCESS(status);
1103        TEST_ASSERT(s1==s2);
1104    }
1105    {
1106        UErrorCode status = U_ZERO_ERROR;
1107        UnicodeSet s1("[:alnum:]", status);
1108        UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Alphabetic}\\p{DecimalNumber}]"), status);
1109        TEST_ASSERT_SUCCESS(status);
1110        TEST_ASSERT(s1==s2);
1111    }
1112    {
1113        UErrorCode status = U_ZERO_ERROR;
1114        UnicodeSet s1("[:space:]", status);
1115        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Whitespace}"), status);
1116        TEST_ASSERT_SUCCESS(status);
1117        TEST_ASSERT(s1==s2);
1118    }
1119    {
1120        UErrorCode status = U_ZERO_ERROR;
1121        UnicodeSet s1("[:blank:]", status);
1122        TEST_ASSERT_SUCCESS(status);
1123        UnicodeSet s2(UNICODE_STRING_SIMPLE("[\\p{Whitespace}-[\\u000a\\u000B\\u000c\\u000d\\u0085\\p{LineSeparator}\\p{ParagraphSeparator}]]"),
1124            status);
1125        TEST_ASSERT_SUCCESS(status);
1126        TEST_ASSERT(s1==s2);
1127    }
1128    {
1129        UErrorCode status = U_ZERO_ERROR;
1130        UnicodeSet s1("[:cntrl:]", status);
1131        TEST_ASSERT_SUCCESS(status);
1132        UnicodeSet s2(UNICODE_STRING_SIMPLE("\\p{Control}"), status);
1133        TEST_ASSERT_SUCCESS(status);
1134        TEST_ASSERT(s1==s2);
1135    }
1136    {
1137        UErrorCode status = U_ZERO_ERROR;
1138        UnicodeSet s1("[:graph:]", status);
1139        TEST_ASSERT_SUCCESS(status);
1140        UnicodeSet s2(UNICODE_STRING_SIMPLE("[^\\p{Whitespace}\\p{Control}\\p{Surrogate}\\p{Unassigned}]"), status);
1141        TEST_ASSERT_SUCCESS(status);
1142        TEST_ASSERT(s1==s2);
1143    }
1144    {
1145        UErrorCode status = U_ZERO_ERROR;
1146        UnicodeSet s1("[:print:]", status);
1147        TEST_ASSERT_SUCCESS(status);
1148        UnicodeSet s2(UNICODE_STRING_SIMPLE("[[:graph:][:blank:]-[\\p{Control}]]") ,status);
1149        TEST_ASSERT_SUCCESS(status);
1150        TEST_ASSERT(s1==s2);
1151    }
1152}
1153/**
1154 * Test cloning of UnicodeSet.  For C++, we test the copy constructor.
1155 */
1156void UnicodeSetTest::TestClone() {
1157    UErrorCode ec = U_ZERO_ERROR;
1158    UnicodeSet s("[abcxyz]", ec);
1159    UnicodeSet t(s);
1160    expectContainment(t, "abc", "def");
1161}
1162
1163/**
1164 * Test the indexOf() and charAt() methods.
1165 */
1166void UnicodeSetTest::TestIndexOf() {
1167    UErrorCode ec = U_ZERO_ERROR;
1168    UnicodeSet set("[a-cx-y3578]", ec);
1169    if (U_FAILURE(ec)) {
1170        errln("FAIL: UnicodeSet constructor");
1171        return;
1172    }
1173    for (int32_t i=0; i<set.size(); ++i) {
1174        UChar32 c = set.charAt(i);
1175        if (set.indexOf(c) != i) {
1176            errln("FAIL: charAt(%d) = %X => indexOf() => %d",
1177                i, c, set.indexOf(c));
1178        }
1179    }
1180    UChar32 c = set.charAt(set.size());
1181    if (c != -1) {
1182        errln("FAIL: charAt(<out of range>) = %X", c);
1183    }
1184    int32_t j = set.indexOf((UChar32)0x71/*'q'*/);
1185    if (j != -1) {
1186        errln((UnicodeString)"FAIL: indexOf('q') = " + j);
1187    }
1188}
1189
1190/**
1191 * Test closure API.
1192 */
1193void UnicodeSetTest::TestCloseOver() {
1194    UErrorCode ec = U_ZERO_ERROR;
1195
1196    char CASE[] = {(char)USET_CASE_INSENSITIVE};
1197    char CASE_MAPPINGS[] = {(char)USET_ADD_CASE_MAPPINGS};
1198    const char* DATA[] = {
1199        // selector, input, output
1200        CASE,
1201        "[aq\\u00DF{Bc}{bC}{Fi}]",
1202        "[aAqQ\\u00DF\\u1E9E\\uFB01{ss}{bc}{fi}]",  // U+1E9E LATIN CAPITAL LETTER SHARP S is new in Unicode 5.1
1203
1204        CASE,
1205        "[\\u01F1]", // 'DZ'
1206        "[\\u01F1\\u01F2\\u01F3]",
1207
1208        CASE,
1209        "[\\u1FB4]",
1210        "[\\u1FB4{\\u03AC\\u03B9}]",
1211
1212        CASE,
1213        "[{F\\uFB01}]",
1214        "[\\uFB03{ffi}]",
1215
1216        CASE, // make sure binary search finds limits
1217        "[a\\uFF3A]",
1218        "[aA\\uFF3A\\uFF5A]",
1219
1220        CASE,
1221        "[a-z]","[A-Za-z\\u017F\\u212A]",
1222        CASE,
1223        "[abc]","[A-Ca-c]",
1224        CASE,
1225        "[ABC]","[A-Ca-c]",
1226
1227        CASE, "[i]", "[iI]",
1228
1229        CASE, "[\\u0130]",          "[\\u0130{i\\u0307}]", // dotted I
1230        CASE, "[{i\\u0307}]",       "[\\u0130{i\\u0307}]", // i with dot
1231
1232        CASE, "[\\u0131]",          "[\\u0131]", // dotless i
1233
1234        CASE, "[\\u0390]",          "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
1235
1236        CASE, "[\\u03c2]",          "[\\u03a3\\u03c2\\u03c3]", // sigmas
1237
1238        CASE, "[\\u03f2]",          "[\\u03f2\\u03f9]", // lunate sigmas
1239
1240        CASE, "[\\u03f7]",          "[\\u03f7\\u03f8]",
1241
1242        CASE, "[\\u1fe3]",          "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
1243
1244        CASE, "[\\ufb05]",          "[\\ufb05\\ufb06{st}]",
1245        CASE, "[{st}]",             "[\\ufb05\\ufb06{st}]",
1246
1247        CASE, "[\\U0001044F]",      "[\\U00010427\\U0001044F]",
1248
1249        CASE, "[{a\\u02BE}]",       "[\\u1E9A{a\\u02BE}]", // first in sorted table
1250
1251        CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
1252
1253#if !UCONFIG_NO_FILE_IO
1254        CASE_MAPPINGS,
1255        "[aq\\u00DF{Bc}{bC}{Fi}]",
1256        "[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
1257#endif
1258
1259        CASE_MAPPINGS,
1260        "[\\u01F1]", // 'DZ'
1261        "[\\u01F1\\u01F2\\u01F3]",
1262
1263        CASE_MAPPINGS,
1264        "[a-z]",
1265        "[A-Za-z]",
1266
1267        NULL
1268    };
1269
1270    UnicodeSet s;
1271    UnicodeSet t;
1272    UnicodeString buf;
1273    for (int32_t i=0; DATA[i]!=NULL; i+=3) {
1274        int32_t selector = DATA[i][0];
1275        UnicodeString pat(DATA[i+1], -1, US_INV);
1276        UnicodeString exp(DATA[i+2], -1, US_INV);
1277        s.applyPattern(pat, ec);
1278        s.closeOver(selector);
1279        t.applyPattern(exp, ec);
1280        if (U_FAILURE(ec)) {
1281            errln("FAIL: applyPattern failed");
1282            continue;
1283        }
1284        if (s == t) {
1285            logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
1286        } else {
1287            dataerrln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
1288                  s.toPattern(buf, TRUE) + ", expected " + exp);
1289        }
1290    }
1291
1292#if 0
1293    /*
1294     * Unused test code.
1295     * This was used to compare the old implementation (using USET_CASE)
1296     * with the new one (using 0x100 temporarily)
1297     * while transitioning from hardcoded case closure tables in uniset.cpp
1298     * (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
1299     * and using ucase.c functions for closure.
1300     * See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
1301     *
1302     * Note: The old and new implementation never fully matched because
1303     * the old implementation turned out to not map U+0130 and U+0131 correctly
1304     * (dotted I and dotless i) and because the old implementation's data tables
1305     * were outdated compared to Unicode 4.0.1 at the time of the change to the
1306     * new implementation. (So sigmas and some other characters were not handled
1307     * according to the newer Unicode version.)
1308     */
1309    UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
1310    UnicodeSetIterator si(sens);
1311    UnicodeString str, buf2;
1312    const UnicodeString *pStr;
1313    UChar32 c;
1314    while(si.next()) {
1315        if(!si.isString()) {
1316            c=si.getCodepoint();
1317            s.clear();
1318            s.add(c);
1319
1320            str.setTo(c);
1321            str.foldCase();
1322            sens2.add(str);
1323
1324            t=s;
1325            s.closeOver(USET_CASE);
1326            t.closeOver(0x100);
1327            if(s!=t) {
1328                errln("FAIL: closeOver(U+%04x) differs: ", c);
1329                errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1330            }
1331        }
1332    }
1333    // remove all code points
1334    // should contain all full case folding mapping strings
1335    sens2.remove(0, 0x10ffff);
1336    si.reset(sens2);
1337    while(si.next()) {
1338        if(si.isString()) {
1339            pStr=&si.getString();
1340            s.clear();
1341            s.add(*pStr);
1342            t=s2=s;
1343            s.closeOver(USET_CASE);
1344            t.closeOver(0x100);
1345            if(s!=t) {
1346                errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
1347                errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
1348            }
1349        }
1350    }
1351#endif
1352
1353    // Test the pattern API
1354    s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
1355    if (U_FAILURE(ec)) {
1356        errln("FAIL: applyPattern failed");
1357    } else {
1358        expectContainment(s, "abcABC", "defDEF");
1359    }
1360    UnicodeSet v("[^abc]", USET_CASE_INSENSITIVE, NULL, ec);
1361    if (U_FAILURE(ec)) {
1362        errln("FAIL: constructor failed");
1363    } else {
1364        expectContainment(v, "defDEF", "abcABC");
1365    }
1366    UnicodeSet cm("[abck]", USET_ADD_CASE_MAPPINGS, NULL, ec);
1367    if (U_FAILURE(ec)) {
1368        errln("FAIL: construct w/case mappings failed");
1369    } else {
1370        expectContainment(cm, "abckABCK", CharsToUnicodeString("defDEF\\u212A"));
1371    }
1372}
1373
1374void UnicodeSetTest::TestEscapePattern() {
1375    const char pattern[] =
1376        "[\\uFEFF \\u200A-\\u200E \\U0001D173-\\U0001D17A \\U000F0000-\\U000FFFFD ]";
1377    const char exp[] =
1378        "[\\u200A-\\u200E\\uFEFF\\U0001D173-\\U0001D17A\\U000F0000-\\U000FFFFD]";
1379    // We test this with two passes; in the second pass we
1380    // pre-unescape the pattern.  Since U+200E is rule whitespace,
1381    // this fails -- which is what we expect.
1382    for (int32_t pass=1; pass<=2; ++pass) {
1383        UErrorCode ec = U_ZERO_ERROR;
1384        UnicodeString pat(pattern, -1, US_INV);
1385        if (pass==2) {
1386            pat = pat.unescape();
1387        }
1388        // Pattern is only good for pass 1
1389        UBool isPatternValid = (pass==1);
1390
1391        UnicodeSet set(pat, ec);
1392        if (U_SUCCESS(ec) != isPatternValid){
1393            errln((UnicodeString)"FAIL: applyPattern(" +
1394                  escape(pat) + ") => " +
1395                  u_errorName(ec));
1396            continue;
1397        }
1398        if (U_FAILURE(ec)) {
1399            continue;
1400        }
1401        if (set.contains((UChar)0x0644)){
1402            errln((UnicodeString)"FAIL: " + escape(pat) + " contains(U+0664)");
1403        }
1404
1405        UnicodeString newpat;
1406        set.toPattern(newpat, TRUE);
1407        if (newpat == UnicodeString(exp, -1, US_INV)) {
1408            logln(escape(pat) + " => " + newpat);
1409        } else {
1410            errln((UnicodeString)"FAIL: " + escape(pat) + " => " + newpat);
1411        }
1412
1413        for (int32_t i=0; i<set.getRangeCount(); ++i) {
1414            UnicodeString str("Range ");
1415            str.append((UChar)(0x30 + i))
1416                .append(": ")
1417                .append((UChar32)set.getRangeStart(i))
1418                .append(" - ")
1419                .append((UChar32)set.getRangeEnd(i));
1420            str = str + " (" + set.getRangeStart(i) + " - " +
1421                set.getRangeEnd(i) + ")";
1422            if (set.getRangeStart(i) < 0) {
1423                errln((UnicodeString)"FAIL: " + escape(str));
1424            } else {
1425                logln(escape(str));
1426            }
1427        }
1428    }
1429}
1430
1431void UnicodeSetTest::expectRange(const UnicodeString& label,
1432                                 const UnicodeSet& set,
1433                                 UChar32 start, UChar32 end) {
1434    UnicodeSet exp(start, end);
1435    UnicodeString pat;
1436    if (set == exp) {
1437        logln(label + " => " + set.toPattern(pat, TRUE));
1438    } else {
1439        UnicodeString xpat;
1440        errln((UnicodeString)"FAIL: " + label + " => " +
1441              set.toPattern(pat, TRUE) +
1442              ", expected " + exp.toPattern(xpat, TRUE));
1443    }
1444}
1445
1446void UnicodeSetTest::TestInvalidCodePoint() {
1447
1448    const UChar32 DATA[] = {
1449        // Test range             Expected range
1450        0, 0x10FFFF,              0, 0x10FFFF,
1451        (UChar32)-1, 8,           0, 8,
1452        8, 0x110000,              8, 0x10FFFF
1453    };
1454    const int32_t DATA_LENGTH = sizeof(DATA)/sizeof(DATA[0]);
1455
1456    UnicodeString pat;
1457    int32_t i;
1458
1459    for (i=0; i<DATA_LENGTH; i+=4) {
1460        UChar32 start  = DATA[i];
1461        UChar32 end    = DATA[i+1];
1462        UChar32 xstart = DATA[i+2];
1463        UChar32 xend   = DATA[i+3];
1464
1465        // Try various API using the test code points
1466
1467        UnicodeSet set(start, end);
1468        expectRange((UnicodeString)"ct(" + start + "," + end + ")",
1469                    set, xstart, xend);
1470
1471        set.clear();
1472        set.set(start, end);
1473        expectRange((UnicodeString)"set(" + start + "," + end + ")",
1474                    set, xstart, xend);
1475
1476        UBool b = set.contains(start);
1477        b = set.contains(start, end);
1478        b = set.containsNone(start, end);
1479        b = set.containsSome(start, end);
1480
1481        /*int32_t index = set.indexOf(start);*/
1482
1483        set.clear();
1484        set.add(start);
1485        set.add(start, end);
1486        expectRange((UnicodeString)"add(" + start + "," + end + ")",
1487                    set, xstart, xend);
1488
1489        set.set(0, 0x10FFFF);
1490        set.retain(start, end);
1491        expectRange((UnicodeString)"retain(" + start + "," + end + ")",
1492                    set, xstart, xend);
1493        set.retain(start);
1494
1495        set.set(0, 0x10FFFF);
1496        set.remove(start);
1497        set.remove(start, end);
1498        set.complement();
1499        expectRange((UnicodeString)"!remove(" + start + "," + end + ")",
1500                    set, xstart, xend);
1501
1502        set.set(0, 0x10FFFF);
1503        set.complement(start, end);
1504        set.complement();
1505        expectRange((UnicodeString)"!complement(" + start + "," + end + ")",
1506                    set, xstart, xend);
1507        set.complement(start);
1508    }
1509
1510    const UChar32 DATA2[] = {
1511        0,
1512        0x10FFFF,
1513        (UChar32)-1,
1514        0x110000
1515    };
1516    const int32_t DATA2_LENGTH = sizeof(DATA2)/sizeof(DATA2[0]);
1517
1518    for (i=0; i<DATA2_LENGTH; ++i) {
1519        UChar32 c = DATA2[i], end = 0x10FFFF;
1520        UBool valid = (c >= 0 && c <= 0x10FFFF);
1521
1522        UnicodeSet set(0, 0x10FFFF);
1523
1524        // For single-codepoint contains, invalid codepoints are NOT contained
1525        UBool b = set.contains(c);
1526        if (b == valid) {
1527            logln((UnicodeString)"[\\u0000-\\U0010FFFF].contains(" + c +
1528                  ") = " + b);
1529        } else {
1530            errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].contains(" + c +
1531                  ") = " + b);
1532        }
1533
1534        // For codepoint range contains, containsNone, and containsSome,
1535        // invalid or empty (start > end) ranges have UNDEFINED behavior.
1536        b = set.contains(c, end);
1537        logln((UnicodeString)"* [\\u0000-\\U0010FFFF].contains(" + c +
1538              "," + end + ") = " + b);
1539
1540        b = set.containsNone(c, end);
1541        logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsNone(" + c +
1542              "," + end + ") = " + b);
1543
1544        b = set.containsSome(c, end);
1545        logln((UnicodeString)"* [\\u0000-\\U0010FFFF].containsSome(" + c +
1546              "," + end + ") = " + b);
1547
1548        int32_t index = set.indexOf(c);
1549        if ((index >= 0) == valid) {
1550            logln((UnicodeString)"[\\u0000-\\U0010FFFF].indexOf(" + c +
1551                  ") = " + index);
1552        } else {
1553            errln((UnicodeString)"FAIL: [\\u0000-\\U0010FFFF].indexOf(" + c +
1554                  ") = " + index);
1555        }
1556    }
1557}
1558
1559// Used by TestSymbolTable
1560class TokenSymbolTable : public SymbolTable {
1561public:
1562    Hashtable contents;
1563
1564    TokenSymbolTable(UErrorCode& ec) : contents(FALSE, ec) {
1565        contents.setValueDeleter(uhash_deleteUnicodeString);
1566    }
1567
1568    ~TokenSymbolTable() {}
1569
1570    /**
1571     * (Non-SymbolTable API) Add the given variable and value to
1572     * the table.  Variable should NOT contain leading '$'.
1573     */
1574    void add(const UnicodeString& var, const UnicodeString& value,
1575             UErrorCode& ec) {
1576        if (U_SUCCESS(ec)) {
1577            contents.put(var, new UnicodeString(value), ec);
1578        }
1579    }
1580
1581    /**
1582     * SymbolTable API
1583     */
1584    virtual const UnicodeString* lookup(const UnicodeString& s) const {
1585        return (const UnicodeString*) contents.get(s);
1586    }
1587
1588    /**
1589     * SymbolTable API
1590     */
1591    virtual const UnicodeFunctor* lookupMatcher(UChar32 /*ch*/) const {
1592        return NULL;
1593    }
1594
1595    /**
1596     * SymbolTable API
1597     */
1598    virtual UnicodeString parseReference(const UnicodeString& text,
1599                                         ParsePosition& pos, int32_t limit) const {
1600        int32_t start = pos.getIndex();
1601        int32_t i = start;
1602        UnicodeString result;
1603        while (i < limit) {
1604            UChar c = text.charAt(i);
1605            if ((i==start && !u_isIDStart(c)) || !u_isIDPart(c)) {
1606                break;
1607            }
1608            ++i;
1609        }
1610        if (i == start) { // No valid name chars
1611            return result; // Indicate failure with empty string
1612        }
1613        pos.setIndex(i);
1614        text.extractBetween(start, i, result);
1615        return result;
1616    }
1617};
1618
1619void UnicodeSetTest::TestSymbolTable() {
1620    // Multiple test cases can be set up here.  Each test case
1621    // is terminated by null:
1622    // var, value, var, value,..., input pat., exp. output pat., null
1623    const char* DATA[] = {
1624        "us", "a-z", "[0-1$us]", "[0-1a-z]", NULL,
1625        "us", "[a-z]", "[0-1$us]", "[0-1[a-z]]", NULL,
1626        "us", "\\[a\\-z\\]", "[0-1$us]", "[-01\\[\\]az]", NULL,
1627        NULL
1628    };
1629
1630    for (int32_t i=0; DATA[i]!=NULL; ++i) {
1631        UErrorCode ec = U_ZERO_ERROR;
1632        TokenSymbolTable sym(ec);
1633        if (U_FAILURE(ec)) {
1634            errln("FAIL: couldn't construct TokenSymbolTable");
1635            continue;
1636        }
1637
1638        // Set up variables
1639        while (DATA[i+2] != NULL) {
1640            sym.add(UnicodeString(DATA[i], -1, US_INV), UnicodeString(DATA[i+1], -1, US_INV), ec);
1641            if (U_FAILURE(ec)) {
1642                errln("FAIL: couldn't add to TokenSymbolTable");
1643                continue;
1644            }
1645            i += 2;
1646        }
1647
1648        // Input pattern and expected output pattern
1649        UnicodeString inpat = UnicodeString(DATA[i], -1, US_INV), exppat = UnicodeString(DATA[i+1], -1, US_INV);
1650        i += 2;
1651
1652        ParsePosition pos(0);
1653        UnicodeSet us(inpat, pos, USET_IGNORE_SPACE, &sym, ec);
1654        if (U_FAILURE(ec)) {
1655            errln("FAIL: couldn't construct UnicodeSet");
1656            continue;
1657        }
1658
1659        // results
1660        if (pos.getIndex() != inpat.length()) {
1661            errln((UnicodeString)"Failed to read to end of string \""
1662                  + inpat + "\": read to "
1663                  + pos.getIndex() + ", length is "
1664                  + inpat.length());
1665        }
1666
1667        UnicodeSet us2(exppat, ec);
1668        if (U_FAILURE(ec)) {
1669            errln("FAIL: couldn't construct expected UnicodeSet");
1670            continue;
1671        }
1672
1673        UnicodeString a, b;
1674        if (us != us2) {
1675            errln((UnicodeString)"Failed, got " + us.toPattern(a, TRUE) +
1676                  ", expected " + us2.toPattern(b, TRUE));
1677        } else {
1678            logln((UnicodeString)"Ok, got " + us.toPattern(a, TRUE));
1679        }
1680    }
1681}
1682
1683void UnicodeSetTest::TestSurrogate() {
1684    const char* DATA[] = {
1685        // These should all behave identically
1686        "[abc\\uD800\\uDC00]",
1687        // "[abc\uD800\uDC00]", // Can't do this on C -- only Java
1688        "[abc\\U00010000]",
1689        0
1690    };
1691    for (int i=0; DATA[i] != 0; ++i) {
1692        UErrorCode ec = U_ZERO_ERROR;
1693        logln((UnicodeString)"Test pattern " + i + " :" + UnicodeString(DATA[i], -1, US_INV));
1694        UnicodeString str = UnicodeString(DATA[i], -1, US_INV);
1695        UnicodeSet set(str, ec);
1696        if (U_FAILURE(ec)) {
1697            errln("FAIL: UnicodeSet constructor");
1698            continue;
1699        }
1700        expectContainment(set,
1701                          CharsToUnicodeString("abc\\U00010000"),
1702                          CharsToUnicodeString("\\uD800;\\uDC00")); // split apart surrogate-pair
1703        if (set.size() != 4) {
1704            errln((UnicodeString)"FAIL: " + UnicodeString(DATA[i], -1, US_INV) + ".size() == " +
1705                  set.size() + ", expected 4");
1706        }
1707    }
1708}
1709
1710void UnicodeSetTest::TestExhaustive() {
1711    // exhaustive tests. Simulate UnicodeSets with integers.
1712    // That gives us very solid tests (except for large memory tests).
1713
1714    int32_t limit = 128;
1715
1716    UnicodeSet x, y, z, aa;
1717
1718    for (int32_t i = 0; i < limit; ++i) {
1719        bitsToSet(i, x);
1720        logln((UnicodeString)"Testing " + i + ", " + x);
1721        _testComplement(i, x, y);
1722
1723        // AS LONG AS WE ARE HERE, check roundtrip
1724        checkRoundTrip(bitsToSet(i, aa));
1725
1726        for (int32_t j = 0; j < limit; ++j) {
1727            _testAdd(i,j,  x,y,z);
1728            _testXor(i,j,  x,y,z);
1729            _testRetain(i,j,  x,y,z);
1730            _testRemove(i,j,  x,y,z);
1731        }
1732    }
1733}
1734
1735void UnicodeSetTest::_testComplement(int32_t a, UnicodeSet& x, UnicodeSet& z) {
1736    bitsToSet(a, x);
1737    z = x;
1738    z.complement();
1739    int32_t c = setToBits(z);
1740    if (c != (~a)) {
1741        errln((UnicodeString)"FAILED: add: ~" + x +  " != " + z);
1742        errln((UnicodeString)"FAILED: add: ~" + a + " != " + c);
1743    }
1744    checkCanonicalRep(z, (UnicodeString)"complement " + a);
1745}
1746
1747void UnicodeSetTest::_testAdd(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1748    bitsToSet(a, x);
1749    bitsToSet(b, y);
1750    z = x;
1751    z.addAll(y);
1752    int32_t c = setToBits(z);
1753    if (c != (a | b)) {
1754        errln((UnicodeString)"FAILED: add: " + x + " | " + y + " != " + z);
1755        errln((UnicodeString)"FAILED: add: " + a + " | " + b + " != " + c);
1756    }
1757    checkCanonicalRep(z, (UnicodeString)"add " + a + "," + b);
1758}
1759
1760void UnicodeSetTest::_testRetain(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1761    bitsToSet(a, x);
1762    bitsToSet(b, y);
1763    z = x;
1764    z.retainAll(y);
1765    int32_t c = setToBits(z);
1766    if (c != (a & b)) {
1767        errln((UnicodeString)"FAILED: retain: " + x + " & " + y + " != " + z);
1768        errln((UnicodeString)"FAILED: retain: " + a + " & " + b + " != " + c);
1769    }
1770    checkCanonicalRep(z, (UnicodeString)"retain " + a + "," + b);
1771}
1772
1773void UnicodeSetTest::_testRemove(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1774    bitsToSet(a, x);
1775    bitsToSet(b, y);
1776    z = x;
1777    z.removeAll(y);
1778    int32_t c = setToBits(z);
1779    if (c != (a &~ b)) {
1780        errln((UnicodeString)"FAILED: remove: " + x + " &~ " + y + " != " + z);
1781        errln((UnicodeString)"FAILED: remove: " + a + " &~ " + b + " != " + c);
1782    }
1783    checkCanonicalRep(z, (UnicodeString)"remove " + a + "," + b);
1784}
1785
1786void UnicodeSetTest::_testXor(int32_t a, int32_t b, UnicodeSet& x, UnicodeSet& y, UnicodeSet& z) {
1787    bitsToSet(a, x);
1788    bitsToSet(b, y);
1789    z = x;
1790    z.complementAll(y);
1791    int32_t c = setToBits(z);
1792    if (c != (a ^ b)) {
1793        errln((UnicodeString)"FAILED: complement: " + x + " ^ " + y + " != " + z);
1794        errln((UnicodeString)"FAILED: complement: " + a + " ^ " + b + " != " + c);
1795    }
1796    checkCanonicalRep(z, (UnicodeString)"complement " + a + "," + b);
1797}
1798
1799/**
1800 * Check that ranges are monotonically increasing and non-
1801 * overlapping.
1802 */
1803void UnicodeSetTest::checkCanonicalRep(const UnicodeSet& set, const UnicodeString& msg) {
1804    int32_t n = set.getRangeCount();
1805    if (n < 0) {
1806        errln((UnicodeString)"FAIL result of " + msg +
1807              ": range count should be >= 0 but is " +
1808              n /*+ " for " + set.toPattern())*/);
1809        return;
1810    }
1811    UChar32 last = 0;
1812    for (int32_t i=0; i<n; ++i) {
1813        UChar32 start = set.getRangeStart(i);
1814        UChar32 end = set.getRangeEnd(i);
1815        if (start > end) {
1816            errln((UnicodeString)"FAIL result of " + msg +
1817                  ": range " + (i+1) +
1818                  " start > end: " + (int)start + ", " + (int)end +
1819                  " for " + set);
1820        }
1821        if (i > 0 && start <= last) {
1822            errln((UnicodeString)"FAIL result of " + msg +
1823                  ": range " + (i+1) +
1824                  " overlaps previous range: " + (int)start + ", " + (int)end +
1825                  " for " + set);
1826        }
1827        last = end;
1828    }
1829}
1830
1831/**
1832 * Convert a bitmask to a UnicodeSet.
1833 */
1834UnicodeSet& UnicodeSetTest::bitsToSet(int32_t a, UnicodeSet& result) {
1835    result.clear();
1836    for (UChar32 i = 0; i < 32; ++i) {
1837        if ((a & (1<<i)) != 0) {
1838            result.add(i);
1839        }
1840    }
1841    return result;
1842}
1843
1844/**
1845 * Convert a UnicodeSet to a bitmask.  Only the characters
1846 * U+0000 to U+0020 are represented in the bitmask.
1847 */
1848int32_t UnicodeSetTest::setToBits(const UnicodeSet& x) {
1849    int32_t result = 0;
1850    for (int32_t i = 0; i < 32; ++i) {
1851        if (x.contains((UChar32)i)) {
1852            result |= (1<<i);
1853        }
1854    }
1855    return result;
1856}
1857
1858/**
1859 * Return the representation of an inversion list based UnicodeSet
1860 * as a pairs list.  Ranges are listed in ascending Unicode order.
1861 * For example, the set [a-zA-M3] is represented as "33AMaz".
1862 */
1863UnicodeString UnicodeSetTest::getPairs(const UnicodeSet& set) {
1864    UnicodeString pairs;
1865    for (int32_t i=0; i<set.getRangeCount(); ++i) {
1866        UChar32 start = set.getRangeStart(i);
1867        UChar32 end = set.getRangeEnd(i);
1868        if (end > 0xFFFF) {
1869            end = 0xFFFF;
1870            i = set.getRangeCount(); // Should be unnecessary
1871        }
1872        pairs.append((UChar)start).append((UChar)end);
1873    }
1874    return pairs;
1875}
1876
1877/**
1878 * Basic consistency check for a few items.
1879 * That the iterator works, and that we can create a pattern and
1880 * get the same thing back
1881 */
1882void UnicodeSetTest::checkRoundTrip(const UnicodeSet& s) {
1883    UErrorCode ec = U_ZERO_ERROR;
1884
1885    UnicodeSet t(s);
1886    checkEqual(s, t, "copy ct");
1887
1888    t = s;
1889    checkEqual(s, t, "operator=");
1890
1891    copyWithIterator(t, s, FALSE);
1892    checkEqual(s, t, "iterator roundtrip");
1893
1894    copyWithIterator(t, s, TRUE); // try range
1895    checkEqual(s, t, "iterator roundtrip");
1896
1897    UnicodeString pat; s.toPattern(pat, FALSE);
1898    t.applyPattern(pat, ec);
1899    if (U_FAILURE(ec)) {
1900        errln("FAIL: applyPattern");
1901        return;
1902    } else {
1903        checkEqual(s, t, "toPattern(false)");
1904    }
1905
1906    s.toPattern(pat, TRUE);
1907    t.applyPattern(pat, ec);
1908    if (U_FAILURE(ec)) {
1909        errln("FAIL: applyPattern");
1910        return;
1911    } else {
1912        checkEqual(s, t, "toPattern(true)");
1913    }
1914}
1915
1916void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool withRange) {
1917    t.clear();
1918    UnicodeSetIterator it(s);
1919    if (withRange) {
1920        while (it.nextRange()) {
1921            if (it.isString()) {
1922                t.add(it.getString());
1923            } else {
1924                t.add(it.getCodepoint(), it.getCodepointEnd());
1925            }
1926        }
1927    } else {
1928        while (it.next()) {
1929            if (it.isString()) {
1930                t.add(it.getString());
1931            } else {
1932                t.add(it.getCodepoint());
1933            }
1934        }
1935    }
1936}
1937
1938UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
1939    UnicodeString source; s.toPattern(source, TRUE);
1940    UnicodeString result; t.toPattern(result, TRUE);
1941    if (s != t) {
1942        errln((UnicodeString)"FAIL: " + message
1943              + "; source = " + source
1944              + "; result = " + result
1945              );
1946        return FALSE;
1947    } else {
1948        logln((UnicodeString)"Ok: " + message
1949              + "; source = " + source
1950              + "; result = " + result
1951              );
1952    }
1953    return TRUE;
1954}
1955
1956void
1957UnicodeSetTest::expectContainment(const UnicodeString& pat,
1958                                  const UnicodeString& charsIn,
1959                                  const UnicodeString& charsOut) {
1960    UErrorCode ec = U_ZERO_ERROR;
1961    UnicodeSet set(pat, ec);
1962    if (U_FAILURE(ec)) {
1963        dataerrln((UnicodeString)"FAIL: pattern \"" +
1964              pat + "\" => " + u_errorName(ec));
1965        return;
1966    }
1967    expectContainment(set, pat, charsIn, charsOut);
1968}
1969
1970void
1971UnicodeSetTest::expectContainment(const UnicodeSet& set,
1972                                  const UnicodeString& charsIn,
1973                                  const UnicodeString& charsOut) {
1974    UnicodeString pat;
1975    set.toPattern(pat);
1976    expectContainment(set, pat, charsIn, charsOut);
1977}
1978
1979void
1980UnicodeSetTest::expectContainment(const UnicodeSet& set,
1981                                  const UnicodeString& setName,
1982                                  const UnicodeString& charsIn,
1983                                  const UnicodeString& charsOut) {
1984    UnicodeString bad;
1985    UChar32 c;
1986    int32_t i;
1987
1988    for (i=0; i<charsIn.length(); i+=U16_LENGTH(c)) {
1989        c = charsIn.char32At(i);
1990        if (!set.contains(c)) {
1991            bad.append(c);
1992        }
1993    }
1994    if (bad.length() > 0) {
1995        errln((UnicodeString)"Fail: set " + setName + " does not contain " + prettify(bad) +
1996              ", expected containment of " + prettify(charsIn));
1997    } else {
1998        logln((UnicodeString)"Ok: set " + setName + " contains " + prettify(charsIn));
1999    }
2000
2001    bad.truncate(0);
2002    for (i=0; i<charsOut.length(); i+=U16_LENGTH(c)) {
2003        c = charsOut.char32At(i);
2004        if (set.contains(c)) {
2005            bad.append(c);
2006        }
2007    }
2008    if (bad.length() > 0) {
2009        errln((UnicodeString)"Fail: set " + setName + " contains " + prettify(bad) +
2010              ", expected non-containment of " + prettify(charsOut));
2011    } else {
2012        logln((UnicodeString)"Ok: set " + setName + " does not contain " + prettify(charsOut));
2013    }
2014}
2015
2016void
2017UnicodeSetTest::expectPattern(UnicodeSet& set,
2018                              const UnicodeString& pattern,
2019                              const UnicodeString& expectedPairs){
2020    UErrorCode status = U_ZERO_ERROR;
2021    set.applyPattern(pattern, status);
2022    if (U_FAILURE(status)) {
2023        errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2024              "\") failed");
2025        return;
2026    } else {
2027        if (getPairs(set) != expectedPairs ) {
2028            errln(UnicodeString("FAIL: applyPattern(\"") + pattern +
2029                  "\") => pairs \"" +
2030                  escape(getPairs(set)) + "\", expected \"" +
2031                  escape(expectedPairs) + "\"");
2032        } else {
2033            logln(UnicodeString("Ok:   applyPattern(\"") + pattern +
2034                  "\") => pairs \"" +
2035                  escape(getPairs(set)) + "\"");
2036        }
2037    }
2038    // the result of calling set.toPattern(), which is the string representation of
2039    // this set(set), is passed to a  UnicodeSet constructor, and tested that it
2040    // will produce another set that is equal to this one.
2041    UnicodeString temppattern;
2042    set.toPattern(temppattern);
2043    UnicodeSet *tempset=new UnicodeSet(temppattern, status);
2044    if (U_FAILURE(status)) {
2045        errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => invalid pattern"));
2046        return;
2047    }
2048    if(*tempset != set || getPairs(*tempset) != getPairs(set)){
2049        errln(UnicodeString("FAIL: applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \""+ escape(getPairs(*tempset)) + "\", expected pairs \"" +
2050            escape(getPairs(set)) + "\""));
2051    } else{
2052        logln(UnicodeString("Ok:   applyPattern(\""+ pattern + "\").toPattern() => " + temppattern + " => pairs \"" + escape(getPairs(*tempset)) + "\""));
2053    }
2054
2055    delete tempset;
2056
2057}
2058
2059void
2060UnicodeSetTest::expectPairs(const UnicodeSet& set, const UnicodeString& expectedPairs) {
2061    if (getPairs(set) != expectedPairs) {
2062        errln(UnicodeString("FAIL: Expected pair list \"") +
2063              escape(expectedPairs) + "\", got \"" +
2064              escape(getPairs(set)) + "\"");
2065    }
2066}
2067
2068void UnicodeSetTest::expectToPattern(const UnicodeSet& set,
2069                                     const UnicodeString& expPat,
2070                                     const char** expStrings) {
2071    UnicodeString pat;
2072    set.toPattern(pat, TRUE);
2073    if (pat == expPat) {
2074        logln((UnicodeString)"Ok:   toPattern() => \"" + pat + "\"");
2075    } else {
2076        errln((UnicodeString)"FAIL: toPattern() => \"" + pat + "\", expected \"" + expPat + "\"");
2077        return;
2078    }
2079    if (expStrings == NULL) {
2080        return;
2081    }
2082    UBool in = TRUE;
2083    for (int32_t i=0; expStrings[i] != NULL; ++i) {
2084        if (expStrings[i] == NOT) { // sic; pointer comparison
2085            in = FALSE;
2086            continue;
2087        }
2088        UnicodeString s = CharsToUnicodeString(expStrings[i]);
2089        UBool contained = set.contains(s);
2090        if (contained == in) {
2091            logln((UnicodeString)"Ok: " + expPat +
2092                  (contained ? " contains {" : " does not contain {") +
2093                  escape(expStrings[i]) + "}");
2094        } else {
2095            errln((UnicodeString)"FAIL: " + expPat +
2096                  (contained ? " contains {" : " does not contain {") +
2097                  escape(expStrings[i]) + "}");
2098        }
2099    }
2100}
2101
2102static UChar toHexString(int32_t i) { return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); }
2103
2104void
2105UnicodeSetTest::doAssert(UBool condition, const char *message)
2106{
2107    if (!condition) {
2108        errln(UnicodeString("ERROR : ") + message);
2109    }
2110}
2111
2112UnicodeString
2113UnicodeSetTest::escape(const UnicodeString& s) {
2114    UnicodeString buf;
2115    for (int32_t i=0; i<s.length(); )
2116    {
2117        UChar32 c = s.char32At(i);
2118        if (0x0020 <= c && c <= 0x007F) {
2119            buf += c;
2120        } else {
2121            if (c <= 0xFFFF) {
2122                buf += (UChar)0x5c; buf += (UChar)0x75;
2123            } else {
2124                buf += (UChar)0x5c; buf += (UChar)0x55;
2125                buf += toHexString((c & 0xF0000000) >> 28);
2126                buf += toHexString((c & 0x0F000000) >> 24);
2127                buf += toHexString((c & 0x00F00000) >> 20);
2128                buf += toHexString((c & 0x000F0000) >> 16);
2129            }
2130            buf += toHexString((c & 0xF000) >> 12);
2131            buf += toHexString((c & 0x0F00) >> 8);
2132            buf += toHexString((c & 0x00F0) >> 4);
2133            buf += toHexString(c & 0x000F);
2134        }
2135        i += U16_LENGTH(c);
2136    }
2137    return buf;
2138}
2139
2140void UnicodeSetTest::TestFreezable() {
2141    UErrorCode errorCode=U_ZERO_ERROR;
2142    UnicodeString idPattern=UNICODE_STRING("[:ID_Continue:]", 15);
2143    UnicodeSet idSet(idPattern, errorCode);
2144    if(U_FAILURE(errorCode)) {
2145        dataerrln("FAIL: unable to create UnicodeSet([:ID_Continue:]) - %s", u_errorName(errorCode));
2146        return;
2147    }
2148
2149    UnicodeString wsPattern=UNICODE_STRING("[:White_Space:]", 15);
2150    UnicodeSet wsSet(wsPattern, errorCode);
2151    if(U_FAILURE(errorCode)) {
2152        dataerrln("FAIL: unable to create UnicodeSet([:White_Space:]) - %s", u_errorName(errorCode));
2153        return;
2154    }
2155
2156    idSet.add(idPattern);
2157    UnicodeSet frozen(idSet);
2158    frozen.freeze();
2159
2160    if(idSet.isFrozen() || !frozen.isFrozen()) {
2161        errln("FAIL: isFrozen() is wrong");
2162    }
2163    if(frozen!=idSet || !(frozen==idSet)) {
2164        errln("FAIL: a copy-constructed frozen set differs from its original");
2165    }
2166
2167    frozen=wsSet;
2168    if(frozen!=idSet || !(frozen==idSet)) {
2169        errln("FAIL: a frozen set was modified by operator=");
2170    }
2171
2172    UnicodeSet frozen2(frozen);
2173    if(frozen2!=frozen || frozen2!=idSet) {
2174        errln("FAIL: a copied frozen set differs from its frozen original");
2175    }
2176    if(!frozen2.isFrozen()) {
2177        errln("FAIL: copy-constructing a frozen set results in a thawed one");
2178    }
2179    UnicodeSet frozen3(5, 55);  // Set to some values to really test assignment below, not copy construction.
2180    if(frozen3.contains(0, 4) || !frozen3.contains(5, 55) || frozen3.contains(56, 0x10ffff)) {
2181        errln("FAIL: UnicodeSet(5, 55) failed");
2182    }
2183    frozen3=frozen;
2184    if(!frozen3.isFrozen()) {
2185        errln("FAIL: copying a frozen set results in a thawed one");
2186    }
2187
2188    UnicodeSet *cloned=(UnicodeSet *)frozen.clone();
2189    if(!cloned->isFrozen() || *cloned!=frozen || cloned->containsSome(0xd802, 0xd805)) {
2190        errln("FAIL: clone() failed");
2191    }
2192    cloned->add(0xd802, 0xd805);
2193    if(cloned->containsSome(0xd802, 0xd805)) {
2194        errln("FAIL: unable to modify clone");
2195    }
2196    delete cloned;
2197
2198    UnicodeSet *thawed=(UnicodeSet *)frozen.cloneAsThawed();
2199    if(thawed->isFrozen() || *thawed!=frozen || thawed->containsSome(0xd802, 0xd805)) {
2200        errln("FAIL: cloneAsThawed() failed");
2201    }
2202    thawed->add(0xd802, 0xd805);
2203    if(!thawed->contains(0xd802, 0xd805)) {
2204        errln("FAIL: unable to modify thawed clone");
2205    }
2206    delete thawed;
2207
2208    frozen.set(5, 55);
2209    if(frozen!=idSet || !(frozen==idSet)) {
2210        errln("FAIL: UnicodeSet::set() modified a frozen set");
2211    }
2212
2213    frozen.clear();
2214    if(frozen!=idSet || !(frozen==idSet)) {
2215        errln("FAIL: UnicodeSet::clear() modified a frozen set");
2216    }
2217
2218    frozen.closeOver(USET_CASE_INSENSITIVE);
2219    if(frozen!=idSet || !(frozen==idSet)) {
2220        errln("FAIL: UnicodeSet::closeOver() modified a frozen set");
2221    }
2222
2223    frozen.compact();
2224    if(frozen!=idSet || !(frozen==idSet)) {
2225        errln("FAIL: UnicodeSet::compact() modified a frozen set");
2226    }
2227
2228    ParsePosition pos;
2229    frozen.
2230        applyPattern(wsPattern, errorCode).
2231        applyPattern(wsPattern, USET_IGNORE_SPACE, NULL, errorCode).
2232        applyPattern(wsPattern, pos, USET_IGNORE_SPACE, NULL, errorCode).
2233        applyIntPropertyValue(UCHAR_CANONICAL_COMBINING_CLASS, 230, errorCode).
2234        applyPropertyAlias(UNICODE_STRING_SIMPLE("Assigned"), UnicodeString(), errorCode);
2235    if(frozen!=idSet || !(frozen==idSet)) {
2236        errln("FAIL: UnicodeSet::applyXYZ() modified a frozen set");
2237    }
2238
2239    frozen.
2240        add(0xd800).
2241        add(0xd802, 0xd805).
2242        add(wsPattern).
2243        addAll(idPattern).
2244        addAll(wsSet);
2245    if(frozen!=idSet || !(frozen==idSet)) {
2246        errln("FAIL: UnicodeSet::addXYZ() modified a frozen set");
2247    }
2248
2249    frozen.
2250        retain(0x62).
2251        retain(0x64, 0x69).
2252        retainAll(wsPattern).
2253        retainAll(wsSet);
2254    if(frozen!=idSet || !(frozen==idSet)) {
2255        errln("FAIL: UnicodeSet::retainXYZ() modified a frozen set");
2256    }
2257
2258    frozen.
2259        remove(0x62).
2260        remove(0x64, 0x69).
2261        remove(idPattern).
2262        removeAll(idPattern).
2263        removeAll(idSet);
2264    if(frozen!=idSet || !(frozen==idSet)) {
2265        errln("FAIL: UnicodeSet::removeXYZ() modified a frozen set");
2266    }
2267
2268    frozen.
2269        complement().
2270        complement(0x62).
2271        complement(0x64, 0x69).
2272        complement(idPattern).
2273        complementAll(idPattern).
2274        complementAll(idSet);
2275    if(frozen!=idSet || !(frozen==idSet)) {
2276        errln("FAIL: UnicodeSet::complementXYZ() modified a frozen set");
2277    }
2278}
2279
2280// Test span() etc. -------------------------------------------------------- ***
2281
2282// Append the UTF-8 version of the string to t and return the appended UTF-8 length.
2283static int32_t
2284appendUTF8(const UChar *s, int32_t length, char *t, int32_t capacity) {
2285    UErrorCode errorCode=U_ZERO_ERROR;
2286    int32_t length8=0;
2287    u_strToUTF8(t, capacity, &length8, s, length, &errorCode);
2288    if(U_SUCCESS(errorCode)) {
2289        return length8;
2290    } else {
2291        // The string contains an unpaired surrogate.
2292        // Ignore this string.
2293        return 0;
2294    }
2295}
2296
2297class UnicodeSetWithStringsIterator;
2298
2299// Make the strings in a UnicodeSet easily accessible.
2300class UnicodeSetWithStrings {
2301public:
2302    UnicodeSetWithStrings(const UnicodeSet &normalSet) :
2303            set(normalSet), stringsLength(0), hasSurrogates(FALSE) {
2304        int32_t size=set.size();
2305        if(size>0 && set.charAt(size-1)<0) {
2306            // If a set's last element is not a code point, then it must contain strings.
2307            // Iterate over the set, skip all code point ranges, and cache the strings.
2308            // Convert them to UTF-8 for spanUTF8().
2309            UnicodeSetIterator iter(set);
2310            const UnicodeString *s;
2311            char *s8=utf8;
2312            int32_t length8, utf8Count=0;
2313            while(iter.nextRange() && stringsLength<LENGTHOF(strings)) {
2314                if(iter.isString()) {
2315                    // Store the pointer to the set's string element
2316                    // which we happen to know is a stable pointer.
2317                    strings[stringsLength]=s=&iter.getString();
2318                    utf8Count+=
2319                        utf8Lengths[stringsLength]=length8=
2320                        appendUTF8(s->getBuffer(), s->length(),
2321                                   s8, (int32_t)(sizeof(utf8)-utf8Count));
2322                    if(length8==0) {
2323                        hasSurrogates=TRUE;  // Contains unpaired surrogates.
2324                    }
2325                    s8+=length8;
2326                    ++stringsLength;
2327                }
2328            }
2329        }
2330    }
2331
2332    const UnicodeSet &getSet() const {
2333        return set;
2334    }
2335
2336    UBool hasStrings() const {
2337        return (UBool)(stringsLength>0);
2338    }
2339
2340    UBool hasStringsWithSurrogates() const {
2341        return hasSurrogates;
2342    }
2343
2344private:
2345    friend class UnicodeSetWithStringsIterator;
2346
2347    const UnicodeSet &set;
2348
2349    const UnicodeString *strings[20];
2350    int32_t stringsLength;
2351    UBool hasSurrogates;
2352
2353    char utf8[1024];
2354    int32_t utf8Lengths[20];
2355
2356    int32_t nextStringIndex;
2357    int32_t nextUTF8Start;
2358};
2359
2360class UnicodeSetWithStringsIterator {
2361public:
2362    UnicodeSetWithStringsIterator(const UnicodeSetWithStrings &set) :
2363            fSet(set), nextStringIndex(0), nextUTF8Start(0) {
2364    }
2365
2366    void reset() {
2367        nextStringIndex=nextUTF8Start=0;
2368    }
2369
2370    const UnicodeString *nextString() {
2371        if(nextStringIndex<fSet.stringsLength) {
2372            return fSet.strings[nextStringIndex++];
2373        } else {
2374            return NULL;
2375        }
2376    }
2377
2378    // Do not mix with calls to nextString().
2379    const char *nextUTF8(int32_t &length) {
2380        if(nextStringIndex<fSet.stringsLength) {
2381            const char *s8=fSet.utf8+nextUTF8Start;
2382            nextUTF8Start+=length=fSet.utf8Lengths[nextStringIndex++];
2383            return s8;
2384        } else {
2385            length=0;
2386            return NULL;
2387        }
2388    }
2389
2390private:
2391    const UnicodeSetWithStrings &fSet;
2392    int32_t nextStringIndex;
2393    int32_t nextUTF8Start;
2394};
2395
2396// Compare 16-bit Unicode strings (which may be malformed UTF-16)
2397// at code point boundaries.
2398// That is, each edge of a match must not be in the middle of a surrogate pair.
2399static inline UBool
2400matches16CPB(const UChar *s, int32_t start, int32_t limit, const UnicodeString &t) {
2401    s+=start;
2402    limit-=start;
2403    int32_t length=t.length();
2404    return 0==t.compare(s, length) &&
2405           !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
2406           !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
2407}
2408
2409// Implement span() with contains() for comparison.
2410static int32_t containsSpanUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2411                                 USetSpanCondition spanCondition) {
2412    const UnicodeSet &realSet(set.getSet());
2413    if(!set.hasStrings()) {
2414        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2415            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2416        }
2417
2418        UChar32 c;
2419        int32_t start=0, prev;
2420        while((prev=start)<length) {
2421            U16_NEXT(s, start, length, c);
2422            if(realSet.contains(c)!=spanCondition) {
2423                break;
2424            }
2425        }
2426        return prev;
2427    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2428        UnicodeSetWithStringsIterator iter(set);
2429        UChar32 c;
2430        int32_t start, next;
2431        for(start=next=0; start<length;) {
2432            U16_NEXT(s, next, length, c);
2433            if(realSet.contains(c)) {
2434                break;
2435            }
2436            const UnicodeString *str;
2437            iter.reset();
2438            while((str=iter.nextString())!=NULL) {
2439                if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2440                    // spanNeedsStrings=TRUE;
2441                    return start;
2442                }
2443            }
2444            start=next;
2445        }
2446        return start;
2447    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2448        UnicodeSetWithStringsIterator iter(set);
2449        UChar32 c;
2450        int32_t start, next, maxSpanLimit=0;
2451        for(start=next=0; start<length;) {
2452            U16_NEXT(s, next, length, c);
2453            if(!realSet.contains(c)) {
2454                next=start;  // Do not span this single, not-contained code point.
2455            }
2456            const UnicodeString *str;
2457            iter.reset();
2458            while((str=iter.nextString())!=NULL) {
2459                if(str->length()<=(length-start) && matches16CPB(s, start, length, *str)) {
2460                    // spanNeedsStrings=TRUE;
2461                    int32_t matchLimit=start+str->length();
2462                    if(matchLimit==length) {
2463                        return length;
2464                    }
2465                    if(spanCondition==USET_SPAN_CONTAINED) {
2466                        // Iterate for the shortest match at each position.
2467                        // Recurse for each but the shortest match.
2468                        if(next==start) {
2469                            next=matchLimit;  // First match from start.
2470                        } else {
2471                            if(matchLimit<next) {
2472                                // Remember shortest match from start for iteration.
2473                                int32_t temp=next;
2474                                next=matchLimit;
2475                                matchLimit=temp;
2476                            }
2477                            // Recurse for non-shortest match from start.
2478                            int32_t spanLength=containsSpanUTF16(set, s+matchLimit, length-matchLimit,
2479                                                                 USET_SPAN_CONTAINED);
2480                            if((matchLimit+spanLength)>maxSpanLimit) {
2481                                maxSpanLimit=matchLimit+spanLength;
2482                                if(maxSpanLimit==length) {
2483                                    return length;
2484                                }
2485                            }
2486                        }
2487                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2488                        if(matchLimit>next) {
2489                            // Remember longest match from start.
2490                            next=matchLimit;
2491                        }
2492                    }
2493                }
2494            }
2495            if(next==start) {
2496                break;  // No match from start.
2497            }
2498            start=next;
2499        }
2500        if(start>maxSpanLimit) {
2501            return start;
2502        } else {
2503            return maxSpanLimit;
2504        }
2505    }
2506}
2507
2508static int32_t containsSpanBackUTF16(const UnicodeSetWithStrings &set, const UChar *s, int32_t length,
2509                                     USetSpanCondition spanCondition) {
2510    if(length==0) {
2511        return 0;
2512    }
2513    const UnicodeSet &realSet(set.getSet());
2514    if(!set.hasStrings()) {
2515        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2516            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2517        }
2518
2519        UChar32 c;
2520        int32_t prev=length;
2521        do {
2522            U16_PREV(s, 0, length, c);
2523            if(realSet.contains(c)!=spanCondition) {
2524                break;
2525            }
2526        } while((prev=length)>0);
2527        return prev;
2528    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2529        UnicodeSetWithStringsIterator iter(set);
2530        UChar32 c;
2531        int32_t prev=length, length0=length;
2532        do {
2533            U16_PREV(s, 0, length, c);
2534            if(realSet.contains(c)) {
2535                break;
2536            }
2537            const UnicodeString *str;
2538            iter.reset();
2539            while((str=iter.nextString())!=NULL) {
2540                if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2541                    // spanNeedsStrings=TRUE;
2542                    return prev;
2543                }
2544            }
2545        } while((prev=length)>0);
2546        return prev;
2547    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2548        UnicodeSetWithStringsIterator iter(set);
2549        UChar32 c;
2550        int32_t prev=length, minSpanStart=length, length0=length;
2551        do {
2552            U16_PREV(s, 0, length, c);
2553            if(!realSet.contains(c)) {
2554                length=prev;  // Do not span this single, not-contained code point.
2555            }
2556            const UnicodeString *str;
2557            iter.reset();
2558            while((str=iter.nextString())!=NULL) {
2559                if(str->length()<=prev && matches16CPB(s, prev-str->length(), length0, *str)) {
2560                    // spanNeedsStrings=TRUE;
2561                    int32_t matchStart=prev-str->length();
2562                    if(matchStart==0) {
2563                        return 0;
2564                    }
2565                    if(spanCondition==USET_SPAN_CONTAINED) {
2566                        // Iterate for the shortest match at each position.
2567                        // Recurse for each but the shortest match.
2568                        if(length==prev) {
2569                            length=matchStart;  // First match from prev.
2570                        } else {
2571                            if(matchStart>length) {
2572                                // Remember shortest match from prev for iteration.
2573                                int32_t temp=length;
2574                                length=matchStart;
2575                                matchStart=temp;
2576                            }
2577                            // Recurse for non-shortest match from prev.
2578                            int32_t spanStart=containsSpanBackUTF16(set, s, matchStart,
2579                                                                    USET_SPAN_CONTAINED);
2580                            if(spanStart<minSpanStart) {
2581                                minSpanStart=spanStart;
2582                                if(minSpanStart==0) {
2583                                    return 0;
2584                                }
2585                            }
2586                        }
2587                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2588                        if(matchStart<length) {
2589                            // Remember longest match from prev.
2590                            length=matchStart;
2591                        }
2592                    }
2593                }
2594            }
2595            if(length==prev) {
2596                break;  // No match from prev.
2597            }
2598        } while((prev=length)>0);
2599        if(prev<minSpanStart) {
2600            return prev;
2601        } else {
2602            return minSpanStart;
2603        }
2604    }
2605}
2606
2607static int32_t containsSpanUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2608                                USetSpanCondition spanCondition) {
2609    const UnicodeSet &realSet(set.getSet());
2610    if(!set.hasStrings()) {
2611        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2612            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2613        }
2614
2615        UChar32 c;
2616        int32_t start=0, prev;
2617        while((prev=start)<length) {
2618            U8_NEXT(s, start, length, c);
2619            if(c<0) {
2620                c=0xfffd;
2621            }
2622            if(realSet.contains(c)!=spanCondition) {
2623                break;
2624            }
2625        }
2626        return prev;
2627    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2628        UnicodeSetWithStringsIterator iter(set);
2629        UChar32 c;
2630        int32_t start, next;
2631        for(start=next=0; start<length;) {
2632            U8_NEXT(s, next, length, c);
2633            if(c<0) {
2634                c=0xfffd;
2635            }
2636            if(realSet.contains(c)) {
2637                break;
2638            }
2639            const char *s8;
2640            int32_t length8;
2641            iter.reset();
2642            while((s8=iter.nextUTF8(length8))!=NULL) {
2643                if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2644                    // spanNeedsStrings=TRUE;
2645                    return start;
2646                }
2647            }
2648            start=next;
2649        }
2650        return start;
2651    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2652        UnicodeSetWithStringsIterator iter(set);
2653        UChar32 c;
2654        int32_t start, next, maxSpanLimit=0;
2655        for(start=next=0; start<length;) {
2656            U8_NEXT(s, next, length, c);
2657            if(c<0) {
2658                c=0xfffd;
2659            }
2660            if(!realSet.contains(c)) {
2661                next=start;  // Do not span this single, not-contained code point.
2662            }
2663            const char *s8;
2664            int32_t length8;
2665            iter.reset();
2666            while((s8=iter.nextUTF8(length8))!=NULL) {
2667                if(length8!=0 && length8<=(length-start) && 0==memcmp(s+start, s8, length8)) {
2668                    // spanNeedsStrings=TRUE;
2669                    int32_t matchLimit=start+length8;
2670                    if(matchLimit==length) {
2671                        return length;
2672                    }
2673                    if(spanCondition==USET_SPAN_CONTAINED) {
2674                        // Iterate for the shortest match at each position.
2675                        // Recurse for each but the shortest match.
2676                        if(next==start) {
2677                            next=matchLimit;  // First match from start.
2678                        } else {
2679                            if(matchLimit<next) {
2680                                // Remember shortest match from start for iteration.
2681                                int32_t temp=next;
2682                                next=matchLimit;
2683                                matchLimit=temp;
2684                            }
2685                            // Recurse for non-shortest match from start.
2686                            int32_t spanLength=containsSpanUTF8(set, s+matchLimit, length-matchLimit,
2687                                                                USET_SPAN_CONTAINED);
2688                            if((matchLimit+spanLength)>maxSpanLimit) {
2689                                maxSpanLimit=matchLimit+spanLength;
2690                                if(maxSpanLimit==length) {
2691                                    return length;
2692                                }
2693                            }
2694                        }
2695                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2696                        if(matchLimit>next) {
2697                            // Remember longest match from start.
2698                            next=matchLimit;
2699                        }
2700                    }
2701                }
2702            }
2703            if(next==start) {
2704                break;  // No match from start.
2705            }
2706            start=next;
2707        }
2708        if(start>maxSpanLimit) {
2709            return start;
2710        } else {
2711            return maxSpanLimit;
2712        }
2713    }
2714}
2715
2716static int32_t containsSpanBackUTF8(const UnicodeSetWithStrings &set, const char *s, int32_t length,
2717                                    USetSpanCondition spanCondition) {
2718    if(length==0) {
2719        return 0;
2720    }
2721    const UnicodeSet &realSet(set.getSet());
2722    if(!set.hasStrings()) {
2723        if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2724            spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2725        }
2726
2727        UChar32 c;
2728        int32_t prev=length;
2729        do {
2730            U8_PREV(s, 0, length, c);
2731            if(c<0) {
2732                c=0xfffd;
2733            }
2734            if(realSet.contains(c)!=spanCondition) {
2735                break;
2736            }
2737        } while((prev=length)>0);
2738        return prev;
2739    } else if(spanCondition==USET_SPAN_NOT_CONTAINED) {
2740        UnicodeSetWithStringsIterator iter(set);
2741        UChar32 c;
2742        int32_t prev=length;
2743        do {
2744            U8_PREV(s, 0, length, c);
2745            if(c<0) {
2746                c=0xfffd;
2747            }
2748            if(realSet.contains(c)) {
2749                break;
2750            }
2751            const char *s8;
2752            int32_t length8;
2753            iter.reset();
2754            while((s8=iter.nextUTF8(length8))!=NULL) {
2755                if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2756                    // spanNeedsStrings=TRUE;
2757                    return prev;
2758                }
2759            }
2760        } while((prev=length)>0);
2761        return prev;
2762    } else /* USET_SPAN_CONTAINED or USET_SPAN_SIMPLE */ {
2763        UnicodeSetWithStringsIterator iter(set);
2764        UChar32 c;
2765        int32_t prev=length, minSpanStart=length;
2766        do {
2767            U8_PREV(s, 0, length, c);
2768            if(c<0) {
2769                c=0xfffd;
2770            }
2771            if(!realSet.contains(c)) {
2772                length=prev;  // Do not span this single, not-contained code point.
2773            }
2774            const char *s8;
2775            int32_t length8;
2776            iter.reset();
2777            while((s8=iter.nextUTF8(length8))!=NULL) {
2778                if(length8!=0 && length8<=prev && 0==memcmp(s+prev-length8, s8, length8)) {
2779                    // spanNeedsStrings=TRUE;
2780                    int32_t matchStart=prev-length8;
2781                    if(matchStart==0) {
2782                        return 0;
2783                    }
2784                    if(spanCondition==USET_SPAN_CONTAINED) {
2785                        // Iterate for the shortest match at each position.
2786                        // Recurse for each but the shortest match.
2787                        if(length==prev) {
2788                            length=matchStart;  // First match from prev.
2789                        } else {
2790                            if(matchStart>length) {
2791                                // Remember shortest match from prev for iteration.
2792                                int32_t temp=length;
2793                                length=matchStart;
2794                                matchStart=temp;
2795                            }
2796                            // Recurse for non-shortest match from prev.
2797                            int32_t spanStart=containsSpanBackUTF8(set, s, matchStart,
2798                                                                   USET_SPAN_CONTAINED);
2799                            if(spanStart<minSpanStart) {
2800                                minSpanStart=spanStart;
2801                                if(minSpanStart==0) {
2802                                    return 0;
2803                                }
2804                            }
2805                        }
2806                    } else /* spanCondition==USET_SPAN_SIMPLE */ {
2807                        if(matchStart<length) {
2808                            // Remember longest match from prev.
2809                            length=matchStart;
2810                        }
2811                    }
2812                }
2813            }
2814            if(length==prev) {
2815                break;  // No match from prev.
2816            }
2817        } while((prev=length)>0);
2818        if(prev<minSpanStart) {
2819            return prev;
2820        } else {
2821            return minSpanStart;
2822        }
2823    }
2824}
2825
2826// spans to be performed and compared
2827enum {
2828    SPAN_UTF16          =1,
2829    SPAN_UTF8           =2,
2830    SPAN_UTFS           =3,
2831
2832    SPAN_SET            =4,
2833    SPAN_COMPLEMENT     =8,
2834    SPAN_POLARITY       =0xc,
2835
2836    SPAN_FWD            =0x10,
2837    SPAN_BACK           =0x20,
2838    SPAN_DIRS           =0x30,
2839
2840    SPAN_CONTAINED      =0x100,
2841    SPAN_SIMPLE         =0x200,
2842    SPAN_CONDITION      =0x300,
2843
2844    SPAN_ALL            =0x33f
2845};
2846
2847static inline USetSpanCondition invertSpanCondition(USetSpanCondition spanCondition, USetSpanCondition contained) {
2848    return spanCondition == USET_SPAN_NOT_CONTAINED ? contained : USET_SPAN_NOT_CONTAINED;
2849}
2850
2851static inline int32_t slen(const void *s, UBool isUTF16) {
2852    return isUTF16 ? u_strlen((const UChar *)s) : strlen((const char *)s);
2853}
2854
2855/*
2856 * Count spans on a string with the method according to type and set the span limits.
2857 * The set may be the complement of the original.
2858 * When using spanBack() and comparing with span(), use a span condition for the first spanBack()
2859 * according to the expected number of spans.
2860 * Sets typeName to an empty string if there is no such type.
2861 * Returns -1 if the span option is filtered out.
2862 */
2863static int32_t getSpans(const UnicodeSetWithStrings &set, UBool isComplement,
2864                        const void *s, int32_t length, UBool isUTF16,
2865                        uint32_t whichSpans,
2866                        int type, const char *&typeName,
2867                        int32_t limits[], int32_t limitsCapacity,
2868                        int32_t expectCount) {
2869    const UnicodeSet &realSet(set.getSet());
2870    int32_t start, count;
2871    USetSpanCondition spanCondition, firstSpanCondition, contained;
2872    UBool isForward;
2873
2874    if(type<0 || 7<type) {
2875        typeName="";
2876        return 0;
2877    }
2878
2879    static const char *const typeNames16[]={
2880        "contains", "contains(LM)",
2881        "span", "span(LM)",
2882        "containsBack", "containsBack(LM)",
2883        "spanBack", "spanBack(LM)"
2884    };
2885
2886    static const char *const typeNames8[]={
2887        "containsUTF8", "containsUTF8(LM)",
2888        "spanUTF8", "spanUTF8(LM)",
2889        "containsBackUTF8", "containsBackUTF8(LM)", // not implemented
2890        "spanBackUTF8", "spanBackUTF8(LM)"
2891    };
2892
2893    typeName= isUTF16 ? typeNames16[type] : typeNames8[type];
2894
2895    // filter span options
2896    if(type<=3) {
2897        // span forward
2898        if((whichSpans&SPAN_FWD)==0) {
2899            return -1;
2900        }
2901        isForward=TRUE;
2902    } else {
2903        // span backward
2904        if((whichSpans&SPAN_BACK)==0) {
2905            return -1;
2906        }
2907        isForward=FALSE;
2908    }
2909    if((type&1)==0) {
2910        // use USET_SPAN_CONTAINED
2911        if((whichSpans&SPAN_CONTAINED)==0) {
2912            return -1;
2913        }
2914        contained=USET_SPAN_CONTAINED;
2915    } else {
2916        // use USET_SPAN_SIMPLE
2917        if((whichSpans&SPAN_SIMPLE)==0) {
2918            return -1;
2919        }
2920        contained=USET_SPAN_SIMPLE;
2921    }
2922
2923    // Default first span condition for going forward with an uncomplemented set.
2924    spanCondition=USET_SPAN_NOT_CONTAINED;
2925    if(isComplement) {
2926        spanCondition=invertSpanCondition(spanCondition, contained);
2927    }
2928
2929    // First span condition for span(), used to terminate the spanBack() iteration.
2930    firstSpanCondition=spanCondition;
2931
2932    // spanBack(): Its initial span condition is span()'s last span condition,
2933    // which is the opposite of span()'s first span condition
2934    // if we expect an even number of spans.
2935    // (The loop inverts spanCondition (expectCount-1) times
2936    // before the expectCount'th span() call.)
2937    // If we do not compare forward and backward directions, then we do not have an
2938    // expectCount and just start with firstSpanCondition.
2939    if(!isForward && (whichSpans&SPAN_FWD)!=0 && (expectCount&1)==0) {
2940        spanCondition=invertSpanCondition(spanCondition, contained);
2941    }
2942
2943    count=0;
2944    switch(type) {
2945    case 0:
2946    case 1:
2947        start=0;
2948        if(length<0) {
2949            length=slen(s, isUTF16);
2950        }
2951        for(;;) {
2952            start+= isUTF16 ? containsSpanUTF16(set, (const UChar *)s+start, length-start, spanCondition) :
2953                              containsSpanUTF8(set, (const char *)s+start, length-start, spanCondition);
2954            if(count<limitsCapacity) {
2955                limits[count]=start;
2956            }
2957            ++count;
2958            if(start>=length) {
2959                break;
2960            }
2961            spanCondition=invertSpanCondition(spanCondition, contained);
2962        }
2963        break;
2964    case 2:
2965    case 3:
2966        start=0;
2967        for(;;) {
2968            start+= isUTF16 ? realSet.span((const UChar *)s+start, length>=0 ? length-start : length, spanCondition) :
2969                              realSet.spanUTF8((const char *)s+start, length>=0 ? length-start : length, spanCondition);
2970            if(count<limitsCapacity) {
2971                limits[count]=start;
2972            }
2973            ++count;
2974            if(length>=0 ? start>=length :
2975                           isUTF16 ? ((const UChar *)s)[start]==0 :
2976                                     ((const char *)s)[start]==0
2977            ) {
2978                break;
2979            }
2980            spanCondition=invertSpanCondition(spanCondition, contained);
2981        }
2982        break;
2983    case 4:
2984    case 5:
2985        if(length<0) {
2986            length=slen(s, isUTF16);
2987        }
2988        for(;;) {
2989            ++count;
2990            if(count<=limitsCapacity) {
2991                limits[limitsCapacity-count]=length;
2992            }
2993            length= isUTF16 ? containsSpanBackUTF16(set, (const UChar *)s, length, spanCondition) :
2994                              containsSpanBackUTF8(set, (const char *)s, length, spanCondition);
2995            if(length==0 && spanCondition==firstSpanCondition) {
2996                break;
2997            }
2998            spanCondition=invertSpanCondition(spanCondition, contained);
2999        }
3000        if(count<limitsCapacity) {
3001            memmove(limits, limits+(limitsCapacity-count), count*4);
3002        }
3003        break;
3004    case 6:
3005    case 7:
3006        for(;;) {
3007            ++count;
3008            if(count<=limitsCapacity) {
3009                limits[limitsCapacity-count]= length >=0 ? length : slen(s, isUTF16);
3010            }
3011            // Note: Length<0 is tested only for the first spanBack().
3012            // If we wanted to keep length<0 for all spanBack()s, we would have to
3013            // temporarily modify the string by placing a NUL where the previous spanBack() stopped.
3014            length= isUTF16 ? realSet.spanBack((const UChar *)s, length, spanCondition) :
3015                              realSet.spanBackUTF8((const char *)s, length, spanCondition);
3016            if(length==0 && spanCondition==firstSpanCondition) {
3017                break;
3018            }
3019            spanCondition=invertSpanCondition(spanCondition, contained);
3020        }
3021        if(count<limitsCapacity) {
3022            memmove(limits, limits+(limitsCapacity-count), count*4);
3023        }
3024        break;
3025    default:
3026        typeName="";
3027        return -1;
3028    }
3029
3030    return count;
3031}
3032
3033// sets to be tested; odd index=isComplement
3034enum {
3035    SLOW,
3036    SLOW_NOT,
3037    FAST,
3038    FAST_NOT,
3039    SET_COUNT
3040};
3041
3042static const char *const setNames[SET_COUNT]={
3043    "slow",
3044    "slow.not",
3045    "fast",
3046    "fast.not"
3047};
3048
3049/*
3050 * Verify that we get the same results whether we look at text with contains(),
3051 * span() or spanBack(), using unfrozen or frozen versions of the set,
3052 * and using the set or its complement (switching the spanConditions accordingly).
3053 * The latter verifies that
3054 *   set.span(spanCondition) == set.complement().span(!spanCondition).
3055 *
3056 * The expectLimits[] are either provided by the caller (with expectCount>=0)
3057 * or returned to the caller (with an input expectCount<0).
3058 */
3059void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3060                              const void *s, int32_t length, UBool isUTF16,
3061                              uint32_t whichSpans,
3062                              int32_t expectLimits[], int32_t &expectCount,
3063                              const char *testName, int32_t index) {
3064    int32_t limits[500];
3065    int32_t limitsCount;
3066    int i, j;
3067
3068    const char *typeName;
3069    int type;
3070
3071    for(i=0; i<SET_COUNT; ++i) {
3072        if((i&1)==0) {
3073            // Even-numbered sets are original, uncomplemented sets.
3074            if((whichSpans&SPAN_SET)==0) {
3075                continue;
3076            }
3077        } else {
3078            // Odd-numbered sets are complemented.
3079            if((whichSpans&SPAN_COMPLEMENT)==0) {
3080                continue;
3081            }
3082        }
3083        for(type=0;; ++type) {
3084            limitsCount=getSpans(*sets[i], (UBool)(i&1),
3085                                 s, length, isUTF16,
3086                                 whichSpans,
3087                                 type, typeName,
3088                                 limits, LENGTHOF(limits), expectCount);
3089            if(typeName[0]==0) {
3090                break; // All types tried.
3091            }
3092            if(limitsCount<0) {
3093                continue; // Span option filtered out.
3094            }
3095            if(expectCount<0) {
3096                expectCount=limitsCount;
3097                if(limitsCount>LENGTHOF(limits)) {
3098                    errln("FAIL: %s[0x%lx].%s.%s span count=%ld > %ld capacity - too many spans",
3099                          testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)LENGTHOF(limits));
3100                    return;
3101                }
3102                memcpy(expectLimits, limits, limitsCount*4);
3103            } else if(limitsCount!=expectCount) {
3104                errln("FAIL: %s[0x%lx].%s.%s span count=%ld != %ld",
3105                      testName, (long)index, setNames[i], typeName, (long)limitsCount, (long)expectCount);
3106            } else {
3107                for(j=0; j<limitsCount; ++j) {
3108                    if(limits[j]!=expectLimits[j]) {
3109                        errln("FAIL: %s[0x%lx].%s.%s span count=%ld limits[%d]=%ld != %ld",
3110                              testName, (long)index, setNames[i], typeName, (long)limitsCount,
3111                              j, (long)limits[j], (long)expectLimits[j]);
3112                        break;
3113                    }
3114                }
3115            }
3116        }
3117    }
3118
3119    // Compare span() with containsAll()/containsNone(),
3120    // but only if we have expectLimits[] from the uncomplemented set.
3121    if(isUTF16 && (whichSpans&SPAN_SET)!=0) {
3122        const UChar *s16=(const UChar *)s;
3123        UnicodeString string;
3124        int32_t prev=0, limit, length;
3125        for(i=0; i<expectCount; ++i) {
3126            limit=expectLimits[i];
3127            length=limit-prev;
3128            if(length>0) {
3129                string.setTo(FALSE, s16+prev, length);  // read-only alias
3130                if(i&1) {
3131                    if(!sets[SLOW]->getSet().containsAll(string)) {
3132                        errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3133                              testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3134                        return;
3135                    }
3136                    if(!sets[FAST]->getSet().containsAll(string)) {
3137                        errln("FAIL: %s[0x%lx].%s.containsAll(%ld..%ld)==FALSE contradicts span()",
3138                              testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3139                        return;
3140                    }
3141                } else {
3142                    if(!sets[SLOW]->getSet().containsNone(string)) {
3143                        errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3144                              testName, (long)index, setNames[SLOW], (long)prev, (long)limit);
3145                        return;
3146                    }
3147                    if(!sets[FAST]->getSet().containsNone(string)) {
3148                        errln("FAIL: %s[0x%lx].%s.containsNone(%ld..%ld)==FALSE contradicts span()",
3149                              testName, (long)index, setNames[FAST], (long)prev, (long)limit);
3150                        return;
3151                    }
3152                }
3153            }
3154            prev=limit;
3155        }
3156    }
3157}
3158
3159// Specifically test either UTF-16 or UTF-8.
3160void UnicodeSetTest::testSpan(const UnicodeSetWithStrings *sets[4],
3161                              const void *s, int32_t length, UBool isUTF16,
3162                              uint32_t whichSpans,
3163                              const char *testName, int32_t index) {
3164    int32_t expectLimits[500];
3165    int32_t expectCount=-1;
3166    testSpan(sets, s, length, isUTF16, whichSpans, expectLimits, expectCount, testName, index);
3167}
3168
3169UBool stringContainsUnpairedSurrogate(const UChar *s, int32_t length) {
3170    UChar c, c2;
3171
3172    if(length>=0) {
3173        while(length>0) {
3174            c=*s++;
3175            --length;
3176            if(0xd800<=c && c<0xe000) {
3177                if(c>=0xdc00 || length==0 || !U16_IS_TRAIL(c2=*s++)) {
3178                    return TRUE;
3179                }
3180                --length;
3181            }
3182        }
3183    } else {
3184        while((c=*s++)!=0) {
3185            if(0xd800<=c && c<0xe000) {
3186                if(c>=0xdc00 || !U16_IS_TRAIL(c2=*s++)) {
3187                    return TRUE;
3188                }
3189            }
3190        }
3191    }
3192    return FALSE;
3193}
3194
3195// Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
3196// unless either UTF is turned off in whichSpans.
3197// Testing UTF-16 and UTF-8 together requires that surrogate code points
3198// have the same contains(c) value as U+FFFD.
3199void UnicodeSetTest::testSpanBothUTFs(const UnicodeSetWithStrings *sets[4],
3200                                      const UChar *s16, int32_t length16,
3201                                      uint32_t whichSpans,
3202                                      const char *testName, int32_t index) {
3203    int32_t expectLimits[500];
3204    int32_t expectCount;
3205
3206    expectCount=-1;  // Get expectLimits[] from testSpan().
3207
3208    if((whichSpans&SPAN_UTF16)!=0) {
3209        testSpan(sets, s16, length16, TRUE, whichSpans, expectLimits, expectCount, testName, index);
3210    }
3211    if((whichSpans&SPAN_UTF8)==0) {
3212        return;
3213    }
3214
3215    // Convert s16[] and expectLimits[] to UTF-8.
3216    uint8_t s8[3000];
3217    int32_t offsets[3000];
3218
3219    const UChar *s16Limit=s16+length16;
3220    char *t=(char *)s8;
3221    char *tLimit=t+sizeof(s8);
3222    int32_t *o=offsets;
3223    UErrorCode errorCode=U_ZERO_ERROR;
3224
3225    // Convert with substitution: Turn unpaired surrogates into U+FFFD.
3226    ucnv_fromUnicode(openUTF8Converter(), &t, tLimit, &s16, s16Limit, o, TRUE, &errorCode);
3227    if(U_FAILURE(errorCode)) {
3228        errln("FAIL: %s[0x%lx] ucnv_fromUnicode(to UTF-8) fails with %s",
3229              testName, (long)index, u_errorName(errorCode));
3230        ucnv_resetFromUnicode(utf8Cnv);
3231        return;
3232    }
3233    int32_t length8=(int32_t)(t-(char *)s8);
3234
3235    // Convert expectLimits[].
3236    int32_t i, j, expect;
3237    for(i=j=0; i<expectCount; ++i) {
3238        expect=expectLimits[i];
3239        if(expect==length16) {
3240            expectLimits[i]=length8;
3241        } else {
3242            while(offsets[j]<expect) {
3243                ++j;
3244            }
3245            expectLimits[i]=j;
3246        }
3247    }
3248
3249    testSpan(sets, s8, length8, FALSE, whichSpans, expectLimits, expectCount, testName, index);
3250}
3251
3252static UChar32 nextCodePoint(UChar32 c) {
3253    // Skip some large and boring ranges.
3254    switch(c) {
3255    case 0x3441:
3256        return 0x4d7f;
3257    case 0x5100:
3258        return 0x9f00;
3259    case 0xb040:
3260        return 0xd780;
3261    case 0xe041:
3262        return 0xf8fe;
3263    case 0x10100:
3264        return 0x20000;
3265    case 0x20041:
3266        return 0xe0000;
3267    case 0xe0101:
3268        return 0x10fffd;
3269    default:
3270        return c+1;
3271    }
3272}
3273
3274// Verify that all implementations represent the same set.
3275void UnicodeSetTest::testSpanContents(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3276    // contains(U+FFFD) is inconsistent with contains(some surrogates),
3277    // or the set contains strings with unpaired surrogates which don't translate to valid UTF-8:
3278    // Skip the UTF-8 part of the test - if the string contains surrogates -
3279    // because it is likely to produce a different result.
3280    UBool inconsistentSurrogates=
3281            (!(sets[0]->getSet().contains(0xfffd) ?
3282               sets[0]->getSet().contains(0xd800, 0xdfff) :
3283               sets[0]->getSet().containsNone(0xd800, 0xdfff)) ||
3284             sets[0]->hasStringsWithSurrogates());
3285
3286    UChar s[1000];
3287    int32_t length=0;
3288    uint32_t localWhichSpans;
3289
3290    UChar32 c, first;
3291    for(first=c=0;; c=nextCodePoint(c)) {
3292        if(c>0x10ffff || length>(LENGTHOF(s)-U16_MAX_LENGTH)) {
3293            localWhichSpans=whichSpans;
3294            if(stringContainsUnpairedSurrogate(s, length) && inconsistentSurrogates) {
3295                localWhichSpans&=~SPAN_UTF8;
3296            }
3297            testSpanBothUTFs(sets, s, length, localWhichSpans, testName, first);
3298            if(c>0x10ffff) {
3299                break;
3300            }
3301            length=0;
3302            first=c;
3303        }
3304        U16_APPEND_UNSAFE(s, length, c);
3305    }
3306}
3307
3308// Test with a particular, interesting string.
3309// Specify length and try NUL-termination.
3310void UnicodeSetTest::testSpanUTF16String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3311    static const UChar s[]={
3312        0x61, 0x62, 0x20,                       // Latin, space
3313        0x3b1, 0x3b2, 0x3b3,                    // Greek
3314        0xd900,                                 // lead surrogate
3315        0x3000, 0x30ab, 0x30ad,                 // wide space, Katakana
3316        0xdc05,                                 // trail surrogate
3317        0xa0, 0xac00, 0xd7a3,                   // nbsp, Hangul
3318        0xd900, 0xdc05,                         // unassigned supplementary
3319        0xd840, 0xdfff, 0xd860, 0xdffe,         // Han supplementary
3320        0xd7a4, 0xdc05, 0xd900, 0x2028,         // unassigned, surrogates in wrong order, LS
3321        0                                       // NUL
3322    };
3323
3324    if((whichSpans&SPAN_UTF16)==0) {
3325        return;
3326    }
3327    testSpan(sets, s, -1, TRUE, (whichSpans&~SPAN_UTF8), testName, 0);
3328    testSpan(sets, s, LENGTHOF(s)-1, TRUE, (whichSpans&~SPAN_UTF8), testName, 1);
3329}
3330
3331void UnicodeSetTest::testSpanUTF8String(const UnicodeSetWithStrings *sets[4], uint32_t whichSpans, const char *testName) {
3332    static const char s[]={
3333        "abc"                                   // Latin
3334
3335        /* trail byte in lead position */
3336        "\x80"
3337
3338        " "                                     // space
3339
3340        /* truncated multi-byte sequences */
3341        "\xd0"
3342        "\xe0"
3343        "\xe1"
3344        "\xed"
3345        "\xee"
3346        "\xf0"
3347        "\xf1"
3348        "\xf4"
3349        "\xf8"
3350        "\xfc"
3351
3352        "\xCE\xB1\xCE\xB2\xCE\xB3"              // Greek
3353
3354        /* trail byte in lead position */
3355        "\x80"
3356
3357        "\xe0\x80"
3358        "\xe0\xa0"
3359        "\xe1\x80"
3360        "\xed\x80"
3361        "\xed\xa0"
3362        "\xee\x80"
3363        "\xf0\x80"
3364        "\xf0\x90"
3365        "\xf1\x80"
3366        "\xf4\x80"
3367        "\xf4\x90"
3368        "\xf8\x80"
3369        "\xfc\x80"
3370
3371        "\xE3\x80\x80\xE3\x82\xAB\xE3\x82\xAD"  // wide space, Katakana
3372
3373        /* trail byte in lead position */
3374        "\x80"
3375
3376        "\xf0\x80\x80"
3377        "\xf0\x90\x80"
3378        "\xf1\x80\x80"
3379        "\xf4\x80\x80"
3380        "\xf4\x90\x80"
3381        "\xf8\x80\x80"
3382        "\xfc\x80\x80"
3383
3384        "\xC2\xA0\xEA\xB0\x80\xED\x9E\xA3"      // nbsp, Hangul
3385
3386        /* trail byte in lead position */
3387        "\x80"
3388
3389        "\xf8\x80\x80\x80"
3390        "\xfc\x80\x80\x80"
3391
3392        "\xF1\x90\x80\x85"                      // unassigned supplementary
3393
3394        /* trail byte in lead position */
3395        "\x80"
3396
3397        "\xfc\x80\x80\x80\x80"
3398
3399        "\xF0\xA0\x8F\xBF\xF0\xA8\x8F\xBE"      // Han supplementary
3400
3401        /* trail byte in lead position */
3402        "\x80"
3403
3404        /* complete sequences but non-shortest forms or out of range etc. */
3405        "\xc0\x80"
3406        "\xe0\x80\x80"
3407        "\xed\xa0\x80"
3408        "\xf0\x80\x80\x80"
3409        "\xf4\x90\x80\x80"
3410        "\xf8\x80\x80\x80\x80"
3411        "\xfc\x80\x80\x80\x80\x80"
3412        "\xfe"
3413        "\xff"
3414
3415        /* trail byte in lead position */
3416        "\x80"
3417
3418        "\xED\x9E\xA4\xE2\x80\xA8"              // unassigned, LS, NUL-terminated
3419    };
3420
3421    if((whichSpans&SPAN_UTF8)==0) {
3422        return;
3423    }
3424    testSpan(sets, s, -1, FALSE, (whichSpans&~SPAN_UTF16), testName, 0);
3425    testSpan(sets, s, LENGTHOF(s)-1, FALSE, (whichSpans&~SPAN_UTF16), testName, 1);
3426}
3427
3428// Take a set of span options and multiply them so that
3429// each portion only has one of the options a, b and c.
3430// If b==0, then the set of options is just modified with mask and a.
3431// If b!=0 and c==0, then the set of options is just modified with mask, a and b.
3432static int32_t
3433addAlternative(uint32_t whichSpans[], int32_t whichSpansCount,
3434               uint32_t mask, uint32_t a, uint32_t b, uint32_t c) {
3435    uint32_t s;
3436    int32_t i;
3437
3438    for(i=0; i<whichSpansCount; ++i) {
3439        s=whichSpans[i]&mask;
3440        whichSpans[i]=s|a;
3441        if(b!=0) {
3442            whichSpans[whichSpansCount+i]=s|b;
3443            if(c!=0) {
3444                whichSpans[2*whichSpansCount+i]=s|c;
3445            }
3446        }
3447    }
3448    return b==0 ? whichSpansCount : c==0 ? 2*whichSpansCount : 3*whichSpansCount;
3449}
3450
3451#define _63_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3452#define _64_a "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
3453#define _63_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3454#define _64_b "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
3455
3456void UnicodeSetTest::TestSpan() {
3457    // "[...]" is a UnicodeSet pattern.
3458    // "*" performs tests on all Unicode code points and on a selection of
3459    //   malformed UTF-8/16 strings.
3460    // "-options" limits the scope of testing for the current set.
3461    //   By default, the test verifies that equivalent boundaries are found
3462    //   for UTF-16 and UTF-8, going forward and backward,
3463    //   alternating USET_SPAN_NOT_CONTAINED with
3464    //   either USET_SPAN_CONTAINED or USET_SPAN_SIMPLE.
3465    //   Single-character options:
3466    //     8 -- UTF-16 and UTF-8 boundaries may differ.
3467    //          Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
3468    //          or the set contains strings with unpaired surrogates
3469    //          which do not translate to valid UTF-8.
3470    //     c -- set.span() and set.complement().span() boundaries may differ.
3471    //          Cause: Set strings are not complemented.
3472    //     b -- span() and spanBack() boundaries may differ.
3473    //          Cause: Strings in the set overlap, and spanBack(USET_SPAN_CONTAINED)
3474    //          and spanBack(USET_SPAN_SIMPLE) are defined to
3475    //          match with non-overlapping substrings.
3476    //          For example, with a set containing "ab" and "ba",
3477    //          span() of "aba" yields boundaries { 0, 2, 3 }
3478    //          because the initial "ab" matches from 0 to 2,
3479    //          while spanBack() yields boundaries { 0, 1, 3 }
3480    //          because the final "ba" matches from 1 to 3.
3481    //     l -- USET_SPAN_CONTAINED and USET_SPAN_SIMPLE boundaries may differ.
3482    //          Cause: Strings in the set overlap, and a longer match may
3483    //          require a sequence including non-longest substrings.
3484    //          For example, with a set containing "ab", "abc" and "cd",
3485    //          span(contained) of "abcd" spans the entire string
3486    //          but span(longest match) only spans the first 3 characters.
3487    //   Each "-options" first resets all options and then applies the specified options.
3488    //   A "-" without options resets the options.
3489    //   The options are also reset for each new set.
3490    // Other strings will be spanned.
3491    static const char *const testdata[]={
3492        "[:ID_Continue:]",
3493        "*",
3494        "[:White_Space:]",
3495        "*",
3496        "[]",
3497        "*",
3498        "[\\u0000-\\U0010FFFF]",
3499        "*",
3500        "[\\u0000\\u0080\\u0800\\U00010000]",
3501        "*",
3502        "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
3503        "*",
3504        "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]",
3505        "-c",
3506        "*",
3507        "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
3508        "-c",
3509        "*",
3510
3511        // Overlapping strings cause overlapping attempts to match.
3512        "[x{xy}{xya}{axy}{ax}]",
3513        "-cl",
3514
3515        // More repetitions of "xya" would take too long with the recursive
3516        // reference implementation.
3517        // containsAll()=FALSE
3518        // test_string 0x14
3519        "xx"
3520        "xyaxyaxyaxya"  // set.complement().span(longest match) will stop here.
3521        "xx"            // set.complement().span(contained) will stop between the two 'x'es.
3522        "xyaxyaxyaxya"
3523        "xx"
3524        "xyaxyaxyaxya"  // span() ends here.
3525        "aaa",
3526
3527        // containsAll()=TRUE
3528        // test_string 0x15
3529        "xx"
3530        "xyaxyaxyaxya"
3531        "xx"
3532        "xyaxyaxyaxya"
3533        "xx"
3534        "xyaxyaxyaxy",
3535
3536        "-bc",
3537        // test_string 0x17
3538        "byayaxya",  // span() -> { 4, 7, 8 }  spanBack() -> { 5, 8 }
3539        "-c",
3540        "byayaxy",   // span() -> { 4, 7 }     complement.span() -> { 7 }
3541        "byayax",    // span() -> { 4, 6 }     complement.span() -> { 6 }
3542        "-",
3543        "byaya",     // span() -> { 5 }
3544        "byay",      // span() -> { 4 }
3545        "bya",       // span() -> { 3 }
3546
3547        // span(longest match) will not span the whole string.
3548        "[a{ab}{bc}]",
3549        "-cl",
3550        // test_string 0x21
3551        "abc",
3552
3553        "[a{ab}{abc}{cd}]",
3554        "-cl",
3555        "acdabcdabccd",
3556
3557        // spanBack(longest match) will not span the whole string.
3558        "[c{ab}{bc}]",
3559        "-cl",
3560        "abc",
3561
3562        "[d{cd}{bcd}{ab}]",
3563        "-cl",
3564        "abbcdabcdabd",
3565
3566        // Test with non-ASCII set strings - test proper handling of surrogate pairs
3567        // and UTF-8 trail bytes.
3568        // Copies of above test sets and strings, but transliterated to have
3569        // different code points with similar trail units.
3570        // Previous: a      b         c            d
3571        // Unicode:  042B   30AB      200AB        204AB
3572        // UTF-16:   042B   30AB      D840 DCAB    D841 DCAB
3573        // UTF-8:    D0 AB  E3 82 AB  F0 A0 82 AB  F0 A0 92 AB
3574        "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
3575        "-cl",
3576        "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
3577
3578        "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
3579        "-cl",
3580        "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
3581
3582        // Stress bookkeeping and recursion.
3583        // The following strings are barely doable with the recursive
3584        // reference implementation.
3585        // The not-contained character at the end prevents an early exit from the span().
3586        "[b{bb}]",
3587        "-c",
3588        // test_string 0x33
3589        "bbbbbbbbbbbbbbbbbbbbbbbb-",
3590        // On complement sets, span() and spanBack() get different results
3591        // because b is not in the complement set and there is an odd number of b's
3592        // in the test string.
3593        "-bc",
3594        "bbbbbbbbbbbbbbbbbbbbbbbbb-",
3595
3596        // Test with set strings with an initial or final code point span
3597        // longer than 254.
3598        "[a{" _64_a _64_a _64_a _64_a "b}"
3599          "{a" _64_b _64_b _64_b _64_b "}]",
3600        "-c",
3601        _64_a _64_a _64_a _63_a "b",
3602        _64_a _64_a _64_a _64_a "b",
3603        _64_a _64_a _64_a _64_a "aaaabbbb",
3604        "a" _64_b _64_b _64_b _63_b,
3605        "a" _64_b _64_b _64_b _64_b,
3606        "aaaabbbb" _64_b _64_b _64_b _64_b,
3607
3608        // Test with strings containing unpaired surrogates.
3609        // They are not representable in UTF-8, and a leading trail surrogate
3610        // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
3611        // U+20001 == \\uD840\\uDC01
3612        // U+20400 == \\uD841\\uDC00
3613        "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]",
3614        "-8cl",
3615        "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb"
3616    };
3617    uint32_t whichSpans[96]={ SPAN_ALL };
3618    int32_t whichSpansCount=1;
3619
3620    UnicodeSet *sets[SET_COUNT]={ NULL };
3621    const UnicodeSetWithStrings *sets_with_str[SET_COUNT]={ NULL };
3622
3623    char testName[1024];
3624    char *testNameLimit=testName;
3625
3626    int32_t i, j;
3627    for(i=0; i<LENGTHOF(testdata); ++i) {
3628        const char *s=testdata[i];
3629        if(s[0]=='[') {
3630            // Create new test sets from this pattern.
3631            for(j=0; j<SET_COUNT; ++j) {
3632                delete sets_with_str[j];
3633                delete sets[j];
3634            }
3635            UErrorCode errorCode=U_ZERO_ERROR;
3636            sets[SLOW]=new UnicodeSet(UnicodeString(s, -1, US_INV).unescape(), errorCode);
3637            if(U_FAILURE(errorCode)) {
3638                dataerrln("FAIL: Unable to create UnicodeSet(%s) - %s", s, u_errorName(errorCode));
3639                break;
3640            }
3641            sets[SLOW_NOT]=new UnicodeSet(*sets[SLOW]);
3642            sets[SLOW_NOT]->complement();
3643            // Intermediate set: Test cloning of a frozen set.
3644            UnicodeSet *fast=new UnicodeSet(*sets[SLOW]);
3645            fast->freeze();
3646            sets[FAST]=(UnicodeSet *)fast->clone();
3647            delete fast;
3648            UnicodeSet *fastNot=new UnicodeSet(*sets[SLOW_NOT]);
3649            fastNot->freeze();
3650            sets[FAST_NOT]=(UnicodeSet *)fastNot->clone();
3651            delete fastNot;
3652
3653            for(j=0; j<SET_COUNT; ++j) {
3654                sets_with_str[j]=new UnicodeSetWithStrings(*sets[j]);
3655            }
3656
3657            strcpy(testName, s);
3658            testNameLimit=strchr(testName, 0);
3659            *testNameLimit++=':';
3660            *testNameLimit=0;
3661
3662            whichSpans[0]=SPAN_ALL;
3663            whichSpansCount=1;
3664        } else if(s[0]=='-') {
3665            whichSpans[0]=SPAN_ALL;
3666            whichSpansCount=1;
3667
3668            while(*++s!=0) {
3669                switch(*s) {
3670                case 'c':
3671                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3672                                                   ~SPAN_POLARITY,
3673                                                   SPAN_SET,
3674                                                   SPAN_COMPLEMENT,
3675                                                   0);
3676                    break;
3677                case 'b':
3678                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3679                                                   ~SPAN_DIRS,
3680                                                   SPAN_FWD,
3681                                                   SPAN_BACK,
3682                                                   0);
3683                    break;
3684                case 'l':
3685                    // test USET_SPAN_CONTAINED FWD & BACK, and separately
3686                    // USET_SPAN_SIMPLE only FWD, and separately
3687                    // USET_SPAN_SIMPLE only BACK
3688                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3689                                                   ~(SPAN_DIRS|SPAN_CONDITION),
3690                                                   SPAN_DIRS|SPAN_CONTAINED,
3691                                                   SPAN_FWD|SPAN_SIMPLE,
3692                                                   SPAN_BACK|SPAN_SIMPLE);
3693                    break;
3694                case '8':
3695                    whichSpansCount=addAlternative(whichSpans, whichSpansCount,
3696                                                   ~SPAN_UTFS,
3697                                                   SPAN_UTF16,
3698                                                   SPAN_UTF8,
3699                                                   0);
3700                    break;
3701                default:
3702                    errln("FAIL: unrecognized span set option in \"%s\"", testdata[i]);
3703                    break;
3704                }
3705            }
3706        } else if(0==strcmp(s, "*")) {
3707            strcpy(testNameLimit, "bad_string");
3708            for(j=0; j<whichSpansCount; ++j) {
3709                if(whichSpansCount>1) {
3710                    sprintf(testNameLimit+10 /* strlen("bad_string") */,
3711                            "%%0x%3x",
3712                            whichSpans[j]);
3713                }
3714                testSpanUTF16String(sets_with_str, whichSpans[j], testName);
3715                testSpanUTF8String(sets_with_str, whichSpans[j], testName);
3716            }
3717
3718            strcpy(testNameLimit, "contents");
3719            for(j=0; j<whichSpansCount; ++j) {
3720                if(whichSpansCount>1) {
3721                    sprintf(testNameLimit+8 /* strlen("contents") */,
3722                            "%%0x%3x",
3723                            whichSpans[j]);
3724                }
3725                testSpanContents(sets_with_str, whichSpans[j], testName);
3726            }
3727        } else {
3728            UnicodeString string=UnicodeString(s, -1, US_INV).unescape();
3729            strcpy(testNameLimit, "test_string");
3730            for(j=0; j<whichSpansCount; ++j) {
3731                if(whichSpansCount>1) {
3732                    sprintf(testNameLimit+11 /* strlen("test_string") */,
3733                            "%%0x%3x",
3734                            whichSpans[j]);
3735                }
3736                testSpanBothUTFs(sets_with_str, string.getBuffer(), string.length(), whichSpans[j], testName, i);
3737            }
3738        }
3739    }
3740    for(j=0; j<SET_COUNT; ++j) {
3741        delete sets_with_str[j];
3742        delete sets[j];
3743    }
3744}
3745
3746// Test select patterns and strings, and test USET_SPAN_SIMPLE.
3747void UnicodeSetTest::TestStringSpan() {
3748    static const char *pattern="[x{xy}{xya}{axy}{ax}]";
3749    static const char *const string=
3750        "xx"
3751        "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3752        "xx"
3753        "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya"
3754        "xx"
3755        "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy"
3756        "aaaa";
3757
3758    UErrorCode errorCode=U_ZERO_ERROR;
3759    UnicodeString pattern16=UnicodeString(pattern, -1, US_INV);
3760    UnicodeSet set(pattern16, errorCode);
3761    if(U_FAILURE(errorCode)) {
3762        errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3763        return;
3764    }
3765
3766    UnicodeString string16=UnicodeString(string, -1, US_INV).unescape();
3767
3768    if(set.containsAll(string16)) {
3769        errln("FAIL: UnicodeSet(%s).containsAll(%s) should be FALSE", pattern, string);
3770    }
3771
3772    // Remove trailing "aaaa".
3773    string16.truncate(string16.length()-4);
3774    if(!set.containsAll(string16)) {
3775        errln("FAIL: UnicodeSet(%s).containsAll(%s[:-4]) should be TRUE", pattern, string);
3776    }
3777
3778    string16=UNICODE_STRING_SIMPLE("byayaxya");
3779    const UChar *s16=string16.getBuffer();
3780    int32_t length16=string16.length();
3781    if( set.span(s16, 8, USET_SPAN_NOT_CONTAINED)!=4 ||
3782        set.span(s16, 7, USET_SPAN_NOT_CONTAINED)!=4 ||
3783        set.span(s16, 6, USET_SPAN_NOT_CONTAINED)!=4 ||
3784        set.span(s16, 5, USET_SPAN_NOT_CONTAINED)!=5 ||
3785        set.span(s16, 4, USET_SPAN_NOT_CONTAINED)!=4 ||
3786        set.span(s16, 3, USET_SPAN_NOT_CONTAINED)!=3
3787    ) {
3788        errln("FAIL: UnicodeSet(%s).span(while not) returns the wrong value", pattern);
3789    }
3790
3791    pattern="[a{ab}{abc}{cd}]";
3792    pattern16=UnicodeString(pattern, -1, US_INV);
3793    set.applyPattern(pattern16, errorCode);
3794    if(U_FAILURE(errorCode)) {
3795        errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3796        return;
3797    }
3798    string16=UNICODE_STRING_SIMPLE("acdabcdabccd");
3799    s16=string16.getBuffer();
3800    length16=string16.length();
3801    if( set.span(s16, 12, USET_SPAN_CONTAINED)!=12 ||
3802        set.span(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3803        set.span(s16+7, 5, USET_SPAN_SIMPLE)!=5
3804    ) {
3805        errln("FAIL: UnicodeSet(%s).span(while longest match) returns the wrong value", pattern);
3806    }
3807
3808    pattern="[d{cd}{bcd}{ab}]";
3809    pattern16=UnicodeString(pattern, -1, US_INV);
3810    set.applyPattern(pattern16, errorCode).freeze();
3811    if(U_FAILURE(errorCode)) {
3812        errln("FAIL: Unable to create UnicodeSet(%s) - %s", pattern, u_errorName(errorCode));
3813        return;
3814    }
3815    string16=UNICODE_STRING_SIMPLE("abbcdabcdabd");
3816    s16=string16.getBuffer();
3817    length16=string16.length();
3818    if( set.spanBack(s16, 12, USET_SPAN_CONTAINED)!=0 ||
3819        set.spanBack(s16, 12, USET_SPAN_SIMPLE)!=6 ||
3820        set.spanBack(s16, 5, USET_SPAN_SIMPLE)!=0
3821    ) {
3822        errln("FAIL: UnicodeSet(%s).spanBack(while longest match) returns the wrong value", pattern);
3823    }
3824}
3825