1/*
2**********************************************************************
3* Copyright (c) 2002-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5**********************************************************************
6*/
7#include "unicode/uset.h"
8#include "unicode/ustring.h"
9#include "cintltst.h"
10#include "cmemory.h"
11#include <stdlib.h>
12#include <string.h>
13
14#define TEST(x) addTest(root, &x, "uset/" # x)
15
16static void TestAPI(void);
17static void Testj2269(void);
18static void TestSerialized(void);
19static void TestNonInvariantPattern(void);
20static void TestBadPattern(void);
21static void TestFreezable(void);
22static void TestSpan(void);
23
24void addUSetTest(TestNode** root);
25
26static void expect(const USet* set,
27                   const char* inList,
28                   const char* outList,
29                   UErrorCode* ec);
30static void expectContainment(const USet* set,
31                              const char* list,
32                              UBool isIn);
33static char oneUCharToChar(UChar32 c);
34static void expectItems(const USet* set,
35                        const char* items);
36
37void
38addUSetTest(TestNode** root) {
39    TEST(TestAPI);
40    TEST(Testj2269);
41    TEST(TestSerialized);
42    TEST(TestNonInvariantPattern);
43    TEST(TestBadPattern);
44    TEST(TestFreezable);
45    TEST(TestSpan);
46}
47
48/*------------------------------------------------------------------
49 * Tests
50 *------------------------------------------------------------------*/
51
52static void Testj2269() {
53  UErrorCode status = U_ZERO_ERROR;
54  UChar a[4] = { 0x61, 0x62, 0x63, 0 };
55  USet *s = uset_open(1, 0);
56  uset_addString(s, a, 3);
57  a[0] = 0x63; a[1] = 0x63;
58  expect(s, "{abc}", "{ccc}", &status);
59  uset_close(s);
60}
61
62static const UChar PAT[] = {91,97,45,99,123,97,98,125,93,0}; /* "[a-c{ab}]" */
63static const int32_t PAT_LEN = (sizeof(PAT) / sizeof(PAT[0])) - 1;
64
65static const UChar PAT_lb[] = {0x6C, 0x62, 0}; /* "lb" */
66static const int32_t PAT_lb_LEN = (sizeof(PAT_lb) / sizeof(PAT_lb[0])) - 1;
67
68static const UChar VAL_SP[] = {0x53, 0x50, 0}; /* "SP" */
69static const int32_t VAL_SP_LEN = (sizeof(VAL_SP) / sizeof(VAL_SP[0])) - 1;
70
71static const UChar STR_bc[] = {98,99,0}; /* "bc" */
72static const int32_t STR_bc_LEN = (sizeof(STR_bc) / sizeof(STR_bc[0])) - 1;
73
74static const UChar STR_ab[] = {97,98,0}; /* "ab" */
75static const int32_t STR_ab_LEN = (sizeof(STR_ab) / sizeof(STR_ab[0])) - 1;
76
77/**
78 * Basic API test for uset.x
79 */
80static void TestAPI() {
81    USet* set;
82    USet* set2;
83    UErrorCode ec;
84
85    /* [] */
86    set = uset_openEmpty();
87    expect(set, "", "abc{ab}", NULL);
88    uset_close(set);
89
90    set = uset_open(1, 0);
91    expect(set, "", "abc{ab}", NULL);
92    uset_close(set);
93
94    set = uset_open(1, 1);
95    uset_clear(set);
96    expect(set, "", "abc{ab}", NULL);
97    uset_close(set);
98
99    /* [ABC] */
100    set = uset_open(0x0041, 0x0043);
101    expect(set, "ABC", "DEF{ab}", NULL);
102    uset_close(set);
103
104    /* [a-c{ab}] */
105    ec = U_ZERO_ERROR;
106    set = uset_openPattern(PAT, PAT_LEN, &ec);
107    if(U_FAILURE(ec)) {
108        log_err("uset_openPattern([a-c{ab}]) failed - %s\n", u_errorName(ec));
109        return;
110    }
111    if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) {
112        log_err("uset_resemblesPattern of PAT failed\n");
113    }
114    expect(set, "abc{ab}", "def{bc}", &ec);
115
116    /* [a-d{ab}] */
117    uset_add(set, 0x64);
118    expect(set, "abcd{ab}", "ef{bc}", NULL);
119
120    /* [acd{ab}{bc}] */
121    uset_remove(set, 0x62);
122    uset_addString(set, STR_bc, STR_bc_LEN);
123    expect(set, "acd{ab}{bc}", "bef{cd}", NULL);
124
125    /* [acd{bc}] */
126    uset_removeString(set, STR_ab, STR_ab_LEN);
127    expect(set, "acd{bc}", "bfg{ab}", NULL);
128
129    /* [^acd{bc}] */
130    uset_complement(set);
131    expect(set, "bef{bc}", "acd{ac}", NULL);
132
133    /* [a-e{bc}] */
134    uset_complement(set);
135    uset_addRange(set, 0x0062, 0x0065);
136    expect(set, "abcde{bc}", "fg{ab}", NULL);
137
138    /* [de{bc}] */
139    uset_removeRange(set, 0x0050, 0x0063);
140    expect(set, "de{bc}", "bcfg{ab}", NULL);
141
142    /* [g-l] */
143    uset_set(set, 0x0067, 0x006C);
144    expect(set, "ghijkl", "de{bc}", NULL);
145
146    if (uset_indexOf(set, 0x0067) != 0) {
147        log_err("uset_indexOf failed finding correct index of 'g'\n");
148    }
149
150    if (uset_charAt(set, 0) != 0x0067) {
151        log_err("uset_charAt failed finding correct char 'g' at index 0\n");
152    }
153
154    /* How to test this one...? */
155    uset_compact(set);
156
157    /* [g-i] */
158    uset_retain(set, 0x0067, 0x0069);
159    expect(set, "ghi", "dejkl{bc}", NULL);
160
161    /* UCHAR_ASCII_HEX_DIGIT */
162    uset_applyIntPropertyValue(set, UCHAR_ASCII_HEX_DIGIT, 1, &ec);
163    if(U_FAILURE(ec)) {
164        log_err("uset_applyIntPropertyValue([UCHAR_ASCII_HEX_DIGIT]) failed - %s\n", u_errorName(ec));
165        return;
166    }
167    expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL);
168
169    /* [ab] */
170    uset_clear(set);
171    uset_addAllCodePoints(set, STR_ab, STR_ab_LEN);
172    expect(set, "ab", "def{ab}", NULL);
173    if (uset_containsAllCodePoints(set, STR_bc, STR_bc_LEN)){
174        log_err("set should not conatin all characters of \"bc\" \n");
175    }
176
177    /* [] */
178    set2 = uset_open(1, 1);
179    uset_clear(set2);
180
181    /* space */
182    uset_applyPropertyAlias(set2, PAT_lb, PAT_lb_LEN, VAL_SP, VAL_SP_LEN, &ec);
183    expect(set2, " ", "abcdefghi{bc}", NULL);
184
185    /* [a-c] */
186    uset_set(set2, 0x0061, 0x0063);
187    /* [g-i] */
188    uset_set(set, 0x0067, 0x0069);
189
190    /* [a-c g-i] */
191    if (uset_containsSome(set, set2)) {
192        log_err("set should not contain some of set2 yet\n");
193    }
194    uset_complementAll(set, set2);
195    if (!uset_containsSome(set, set2)) {
196        log_err("set should contain some of set2\n");
197    }
198    expect(set, "abcghi", "def{bc}", NULL);
199
200    /* [g-i] */
201    uset_removeAll(set, set2);
202    expect(set, "ghi", "abcdef{bc}", NULL);
203
204    /* [a-c g-i] */
205    uset_addAll(set2, set);
206    expect(set2, "abcghi", "def{bc}", NULL);
207
208    /* [g-i] */
209    uset_retainAll(set2, set);
210    expect(set2, "ghi", "abcdef{bc}", NULL);
211
212    uset_close(set);
213    uset_close(set2);
214}
215
216/*------------------------------------------------------------------
217 * Support
218 *------------------------------------------------------------------*/
219
220/**
221 * Verifies that the given set contains the characters and strings in
222 * inList, and does not contain those in outList.  Also verifies that
223 * 'set' is not NULL and that 'ec' succeeds.
224 * @param set the set to test, or NULL (on error)
225 * @param inList list of set contents, in iteration order.  Format is
226 * list of individual strings, in iteration order, followed by sorted
227 * list of strings, delimited by {}.  This means we do not test
228 * characters '{' or '}' and we do not test strings containing those
229 * characters either.
230 * @param outList list of things not in the set.  Same format as
231 * inList.
232 * @param ec an error code, checked for success.  May be NULL in which
233 * case it is ignored.
234 */
235static void expect(const USet* set,
236                   const char* inList,
237                   const char* outList,
238                   UErrorCode* ec) {
239    if (ec!=NULL && U_FAILURE(*ec)) {
240        log_err("FAIL: %s\n", u_errorName(*ec));
241        return;
242    }
243    if (set == NULL) {
244        log_err("FAIL: USet is NULL\n");
245        return;
246    }
247    expectContainment(set, inList, TRUE);
248    expectContainment(set, outList, FALSE);
249    expectItems(set, inList);
250}
251
252static void expectContainment(const USet* set,
253                              const char* list,
254                              UBool isIn) {
255    const char* p = list;
256    UChar ustr[4096];
257    char *pat;
258    UErrorCode ec;
259    int32_t rangeStart = -1, rangeEnd = -1, length;
260
261    ec = U_ZERO_ERROR;
262    length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec);
263    if(U_FAILURE(ec)) {
264        log_err("FAIL: uset_toPattern() fails in expectContainment() - %s\n", u_errorName(ec));
265        return;
266    }
267    pat=aescstrdup(ustr, length);
268
269    while (*p) {
270        if (*p=='{') {
271            const char* stringStart = ++p;
272            int32_t stringLength = 0;
273            char strCopy[64];
274
275            while (*p++ != '}') {
276            }
277            stringLength = (int32_t)(p - stringStart - 1);
278            strncpy(strCopy, stringStart, stringLength);
279            strCopy[stringLength] = 0;
280
281            u_charsToUChars(stringStart, ustr, stringLength);
282
283            if (uset_containsString(set, ustr, stringLength) == isIn) {
284                log_verbose("Ok: %s %s \"%s\"\n", pat,
285                            (isIn ? "contains" : "does not contain"),
286                            strCopy);
287            } else {
288                log_data_err("FAIL: %s %s \"%s\" (Are you missing data?)\n", pat,
289                        (isIn ? "does not contain" : "contains"),
290                        strCopy);
291            }
292        }
293
294        else {
295            UChar32 c;
296
297            u_charsToUChars(p, ustr, 1);
298            c = ustr[0];
299
300            if (uset_contains(set, c) == isIn) {
301                log_verbose("Ok: %s %s '%c'\n", pat,
302                            (isIn ? "contains" : "does not contain"),
303                            *p);
304            } else {
305                log_data_err("FAIL: %s %s '%c' (Are you missing data?)\n", pat,
306                        (isIn ? "does not contain" : "contains"),
307                        *p);
308            }
309
310            /* Test the range API too by looking for ranges */
311            if (c == rangeEnd+1) {
312                rangeEnd = c;
313            } else {
314                if (rangeStart >= 0) {
315                    if (uset_containsRange(set, rangeStart, rangeEnd) == isIn) {
316                        log_verbose("Ok: %s %s U+%04X-U+%04X\n", pat,
317                                    (isIn ? "contains" : "does not contain"),
318                                    rangeStart, rangeEnd);
319                    } else {
320                        log_data_err("FAIL: %s %s U+%04X-U+%04X (Are you missing data?)\n", pat,
321                                (isIn ? "does not contain" : "contains"),
322                                rangeStart, rangeEnd);
323                    }
324                }
325                rangeStart = rangeEnd = c;
326            }
327
328            ++p;
329        }
330    }
331
332    if (rangeStart >= 0) {
333        if (uset_containsRange(set, rangeStart, rangeEnd) == isIn) {
334            log_verbose("Ok: %s %s U+%04X-U+%04X\n", pat,
335                        (isIn ? "contains" : "does not contain"),
336                        rangeStart, rangeEnd);
337        } else {
338            log_data_err("FAIL: %s %s U+%04X-U+%04X (Are you missing data?)\n", pat,
339                    (isIn ? "does not contain" : "contains"),
340                    rangeStart, rangeEnd);
341        }
342    }
343}
344
345/* This only works for invariant BMP chars */
346static char oneUCharToChar(UChar32 c) {
347    UChar ubuf[1];
348    char buf[1];
349    ubuf[0] = (UChar) c;
350    u_UCharsToChars(ubuf, buf, 1);
351    return buf[0];
352}
353
354static void expectItems(const USet* set,
355                        const char* items) {
356    const char* p = items;
357    UChar ustr[4096], itemStr[4096];
358    char buf[4096];
359    char *pat;
360    UErrorCode ec;
361    int32_t expectedSize = 0;
362    int32_t itemCount = uset_getItemCount(set);
363    int32_t itemIndex = 0;
364    UChar32 start = 1, end = 0;
365    int32_t itemLen = 0, length;
366
367    ec = U_ZERO_ERROR;
368    length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec);
369    if (U_FAILURE(ec)) {
370        log_err("FAIL: uset_toPattern => %s\n", u_errorName(ec));
371        return;
372    }
373    pat=aescstrdup(ustr, length);
374
375    if (uset_isEmpty(set) != (strlen(items)==0)) {
376        log_data_err("FAIL: %s should return %s from isEmpty (Are you missing data?)\n",
377                pat,
378                strlen(items)==0 ? "TRUE" : "FALSE");
379    }
380
381    /* Don't test patterns starting with "[^" */
382    if (u_strlen(ustr) > 2 && ustr[1] == 0x5e /*'^'*/) {
383        return;
384    }
385
386    while (*p) {
387
388        ++expectedSize;
389
390        if (start > end || start == -1) {
391            /* Fetch our next item */
392            if (itemIndex >= itemCount) {
393                log_data_err("FAIL: ran out of items iterating %s (Are you missing data?)\n", pat);
394                return;
395            }
396
397            itemLen = uset_getItem(set, itemIndex, &start, &end,
398                                   itemStr, sizeof(itemStr), &ec);
399            if (U_FAILURE(ec) || itemLen < 0) {
400                log_err("FAIL: uset_getItem => %s\n", u_errorName(ec));
401                return;
402            }
403
404            if (itemLen == 0) {
405                log_verbose("Ok: %s item %d is %c-%c\n", pat,
406                            itemIndex, oneUCharToChar(start),
407                            oneUCharToChar(end));
408            } else {
409                itemStr[itemLen] = 0;
410                u_UCharsToChars(itemStr, buf, itemLen+1);
411                log_verbose("Ok: %s item %d is \"%s\"\n", pat, itemIndex, buf);
412            }
413
414            ++itemIndex;
415        }
416
417        if (*p=='{') {
418            const char* stringStart = ++p;
419            int32_t stringLength = 0;
420            char strCopy[64];
421
422            while (*p++ != '}') {
423            }
424            stringLength = (int32_t)(p - stringStart - 1);
425            strncpy(strCopy, stringStart, stringLength);
426            strCopy[stringLength] = 0;
427
428            u_charsToUChars(stringStart, ustr, stringLength);
429            ustr[stringLength] = 0;
430
431            if (itemLen == 0) {
432                log_err("FAIL: for %s expect \"%s\" next, but got a char\n",
433                        pat, strCopy);
434                return;
435            }
436
437            if (u_strcmp(ustr, itemStr) != 0) {
438                log_err("FAIL: for %s expect \"%s\" next\n",
439                        pat, strCopy);
440                return;
441            }
442        }
443
444        else {
445            UChar32 c;
446
447            u_charsToUChars(p, ustr, 1);
448            c = ustr[0];
449
450            if (itemLen != 0) {
451                log_err("FAIL: for %s expect '%c' next, but got a string\n",
452                        pat, *p);
453                return;
454            }
455
456            if (c != start++) {
457                log_err("FAIL: for %s expect '%c' next\n",
458                        pat, *p);
459                return;
460            }
461
462            ++p;
463        }
464    }
465
466    if (uset_size(set) == expectedSize) {
467        log_verbose("Ok: %s size is %d\n", pat, expectedSize);
468    } else {
469        log_err("FAIL: %s size is %d, expected %d\n",
470                pat, uset_size(set), expectedSize);
471    }
472}
473
474static void
475TestSerialized() {
476    uint16_t buffer[1000];
477    USerializedSet sset;
478    USet *set;
479    UErrorCode errorCode;
480    UChar32 c;
481    int32_t length;
482
483    /* use a pattern that generates both BMP and supplementary code points */
484    U_STRING_DECL(pattern, "[:Cf:]", 6);
485    U_STRING_INIT(pattern, "[:Cf:]", 6);
486
487    errorCode=U_ZERO_ERROR;
488    set=uset_openPattern(pattern, -1, &errorCode);
489    if(U_FAILURE(errorCode)) {
490        log_data_err("uset_openPattern([:Cf:]) failed - %s (Are you missing data?)\n", u_errorName(errorCode));
491        return;
492    }
493
494    length=uset_serialize(set, buffer, UPRV_LENGTHOF(buffer), &errorCode);
495    if(U_FAILURE(errorCode)) {
496        log_err("unable to uset_serialize([:Cf:]) - %s\n", u_errorName(errorCode));
497        uset_close(set);
498        return;
499    }
500
501    uset_getSerializedSet(&sset, buffer, length);
502    for(c=0; c<=0x10ffff; ++c) {
503        if(uset_contains(set, c)!=uset_serializedContains(&sset, c)) {
504            log_err("uset_contains(U+%04x)!=uset_serializedContains(U+%04x)\n", c);
505            break;
506        }
507    }
508
509    uset_close(set);
510}
511
512/**
513 * Make sure that when non-invariant chars are passed to uset_openPattern
514 * they do not cause an ugly failure mode (e.g. assertion failure).
515 * JB#3795.
516 */
517static void
518TestNonInvariantPattern() {
519    UErrorCode ec = U_ZERO_ERROR;
520    /* The critical part of this test is that the following pattern
521       must contain a non-invariant character. */
522    static const char *pattern = "[:ccc!=0:]";
523    UChar buf[256];
524    int32_t len = u_unescape(pattern, buf, 256);
525    /* This test 'fails' by having an assertion failure within the
526       following call.  It passes by running to completion with no
527       assertion failure. */
528    USet *set = uset_openPattern(buf, len, &ec);
529    uset_close(set);
530}
531
532static void TestBadPattern(void) {
533    UErrorCode status = U_ZERO_ERROR;
534    USet *pat;
535    U_STRING_DECL(pattern, "[", 1);
536    U_STRING_INIT(pattern, "[", 1);
537    pat = uset_openPatternOptions(pattern, u_strlen(pattern), 0, &status);
538    if (pat != NULL || U_SUCCESS(status)) {
539        log_err("uset_openPatternOptions did not fail as expected %s\n", u_errorName(status));
540    }
541}
542
543static USet *openIDSet() {
544    UErrorCode errorCode = U_ZERO_ERROR;
545    U_STRING_DECL(pattern, "[:ID_Continue:]", 15);
546    U_STRING_INIT(pattern, "[:ID_Continue:]", 15);
547    return uset_openPattern(pattern, 15, &errorCode);
548}
549
550static void TestFreezable() {
551    USet *idSet;
552    USet *frozen;
553    USet *thawed;
554
555    idSet=openIDSet();
556
557    if (idSet == NULL) {
558        log_data_err("openIDSet() returned NULL. (Are you missing data?)\n");
559        uset_close(idSet);
560        return;
561    }
562
563    frozen=uset_clone(idSet);
564
565    if (frozen == NULL) {
566        log_err("uset_Clone() returned NULL\n");
567        return;
568    }
569
570    if(!uset_equals(frozen, idSet)) {
571        log_err("uset_clone() did not make an equal copy\n");
572    }
573
574    uset_freeze(frozen);
575    uset_addRange(frozen, 0xd802, 0xd805);
576
577    if(uset_isFrozen(idSet) || !uset_isFrozen(frozen) || !uset_equals(frozen, idSet)) {
578        log_err("uset_freeze() or uset_isFrozen() does not work\n");
579    }
580
581    thawed=uset_cloneAsThawed(frozen);
582
583    if (thawed == NULL) {
584        log_err("uset_cloneAsThawed(frozen) returned NULL");
585        uset_close(frozen);
586        uset_close(idSet);
587        return;
588    }
589
590    uset_addRange(thawed, 0xd802, 0xd805);
591
592    if(uset_isFrozen(thawed) || uset_equals(thawed, idSet) || !uset_containsRange(thawed, 0xd802, 0xd805)) {
593        log_err("uset_cloneAsThawed() does not work\n");
594    }
595
596    uset_close(idSet);
597    uset_close(frozen);
598    uset_close(thawed);
599}
600
601static void TestSpan() {
602    static const UChar s16[2]={ 0xe01, 0x3000 };
603    static const char* s8="\xE0\xB8\x81\xE3\x80\x80";
604
605    USet *idSet=openIDSet();
606
607    if (idSet == NULL) {
608        log_data_err("openIDSet() returned NULL (Are you missing data?)\n");
609        return;
610    }
611
612    if(
613        1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) ||
614        0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) ||
615        2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) ||
616        1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED)
617    ) {
618        log_err("uset_span() or uset_spanBack() does not work\n");
619    }
620
621    if(
622        3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
623        0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) ||
624        6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
625        3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED)
626    ) {
627        log_err("uset_spanUTF8() or uset_spanBackUTF8() does not work\n");
628    }
629
630    uset_freeze(idSet);
631
632    if(
633        1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) ||
634        0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) ||
635        2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) ||
636        1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED)
637    ) {
638        log_err("uset_span(frozen) or uset_spanBack(frozen) does not work\n");
639    }
640
641    if(
642        3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
643        0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) ||
644        6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
645        3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED)
646    ) {
647        log_err("uset_spanUTF8(frozen) or uset_spanBackUTF8(frozen) does not work\n");
648    }
649
650    uset_close(idSet);
651}
652
653/*eof*/
654