1/*
2**********************************************************************
3* Copyright (c) 2002-2009, International Business Machines
4* Corporation and others.  All Rights Reserved.
5**********************************************************************
6*/
7#include "unicode/uset.h"
8#include "unicode/ustring.h"
9#include "cintltst.h"
10#include <stdlib.h>
11#include <string.h>
12
13#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
14
15#define TEST(x) addTest(root, &x, "uset/" # x)
16
17static void TestAPI(void);
18static void Testj2269(void);
19static void TestSerialized(void);
20static void TestNonInvariantPattern(void);
21static void TestBadPattern(void);
22static void TestFreezable(void);
23static void TestSpan(void);
24
25void addUSetTest(TestNode** root);
26
27static void expect(const USet* set,
28                   const char* inList,
29                   const char* outList,
30                   UErrorCode* ec);
31static void expectContainment(const USet* set,
32                              const char* list,
33                              UBool isIn);
34static char oneUCharToChar(UChar32 c);
35static void expectItems(const USet* set,
36                        const char* items);
37
38void
39addUSetTest(TestNode** root) {
40    TEST(TestAPI);
41    TEST(Testj2269);
42    TEST(TestSerialized);
43    TEST(TestNonInvariantPattern);
44    TEST(TestBadPattern);
45    TEST(TestFreezable);
46    TEST(TestSpan);
47}
48
49/*------------------------------------------------------------------
50 * Tests
51 *------------------------------------------------------------------*/
52
53static void Testj2269() {
54  UErrorCode status = U_ZERO_ERROR;
55  UChar a[4] = { 0x61, 0x62, 0x63, 0 };
56  USet *s = uset_open(1, 0);
57  uset_addString(s, a, 3);
58  a[0] = 0x63; a[1] = 0x63;
59  expect(s, "{abc}", "{ccc}", &status);
60  uset_close(s);
61}
62
63static const UChar PAT[] = {91,97,45,99,123,97,98,125,93,0}; /* "[a-c{ab}]" */
64static const int32_t PAT_LEN = (sizeof(PAT) / sizeof(PAT[0])) - 1;
65
66static const UChar PAT_lb[] = {0x6C, 0x62, 0}; /* "lb" */
67static const int32_t PAT_lb_LEN = (sizeof(PAT_lb) / sizeof(PAT_lb[0])) - 1;
68
69static const UChar VAL_SP[] = {0x53, 0x50, 0}; /* "SP" */
70static const int32_t VAL_SP_LEN = (sizeof(VAL_SP) / sizeof(VAL_SP[0])) - 1;
71
72static const UChar STR_bc[] = {98,99,0}; /* "bc" */
73static const int32_t STR_bc_LEN = (sizeof(STR_bc) / sizeof(STR_bc[0])) - 1;
74
75static const UChar STR_ab[] = {97,98,0}; /* "ab" */
76static const int32_t STR_ab_LEN = (sizeof(STR_ab) / sizeof(STR_ab[0])) - 1;
77
78/**
79 * Basic API test for uset.x
80 */
81static void TestAPI() {
82    USet* set;
83    USet* set2;
84    UErrorCode ec;
85
86    /* [] */
87    set = uset_openEmpty();
88    expect(set, "", "abc{ab}", NULL);
89    uset_close(set);
90
91    set = uset_open(1, 0);
92    expect(set, "", "abc{ab}", NULL);
93    uset_close(set);
94
95    set = uset_open(1, 1);
96    uset_clear(set);
97    expect(set, "", "abc{ab}", NULL);
98    uset_close(set);
99
100    /* [ABC] */
101    set = uset_open(0x0041, 0x0043);
102    expect(set, "ABC", "DEF{ab}", NULL);
103    uset_close(set);
104
105    /* [a-c{ab}] */
106    ec = U_ZERO_ERROR;
107    set = uset_openPattern(PAT, PAT_LEN, &ec);
108    if(U_FAILURE(ec)) {
109        log_err("uset_openPattern([a-c{ab}]) failed - %s\n", u_errorName(ec));
110        return;
111    }
112    if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) {
113        log_err("uset_resemblesPattern of PAT failed\n");
114    }
115    expect(set, "abc{ab}", "def{bc}", &ec);
116
117    /* [a-d{ab}] */
118    uset_add(set, 0x64);
119    expect(set, "abcd{ab}", "ef{bc}", NULL);
120
121    /* [acd{ab}{bc}] */
122    uset_remove(set, 0x62);
123    uset_addString(set, STR_bc, STR_bc_LEN);
124    expect(set, "acd{ab}{bc}", "bef{cd}", NULL);
125
126    /* [acd{bc}] */
127    uset_removeString(set, STR_ab, STR_ab_LEN);
128    expect(set, "acd{bc}", "bfg{ab}", NULL);
129
130    /* [^acd{bc}] */
131    uset_complement(set);
132    expect(set, "bef{bc}", "acd{ac}", NULL);
133
134    /* [a-e{bc}] */
135    uset_complement(set);
136    uset_addRange(set, 0x0062, 0x0065);
137    expect(set, "abcde{bc}", "fg{ab}", NULL);
138
139    /* [de{bc}] */
140    uset_removeRange(set, 0x0050, 0x0063);
141    expect(set, "de{bc}", "bcfg{ab}", NULL);
142
143    /* [g-l] */
144    uset_set(set, 0x0067, 0x006C);
145    expect(set, "ghijkl", "de{bc}", NULL);
146
147    if (uset_indexOf(set, 0x0067) != 0) {
148        log_err("uset_indexOf failed finding correct index of 'g'\n");
149    }
150
151    if (uset_charAt(set, 0) != 0x0067) {
152        log_err("uset_charAt failed finding correct char 'g' at index 0\n");
153    }
154
155    /* How to test this one...? */
156    uset_compact(set);
157
158    /* [g-i] */
159    uset_retain(set, 0x0067, 0x0069);
160    expect(set, "ghi", "dejkl{bc}", NULL);
161
162    /* UCHAR_ASCII_HEX_DIGIT */
163    uset_applyIntPropertyValue(set, UCHAR_ASCII_HEX_DIGIT, 1, &ec);
164    if(U_FAILURE(ec)) {
165        log_err("uset_applyIntPropertyValue([UCHAR_ASCII_HEX_DIGIT]) failed - %s\n", u_errorName(ec));
166        return;
167    }
168    expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL);
169
170    /* [ab] */
171    uset_clear(set);
172    uset_addAllCodePoints(set, STR_ab, STR_ab_LEN);
173    expect(set, "ab", "def{ab}", NULL);
174    if (uset_containsAllCodePoints(set, STR_bc, STR_bc_LEN)){
175        log_err("set should not conatin all characters of \"bc\" \n");
176    }
177
178    /* [] */
179    set2 = uset_open(1, 1);
180    uset_clear(set2);
181
182    /* space */
183    uset_applyPropertyAlias(set2, PAT_lb, PAT_lb_LEN, VAL_SP, VAL_SP_LEN, &ec);
184    expect(set2, " ", "abcdefghi{bc}", NULL);
185
186    /* [a-c] */
187    uset_set(set2, 0x0061, 0x0063);
188    /* [g-i] */
189    uset_set(set, 0x0067, 0x0069);
190
191    /* [a-c g-i] */
192    if (uset_containsSome(set, set2)) {
193        log_err("set should not contain some of set2 yet\n");
194    }
195    uset_complementAll(set, set2);
196    if (!uset_containsSome(set, set2)) {
197        log_err("set should contain some of set2\n");
198    }
199    expect(set, "abcghi", "def{bc}", NULL);
200
201    /* [g-i] */
202    uset_removeAll(set, set2);
203    expect(set, "ghi", "abcdef{bc}", NULL);
204
205    /* [a-c g-i] */
206    uset_addAll(set2, set);
207    expect(set2, "abcghi", "def{bc}", NULL);
208
209    /* [g-i] */
210    uset_retainAll(set2, set);
211    expect(set2, "ghi", "abcdef{bc}", NULL);
212
213    uset_close(set);
214    uset_close(set2);
215}
216
217/*------------------------------------------------------------------
218 * Support
219 *------------------------------------------------------------------*/
220
221/**
222 * Verifies that the given set contains the characters and strings in
223 * inList, and does not contain those in outList.  Also verifies that
224 * 'set' is not NULL and that 'ec' succeeds.
225 * @param set the set to test, or NULL (on error)
226 * @param inList list of set contents, in iteration order.  Format is
227 * list of individual strings, in iteration order, followed by sorted
228 * list of strings, delimited by {}.  This means we do not test
229 * characters '{' or '}' and we do not test strings containing those
230 * characters either.
231 * @param outList list of things not in the set.  Same format as
232 * inList.
233 * @param ec an error code, checked for success.  May be NULL in which
234 * case it is ignored.
235 */
236static void expect(const USet* set,
237                   const char* inList,
238                   const char* outList,
239                   UErrorCode* ec) {
240    if (ec!=NULL && U_FAILURE(*ec)) {
241        log_err("FAIL: %s\n", u_errorName(*ec));
242        return;
243    }
244    if (set == NULL) {
245        log_err("FAIL: USet is NULL\n");
246        return;
247    }
248    expectContainment(set, inList, TRUE);
249    expectContainment(set, outList, FALSE);
250    expectItems(set, inList);
251}
252
253static void expectContainment(const USet* set,
254                              const char* list,
255                              UBool isIn) {
256    const char* p = list;
257    UChar ustr[4096];
258    char *pat;
259    UErrorCode ec;
260    int32_t rangeStart = -1, rangeEnd = -1, length;
261
262    ec = U_ZERO_ERROR;
263    length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec);
264    if(U_FAILURE(ec)) {
265        log_err("FAIL: uset_toPattern() fails in expectContainment() - %s\n", u_errorName(ec));
266        return;
267    }
268    pat=aescstrdup(ustr, length);
269
270    while (*p) {
271        if (*p=='{') {
272            const char* stringStart = ++p;
273            int32_t stringLength = 0;
274            char strCopy[64];
275
276            while (*p++ != '}') {
277            }
278            stringLength = (int32_t)(p - stringStart - 1);
279            strncpy(strCopy, stringStart, stringLength);
280            strCopy[stringLength] = 0;
281
282            u_charsToUChars(stringStart, ustr, stringLength);
283
284            if (uset_containsString(set, ustr, stringLength) == isIn) {
285                log_verbose("Ok: %s %s \"%s\"\n", pat,
286                            (isIn ? "contains" : "does not contain"),
287                            strCopy);
288            } else {
289                log_data_err("FAIL: %s %s \"%s\" (Are you missing data?)\n", pat,
290                        (isIn ? "does not contain" : "contains"),
291                        strCopy);
292            }
293        }
294
295        else {
296            UChar32 c;
297
298            u_charsToUChars(p, ustr, 1);
299            c = ustr[0];
300
301            if (uset_contains(set, c) == isIn) {
302                log_verbose("Ok: %s %s '%c'\n", pat,
303                            (isIn ? "contains" : "does not contain"),
304                            *p);
305            } else {
306                log_data_err("FAIL: %s %s '%c' (Are you missing data?)\n", pat,
307                        (isIn ? "does not contain" : "contains"),
308                        *p);
309            }
310
311            /* Test the range API too by looking for ranges */
312            if (c == rangeEnd+1) {
313                rangeEnd = c;
314            } else {
315                if (rangeStart >= 0) {
316                    if (uset_containsRange(set, rangeStart, rangeEnd) == isIn) {
317                        log_verbose("Ok: %s %s U+%04X-U+%04X\n", pat,
318                                    (isIn ? "contains" : "does not contain"),
319                                    rangeStart, rangeEnd);
320                    } else {
321                        log_data_err("FAIL: %s %s U+%04X-U+%04X (Are you missing data?)\n", pat,
322                                (isIn ? "does not contain" : "contains"),
323                                rangeStart, rangeEnd);
324                    }
325                }
326                rangeStart = rangeEnd = c;
327            }
328
329            ++p;
330        }
331    }
332
333    if (rangeStart >= 0) {
334        if (uset_containsRange(set, rangeStart, rangeEnd) == isIn) {
335            log_verbose("Ok: %s %s U+%04X-U+%04X\n", pat,
336                        (isIn ? "contains" : "does not contain"),
337                        rangeStart, rangeEnd);
338        } else {
339            log_data_err("FAIL: %s %s U+%04X-U+%04X (Are you missing data?)\n", pat,
340                    (isIn ? "does not contain" : "contains"),
341                    rangeStart, rangeEnd);
342        }
343    }
344}
345
346/* This only works for invariant BMP chars */
347static char oneUCharToChar(UChar32 c) {
348    UChar ubuf[1];
349    char buf[1];
350    ubuf[0] = (UChar) c;
351    u_UCharsToChars(ubuf, buf, 1);
352    return buf[0];
353}
354
355static void expectItems(const USet* set,
356                        const char* items) {
357    const char* p = items;
358    UChar ustr[4096], itemStr[4096];
359    char buf[4096];
360    char *pat;
361    UErrorCode ec;
362    int32_t expectedSize = 0;
363    int32_t itemCount = uset_getItemCount(set);
364    int32_t itemIndex = 0;
365    UChar32 start = 1, end = 0;
366    int32_t itemLen = 0, length;
367
368    ec = U_ZERO_ERROR;
369    length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec);
370    if (U_FAILURE(ec)) {
371        log_err("FAIL: uset_toPattern => %s\n", u_errorName(ec));
372        return;
373    }
374    pat=aescstrdup(ustr, length);
375
376    if (uset_isEmpty(set) != (strlen(items)==0)) {
377        log_data_err("FAIL: %s should return %s from isEmpty (Are you missing data?)\n",
378                pat,
379                strlen(items)==0 ? "TRUE" : "FALSE");
380    }
381
382    /* Don't test patterns starting with "[^" */
383    if (u_strlen(ustr) > 2 && ustr[1] == 0x5e /*'^'*/) {
384        return;
385    }
386
387    while (*p) {
388
389        ++expectedSize;
390
391        if (start > end || start == -1) {
392            /* Fetch our next item */
393            if (itemIndex >= itemCount) {
394                log_data_err("FAIL: ran out of items iterating %s (Are you missing data?)\n", pat);
395                return;
396            }
397
398            itemLen = uset_getItem(set, itemIndex, &start, &end,
399                                   itemStr, sizeof(itemStr), &ec);
400            if (U_FAILURE(ec) || itemLen < 0) {
401                log_err("FAIL: uset_getItem => %s\n", u_errorName(ec));
402                return;
403            }
404
405            if (itemLen == 0) {
406                log_verbose("Ok: %s item %d is %c-%c\n", pat,
407                            itemIndex, oneUCharToChar(start),
408                            oneUCharToChar(end));
409            } else {
410                itemStr[itemLen] = 0;
411                u_UCharsToChars(itemStr, buf, itemLen+1);
412                log_verbose("Ok: %s item %d is \"%s\"\n", pat, itemIndex, buf);
413            }
414
415            ++itemIndex;
416        }
417
418        if (*p=='{') {
419            const char* stringStart = ++p;
420            int32_t stringLength = 0;
421            char strCopy[64];
422
423            while (*p++ != '}') {
424            }
425            stringLength = (int32_t)(p - stringStart - 1);
426            strncpy(strCopy, stringStart, stringLength);
427            strCopy[stringLength] = 0;
428
429            u_charsToUChars(stringStart, ustr, stringLength);
430            ustr[stringLength] = 0;
431
432            if (itemLen == 0) {
433                log_err("FAIL: for %s expect \"%s\" next, but got a char\n",
434                        pat, strCopy);
435                return;
436            }
437
438            if (u_strcmp(ustr, itemStr) != 0) {
439                log_err("FAIL: for %s expect \"%s\" next\n",
440                        pat, strCopy);
441                return;
442            }
443        }
444
445        else {
446            UChar32 c;
447
448            u_charsToUChars(p, ustr, 1);
449            c = ustr[0];
450
451            if (itemLen != 0) {
452                log_err("FAIL: for %s expect '%c' next, but got a string\n",
453                        pat, *p);
454                return;
455            }
456
457            if (c != start++) {
458                log_err("FAIL: for %s expect '%c' next\n",
459                        pat, *p);
460                return;
461            }
462
463            ++p;
464        }
465    }
466
467    if (uset_size(set) == expectedSize) {
468        log_verbose("Ok: %s size is %d\n", pat, expectedSize);
469    } else {
470        log_err("FAIL: %s size is %d, expected %d\n",
471                pat, uset_size(set), expectedSize);
472    }
473}
474
475static void
476TestSerialized() {
477    uint16_t buffer[1000];
478    USerializedSet sset;
479    USet *set;
480    UErrorCode errorCode;
481    UChar32 c;
482    int32_t length;
483
484    /* use a pattern that generates both BMP and supplementary code points */
485    U_STRING_DECL(pattern, "[:Cf:]", 6);
486    U_STRING_INIT(pattern, "[:Cf:]", 6);
487
488    errorCode=U_ZERO_ERROR;
489    set=uset_openPattern(pattern, -1, &errorCode);
490    if(U_FAILURE(errorCode)) {
491        log_data_err("uset_openPattern([:Cf:]) failed - %s (Are you missing data?)\n", u_errorName(errorCode));
492        return;
493    }
494
495    length=uset_serialize(set, buffer, LENGTHOF(buffer), &errorCode);
496    if(U_FAILURE(errorCode)) {
497        log_err("unable to uset_serialize([:Cf:]) - %s\n", u_errorName(errorCode));
498        uset_close(set);
499        return;
500    }
501
502    uset_getSerializedSet(&sset, buffer, length);
503    for(c=0; c<=0x10ffff; ++c) {
504        if(uset_contains(set, c)!=uset_serializedContains(&sset, c)) {
505            log_err("uset_contains(U+%04x)!=uset_serializedContains(U+%04x)\n", c);
506            break;
507        }
508    }
509
510    uset_close(set);
511}
512
513/**
514 * Make sure that when non-invariant chars are passed to uset_openPattern
515 * they do not cause an ugly failure mode (e.g. assertion failure).
516 * JB#3795.
517 */
518static void
519TestNonInvariantPattern() {
520    UErrorCode ec = U_ZERO_ERROR;
521    /* The critical part of this test is that the following pattern
522       must contain a non-invariant character. */
523    static const char *pattern = "[:ccc!=0:]";
524    UChar buf[256];
525    int32_t len = u_unescape(pattern, buf, 256);
526    /* This test 'fails' by having an assertion failure within the
527       following call.  It passes by running to completion with no
528       assertion failure. */
529    USet *set = uset_openPattern(buf, len, &ec);
530    uset_close(set);
531}
532
533static void TestBadPattern(void) {
534    UErrorCode status = U_ZERO_ERROR;
535    USet *pat;
536    U_STRING_DECL(pattern, "[", 1);
537    U_STRING_INIT(pattern, "[", 1);
538    pat = uset_openPatternOptions(pattern, u_strlen(pattern), 0, &status);
539    if (pat != NULL || U_SUCCESS(status)) {
540        log_err("uset_openPatternOptions did not fail as expected %s\n", u_errorName(status));
541    }
542}
543
544static USet *openIDSet() {
545    UErrorCode errorCode = U_ZERO_ERROR;
546    U_STRING_DECL(pattern, "[:ID_Continue:]", 15);
547    U_STRING_INIT(pattern, "[:ID_Continue:]", 15);
548    return uset_openPattern(pattern, 15, &errorCode);
549}
550
551static void TestFreezable() {
552    USet *idSet;
553    USet *frozen;
554    USet *thawed;
555
556    idSet=openIDSet();
557
558    if (idSet == NULL) {
559        log_data_err("openIDSet() returned NULL. (Are you missing data?)\n");
560        uset_close(idSet);
561        return;
562    }
563
564    frozen=uset_clone(idSet);
565
566    if (frozen == NULL) {
567        log_err("uset_Clone() returned NULL\n");
568        return;
569    }
570
571    if(!uset_equals(frozen, idSet)) {
572        log_err("uset_clone() did not make an equal copy\n");
573    }
574
575    uset_freeze(frozen);
576    uset_addRange(frozen, 0xd802, 0xd805);
577
578    if(uset_isFrozen(idSet) || !uset_isFrozen(frozen) || !uset_equals(frozen, idSet)) {
579        log_err("uset_freeze() or uset_isFrozen() does not work\n");
580    }
581
582    thawed=uset_cloneAsThawed(frozen);
583
584    if (thawed == NULL) {
585        log_err("uset_cloneAsThawed(frozen) returned NULL");
586        uset_close(frozen);
587        uset_close(idSet);
588        return;
589    }
590
591    uset_addRange(thawed, 0xd802, 0xd805);
592
593    if(uset_isFrozen(thawed) || uset_equals(thawed, idSet) || !uset_containsRange(thawed, 0xd802, 0xd805)) {
594        log_err("uset_cloneAsThawed() does not work\n");
595    }
596
597    uset_close(idSet);
598    uset_close(frozen);
599    uset_close(thawed);
600}
601
602static void TestSpan() {
603    static const UChar s16[2]={ 0xe01, 0x3000 };
604    static const char* s8="\xE0\xB8\x81\xE3\x80\x80";
605
606    USet *idSet=openIDSet();
607
608    if (idSet == NULL) {
609        log_data_err("openIDSet() returned NULL (Are you missing data?)\n");
610        return;
611    }
612
613    if(
614        1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) ||
615        0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) ||
616        2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) ||
617        1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED)
618    ) {
619        log_err("uset_span() or uset_spanBack() does not work\n");
620    }
621
622    if(
623        3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
624        0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) ||
625        6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
626        3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED)
627    ) {
628        log_err("uset_spanUTF8() or uset_spanBackUTF8() does not work\n");
629    }
630
631    uset_freeze(idSet);
632
633    if(
634        1!=uset_span(idSet, s16, 2, USET_SPAN_CONTAINED) ||
635        0!=uset_span(idSet, s16, 2, USET_SPAN_NOT_CONTAINED) ||
636        2!=uset_spanBack(idSet, s16, 2, USET_SPAN_CONTAINED) ||
637        1!=uset_spanBack(idSet, s16, 2, USET_SPAN_NOT_CONTAINED)
638    ) {
639        log_err("uset_span(frozen) or uset_spanBack(frozen) does not work\n");
640    }
641
642    if(
643        3!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
644        0!=uset_spanUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED) ||
645        6!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_CONTAINED) ||
646        3!=uset_spanBackUTF8(idSet, s8, 6, USET_SPAN_NOT_CONTAINED)
647    ) {
648        log_err("uset_spanUTF8(frozen) or uset_spanBackUTF8(frozen) does not work\n");
649    }
650
651    uset_close(idSet);
652}
653
654/*eof*/
655