1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 *******************************************************************************
6 * Copyright (C) 2009-2014, International Business Machines Corporation and
7 * others. All Rights Reserved.
8 *******************************************************************************
9 */
10package android.icu.dev.test.lang;
11
12import java.util.Collection;
13
14import org.junit.Test;
15
16import android.icu.dev.test.TestFmwk;
17import android.icu.impl.Utility;
18import android.icu.text.UTF16;
19import android.icu.text.UnicodeSet;
20import android.icu.text.UnicodeSet.SpanCondition;
21import android.icu.util.OutputInt;
22
23/**
24 * @test
25 * @summary General test of UnicodeSet string span.
26 */
27public class UnicodeSetStringSpanTest extends TestFmwk {
28    // Simple test first, easier to debug.
29    @Test
30    public void TestSimpleStringSpan() {
31        String pattern = "[a{ab}{bc}]";
32        String string = "abc";
33        UnicodeSet set = new UnicodeSet(pattern);
34        set.complement();
35        int pos = set.spanBack(string, 3, SpanCondition.SIMPLE);
36        if (pos != 1) {
37            errln(String.format("FAIL: UnicodeSet(%s).spanBack(%s) returns the wrong value pos %d (!= 1)",
38                    set.toString(), string, pos));
39        }
40        pos = set.span(string, SpanCondition.SIMPLE);
41        if (pos != 3) {
42            errln(String.format("FAIL: UnicodeSet(%s).span(%s) returns the wrong value pos %d (!= 3)",
43                    set.toString(), string, pos));
44        }
45        pos = set.span(string, 1, SpanCondition.SIMPLE);
46        if (pos != 3) {
47            errln(String.format("FAIL: UnicodeSet(%s).span(%s, 1) returns the wrong value pos %d (!= 3)",
48                    set.toString(), string, pos));
49        }
50    }
51
52    // test our slow implementation
53    @Test
54    public void TestSimpleStringSpanSlow() {
55        String pattern = "[a{ab}{bc}]";
56        String string = "abc";
57        UnicodeSet uset = new UnicodeSet(pattern);
58        uset.complement();
59        UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset);
60
61        int length = containsSpanBackUTF16(set, string, 3, SpanCondition.SIMPLE);
62        if (length != 1) {
63            errln(String.format("FAIL: UnicodeSet(%s) containsSpanBackUTF16(%s) returns the wrong value length %d (!= 1)",
64                    set.toString(), string, length));
65        }
66        length = containsSpanUTF16(set, string, SpanCondition.SIMPLE);
67        if (length != 3) {
68            errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 3)",
69                    set.toString(), string, length));
70        }
71        length = containsSpanUTF16(set, string.substring(1), SpanCondition.SIMPLE);
72        if (length != 2) {
73            errln(String.format("FAIL: UnicodeSet(%s) containsSpanUTF16(%s) returns the wrong value length %d (!= 2)",
74                    set.toString(), string, length));
75        }
76    }
77
78    // Test select patterns and strings, and test SIMPLE.
79    @Test
80    public void TestSimpleStringSpanAndFreeze() {
81        String pattern = "[x{xy}{xya}{axy}{ax}]";
82        final String string = "xx"
83                + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx"
84                + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxya" + "xx"
85                + "xyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxyaxy" + "aaaa";
86
87        UnicodeSet set = new UnicodeSet(pattern);
88        if (set.containsAll(string)) {
89            errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + ") should be FALSE");
90        }
91
92        // Remove trailing "aaaa".
93        String string16 = string.substring(0, string.length() - 4);
94        if (!set.containsAll(string16)) {
95            errln("FAIL: UnicodeSet(" + pattern + ").containsAll(" + string + "[:-4]) should be TRUE");
96        }
97
98        String s16 = "byayaxya";
99        if (   set.span(s16.substring(0, 8), SpanCondition.NOT_CONTAINED) != 4
100            || set.span(s16.substring(0, 7), SpanCondition.NOT_CONTAINED) != 4
101            || set.span(s16.substring(0, 6), SpanCondition.NOT_CONTAINED) != 4
102            || set.span(s16.substring(0, 5), SpanCondition.NOT_CONTAINED) != 5
103            || set.span(s16.substring(0, 4), SpanCondition.NOT_CONTAINED) != 4
104            || set.span(s16.substring(0, 3), SpanCondition.NOT_CONTAINED) != 3) {
105            errln("FAIL: UnicodeSet(" + pattern + ").span(while not) returns the wrong value");
106        }
107
108        pattern = "[a{ab}{abc}{cd}]";
109        set.applyPattern(pattern);
110        s16 = "acdabcdabccd";
111        if (   set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12
112            || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6
113            || set.span(s16.substring(7),     SpanCondition.SIMPLE) != 5) {
114            errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value");
115        }
116        set.freeze();
117        if (   set.span(s16.substring(0, 12), SpanCondition.CONTAINED) != 12
118            || set.span(s16.substring(0, 12), SpanCondition.SIMPLE) != 6
119            || set.span(s16.substring(7),     SpanCondition.SIMPLE) != 5) {
120            errln("FAIL: UnicodeSet(" + pattern + ").span(while longest match) returns the wrong value");
121        }
122
123        pattern = "[d{cd}{bcd}{ab}]";
124        set = (UnicodeSet)set.cloneAsThawed();
125        set.applyPattern(pattern).freeze();
126        s16 = "abbcdabcdabd";
127        if (   set.spanBack(s16, 12, SpanCondition.CONTAINED) != 0
128            || set.spanBack(s16, 12, SpanCondition.SIMPLE) != 6
129            || set.spanBack(s16,  5, SpanCondition.SIMPLE) != 0) {
130            errln("FAIL: UnicodeSet(" + pattern + ").spanBack(while longest match) returns the wrong value");
131        }
132    }
133
134    // more complex test. --------------------------------------------------------
135
136    // Make the strings in a UnicodeSet easily accessible.
137    private static class UnicodeSetWithStrings {
138        private UnicodeSet set;
139        private Collection<String> setStrings;
140        private int stringsLength;
141
142        public UnicodeSetWithStrings(final UnicodeSet normalSet) {
143            set = normalSet;
144            setStrings = normalSet.strings();
145            stringsLength = setStrings.size();
146        }
147
148        public final UnicodeSet getSet() {
149            return set;
150        }
151
152        public boolean hasStrings() {
153            return (stringsLength > 0);
154        }
155
156        public Iterable<String> strings() {
157            return setStrings;
158        }
159    }
160
161    // Compare 16-bit Unicode strings (which may be malformed UTF-16)
162    // at code point boundaries.
163    // That is, each edge of a match must not be in the middle of a surrogate pair.
164    static boolean matches16CPB(final String s, int start, int limit, final String t) {
165        limit -= start;
166        int length = t.length();
167        return t.equals(s.substring(start, start + length))
168                && !(0 < start && UTF16.isLeadSurrogate (s.charAt(start - 1)) &&
169                                  UTF16.isTrailSurrogate(s.charAt(start)))
170                && !(length < limit && UTF16.isLeadSurrogate (s.charAt(start + length - 1)) &&
171                                       UTF16.isTrailSurrogate(s.charAt(start + length)));
172    }
173
174    // Implement span() with contains() for comparison.
175    static int containsSpanUTF16(final UnicodeSetWithStrings set, final String s,
176            SpanCondition spanCondition) {
177        final UnicodeSet realSet = set.getSet();
178        int length = s.length();
179        if (!set.hasStrings()) {
180            boolean spanContained = false;
181            if (spanCondition != SpanCondition.NOT_CONTAINED) {
182                spanContained = true; // Pin to 0/1 values.
183            }
184
185            int c;
186            int start = 0, prev;
187            while ((prev = start) < length) {
188                c = s.codePointAt(start);
189                start = s.offsetByCodePoints(start, 1);
190                if (realSet.contains(c) != spanContained) {
191                    break;
192                }
193            }
194            return prev;
195        } else if (spanCondition == SpanCondition.NOT_CONTAINED) {
196            int c;
197            int start, next;
198            for (start = next = 0; start < length;) {
199                c = s.codePointAt(next);
200                next = s.offsetByCodePoints(next, 1);
201                if (realSet.contains(c)) {
202                    break;
203                }
204                for (String str : set.strings()) {
205                    if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) {
206                        // spanNeedsStrings=true;
207                        return start;
208                    }
209                }
210                start = next;
211            }
212            return start;
213        } else /* CONTAINED or SIMPLE */{
214            int c;
215            int start, next, maxSpanLimit = 0;
216            for (start = next = 0; start < length;) {
217                c = s.codePointAt(next);
218                next = s.offsetByCodePoints(next, 1);
219                if (!realSet.contains(c)) {
220                    next = start; // Do not span this single, not-contained code point.
221                }
222                for (String str : set.strings()) {
223                    if (str.length() <= (length - start) && matches16CPB(s, start, length, str)) {
224                        // spanNeedsStrings=true;
225                        int matchLimit = start + str.length();
226                        if (matchLimit == length) {
227                            return length;
228                        }
229                        if (spanCondition == SpanCondition.CONTAINED) {
230                            // Iterate for the shortest match at each position.
231                            // Recurse for each but the shortest match.
232                            if (next == start) {
233                                next = matchLimit; // First match from start.
234                            } else {
235                                if (matchLimit < next) {
236                                    // Remember shortest match from start for iteration.
237                                    int temp = next;
238                                    next = matchLimit;
239                                    matchLimit = temp;
240                                }
241                                // Recurse for non-shortest match from start.
242                                int spanLength = containsSpanUTF16(set, s.substring(matchLimit),
243                                        SpanCondition.CONTAINED);
244                                if ((matchLimit + spanLength) > maxSpanLimit) {
245                                    maxSpanLimit = matchLimit + spanLength;
246                                    if (maxSpanLimit == length) {
247                                        return length;
248                                    }
249                                }
250                            }
251                        } else /* spanCondition==SIMPLE */{
252                            if (matchLimit > next) {
253                                // Remember longest match from start.
254                                next = matchLimit;
255                            }
256                        }
257                    }
258                }
259                if (next == start) {
260                    break; // No match from start.
261                }
262                start = next;
263            }
264            if (start > maxSpanLimit) {
265                return start;
266            } else {
267                return maxSpanLimit;
268            }
269        }
270    }
271
272    static int containsSpanBackUTF16(final UnicodeSetWithStrings set, final String s, int length,
273            SpanCondition spanCondition) {
274        if (length == 0) {
275            return 0;
276        }
277        final UnicodeSet realSet = set.getSet();
278        if (!set.hasStrings()) {
279            boolean spanContained = false;
280            if (spanCondition != SpanCondition.NOT_CONTAINED) {
281                spanContained = true; // Pin to 0/1 values.
282            }
283
284            int c;
285            int prev = length;
286            do {
287                c = s.codePointBefore(prev);
288                if (realSet.contains(c) != spanContained) {
289                    break;
290                }
291                prev = s.offsetByCodePoints(prev, -1);
292            } while (prev > 0);
293            return prev;
294        } else if (spanCondition == SpanCondition.NOT_CONTAINED) {
295            int c;
296            int prev = length, length0 = length;
297            do {
298                c = s.codePointBefore(prev);
299                if (realSet.contains(c)) {
300                    break;
301                }
302                for (String str : set.strings()) {
303                    if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) {
304                        // spanNeedsStrings=true;
305                        return prev;
306                    }
307                }
308                prev = s.offsetByCodePoints(prev, -1);
309            } while (prev > 0);
310            return prev;
311        } else /* SpanCondition.CONTAINED or SIMPLE */{
312            int c;
313            int prev = length, minSpanStart = length, length0 = length;
314            do {
315                c = s.codePointBefore(length);
316                length = s.offsetByCodePoints(length, -1);
317                if (!realSet.contains(c)) {
318                    length = prev; // Do not span this single, not-contained code point.
319                }
320                for (String str : set.strings()) {
321                    if (str.length() <= prev && matches16CPB(s, prev - str.length(), length0, str)) {
322                        // spanNeedsStrings=true;
323                        int matchStart = prev - str.length();
324                        if (matchStart == 0) {
325                            return 0;
326                        }
327                        if (spanCondition == SpanCondition.CONTAINED) {
328                            // Iterate for the shortest match at each position.
329                            // Recurse for each but the shortest match.
330                            if (length == prev) {
331                                length = matchStart; // First match from prev.
332                            } else {
333                                if (matchStart > length) {
334                                    // Remember shortest match from prev for iteration.
335                                    int temp = length;
336                                    length = matchStart;
337                                    matchStart = temp;
338                                }
339                                // Recurse for non-shortest match from prev.
340                                int spanStart = containsSpanBackUTF16(set, s, matchStart,
341                                        SpanCondition.CONTAINED);
342                                if (spanStart < minSpanStart) {
343                                    minSpanStart = spanStart;
344                                    if (minSpanStart == 0) {
345                                        return 0;
346                                    }
347                                }
348                            }
349                        } else /* spanCondition==SIMPLE */{
350                            if (matchStart < length) {
351                                // Remember longest match from prev.
352                                length = matchStart;
353                            }
354                        }
355                    }
356                }
357                if (length == prev) {
358                    break; // No match from prev.
359                }
360            } while ((prev = length) > 0);
361            if (prev < minSpanStart) {
362                return prev;
363            } else {
364                return minSpanStart;
365            }
366        }
367    }
368
369    // spans to be performed and compared
370    static final int SPAN_UTF16 = 1;
371    static final int SPAN_UTF8 = 2;
372    static final int SPAN_UTFS = 3;
373
374    static final int SPAN_SET = 4;
375    static final int SPAN_COMPLEMENT = 8;
376    static final int SPAN_POLARITY = 0xc;
377
378    static final int SPAN_FWD = 0x10;
379    static final int SPAN_BACK = 0x20;
380    static final int SPAN_DIRS = 0x30;
381
382    static final int SPAN_CONTAINED = 0x100;
383    static final int SPAN_SIMPLE = 0x200;
384    static final int SPAN_CONDITION = 0x300;
385
386    static final int SPAN_ALL = 0x33f;
387
388    static SpanCondition invertSpanCondition(SpanCondition spanCondition, SpanCondition contained) {
389        return spanCondition == SpanCondition.NOT_CONTAINED ? contained
390                : SpanCondition.NOT_CONTAINED;
391    }
392
393    /*
394     * Count spans on a string with the method according to type and set the span limits. The set may be the complement
395     * of the original. When using spanBack() and comparing with span(), use a span condition for the first spanBack()
396     * according to the expected number of spans. Sets typeName to an empty string if there is no such type. Returns -1
397     * if the span option is filtered out.
398     */
399    static int getSpans(final UnicodeSetWithStrings set, boolean isComplement, final String s,
400            int whichSpans, int type, String[] typeName, int limits[], int limitsCapacity,
401            int expectCount) {
402        final UnicodeSet realSet = set.getSet();
403        int start, count, i;
404        SpanCondition spanCondition, firstSpanCondition, contained;
405        boolean isForward;
406
407        int length = s.length();
408        if (type < 0 || 7 < type) {
409            typeName[0] = null;
410            return 0;
411        }
412
413        final String typeNames16[] = {
414                "contains",
415                "contains(LM)",
416                "span",
417                "span(LM)",
418                "containsBack",
419                "containsBack(LM)",
420                "spanBack",
421                "spanBack(LM)" };
422
423        typeName[0] = typeNames16[type];
424
425        // filter span options
426        if (type <= 3) {
427            // span forward
428            if ((whichSpans & SPAN_FWD) == 0) {
429                return -1;
430            }
431            isForward = true;
432        } else {
433            // span backward
434            if ((whichSpans & SPAN_BACK) == 0) {
435                return -1;
436            }
437            isForward = false;
438        }
439        if ((type & 1) == 0) {
440            // use SpanCondition.CONTAINED
441            if ((whichSpans & SPAN_CONTAINED) == 0) {
442                return -1;
443            }
444            contained = SpanCondition.CONTAINED;
445        } else {
446            // use SIMPLE
447            if ((whichSpans & SPAN_SIMPLE) == 0) {
448                return -1;
449            }
450            contained = SpanCondition.SIMPLE;
451        }
452
453        // Default first span condition for going forward with an uncomplemented set.
454        spanCondition = SpanCondition.NOT_CONTAINED;
455        if (isComplement) {
456            spanCondition = invertSpanCondition(spanCondition, contained);
457        }
458
459        // First span condition for span(), used to terminate the spanBack() iteration.
460        firstSpanCondition = spanCondition;
461
462        // spanBack(): Its initial span condition is span()'s last span condition,
463        // which is the opposite of span()'s first span condition
464        // if we expect an even number of spans.
465        // (The loop inverts spanCondition (expectCount-1) times
466        // before the expectCount'th span() call.)
467        // If we do not compare forward and backward directions, then we do not have an
468        // expectCount and just start with firstSpanCondition.
469        if (!isForward && (whichSpans & SPAN_FWD) != 0 && (expectCount & 1) == 0) {
470            spanCondition = invertSpanCondition(spanCondition, contained);
471        }
472
473        count = 0;
474        switch (type) {
475        case 0:
476        case 1:
477            start = 0;
478            for (;;) {
479                start += containsSpanUTF16(set, s.substring(start), spanCondition);
480                if (count < limitsCapacity) {
481                    limits[count] = start;
482                }
483                ++count;
484                if (start >= length) {
485                    break;
486                }
487                spanCondition = invertSpanCondition(spanCondition, contained);
488            }
489            break;
490        case 2:
491        case 3:
492            start = 0;
493            for (;;) {
494                start = realSet.span(s, start, spanCondition);
495                if (count < limitsCapacity) {
496                    limits[count] = start;
497                }
498                ++count;
499                if (start >= length) {
500                    break;
501                }
502                spanCondition = invertSpanCondition(spanCondition, contained);
503            }
504            break;
505        case 4:
506        case 5:
507            for (;;) {
508                ++count;
509                if (count <= limitsCapacity) {
510                    limits[limitsCapacity - count] = length;
511                }
512                length = containsSpanBackUTF16(set, s, length, spanCondition);
513                if (length == 0 && spanCondition == firstSpanCondition) {
514                    break;
515                }
516                spanCondition = invertSpanCondition(spanCondition, contained);
517            }
518            if (count < limitsCapacity) {
519                for (i = count; i-- > 0;) {
520                    limits[i] = limits[limitsCapacity - count + i];
521                }
522            }
523            break;
524        case 6:
525        case 7:
526            for (;;) {
527                ++count;
528                if (count <= limitsCapacity) {
529                    limits[limitsCapacity - count] = length >= 0 ? length : s.length();
530                }
531                length = realSet.spanBack(s, length, spanCondition);
532                if (length == 0 && spanCondition == firstSpanCondition) {
533                    break;
534                }
535                spanCondition = invertSpanCondition(spanCondition, contained);
536            }
537            if (count < limitsCapacity) {
538                for (i = count; i-- > 0;) {
539                    limits[i] = limits[limitsCapacity - count + i];
540                }
541            }
542            break;
543        default:
544            typeName = null;
545            return -1;
546        }
547
548        return count;
549    }
550
551    // sets to be tested; odd index=isComplement
552    static final int SLOW = 0;
553    static final int SLOW_NOT = 1;
554    static final int FAST = 2;
555    static final int FAST_NOT = 3;
556    static final int SET_COUNT = 4;
557
558    static final String setNames[] = { "slow", "slow.not", "fast", "fast.not" };
559
560    /*
561     * Verify that we get the same results whether we look at text with contains(), span() or spanBack(), using unfrozen
562     * or frozen versions of the set, and using the set or its complement (switching the spanConditions accordingly).
563     * The latter verifies that set.span(spanCondition) == set.complement().span(!spanCondition).
564     *
565     * The expectLimits[] are either provided by the caller (with expectCount>=0) or returned to the caller (with an
566     * input expectCount<0).
567     */
568    void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans,
569            int expectLimits[], int expectCount,
570            final String testName, int index) {
571        int[] limits = new int[500];
572        int limitsCount;
573        int i, j;
574        String[] typeName = new String[1];
575        int type;
576
577        for (i = 0; i < SET_COUNT; ++i) {
578            if ((i & 1) == 0) {
579                // Even-numbered sets are original, uncomplemented sets.
580                if ((whichSpans & SPAN_SET) == 0) {
581                    continue;
582                }
583            } else {
584                // Odd-numbered sets are complemented.
585                if ((whichSpans & SPAN_COMPLEMENT) == 0) {
586                    continue;
587                }
588            }
589            for (type = 0;; ++type) {
590                limitsCount = getSpans(sets[i], (0 != (i & 1)), s, whichSpans, type, typeName, limits,
591                        limits.length, expectCount);
592                if (typeName[0] == null) {
593                    break; // All types tried.
594                }
595                if (limitsCount < 0) {
596                    continue; // Span option filtered out.
597                }
598                if (expectCount < 0) {
599                    expectCount = limitsCount;
600                    if (limitsCount > limits.length) {
601                        errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d > %d capacity - too many spans",
602                                testName, index, setNames[i], typeName[0], limitsCount, limits.length));
603                        return;
604                    }
605                    for (j = limitsCount; j-- > 0;) {
606                        expectLimits[j] = limits[j];
607                    }
608                } else if (limitsCount != expectCount) {
609                    errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d != %d", testName, index, setNames[i],
610                            typeName[0], limitsCount, expectCount));
611                } else {
612                    for (j = 0; j < limitsCount; ++j) {
613                        if (limits[j] != expectLimits[j]) {
614                            errln(String.format("FAIL: %s[0x%x].%s.%s span count=%d limits[%d]=%d != %d", testName,
615                                    index, setNames[i], typeName[0], limitsCount, j, limits[j], expectLimits[j]));
616                            break;
617                        }
618                    }
619                }
620            }
621        }
622
623        // Compare span() with containsAll()/containsNone(),
624        // but only if we have expectLimits[] from the uncomplemented set.
625        if ((whichSpans & SPAN_SET) != 0) {
626            final String s16 = s;
627            String string;
628            int prev = 0, limit, len;
629            for (i = 0; i < expectCount; ++i) {
630                limit = expectLimits[i];
631                len = limit - prev;
632                if (len > 0) {
633                    string = s16.substring(prev, prev + len); // read-only alias
634                    if (0 != (i & 1)) {
635                        if (!sets[SLOW].getSet().containsAll(string)) {
636                            errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()",
637                                    testName, index, setNames[SLOW], prev, limit));
638                            return;
639                        }
640                        if (!sets[FAST].getSet().containsAll(string)) {
641                            errln(String.format("FAIL: %s[0x%x].%s.containsAll(%d..%d)==false contradicts span()",
642                                    testName, index, setNames[FAST], prev, limit));
643                            return;
644                        }
645                    } else {
646                        if (!sets[SLOW].getSet().containsNone(string)) {
647                            errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()",
648                                    testName, index, setNames[SLOW], prev, limit));
649                            return;
650                        }
651                        if (!sets[FAST].getSet().containsNone(string)) {
652                            errln(String.format("FAIL: %s[0x%x].%s.containsNone(%d..%d)==false contradicts span()",
653                                    testName, index, setNames[FAST], prev, limit));
654                            return;
655                        }
656                    }
657                }
658                prev = limit;
659            }
660        }
661    }
662
663    // Specifically test either UTF-16 or UTF-8.
664    void verifySpan(final UnicodeSetWithStrings sets[], final String s, int whichSpans,
665            final String testName, int index) {
666        int[] expectLimits = new int[500];
667        int expectCount = -1;
668        verifySpan(sets, s, whichSpans, expectLimits, expectCount, testName, index);
669    }
670
671    // Test both UTF-16 and UTF-8 versions of span() etc. on the same sets and text,
672    // unless either UTF is turned off in whichSpans.
673    // Testing UTF-16 and UTF-8 together requires that surrogate code points
674    // have the same contains(c) value as U+FFFD.
675    void verifySpanBothUTFs(final UnicodeSetWithStrings sets[], final String s16, int whichSpans,
676            final String testName, int index) {
677        int[] expectLimits = new int[500];
678        int expectCount;
679
680        expectCount = -1; // Get expectLimits[] from verifySpan().
681
682        if ((whichSpans & SPAN_UTF16) != 0) {
683            verifySpan(sets, s16, whichSpans, expectLimits, expectCount, testName, index);
684        }
685    }
686
687    static int nextCodePoint(int c) {
688        // Skip some large and boring ranges.
689        switch (c) {
690        case 0x3441:
691            return 0x4d7f;
692        case 0x5100:
693            return 0x9f00;
694        case 0xb040:
695            return 0xd780;
696        case 0xe041:
697            return 0xf8fe;
698        case 0x10100:
699            return 0x20000;
700        case 0x20041:
701            return 0xe0000;
702        case 0xe0101:
703            return 0x10fffd;
704        default:
705            return c + 1;
706        }
707    }
708
709    // Verify that all implementations represent the same set.
710    void verifySpanContents(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) {
711        StringBuffer s = new StringBuffer();
712        int localWhichSpans;
713        int c, first;
714        for (first = c = 0;; c = nextCodePoint(c)) {
715            if (c > 0x10ffff || s.length() > 1024) {
716                localWhichSpans = whichSpans;
717                verifySpanBothUTFs(sets, s.toString(), localWhichSpans, testName, first);
718                if (c > 0x10ffff) {
719                    break;
720                }
721                s.delete(0, s.length());
722                first = c;
723            }
724            UTF16.append(s, c);
725        }
726    }
727
728    // Test with a particular, interesting string.
729    // Specify length and try NUL-termination.
730    static final char interestingStringChars[] = { 0x61, 0x62, 0x20, // Latin, space
731            0x3b1, 0x3b2, 0x3b3, // Greek
732            0xd900, // lead surrogate
733            0x3000, 0x30ab, 0x30ad, // wide space, Katakana
734            0xdc05, // trail surrogate
735            0xa0, 0xac00, 0xd7a3, // nbsp, Hangul
736            0xd900, 0xdc05, // unassigned supplementary
737            0xd840, 0xdfff, 0xd860, 0xdffe, // Han supplementary
738            0xd7a4, 0xdc05, 0xd900, 0x2028  // unassigned, surrogates in wrong order, LS
739    };
740    static String interestingString = new String(interestingStringChars);
741    static final String unicodeSet1 = "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u3000\\u30ab}{\\u3000\\u30ab\\u30ad}]";
742
743    @Test
744    public void TestInterestingStringSpan() {
745        UnicodeSet uset = new UnicodeSet(Utility.unescape(unicodeSet1));
746        SpanCondition spanCondition = SpanCondition.NOT_CONTAINED;
747        int expect = 2;
748        int start = 14;
749
750        int c = 0xd840;
751        boolean contains = uset.contains(c);
752        if (false != contains) {
753            errln(String.format("FAIL: UnicodeSet(unicodeSet1).contains(%d) = true (expect false)",
754                  c));
755        }
756
757        UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset);
758        int len = containsSpanUTF16(set, interestingString.substring(start), spanCondition);
759        if (expect != len) {
760            errln(String.format("FAIL: containsSpanUTF16(unicodeSet1, \"%s(%d)\") = %d (expect %d)",
761                  interestingString, start, len, expect));
762        }
763
764        len = uset.span(interestingString, start, spanCondition) - start;
765        if (expect != len) {
766            errln(String.format("FAIL: UnicodeSet(unicodeSet1).span(\"%s\", %d) = %d (expect %d)",
767                  interestingString, start, len, expect));
768        }
769    }
770
771    void verifySpanUTF16String(final UnicodeSetWithStrings sets[], int whichSpans, final String testName) {
772        if ((whichSpans & SPAN_UTF16) == 0) {
773            return;
774        }
775        verifySpan(sets, interestingString, (whichSpans & ~SPAN_UTF8), testName, 1);
776    }
777
778    // Take a set of span options and multiply them so that
779    // each portion only has one of the options a, b and c.
780    // If b==0, then the set of options is just modified with mask and a.
781    // If b!=0 and c==0, then the set of options is just modified with mask, a and b.
782    static int addAlternative(int whichSpans[], int whichSpansCount, int mask, int a, int b, int c) {
783        int s;
784        int i;
785
786        for (i = 0; i < whichSpansCount; ++i) {
787            s = whichSpans[i] & mask;
788            whichSpans[i] = s | a;
789            if (b != 0) {
790                whichSpans[whichSpansCount + i] = s | b;
791                if (c != 0) {
792                    whichSpans[2 * whichSpansCount + i] = s | c;
793                }
794            }
795        }
796        return b == 0 ? whichSpansCount : c == 0 ? 2 * whichSpansCount : 3 * whichSpansCount;
797    }
798
799    // They are not representable in UTF-8, and a leading trail surrogate
800    // and a trailing lead surrogate must not match in the middle of a proper surrogate pair.
801    // U+20001 == \\uD840\\uDC01
802    // U+20400 == \\uD841\\uDC00
803    static final String patternWithUnpairedSurrogate =
804        "[a\\U00020001\\U00020400{ab}{b\\uD840}{\\uDC00a}]";
805    static final String stringWithUnpairedSurrogate =
806        "aaab\\U00020001ba\\U00020400aba\\uD840ab\\uD840\\U00020000b\\U00020000a\\U00020000\\uDC00a\\uDC00babbb";
807
808    static final String _63_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
809    static final String _64_a = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa";
810    static final String _63_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
811    static final String _64_b = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb";
812    static final String longPattern =
813        "[a{" + _64_a + _64_a + _64_a + _64_a + "b}" + "{a" + _64_b + _64_b + _64_b + _64_b + "}]";
814
815    @Test
816    public void TestStringWithUnpairedSurrogateSpan() {
817        String string = Utility.unescape(stringWithUnpairedSurrogate);
818        UnicodeSet uset = new UnicodeSet(Utility.unescape(patternWithUnpairedSurrogate));
819        SpanCondition spanCondition = SpanCondition.NOT_CONTAINED;
820        int start = 17;
821        int expect = 5;
822
823        UnicodeSetWithStrings set = new UnicodeSetWithStrings(uset);
824        int len = containsSpanUTF16(set, string.substring(start), spanCondition);
825        if (expect != len) {
826            errln(String.format("FAIL: containsSpanUTF16(patternWithUnpairedSurrogate, \"%s(%d)\") = %d (expect %d)",
827                  string, start, len, expect));
828        }
829
830        len = uset.span(string, start, spanCondition) - start;
831        if (expect != len) {
832            errln(String.format("FAIL: UnicodeSet(patternWithUnpairedSurrogate).span(\"%s\", %d) = %d (expect %d)",
833                  string, start, len, expect));
834        }
835    }
836
837    @Test
838    public void TestSpan() {
839        // "[...]" is a UnicodeSet pattern.
840        // "*" performs tests on all Unicode code points and on a selection of
841        // malformed UTF-8/16 strings.
842        // "-options" limits the scope of testing for the current set.
843        // By default, the test verifies that equivalent boundaries are found
844        // for UTF-16 and UTF-8, going forward and backward,
845        // alternating NOT_CONTAINED with
846        // either CONTAINED or SIMPLE.
847        // Single-character options:
848        // 8 -- UTF-16 and UTF-8 boundaries may differ.
849        // Cause: contains(U+FFFD) is inconsistent with contains(some surrogates),
850        // or the set contains strings with unpaired surrogates
851        // which do not translate to valid UTF-8.
852        // c -- set.span() and set.complement().span() boundaries may differ.
853        // Cause: Set strings are not complemented.
854        // b -- span() and spanBack() boundaries may differ.
855        // Cause: Strings in the set overlap, and spanBack(CONTAINED)
856        // and spanBack(SIMPLE) are defined to
857        // match with non-overlapping substrings.
858        // For example, with a set containing "ab" and "ba",
859        // span() of "aba" yields boundaries { 0, 2, 3 }
860        // because the initial "ab" matches from 0 to 2,
861        // while spanBack() yields boundaries { 0, 1, 3 }
862        // because the final "ba" matches from 1 to 3.
863        // l -- CONTAINED and SIMPLE boundaries may differ.
864        // Cause: Strings in the set overlap, and a longer match may
865        // require a sequence including non-longest substrings.
866        // For example, with a set containing "ab", "abc" and "cd",
867        // span(contained) of "abcd" spans the entire string
868        // but span(longest match) only spans the first 3 characters.
869        // Each "-options" first resets all options and then applies the specified options.
870        // A "-" without options resets the options.
871        // The options are also reset for each new set.
872        // Other strings will be spanned.
873        final String testdata[] = {
874                "[:ID_Continue:]",
875                "*",
876                "[:White_Space:]",
877                "*",
878                "[]",
879                "*",
880                "[\\u0000-\\U0010FFFF]",
881                "*",
882                "[\\u0000\\u0080\\u0800\\U00010000]",
883                "*",
884                "[\\u007F\\u07FF\\uFFFF\\U0010FFFF]",
885                "*",
886                unicodeSet1,
887                "-c",
888                "*",
889                "[[[:ID_Continue:]-[\\u30ab\\u30ad]]{\\u30ab\\u30ad}{\\u3000\\u30ab\\u30ad}]",
890                "-c",
891                "*",
892
893                // Overlapping strings cause overlapping attempts to match.
894                "[x{xy}{xya}{axy}{ax}]",
895                "-cl",
896
897                // More repetitions of "xya" would take too long with the recursive
898                // reference implementation.
899                // containsAll()=false
900                // test_string 0x14
901                "xx" + "xyaxyaxyaxya" + // set.complement().span(longest match) will stop here.
902                        "xx" + // set.complement().span(contained) will stop between the two 'x'es.
903                        "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + // span() ends here.
904                        "aaa",
905
906                // containsAll()=true
907                // test_string 0x15
908                "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxya" + "xx" + "xyaxyaxyaxy",
909
910                "-bc",
911                // test_string 0x17
912                "byayaxya", // span() -> { 4, 7, 8 } spanBack() -> { 5, 8 }
913                "-c",
914                "byayaxy", // span() -> { 4, 7 } complement.span() -> { 7 }
915                "byayax", // span() -> { 4, 6 } complement.span() -> { 6 }
916                "-",
917                "byaya", // span() -> { 5 }
918                "byay", // span() -> { 4 }
919                "bya", // span() -> { 3 }
920
921                // span(longest match) will not span the whole string.
922                "[a{ab}{bc}]",
923                "-cl",
924                // test_string 0x21
925                "abc",
926
927                "[a{ab}{abc}{cd}]",
928                "-cl",
929                "acdabcdabccd",
930
931                // spanBack(longest match) will not span the whole string.
932                "[c{ab}{bc}]",
933                "-cl",
934                "abc",
935
936                "[d{cd}{bcd}{ab}]",
937                "-cl",
938                "abbcdabcdabd",
939
940                // Test with non-ASCII set strings - test proper handling of surrogate pairs
941                // and UTF-8 trail bytes.
942                // Copies of above test sets and strings, but transliterated to have
943                // different code points with similar trail units.
944                // Previous: a b c d
945                // Unicode: 042B 30AB 200AB 204AB
946                // UTF-16: 042B 30AB D840 DCAB D841 DCAB
947                // UTF-8: D0 AB E3 82 AB F0 A0 82 AB F0 A0 92 AB
948                "[\\u042B{\\u042B\\u30AB}{\\u042B\\u30AB\\U000200AB}{\\U000200AB\\U000204AB}]",
949                "-cl",
950                "\\u042B\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000200AB\\U000204AB",
951
952                "[\\U000204AB{\\U000200AB\\U000204AB}{\\u30AB\\U000200AB\\U000204AB}{\\u042B\\u30AB}]",
953                "-cl",
954                "\\u042B\\u30AB\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000200AB\\U000204AB\\u042B\\u30AB\\U000204AB",
955
956                // Stress bookkeeping and recursion.
957                // The following strings are barely doable with the recursive
958                // reference implementation.
959                // The not-contained character at the end prevents an early exit from the span().
960                "[b{bb}]",
961                "-c",
962                // test_string 0x33
963                "bbbbbbbbbbbbbbbbbbbbbbbb-",
964                // On complement sets, span() and spanBack() get different results
965                // because b is not in the complement set and there is an odd number of b's
966                // in the test string.
967                "-bc",
968                "bbbbbbbbbbbbbbbbbbbbbbbbb-",
969
970                // Test with set strings with an initial or final code point span
971                // longer than 254.
972                longPattern,
973                "-c",
974                _64_a + _64_a + _64_a + _63_a + "b",
975                _64_a + _64_a + _64_a + _64_a + "b",
976                _64_a + _64_a + _64_a + _64_a + "aaaabbbb",
977                "a" + _64_b + _64_b + _64_b + _63_b,
978                "a" + _64_b + _64_b + _64_b + _64_b,
979                "aaaabbbb" + _64_b + _64_b + _64_b + _64_b,
980
981                // Test with strings containing unpaired surrogates.
982                patternWithUnpairedSurrogate, "-8cl",
983                stringWithUnpairedSurrogate };
984        int i, j;
985        int whichSpansCount = 1;
986        int[] whichSpans = new int[96];
987        for (i = whichSpans.length; i-- > 0;) {
988            whichSpans[i] = SPAN_ALL;
989        }
990
991        UnicodeSet[] sets = new UnicodeSet[SET_COUNT];
992        UnicodeSetWithStrings[] sets_with_str = new UnicodeSetWithStrings[SET_COUNT];
993
994        String testName = null;
995        @SuppressWarnings("unused")
996        String testNameLimit;
997
998        for (i = 0; i < testdata.length; ++i) {
999            final String s = testdata[i];
1000            if (s.charAt(0) == '[') {
1001                // Create new test sets from this pattern.
1002                for (j = 0; j < SET_COUNT; ++j) {
1003                    sets_with_str[j] = null;
1004                    sets[j] = null;
1005                }
1006                sets[SLOW] = new UnicodeSet(Utility.unescape(s));
1007                sets[SLOW_NOT] = new UnicodeSet(sets[SLOW]);
1008                sets[SLOW_NOT].complement();
1009                // Intermediate set: Test cloning of a frozen set.
1010                UnicodeSet fast = new UnicodeSet(sets[SLOW]);
1011                fast.freeze();
1012                sets[FAST] = (UnicodeSet) fast.clone();
1013                fast = null;
1014                UnicodeSet fastNot = new UnicodeSet(sets[SLOW_NOT]);
1015                fastNot.freeze();
1016                sets[FAST_NOT] = (UnicodeSet) fastNot.clone();
1017                fastNot = null;
1018
1019                for (j = 0; j < SET_COUNT; ++j) {
1020                    sets_with_str[j] = new UnicodeSetWithStrings(sets[j]);
1021                }
1022
1023                testName = s + ':';
1024                whichSpans[0] = SPAN_ALL;
1025                whichSpansCount = 1;
1026            } else if (s.charAt(0) == '-') {
1027                whichSpans[0] = SPAN_ALL;
1028                whichSpansCount = 1;
1029
1030                for (j = 1; j < s.length(); j++) {
1031                    switch (s.charAt(j)) {
1032                    case 'c':
1033                        whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_POLARITY, SPAN_SET,
1034                                SPAN_COMPLEMENT, 0);
1035                        break;
1036                    case 'b':
1037                        whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_DIRS, SPAN_FWD, SPAN_BACK,
1038                                0);
1039                        break;
1040                    case 'l':
1041                        // test CONTAINED FWD & BACK, and separately
1042                        // SIMPLE only FWD, and separately
1043                        // SIMPLE only BACK
1044                        whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~(SPAN_DIRS | SPAN_CONDITION),
1045                                SPAN_DIRS | SPAN_CONTAINED, SPAN_FWD | SPAN_SIMPLE, SPAN_BACK | SPAN_SIMPLE);
1046                        break;
1047                    case '8':
1048                        whichSpansCount = addAlternative(whichSpans, whichSpansCount, ~SPAN_UTFS, SPAN_UTF16,
1049                                SPAN_UTF8, 0);
1050                        break;
1051                    default:
1052                        errln(String.format("FAIL: unrecognized span set option in \"%s\"", testdata[i]));
1053                        break;
1054                    }
1055                }
1056            } else if (s.equals("*")) {
1057                testNameLimit = "bad_string";
1058                for (j = 0; j < whichSpansCount; ++j) {
1059                    if (whichSpansCount > 1) {
1060                        testNameLimit += String.format("%%0x%3x", whichSpans[j]);
1061                    }
1062                    verifySpanUTF16String(sets_with_str, whichSpans[j], testName);
1063                }
1064
1065                testNameLimit = "contents";
1066                for (j = 0; j < whichSpansCount; ++j) {
1067                    if (whichSpansCount > 1) {
1068                        testNameLimit += String.format("%%0x%3x", whichSpans[j]);
1069                    }
1070                    verifySpanContents(sets_with_str, whichSpans[j], testName);
1071                }
1072            } else {
1073                String string = Utility.unescape(s);
1074                testNameLimit = "test_string";
1075                for (j = 0; j < whichSpansCount; ++j) {
1076                    if (whichSpansCount > 1) {
1077                        testNameLimit += String.format("%%0x%3x", whichSpans[j]);
1078                    }
1079                    verifySpanBothUTFs(sets_with_str, string, whichSpans[j], testName, i);
1080                }
1081            }
1082        }
1083    }
1084
1085    @Test
1086    public void TestSpanAndCount() {
1087        // a set with no strings
1088        UnicodeSet abc = new UnicodeSet('a', 'c');
1089        // a set with an "irrelevant" string (fully contained in the code point set)
1090        UnicodeSet crlf = new UnicodeSet().add('\n').add('\r').add("\r\n");
1091        // a set with no "irrelevant" string but some interesting overlaps
1092        UnicodeSet ab_cd = new UnicodeSet().add('a').add("ab").add("abc").add("cd");
1093        String s = "ab\n\r\r\n" + UTF16.valueOf(0x50000) + "abcde";
1094        OutputInt count = new OutputInt();
1095        assertEquals("abc span[8, 11[", 11,
1096                abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
1097        assertEquals("abc count=3", 3, count.value);
1098        assertEquals("no abc span[2, 8[", 8,
1099                abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
1100        assertEquals("no abc count=5", 5, count.value);
1101        assertEquals("line endings span[2, 6[", 6,
1102                crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count));
1103        assertEquals("line endings count=3", 3, count.value);
1104        assertEquals("no ab+cd span[2, 8[", 8,
1105                ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
1106        assertEquals("no ab+cd count=5", 5, count.value);
1107        assertEquals("ab+cd span[8, 12[", 12,
1108                ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count));
1109        assertEquals("ab+cd count=2", 2, count.value);
1110        assertEquals("1x abc span[8, 11[", 11,
1111                ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
1112        assertEquals("1x abc count=1", 1, count.value);
1113
1114        abc.freeze();
1115        crlf.freeze();
1116        ab_cd.freeze();
1117        assertEquals("abc span[8, 11[ (frozen)", 11,
1118                abc.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
1119        assertEquals("abc count=3 (frozen)", 3, count.value);
1120        assertEquals("no abc span[2, 8[ (frozen)", 8,
1121                abc.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
1122        assertEquals("no abc count=5 (frozen)", 5, count.value);
1123        assertEquals("line endings span[2, 6[ (frozen)", 6,
1124                crlf.spanAndCount(s, 2, SpanCondition.CONTAINED, count));
1125        assertEquals("line endings count=3 (frozen)", 3, count.value);
1126        assertEquals("no ab+cd span[2, 8[ (frozen)", 8,
1127                ab_cd.spanAndCount(s, 2, SpanCondition.NOT_CONTAINED, count));
1128        assertEquals("no ab+cd count=5 (frozen)", 5, count.value);
1129        assertEquals("ab+cd span[8, 12[ (frozen)", 12,
1130                ab_cd.spanAndCount(s, 8, SpanCondition.CONTAINED, count));
1131        assertEquals("ab+cd count=2 (frozen)", 2, count.value);
1132        assertEquals("1x abc span[8, 11[ (frozen)", 11,
1133                ab_cd.spanAndCount(s, 8, SpanCondition.SIMPLE, count));
1134        assertEquals("1x abc count=1 (frozen)", 1, count.value);
1135    }
1136}
1137