1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2009, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7//
8//   regextst.cpp
9//
10//      ICU Regular Expressions test, part of intltest.
11//
12
13#include "intltest.h"
14#if !UCONFIG_NO_REGULAR_EXPRESSIONS
15
16#include "unicode/regex.h"
17#include "unicode/uchar.h"
18#include "unicode/ucnv.h"
19#include "regextst.h"
20#include "uvector.h"
21#include "util.h"
22#include <stdlib.h>
23#include <string.h>
24#include <stdio.h>
25
26
27//---------------------------------------------------------------------------
28//
29//  Test class boilerplate
30//
31//---------------------------------------------------------------------------
32RegexTest::RegexTest()
33{
34}
35
36
37RegexTest::~RegexTest()
38{
39}
40
41
42
43void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
44{
45    if (exec) logln("TestSuite RegexTest: ");
46    switch (index) {
47
48        case 0: name = "Basic";
49            if (exec) Basic();
50            break;
51        case 1: name = "API_Match";
52            if (exec) API_Match();
53            break;
54        case 2: name = "API_Replace";
55            if (exec) API_Replace();
56            break;
57        case 3: name = "API_Pattern";
58            if (exec) API_Pattern();
59            break;
60        case 4: name = "Extended";
61            if (exec) Extended();
62            break;
63        case 5: name = "Errors";
64            if (exec) Errors();
65            break;
66        case 6: name = "PerlTests";
67            if (exec) PerlTests();
68            break;
69        case 7: name = "Callbacks";
70            if (exec) Callbacks();
71            break;
72        case 8: name = "Bug 6149";
73             if (exec) Bug6149();
74             break;
75
76        default: name = "";
77            break; //needed to end loop
78    }
79}
80
81
82//---------------------------------------------------------------------------
83//
84//   Error Checking / Reporting macros used in all of the tests.
85//
86//---------------------------------------------------------------------------
87#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("RegexTest failure at line %d.  status=%s", \
88__LINE__, u_errorName(status)); return;}}
89
90#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
91
92#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
93if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
94    __LINE__, u_errorName(errcode), u_errorName(status));};}
95
96#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
97    "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
98
99#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
100    errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
101
102
103
104//---------------------------------------------------------------------------
105//
106//    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
107//                       for the LookingAt() and  Match() functions.
108//
109//       usage:
110//          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
111//
112//          The expected results are UBool - TRUE or FALSE.
113//          The input text is unescaped.  The pattern is not.
114//
115//
116//---------------------------------------------------------------------------
117
118#define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__);
119
120UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
121    const UnicodeString pattern(pat, -1, US_INV);
122    const UnicodeString inputText(text, -1, US_INV);
123    UErrorCode          status  = U_ZERO_ERROR;
124    UParseError         pe;
125    RegexPattern        *REPattern = NULL;
126    RegexMatcher        *REMatcher = NULL;
127    UBool               retVal     = TRUE;
128
129    UnicodeString patString(pat, -1, US_INV);
130    REPattern = RegexPattern::compile(patString, 0, pe, status);
131    if (U_FAILURE(status)) {
132        dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
133            line, u_errorName(status));
134        return FALSE;
135    }
136    if (line==376) { RegexPatternDump(REPattern);}
137
138    UnicodeString inputString(inputText);
139    UnicodeString unEscapedInput = inputString.unescape();
140    REMatcher = REPattern->matcher(unEscapedInput, status);
141    if (U_FAILURE(status)) {
142        errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
143            line, u_errorName(status));
144        return FALSE;
145    }
146
147    UBool actualmatch;
148    actualmatch = REMatcher->lookingAt(status);
149    if (U_FAILURE(status)) {
150        errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
151            line, u_errorName(status));
152        retVal =  FALSE;
153    }
154    if (actualmatch != looking) {
155        errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
156        retVal = FALSE;
157    }
158
159    status = U_ZERO_ERROR;
160    actualmatch = REMatcher->matches(status);
161    if (U_FAILURE(status)) {
162        errln("RegexTest failure in matches() at line %d.  Status = %s\n",
163            line, u_errorName(status));
164        retVal = FALSE;
165    }
166    if (actualmatch != match) {
167        errln("RegexTest: wrong return from matches() at line %d.\n", line);
168        retVal = FALSE;
169    }
170
171    if (retVal == FALSE) {
172        RegexPatternDump(REPattern);
173    }
174
175    delete REPattern;
176    delete REMatcher;
177    return retVal;
178}
179
180
181
182
183
184//---------------------------------------------------------------------------
185//
186//    REGEX_ERR       Macro + invocation function to simplify writing tests
187//                       regex tests for incorrect patterns
188//
189//       usage:
190//          REGEX_ERR("pattern",   expected error line, column, expected status);
191//
192//---------------------------------------------------------------------------
193#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
194
195void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
196                          UErrorCode expectedStatus, int32_t line) {
197    UnicodeString       pattern(pat);
198
199    UErrorCode          status         = U_ZERO_ERROR;
200    UParseError         pe;
201    RegexPattern        *callerPattern = NULL;
202
203    //
204    //  Compile the caller's pattern
205    //
206    UnicodeString patString(pat);
207    callerPattern = RegexPattern::compile(patString, 0, pe, status);
208    if (status != expectedStatus) {
209        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
210    } else {
211        if (status != U_ZERO_ERROR) {
212            if (pe.line != errLine || pe.offset != errCol) {
213                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
214                    line, errLine, errCol, pe.line, pe.offset);
215            }
216        }
217    }
218
219    delete callerPattern;
220}
221
222
223
224//---------------------------------------------------------------------------
225//
226//      Basic      Check for basic functionality of regex pattern matching.
227//                 Avoid the use of REGEX_FIND test macro, which has
228//                 substantial dependencies on basic Regex functionality.
229//
230//---------------------------------------------------------------------------
231void RegexTest::Basic() {
232
233
234//
235// Debug - slide failing test cases early
236//
237#if 0
238    {
239        // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
240        UParseError pe;
241        UErrorCode  status = U_ZERO_ERROR;
242        RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
243        // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
244        // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
245    }
246    exit(1);
247#endif
248
249
250    //
251    // Pattern with parentheses
252    //
253    REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
254    REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
255    REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
256
257    //
258    // Patterns with *
259    //
260    REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
261    REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
262    REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
263    REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
264    REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
265
266    REGEX_TESTLM("a*", "",  TRUE, TRUE);
267    REGEX_TESTLM("a*", "b", TRUE, FALSE);
268
269
270    //
271    //  Patterns with "."
272    //
273    REGEX_TESTLM(".", "abc", TRUE, FALSE);
274    REGEX_TESTLM("...", "abc", TRUE, TRUE);
275    REGEX_TESTLM("....", "abc", FALSE, FALSE);
276    REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
277    REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
278    REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
279    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
280    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
281
282    //
283    //  Patterns with * applied to chars at end of literal string
284    //
285    REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
286    REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
287
288    //
289    //  Supplemental chars match as single chars, not a pair of surrogates.
290    //
291    REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
292    REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
293    REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
294
295
296    //
297    //  UnicodeSets in the pattern
298    //
299    REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
300    REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
301    REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
302    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
303    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
304    REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
305
306    REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
307    REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
308    REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
309    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
310    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
311
312    //
313    //   OR operator in patterns
314    //
315    REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
316    REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
317    REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
318    REGEX_TESTLM("a|b", "b", TRUE, TRUE);
319
320    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
321    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
322    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
323    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
324    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
325    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
326
327    //
328    //  +
329    //
330    REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
331    REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
332    REGEX_TESTLM("b+", "", FALSE, FALSE);
333    REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
334    REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
335    REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
336
337    //
338    //   ?
339    //
340    REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
341    REGEX_TESTLM("ab?", "a", TRUE, TRUE);
342    REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
343    REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
344    REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
345    REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
346    REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
347    REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
348    REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
349
350    //
351    //  Escape sequences that become single literal chars, handled internally
352    //   by ICU's Unescape.
353    //
354
355    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
356    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
357    REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
358    REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
359    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
360    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
361    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
362    REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
363    REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
364    REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
365
366    REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
367    REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
368
369    // Escape of special chars in patterns
370    REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
371
372
373}
374
375
376//---------------------------------------------------------------------------
377//
378//      API_Match   Test that the API for class RegexMatcher
379//                  is present and nominally working, but excluding functions
380//                  implementing replace operations.
381//
382//---------------------------------------------------------------------------
383void RegexTest::API_Match() {
384    UParseError         pe;
385    UErrorCode          status=U_ZERO_ERROR;
386    int32_t             flags = 0;
387
388    //
389    // Debug - slide failing test cases early
390    //
391#if 0
392    {
393    }
394    return;
395#endif
396
397    //
398    // Simple pattern compilation
399    //
400    {
401        UnicodeString       re("abc");
402        RegexPattern        *pat2;
403        pat2 = RegexPattern::compile(re, flags, pe, status);
404        REGEX_CHECK_STATUS;
405
406        UnicodeString inStr1 = "abcdef this is a test";
407        UnicodeString instr2 = "not abc";
408        UnicodeString empty  = "";
409
410
411        //
412        // Matcher creation and reset.
413        //
414        RegexMatcher *m1 = pat2->matcher(inStr1, status);
415        REGEX_CHECK_STATUS;
416        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
417        REGEX_ASSERT(m1->input() == inStr1);
418        m1->reset(instr2);
419        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
420        REGEX_ASSERT(m1->input() == instr2);
421        m1->reset(inStr1);
422        REGEX_ASSERT(m1->input() == inStr1);
423        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
424        m1->reset(empty);
425        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
426        REGEX_ASSERT(m1->input() == empty);
427        REGEX_ASSERT(&m1->pattern() == pat2);
428
429        //
430        //  reset(pos, status)
431        //
432        m1->reset(inStr1);
433        m1->reset(4, status);
434        REGEX_CHECK_STATUS;
435        REGEX_ASSERT(m1->input() == inStr1);
436        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
437
438        m1->reset(-1, status);
439        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
440        status = U_ZERO_ERROR;
441
442        m1->reset(0, status);
443        REGEX_CHECK_STATUS;
444        status = U_ZERO_ERROR;
445
446        int32_t len = m1->input().length();
447        m1->reset(len-1, status);
448        REGEX_CHECK_STATUS;
449        status = U_ZERO_ERROR;
450
451        m1->reset(len, status);
452        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
453        status = U_ZERO_ERROR;
454
455        //
456        // match(pos, status)
457        //
458        m1->reset(instr2);
459        REGEX_ASSERT(m1->matches(4, status) == TRUE);
460        m1->reset();
461        REGEX_ASSERT(m1->matches(3, status) == FALSE);
462        m1->reset();
463        REGEX_ASSERT(m1->matches(5, status) == FALSE);
464        REGEX_ASSERT(m1->matches(4, status) == TRUE);
465        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
466        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
467
468        // Match() at end of string should fail, but should not
469        //  be an error.
470        status = U_ZERO_ERROR;
471        len = m1->input().length();
472        REGEX_ASSERT(m1->matches(len, status) == FALSE);
473        REGEX_CHECK_STATUS;
474
475        // Match beyond end of string should fail with an error.
476        status = U_ZERO_ERROR;
477        REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
478        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
479
480        // Successful match at end of string.
481        {
482            status = U_ZERO_ERROR;
483            RegexMatcher m("A?", 0, status);  // will match zero length string.
484            REGEX_CHECK_STATUS;
485            m.reset(inStr1);
486            len = inStr1.length();
487            REGEX_ASSERT(m.matches(len, status) == TRUE);
488            REGEX_CHECK_STATUS;
489            m.reset(empty);
490            REGEX_ASSERT(m.matches(0, status) == TRUE);
491            REGEX_CHECK_STATUS;
492        }
493
494
495        //
496        // lookingAt(pos, status)
497        //
498        status = U_ZERO_ERROR;
499        m1->reset(instr2);  // "not abc"
500        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
501        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
502        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
503        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
504        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
505        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
506        status = U_ZERO_ERROR;
507        len = m1->input().length();
508        REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
509        REGEX_CHECK_STATUS;
510        REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
511        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
512
513        delete m1;
514        delete pat2;
515    }
516
517
518    //
519    // Capture Group.
520    //     RegexMatcher::start();
521    //     RegexMatcher::end();
522    //     RegexMatcher::groupCount();
523    //
524    {
525        int32_t             flags=0;
526        UParseError         pe;
527        UErrorCode          status=U_ZERO_ERROR;
528
529        UnicodeString       re("01(23(45)67)(.*)");
530        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
531        REGEX_CHECK_STATUS;
532        UnicodeString data = "0123456789";
533
534        RegexMatcher *matcher = pat->matcher(data, status);
535        REGEX_CHECK_STATUS;
536        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
537        static const int32_t matchStarts[] = {0,  2, 4, 8};
538        static const int32_t matchEnds[]   = {10, 8, 6, 10};
539        int32_t i;
540        for (i=0; i<4; i++) {
541            int32_t actualStart = matcher->start(i, status);
542            REGEX_CHECK_STATUS;
543            if (actualStart != matchStarts[i]) {
544                errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
545                    __LINE__, i, matchStarts[i], actualStart);
546            }
547            int32_t actualEnd = matcher->end(i, status);
548            REGEX_CHECK_STATUS;
549            if (actualEnd != matchEnds[i]) {
550                errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
551                    __LINE__, i, matchEnds[i], actualEnd);
552            }
553        }
554
555        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
556        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
557
558        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
559        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
560        matcher->reset();
561        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
562
563        matcher->lookingAt(status);
564        REGEX_ASSERT(matcher->group(status)    == "0123456789");
565        REGEX_ASSERT(matcher->group(0, status) == "0123456789");
566        REGEX_ASSERT(matcher->group(1, status) == "234567"    );
567        REGEX_ASSERT(matcher->group(2, status) == "45"        );
568        REGEX_ASSERT(matcher->group(3, status) == "89"        );
569        REGEX_CHECK_STATUS;
570        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
571        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
572        matcher->reset();
573        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
574
575        delete matcher;
576        delete pat;
577
578    }
579
580    //
581    //  find
582    //
583    {
584        int32_t             flags=0;
585        UParseError         pe;
586        UErrorCode          status=U_ZERO_ERROR;
587
588        UnicodeString       re("abc");
589        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
590        REGEX_CHECK_STATUS;
591        UnicodeString data = ".abc..abc...abc..";
592        //                    012345678901234567
593
594        RegexMatcher *matcher = pat->matcher(data, status);
595        REGEX_CHECK_STATUS;
596        REGEX_ASSERT(matcher->find());
597        REGEX_ASSERT(matcher->start(status) == 1);
598        REGEX_ASSERT(matcher->find());
599        REGEX_ASSERT(matcher->start(status) == 6);
600        REGEX_ASSERT(matcher->find());
601        REGEX_ASSERT(matcher->start(status) == 12);
602        REGEX_ASSERT(matcher->find() == FALSE);
603        REGEX_ASSERT(matcher->find() == FALSE);
604
605        matcher->reset();
606        REGEX_ASSERT(matcher->find());
607        REGEX_ASSERT(matcher->start(status) == 1);
608
609        REGEX_ASSERT(matcher->find(0, status));
610        REGEX_ASSERT(matcher->start(status) == 1);
611        REGEX_ASSERT(matcher->find(1, status));
612        REGEX_ASSERT(matcher->start(status) == 1);
613        REGEX_ASSERT(matcher->find(2, status));
614        REGEX_ASSERT(matcher->start(status) == 6);
615        REGEX_ASSERT(matcher->find(12, status));
616        REGEX_ASSERT(matcher->start(status) == 12);
617        REGEX_ASSERT(matcher->find(13, status) == FALSE);
618        REGEX_ASSERT(matcher->find(16, status) == FALSE);
619        REGEX_ASSERT(matcher->find(17, status) == FALSE);
620        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
621
622        status = U_ZERO_ERROR;
623        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
624        status = U_ZERO_ERROR;
625        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
626
627        REGEX_ASSERT(matcher->groupCount() == 0);
628
629        delete matcher;
630        delete pat;
631    }
632
633
634    //
635    //  find, with \G in pattern (true if at the end of a previous match).
636    //
637    {
638        int32_t             flags=0;
639        UParseError         pe;
640        UErrorCode          status=U_ZERO_ERROR;
641
642        UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
643        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
644        REGEX_CHECK_STATUS;
645        UnicodeString data = ".abcabc.abc..";
646        //                    012345678901234567
647
648        RegexMatcher *matcher = pat->matcher(data, status);
649        REGEX_CHECK_STATUS;
650        REGEX_ASSERT(matcher->find());
651        REGEX_ASSERT(matcher->start(status) == 0);
652        REGEX_ASSERT(matcher->start(1, status) == -1);
653        REGEX_ASSERT(matcher->start(2, status) == 1);
654
655        REGEX_ASSERT(matcher->find());
656        REGEX_ASSERT(matcher->start(status) == 4);
657        REGEX_ASSERT(matcher->start(1, status) == 4);
658        REGEX_ASSERT(matcher->start(2, status) == -1);
659        REGEX_CHECK_STATUS;
660
661        delete matcher;
662        delete pat;
663    }
664
665    //
666    //   find with zero length matches, match position should bump ahead
667    //     to prevent loops.
668    //
669    {
670        int32_t                 i;
671        UErrorCode          status=U_ZERO_ERROR;
672        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
673                                                      //   using an always-true look-ahead.
674        REGEX_CHECK_STATUS;
675        UnicodeString s("    ");
676        m.reset(s);
677        for (i=0; ; i++) {
678            if (m.find() == FALSE) {
679                break;
680            }
681            REGEX_ASSERT(m.start(status) == i);
682            REGEX_ASSERT(m.end(status) == i);
683        }
684        REGEX_ASSERT(i==5);
685
686        // Check that the bump goes over surrogate pairs OK
687        s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
688        s = s.unescape();
689        m.reset(s);
690        for (i=0; ; i+=2) {
691            if (m.find() == FALSE) {
692                break;
693            }
694            REGEX_ASSERT(m.start(status) == i);
695            REGEX_ASSERT(m.end(status) == i);
696        }
697        REGEX_ASSERT(i==10);
698    }
699    {
700        // find() loop breaking test.
701        //        with pattern of /.?/, should see a series of one char matches, then a single
702        //        match of zero length at the end of the input string.
703        int32_t                 i;
704        UErrorCode          status=U_ZERO_ERROR;
705        RegexMatcher        m(".?", 0, status);
706        REGEX_CHECK_STATUS;
707        UnicodeString s("    ");
708        m.reset(s);
709        for (i=0; ; i++) {
710            if (m.find() == FALSE) {
711                break;
712            }
713            REGEX_ASSERT(m.start(status) == i);
714            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
715        }
716        REGEX_ASSERT(i==5);
717    }
718
719
720    //
721    // Matchers with no input string behave as if they had an empty input string.
722    //
723
724    {
725        UErrorCode status = U_ZERO_ERROR;
726        RegexMatcher  m(".?", 0, status);
727        REGEX_CHECK_STATUS;
728        REGEX_ASSERT(m.find());
729        REGEX_ASSERT(m.start(status) == 0);
730        REGEX_ASSERT(m.input() == "");
731    }
732    {
733        UErrorCode status = U_ZERO_ERROR;
734        RegexPattern  *p = RegexPattern::compile(".", 0, status);
735        RegexMatcher  *m = p->matcher(status);
736        REGEX_CHECK_STATUS;
737
738        REGEX_ASSERT(m->find() == FALSE);
739        REGEX_ASSERT(m->input() == "");
740        delete m;
741        delete p;
742    }
743
744    //
745    // Regions
746    //
747    {
748        UErrorCode status = U_ZERO_ERROR;
749        UnicodeString testString("This is test data");
750        RegexMatcher m(".*", testString,  0, status);
751        REGEX_CHECK_STATUS;
752        REGEX_ASSERT(m.regionStart() == 0);
753        REGEX_ASSERT(m.regionEnd() == testString.length());
754        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
755        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
756
757        m.region(2,4, status);
758        REGEX_CHECK_STATUS;
759        REGEX_ASSERT(m.matches(status));
760        REGEX_ASSERT(m.start(status)==2);
761        REGEX_ASSERT(m.end(status)==4);
762        REGEX_CHECK_STATUS;
763
764        m.reset();
765        REGEX_ASSERT(m.regionStart() == 0);
766        REGEX_ASSERT(m.regionEnd() == testString.length());
767
768        UnicodeString shorterString("short");
769        m.reset(shorterString);
770        REGEX_ASSERT(m.regionStart() == 0);
771        REGEX_ASSERT(m.regionEnd() == shorterString.length());
772
773        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
774        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
775        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
776        REGEX_ASSERT(&m == &m.reset());
777        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
778
779        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
780        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
781        REGEX_ASSERT(&m == &m.reset());
782        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
783
784        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
785        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
786        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
787        REGEX_ASSERT(&m == &m.reset());
788        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
789
790        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
791        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
792        REGEX_ASSERT(&m == &m.reset());
793        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
794
795    }
796
797    //
798    // hitEnd() and requireEnd()
799    //
800    {
801        UErrorCode status = U_ZERO_ERROR;
802        UnicodeString testString("aabb");
803        RegexMatcher m1(".*", testString,  0, status);
804        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
805        REGEX_ASSERT(m1.hitEnd() == TRUE);
806        REGEX_ASSERT(m1.requireEnd() == FALSE);
807        REGEX_CHECK_STATUS;
808
809        status = U_ZERO_ERROR;
810        RegexMatcher m2("a*", testString, 0, status);
811        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
812        REGEX_ASSERT(m2.hitEnd() == FALSE);
813        REGEX_ASSERT(m2.requireEnd() == FALSE);
814        REGEX_CHECK_STATUS;
815
816        status = U_ZERO_ERROR;
817        RegexMatcher m3(".*$", testString, 0, status);
818        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
819        REGEX_ASSERT(m3.hitEnd() == TRUE);
820        REGEX_ASSERT(m3.requireEnd() == TRUE);
821        REGEX_CHECK_STATUS;
822    }
823
824
825    //
826    // Compilation error on reset with UChar *
827    //   These were a hazard that people were stumbling over with runtime errors.
828    //   Changed them to compiler errors by adding private methods that more closely
829    //   matched the incorrect use of the functions.
830    //
831#if 0
832    {
833        UErrorCode status = U_ZERO_ERROR;
834        UChar ucharString[20];
835        RegexMatcher m(".", 0, status);
836        m.reset(ucharString);  // should not compile.
837
838        RegexPattern *p = RegexPattern::compile(".", 0, status);
839        RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
840
841        RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
842    }
843#endif
844
845    //
846    //  Time Outs.
847    //       Note:  These tests will need to be changed when the regexp engine is
848    //              able to detect and cut short the exponential time behavior on
849    //              this type of match.
850    //
851    {
852        UErrorCode status = U_ZERO_ERROR;
853        //    Enough 'a's in the string to cause the match to time out.
854        //       (Each on additonal 'a' doubles the time)
855        UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
856        RegexMatcher matcher("(a+)+b", testString, 0, status);
857        REGEX_CHECK_STATUS;
858        REGEX_ASSERT(matcher.getTimeLimit() == 0);
859        matcher.setTimeLimit(100, status);
860        REGEX_ASSERT(matcher.getTimeLimit() == 100);
861        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
862        REGEX_ASSERT(status == U_REGEX_TIME_OUT);
863    }
864    {
865        UErrorCode status = U_ZERO_ERROR;
866        //   Few enough 'a's to slip in under the time limit.
867        UnicodeString testString("aaaaaaaaaaaaaaaaaa");
868        RegexMatcher matcher("(a+)+b", testString, 0, status);
869        REGEX_CHECK_STATUS;
870        matcher.setTimeLimit(100, status);
871        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
872        REGEX_CHECK_STATUS;
873    }
874
875    //
876    //  Stack Limits
877    //
878    {
879        UErrorCode status = U_ZERO_ERROR;
880        UnicodeString testString(600000, 0x41, 600000);  // Length 600,000, filled with 'A'
881
882        // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
883        //   of the '+', and makes the stack frames larger.
884        RegexMatcher matcher("(A)+A$", testString, 0, status);
885
886        // With the default stack, this match should fail to run
887        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
888        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
889
890        // With unlimited stack, it should run
891        status = U_ZERO_ERROR;
892        matcher.setStackLimit(0, status);
893        REGEX_CHECK_STATUS;
894        REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
895        REGEX_CHECK_STATUS;
896        REGEX_ASSERT(matcher.getStackLimit() == 0);
897
898        // With a limited stack, it the match should fail
899        status = U_ZERO_ERROR;
900        matcher.setStackLimit(10000, status);
901        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
902        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
903        REGEX_ASSERT(matcher.getStackLimit() == 10000);
904    }
905
906        // A pattern that doesn't save state should work with
907        //   a minimal sized stack
908    {
909        UErrorCode status = U_ZERO_ERROR;
910        UnicodeString testString = "abc";
911        RegexMatcher matcher("abc", testString, 0, status);
912        REGEX_CHECK_STATUS;
913        matcher.setStackLimit(30, status);
914        REGEX_CHECK_STATUS;
915        REGEX_ASSERT(matcher.matches(status) == TRUE);
916        REGEX_CHECK_STATUS;
917        REGEX_ASSERT(matcher.getStackLimit() == 30);
918
919        // Negative stack sizes should fail
920        status = U_ZERO_ERROR;
921        matcher.setStackLimit(1000, status);
922        REGEX_CHECK_STATUS;
923        matcher.setStackLimit(-1, status);
924        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
925        REGEX_ASSERT(matcher.getStackLimit() == 1000);
926    }
927
928
929}
930
931
932
933
934
935
936//---------------------------------------------------------------------------
937//
938//      API_Replace        API test for class RegexMatcher, testing the
939//                         Replace family of functions.
940//
941//---------------------------------------------------------------------------
942void RegexTest::API_Replace() {
943    //
944    //  Replace
945    //
946    int32_t             flags=0;
947    UParseError         pe;
948    UErrorCode          status=U_ZERO_ERROR;
949
950    UnicodeString       re("abc");
951    RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
952    REGEX_CHECK_STATUS;
953    UnicodeString data = ".abc..abc...abc..";
954    //                    012345678901234567
955    RegexMatcher *matcher = pat->matcher(data, status);
956
957    //
958    //  Plain vanilla matches.
959    //
960    UnicodeString  dest;
961    dest = matcher->replaceFirst("yz", status);
962    REGEX_CHECK_STATUS;
963    REGEX_ASSERT(dest == ".yz..abc...abc..");
964
965    dest = matcher->replaceAll("yz", status);
966    REGEX_CHECK_STATUS;
967    REGEX_ASSERT(dest == ".yz..yz...yz..");
968
969    //
970    //  Plain vanilla non-matches.
971    //
972    UnicodeString d2 = ".abx..abx...abx..";
973    matcher->reset(d2);
974    dest = matcher->replaceFirst("yz", status);
975    REGEX_CHECK_STATUS;
976    REGEX_ASSERT(dest == ".abx..abx...abx..");
977
978    dest = matcher->replaceAll("yz", status);
979    REGEX_CHECK_STATUS;
980    REGEX_ASSERT(dest == ".abx..abx...abx..");
981
982    //
983    // Empty source string
984    //
985    UnicodeString d3 = "";
986    matcher->reset(d3);
987    dest = matcher->replaceFirst("yz", status);
988    REGEX_CHECK_STATUS;
989    REGEX_ASSERT(dest == "");
990
991    dest = matcher->replaceAll("yz", status);
992    REGEX_CHECK_STATUS;
993    REGEX_ASSERT(dest == "");
994
995    //
996    // Empty substitution string
997    //
998    matcher->reset(data);              // ".abc..abc...abc.."
999    dest = matcher->replaceFirst("", status);
1000    REGEX_CHECK_STATUS;
1001    REGEX_ASSERT(dest == "...abc...abc..");
1002
1003    dest = matcher->replaceAll("", status);
1004    REGEX_CHECK_STATUS;
1005    REGEX_ASSERT(dest == "........");
1006
1007    //
1008    // match whole string
1009    //
1010    UnicodeString d4 = "abc";
1011    matcher->reset(d4);
1012    dest = matcher->replaceFirst("xyz", status);
1013    REGEX_CHECK_STATUS;
1014    REGEX_ASSERT(dest == "xyz");
1015
1016    dest = matcher->replaceAll("xyz", status);
1017    REGEX_CHECK_STATUS;
1018    REGEX_ASSERT(dest == "xyz");
1019
1020    //
1021    // Capture Group, simple case
1022    //
1023    UnicodeString       re2("a(..)");
1024    RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1025    REGEX_CHECK_STATUS;
1026    UnicodeString d5 = "abcdefg";
1027    RegexMatcher *matcher2 = pat2->matcher(d5, status);
1028    REGEX_CHECK_STATUS;
1029    dest = matcher2->replaceFirst("$1$1", status);
1030    REGEX_CHECK_STATUS;
1031    REGEX_ASSERT(dest == "bcbcdefg");
1032
1033    dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1034    REGEX_CHECK_STATUS;
1035    REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1036
1037    dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1038    REGEX_CHECK_STATUS;
1039    REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1040
1041    UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1042    replacement = replacement.unescape();
1043    dest = matcher2->replaceFirst(replacement, status);
1044    REGEX_CHECK_STATUS;
1045    REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1046
1047    REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1048
1049
1050    //
1051    // Replacement String with \u hex escapes
1052    //
1053    {
1054        UnicodeString  src = "abc 1 abc 2 abc 3";
1055        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1056        matcher->reset(src);
1057        UnicodeString  result = matcher->replaceAll(substitute, status);
1058        REGEX_CHECK_STATUS;
1059        REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1060    }
1061    {
1062        UnicodeString  src = "abc !";
1063        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1064        matcher->reset(src);
1065        UnicodeString  result = matcher->replaceAll(substitute, status);
1066        REGEX_CHECK_STATUS;
1067        UnicodeString expected = UnicodeString("--");
1068        expected.append((UChar32)0x10000);
1069        expected.append("-- !");
1070        REGEX_ASSERT(result == expected);
1071    }
1072    // TODO:  need more through testing of capture substitutions.
1073
1074    // Bug 4057
1075    //
1076    {
1077        status = U_ZERO_ERROR;
1078        UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1079        RegexMatcher m("ss(.*?)ee", 0, status);
1080        REGEX_CHECK_STATUS;
1081        UnicodeString result;
1082
1083        // Multiple finds do NOT bump up the previous appendReplacement postion.
1084        m.reset(s);
1085        m.find();
1086        m.find();
1087        m.appendReplacement(result, "ooh", status);
1088        REGEX_CHECK_STATUS;
1089        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1090
1091        // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1092        status = U_ZERO_ERROR;
1093        result.truncate(0);
1094        m.reset(10, status);
1095        m.find();
1096        m.find();
1097        m.appendReplacement(result, "ooh", status);
1098        REGEX_CHECK_STATUS;
1099        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1100
1101        // find() at interior of string, appendReplacemnt still starts at beginning.
1102        status = U_ZERO_ERROR;
1103        result.truncate(0);
1104        m.reset();
1105        m.find(10, status);
1106        m.find();
1107        m.appendReplacement(result, "ooh", status);
1108        REGEX_CHECK_STATUS;
1109        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1110
1111        m.appendTail(result);
1112        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1113
1114    }
1115
1116    delete matcher2;
1117    delete pat2;
1118    delete matcher;
1119    delete pat;
1120}
1121
1122
1123//---------------------------------------------------------------------------
1124//
1125//      API_Pattern       Test that the API for class RegexPattern is
1126//                        present and nominally working.
1127//
1128//---------------------------------------------------------------------------
1129void RegexTest::API_Pattern() {
1130    RegexPattern        pata;    // Test default constructor to not crash.
1131    RegexPattern        patb;
1132
1133    REGEX_ASSERT(pata == patb);
1134    REGEX_ASSERT(pata == pata);
1135
1136    UnicodeString re1("abc[a-l][m-z]");
1137    UnicodeString re2("def");
1138    UErrorCode    status = U_ZERO_ERROR;
1139    UParseError   pe;
1140
1141    RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1142    RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1143    REGEX_CHECK_STATUS;
1144    REGEX_ASSERT(*pat1 == *pat1);
1145    REGEX_ASSERT(*pat1 != pata);
1146
1147    // Assign
1148    patb = *pat1;
1149    REGEX_ASSERT(patb == *pat1);
1150
1151    // Copy Construct
1152    RegexPattern patc(*pat1);
1153    REGEX_ASSERT(patc == *pat1);
1154    REGEX_ASSERT(patb == patc);
1155    REGEX_ASSERT(pat1 != pat2);
1156    patb = *pat2;
1157    REGEX_ASSERT(patb != patc);
1158    REGEX_ASSERT(patb == *pat2);
1159
1160    // Compile with no flags.
1161    RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1162    REGEX_ASSERT(*pat1a == *pat1);
1163
1164    REGEX_ASSERT(pat1a->flags() == 0);
1165
1166    // Compile with different flags should be not equal
1167    RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1168    REGEX_CHECK_STATUS;
1169
1170    REGEX_ASSERT(*pat1b != *pat1a);
1171    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1172    REGEX_ASSERT(pat1a->flags() == 0);
1173    delete pat1b;
1174
1175    // clone
1176    RegexPattern *pat1c = pat1->clone();
1177    REGEX_ASSERT(*pat1c == *pat1);
1178    REGEX_ASSERT(*pat1c != *pat2);
1179
1180    delete pat1c;
1181    delete pat1a;
1182    delete pat1;
1183    delete pat2;
1184
1185
1186    //
1187    //   Verify that a matcher created from a cloned pattern works.
1188    //     (Jitterbug 3423)
1189    //
1190    {
1191        UErrorCode     status     = U_ZERO_ERROR;
1192        RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1193        RegexPattern  *pClone     = pSource->clone();
1194        delete         pSource;
1195        RegexMatcher  *mFromClone = pClone->matcher(status);
1196        REGEX_CHECK_STATUS;
1197        UnicodeString s = "Hello World";
1198        mFromClone->reset(s);
1199        REGEX_ASSERT(mFromClone->find() == TRUE);
1200        REGEX_ASSERT(mFromClone->group(status) == "Hello");
1201        REGEX_ASSERT(mFromClone->find() == TRUE);
1202        REGEX_ASSERT(mFromClone->group(status) == "World");
1203        REGEX_ASSERT(mFromClone->find() == FALSE);
1204        delete mFromClone;
1205        delete pClone;
1206    }
1207
1208    //
1209    //   matches convenience API
1210    //
1211    REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1212    REGEX_CHECK_STATUS;
1213    REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1214    REGEX_CHECK_STATUS;
1215    REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1216    REGEX_CHECK_STATUS;
1217    REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1218    REGEX_CHECK_STATUS;
1219    REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1220    REGEX_CHECK_STATUS;
1221    status = U_INDEX_OUTOFBOUNDS_ERROR;
1222    REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1223    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1224
1225
1226    //
1227    // Split()
1228    //
1229    status = U_ZERO_ERROR;
1230    pat1 = RegexPattern::compile(" +",  pe, status);
1231    REGEX_CHECK_STATUS;
1232    UnicodeString  fields[10];
1233
1234    int32_t n;
1235    n = pat1->split("Now is the time", fields, 10, status);
1236    REGEX_CHECK_STATUS;
1237    REGEX_ASSERT(n==4);
1238    REGEX_ASSERT(fields[0]=="Now");
1239    REGEX_ASSERT(fields[1]=="is");
1240    REGEX_ASSERT(fields[2]=="the");
1241    REGEX_ASSERT(fields[3]=="time");
1242    REGEX_ASSERT(fields[4]=="");
1243
1244    n = pat1->split("Now is the time", fields, 2, status);
1245    REGEX_CHECK_STATUS;
1246    REGEX_ASSERT(n==2);
1247    REGEX_ASSERT(fields[0]=="Now");
1248    REGEX_ASSERT(fields[1]=="is the time");
1249    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1250
1251    fields[1] = "*";
1252    status = U_ZERO_ERROR;
1253    n = pat1->split("Now is the time", fields, 1, status);
1254    REGEX_CHECK_STATUS;
1255    REGEX_ASSERT(n==1);
1256    REGEX_ASSERT(fields[0]=="Now is the time");
1257    REGEX_ASSERT(fields[1]=="*");
1258    status = U_ZERO_ERROR;
1259
1260    n = pat1->split("    Now       is the time   ", fields, 10, status);
1261    REGEX_CHECK_STATUS;
1262    REGEX_ASSERT(n==5);
1263    REGEX_ASSERT(fields[0]=="");
1264    REGEX_ASSERT(fields[1]=="Now");
1265    REGEX_ASSERT(fields[2]=="is");
1266    REGEX_ASSERT(fields[3]=="the");
1267    REGEX_ASSERT(fields[4]=="time");
1268    REGEX_ASSERT(fields[5]=="");
1269
1270    n = pat1->split("     ", fields, 10, status);
1271    REGEX_CHECK_STATUS;
1272    REGEX_ASSERT(n==1);
1273    REGEX_ASSERT(fields[0]=="");
1274
1275    fields[0] = "foo";
1276    n = pat1->split("", fields, 10, status);
1277    REGEX_CHECK_STATUS;
1278    REGEX_ASSERT(n==0);
1279    REGEX_ASSERT(fields[0]=="foo");
1280
1281    delete pat1;
1282
1283    //  split, with a pattern with (capture)
1284    pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1285    REGEX_CHECK_STATUS;
1286
1287    status = U_ZERO_ERROR;
1288    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1289    REGEX_CHECK_STATUS;
1290    REGEX_ASSERT(n==6);
1291    REGEX_ASSERT(fields[0]=="");
1292    REGEX_ASSERT(fields[1]=="a");
1293    REGEX_ASSERT(fields[2]=="Now is ");
1294    REGEX_ASSERT(fields[3]=="b");
1295    REGEX_ASSERT(fields[4]=="the time");
1296    REGEX_ASSERT(fields[5]=="c");
1297    REGEX_ASSERT(fields[6]=="");
1298    REGEX_ASSERT(status==U_ZERO_ERROR);
1299
1300    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1301    REGEX_CHECK_STATUS;
1302    REGEX_ASSERT(n==6);
1303    REGEX_ASSERT(fields[0]=="  ");
1304    REGEX_ASSERT(fields[1]=="a");
1305    REGEX_ASSERT(fields[2]=="Now is ");
1306    REGEX_ASSERT(fields[3]=="b");
1307    REGEX_ASSERT(fields[4]=="the time");
1308    REGEX_ASSERT(fields[5]=="c");
1309    REGEX_ASSERT(fields[6]=="");
1310
1311    status = U_ZERO_ERROR;
1312    fields[6] = "foo";
1313    n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1314    REGEX_CHECK_STATUS;
1315    REGEX_ASSERT(n==6);
1316    REGEX_ASSERT(fields[0]=="  ");
1317    REGEX_ASSERT(fields[1]=="a");
1318    REGEX_ASSERT(fields[2]=="Now is ");
1319    REGEX_ASSERT(fields[3]=="b");
1320    REGEX_ASSERT(fields[4]=="the time");
1321    REGEX_ASSERT(fields[5]=="c");
1322    REGEX_ASSERT(fields[6]=="foo");
1323
1324    status = U_ZERO_ERROR;
1325    fields[5] = "foo";
1326    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1327    REGEX_CHECK_STATUS;
1328    REGEX_ASSERT(n==5);
1329    REGEX_ASSERT(fields[0]=="  ");
1330    REGEX_ASSERT(fields[1]=="a");
1331    REGEX_ASSERT(fields[2]=="Now is ");
1332    REGEX_ASSERT(fields[3]=="b");
1333    REGEX_ASSERT(fields[4]=="the time<c>");
1334    REGEX_ASSERT(fields[5]=="foo");
1335
1336    status = U_ZERO_ERROR;
1337    fields[5] = "foo";
1338    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1339    REGEX_CHECK_STATUS;
1340    REGEX_ASSERT(n==5);
1341    REGEX_ASSERT(fields[0]=="  ");
1342    REGEX_ASSERT(fields[1]=="a");
1343    REGEX_ASSERT(fields[2]=="Now is ");
1344    REGEX_ASSERT(fields[3]=="b");
1345    REGEX_ASSERT(fields[4]=="the time");
1346    REGEX_ASSERT(fields[5]=="foo");
1347
1348    status = U_ZERO_ERROR;
1349    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1350    REGEX_CHECK_STATUS;
1351    REGEX_ASSERT(n==4);
1352    REGEX_ASSERT(fields[0]=="  ");
1353    REGEX_ASSERT(fields[1]=="a");
1354    REGEX_ASSERT(fields[2]=="Now is ");
1355    REGEX_ASSERT(fields[3]=="the time<c>");
1356    status = U_ZERO_ERROR;
1357    delete pat1;
1358
1359    pat1 = RegexPattern::compile("([-,])",  pe, status);
1360    REGEX_CHECK_STATUS;
1361    n = pat1->split("1-10,20", fields, 10, status);
1362    REGEX_CHECK_STATUS;
1363    REGEX_ASSERT(n==5);
1364    REGEX_ASSERT(fields[0]=="1");
1365    REGEX_ASSERT(fields[1]=="-");
1366    REGEX_ASSERT(fields[2]=="10");
1367    REGEX_ASSERT(fields[3]==",");
1368    REGEX_ASSERT(fields[4]=="20");
1369    delete pat1;
1370
1371
1372    //
1373    // RegexPattern::pattern()
1374    //
1375    pat1 = new RegexPattern();
1376    REGEX_ASSERT(pat1->pattern() == "");
1377    delete pat1;
1378
1379    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1380    REGEX_CHECK_STATUS;
1381    REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1382    delete pat1;
1383
1384
1385    //
1386    // classID functions
1387    //
1388    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1389    REGEX_CHECK_STATUS;
1390    REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1391    REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1392    UnicodeString Hello("Hello, world.");
1393    RegexMatcher *m = pat1->matcher(Hello, status);
1394    REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1395    REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1396    REGEX_ASSERT(m->getDynamicClassID() != NULL);
1397    delete m;
1398    delete pat1;
1399
1400}
1401
1402//---------------------------------------------------------------------------
1403//
1404//      Extended       A more thorough check for features of regex patterns
1405//                     The test cases are in a separate data file,
1406//                       source/tests/testdata/regextst.txt
1407//                     A description of the test data format is included in that file.
1408//
1409//---------------------------------------------------------------------------
1410
1411const char *
1412RegexTest::getPath(char buffer[2048], const char *filename) {
1413    UErrorCode status=U_ZERO_ERROR;
1414    const char *testDataDirectory = IntlTest::getSourceTestData(status);
1415    if (U_FAILURE(status)) {
1416        errln("ERROR: loadTestData() failed - %s", u_errorName(status));
1417        return NULL;
1418    }
1419
1420    strcpy(buffer, testDataDirectory);
1421    strcat(buffer, filename);
1422    return buffer;
1423}
1424
1425void RegexTest::Extended() {
1426    char tdd[2048];
1427    const char *srcPath;
1428    UErrorCode  status  = U_ZERO_ERROR;
1429    int32_t     lineNum = 0;
1430
1431    //
1432    //  Open and read the test data file.
1433    //
1434    srcPath=getPath(tdd, "regextst.txt");
1435    if(srcPath==NULL) {
1436        return; /* something went wrong, error already output */
1437    }
1438
1439    int32_t    len;
1440    UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
1441    if (U_FAILURE(status)) {
1442        return; /* something went wrong, error already output */
1443    }
1444
1445    //
1446    //  Put the test data into a UnicodeString
1447    //
1448    UnicodeString testString(FALSE, testData, len);
1449
1450    RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
1451    RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
1452    RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
1453
1454    RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
1455    UnicodeString   testPattern;   // The pattern for test from the test file.
1456    UnicodeString   testFlags;     // the flags   for a test.
1457    UnicodeString   matchString;   // The marked up string to be used as input
1458
1459    if (U_FAILURE(status)){
1460        dataerrln("Construct RegexMatcher() error.");
1461        delete [] testData;
1462        return;
1463    }
1464
1465    //
1466    //  Loop over the test data file, once per line.
1467    //
1468    while (lineMat.find()) {
1469        lineNum++;
1470        if (U_FAILURE(status)) {
1471            errln("line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
1472        }
1473
1474        status = U_ZERO_ERROR;
1475        UnicodeString testLine = lineMat.group(1, status);
1476        if (testLine.length() == 0) {
1477            continue;
1478        }
1479
1480        //
1481        // Parse the test line.  Skip blank and comment only lines.
1482        // Separate out the three main fields - pattern, flags, target.
1483        //
1484
1485        commentMat.reset(testLine);
1486        if (commentMat.lookingAt(status)) {
1487            // This line is a comment, or blank.
1488            continue;
1489        }
1490
1491        //
1492        //  Pull out the pattern field, remove it from the test file line.
1493        //
1494        quotedStuffMat.reset(testLine);
1495        if (quotedStuffMat.lookingAt(status)) {
1496            testPattern = quotedStuffMat.group(2, status);
1497            testLine.remove(0, quotedStuffMat.end(0, status));
1498        } else {
1499            errln("Bad pattern (missing quotes?) at test file line %d", lineNum);
1500            continue;
1501        }
1502
1503
1504        //
1505        //  Pull out the flags from the test file line.
1506        //
1507        flagsMat.reset(testLine);
1508        flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
1509        testFlags = flagsMat.group(1, status);
1510        if (flagsMat.group(2, status).length() > 0) {
1511            errln("Bad Match flag at line %d. Scanning %c\n",
1512                lineNum, flagsMat.group(2, status).charAt(0));
1513            continue;
1514        }
1515        testLine.remove(0, flagsMat.end(0, status));
1516
1517        //
1518        //  Pull out the match string, as a whole.
1519        //    We'll process the <tags> later.
1520        //
1521        quotedStuffMat.reset(testLine);
1522        if (quotedStuffMat.lookingAt(status)) {
1523            matchString = quotedStuffMat.group(2, status);
1524            testLine.remove(0, quotedStuffMat.end(0, status));
1525        } else {
1526            errln("Bad match string at test file line %d", lineNum);
1527            continue;
1528        }
1529
1530        //
1531        //  The only thing left from the input line should be an optional trailing comment.
1532        //
1533        commentMat.reset(testLine);
1534        if (commentMat.lookingAt(status) == FALSE) {
1535            errln("Line %d: unexpected characters at end of test line.", lineNum);
1536            continue;
1537        }
1538
1539        //
1540        //  Run the test
1541        //
1542        regex_find(testPattern, testFlags, matchString, lineNum);
1543    }
1544
1545    delete [] testData;
1546
1547}
1548
1549
1550
1551//---------------------------------------------------------------------------
1552//
1553//    regex_find(pattern, flags, inputString, lineNumber)
1554//
1555//         Function to run a single test from the Extended (data driven) tests.
1556//         See file test/testdata/regextst.txt for a description of the
1557//         pattern and inputString fields, and the allowed flags.
1558//         lineNumber is the source line in regextst.txt of the test.
1559//
1560//---------------------------------------------------------------------------
1561
1562
1563//  Set a value into a UVector at position specified by a decimal number in
1564//   a UnicodeString.   This is a utility function needed by the actual test function,
1565//   which follows.
1566static void set(UVector &vec, int32_t val, UnicodeString index) {
1567    UErrorCode  status=U_ZERO_ERROR;
1568    int32_t  idx = 0;
1569    for (int32_t i=0; i<index.length(); i++) {
1570        int32_t d=u_charDigitValue(index.charAt(i));
1571        if (d<0) {return;}
1572        idx = idx*10 + d;
1573    }
1574    while (vec.size()<idx+1) {vec.addElement(-1, status);}
1575    vec.setElementAt(val, idx);
1576}
1577
1578void RegexTest::regex_find(const UnicodeString &pattern,
1579                           const UnicodeString &flags,
1580                           const UnicodeString &inputString,
1581                           int32_t line) {
1582    UnicodeString       unEscapedInput;
1583    UnicodeString       deTaggedInput;
1584
1585    UErrorCode          status         = U_ZERO_ERROR;
1586    UParseError         pe;
1587    RegexPattern        *parsePat      = NULL;
1588    RegexMatcher        *parseMatcher  = NULL;
1589    RegexPattern        *callerPattern = NULL;
1590    RegexMatcher        *matcher       = NULL;
1591    UVector             groupStarts(status);
1592    UVector             groupEnds(status);
1593    UBool               isMatch        = FALSE;
1594    UBool               failed         = FALSE;
1595    int32_t             numFinds;
1596    int32_t             i;
1597    UBool               useMatchesFunc   = FALSE;
1598    UBool               useLookingAtFunc = FALSE;
1599    int32_t             regionStart      = -1;
1600    int32_t             regionEnd        = -1;
1601
1602    //
1603    //  Compile the caller's pattern
1604    //
1605    uint32_t bflags = 0;
1606    if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
1607        bflags |= UREGEX_CASE_INSENSITIVE;
1608    }
1609    if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
1610        bflags |= UREGEX_COMMENTS;
1611    }
1612    if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
1613        bflags |= UREGEX_DOTALL;
1614    }
1615    if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
1616        bflags |= UREGEX_MULTILINE;
1617    }
1618
1619    if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
1620        bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
1621    }
1622    if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
1623        bflags |= UREGEX_UNIX_LINES;
1624    }
1625
1626
1627    callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
1628    if (status != U_ZERO_ERROR) {
1629        #if UCONFIG_NO_BREAK_ITERATION==1
1630        // 'v' test flag means that the test pattern should not compile if ICU was configured
1631        //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
1632        if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
1633            goto cleanupAndReturn;
1634        }
1635        #endif
1636        if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
1637            // Expected pattern compilation error.
1638            if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
1639                logln("Pattern Compile returns \"%s\"", u_errorName(status));
1640            }
1641            goto cleanupAndReturn;
1642        } else {
1643            // Unexpected pattern compilation error.
1644            errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
1645            goto cleanupAndReturn;
1646        }
1647    }
1648
1649    if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
1650        RegexPatternDump(callerPattern);
1651    }
1652
1653    if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
1654        errln("Expected, but did not get, a pattern compilation error.");
1655        goto cleanupAndReturn;
1656    }
1657
1658
1659    //
1660    // Number of times find() should be called on the test string, default to 1
1661    //
1662    numFinds = 1;
1663    for (i=2; i<=9; i++) {
1664        if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
1665            if (numFinds != 1) {
1666                errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
1667                goto cleanupAndReturn;
1668            }
1669            numFinds = i;
1670        }
1671    }
1672
1673    // 'M' flag.  Use matches() instead of find()
1674    if (flags.indexOf((UChar)0x4d) >= 0) {
1675        useMatchesFunc = TRUE;
1676    }
1677    if (flags.indexOf((UChar)0x4c) >= 0) {
1678        useLookingAtFunc = TRUE;
1679    }
1680
1681    //
1682    //  Find the tags in the input data, remove them, and record the group boundary
1683    //    positions.
1684    //
1685    parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
1686    REGEX_CHECK_STATUS_L(line);
1687
1688    unEscapedInput = inputString.unescape();
1689    parseMatcher = parsePat->matcher(unEscapedInput, status);
1690    REGEX_CHECK_STATUS_L(line);
1691    while(parseMatcher->find()) {
1692        parseMatcher->appendReplacement(deTaggedInput, "", status);
1693        REGEX_CHECK_STATUS;
1694        UnicodeString groupNum = parseMatcher->group(2, status);
1695        if (groupNum == "r") {
1696            // <r> or </r>, a region specification within the string
1697            if (parseMatcher->group(1, status) == "/") {
1698                regionEnd = deTaggedInput.length();
1699            } else {
1700                regionStart = deTaggedInput.length();
1701            }
1702        } else {
1703            // <digits> or </digits>, a group match boundary tag.
1704            if (parseMatcher->group(1, status) == "/") {
1705                set(groupEnds, deTaggedInput.length(), groupNum);
1706            } else {
1707                set(groupStarts, deTaggedInput.length(), groupNum);
1708            }
1709        }
1710    }
1711    parseMatcher->appendTail(deTaggedInput);
1712    REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
1713    if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
1714      errln("mismatched <r> tags");
1715      failed = TRUE;
1716      goto cleanupAndReturn;
1717    }
1718
1719
1720    //
1721    //  Configure the matcher according to the flags specified with this test.
1722    //
1723    matcher = callerPattern->matcher(deTaggedInput, status);
1724    REGEX_CHECK_STATUS_L(line);
1725    if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
1726        matcher->setTrace(TRUE);
1727    }
1728    if (regionStart>=0) {
1729       matcher->region(regionStart, regionEnd, status);
1730       REGEX_CHECK_STATUS_L(line);
1731    }
1732    if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
1733        matcher->useAnchoringBounds(FALSE);
1734    }
1735    if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
1736        matcher->useTransparentBounds(TRUE);
1737    }
1738
1739
1740
1741    //
1742    // Do a find on the de-tagged input using the caller's pattern
1743    //     TODO: error on count>1 and not find().
1744    //           error on both matches() and lookingAt().
1745    //
1746    for (i=0; i<numFinds; i++) {
1747        if (useMatchesFunc) {
1748            isMatch = matcher->matches(status);
1749        } else  if (useLookingAtFunc) {
1750            isMatch = matcher->lookingAt(status);
1751        } else {
1752            isMatch = matcher->find();
1753        }
1754    }
1755    matcher->setTrace(FALSE);
1756
1757    //
1758    // Match up the groups from the find() with the groups from the tags
1759    //
1760
1761    // number of tags should match number of groups from find operation.
1762    // matcher->groupCount does not include group 0, the entire match, hence the +1.
1763    //   G option in test means that capture group data is not available in the
1764    //     expected results, so the check needs to be suppressed.
1765    if (isMatch == FALSE && groupStarts.size() != 0) {
1766        errln("Error at line %d:  Match expected, but none found.\n", line);
1767        failed = TRUE;
1768        goto cleanupAndReturn;
1769    }
1770
1771    if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
1772        // Only check for match / no match.  Don't check capture groups.
1773        if (isMatch && groupStarts.size() == 0) {
1774            errln("Error at line %d:  No match expected, but one found.\n", line);
1775            failed = TRUE;
1776        }
1777        goto cleanupAndReturn;
1778    }
1779
1780    for (i=0; i<=matcher->groupCount(); i++) {
1781        int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
1782        if (matcher->start(i, status) != expectedStart) {
1783            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
1784                line, i, expectedStart, matcher->start(i, status));
1785            failed = TRUE;
1786            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
1787        }
1788        int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
1789        if (matcher->end(i, status) != expectedEnd) {
1790            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
1791                line, i, expectedEnd, matcher->end(i, status));
1792            failed = TRUE;
1793            // Error on end position;  keep going; real error is probably yet to come as group
1794            //   end positions work from end of the input data towards the front.
1795        }
1796    }
1797    if ( matcher->groupCount()+1 < groupStarts.size()) {
1798        errln("Error at line %d: Expected %d capture groups, found %d.",
1799            line, groupStarts.size()-1, matcher->groupCount());
1800        failed = TRUE;
1801        }
1802
1803    if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
1804        matcher->requireEnd() == TRUE) {
1805        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
1806        failed = TRUE;
1807    }
1808    if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
1809        matcher->requireEnd() == FALSE) {
1810        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
1811        failed = TRUE;
1812    }
1813    if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
1814        matcher->hitEnd() == TRUE) {
1815        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
1816        failed = TRUE;
1817    }
1818    if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
1819        matcher->hitEnd() == FALSE) {
1820        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
1821        failed = TRUE;
1822    }
1823
1824
1825cleanupAndReturn:
1826    if (failed) {
1827        errln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
1828            +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
1829        // callerPattern->dump();
1830    }
1831    delete parseMatcher;
1832    delete parsePat;
1833    delete matcher;
1834    delete callerPattern;
1835}
1836
1837
1838
1839
1840//---------------------------------------------------------------------------
1841//
1842//      Errors     Check for error handling in patterns.
1843//
1844//---------------------------------------------------------------------------
1845void RegexTest::Errors() {
1846    // \escape sequences that aren't implemented yet.
1847    //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
1848
1849    // Missing close parentheses
1850    REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
1851    REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
1852    REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
1853
1854    // Extra close paren
1855    REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
1856    REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
1857    REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
1858
1859    // Look-ahead, Look-behind
1860    //  TODO:  add tests for unbounded length look-behinds.
1861    REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
1862
1863    // Attempt to use non-default flags
1864    {
1865        UParseError   pe;
1866        UErrorCode    status = U_ZERO_ERROR;
1867        int32_t       flags  = UREGEX_CANON_EQ |
1868                               UREGEX_COMMENTS         | UREGEX_DOTALL   |
1869                               UREGEX_MULTILINE;
1870        RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
1871        REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
1872        delete pat1;
1873    }
1874
1875
1876    // Quantifiers are allowed only after something that can be quantified.
1877    REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
1878    REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
1879    REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
1880
1881    // Mal-formed {min,max} quantifiers
1882    REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
1883    REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
1884    REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
1885    REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
1886    REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
1887    REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
1888    REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
1889    REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
1890    REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
1891
1892    // Ticket 5389
1893    REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
1894
1895    // Invalid Back Reference \0
1896    //    For ICU 3.8 and earlier
1897    //    For ICU versions newer than 3.8, \0 introduces an octal escape.
1898    //
1899    REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
1900
1901}
1902
1903
1904//-------------------------------------------------------------------------------
1905//
1906//  Read a text data file, convert it to UChars, and return the data
1907//    in one big UChar * buffer, which the caller must delete.
1908//
1909//--------------------------------------------------------------------------------
1910UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
1911                                     const char *defEncoding, UErrorCode &status) {
1912    UChar       *retPtr  = NULL;
1913    char        *fileBuf = NULL;
1914    UConverter* conv     = NULL;
1915    FILE        *f       = NULL;
1916
1917    ulen = 0;
1918    if (U_FAILURE(status)) {
1919        return retPtr;
1920    }
1921
1922    //
1923    //  Open the file.
1924    //
1925    f = fopen(fileName, "rb");
1926    if (f == 0) {
1927        dataerrln("Error opening test data file %s\n", fileName);
1928        status = U_FILE_ACCESS_ERROR;
1929        return NULL;
1930    }
1931    //
1932    //  Read it in
1933    //
1934    int32_t            fileSize;
1935    int32_t            amt_read;
1936
1937    fseek( f, 0, SEEK_END);
1938    fileSize = ftell(f);
1939    fileBuf = new char[fileSize];
1940    fseek(f, 0, SEEK_SET);
1941    amt_read = fread(fileBuf, 1, fileSize, f);
1942    if (amt_read != fileSize || fileSize <= 0) {
1943        errln("Error reading test data file.");
1944        goto cleanUpAndReturn;
1945    }
1946
1947    //
1948    // Look for a Unicode Signature (BOM) on the data just read
1949    //
1950    int32_t        signatureLength;
1951    const char *   fileBufC;
1952    const char*    encoding;
1953
1954    fileBufC = fileBuf;
1955    encoding = ucnv_detectUnicodeSignature(
1956        fileBuf, fileSize, &signatureLength, &status);
1957    if(encoding!=NULL ){
1958        fileBufC  += signatureLength;
1959        fileSize  -= signatureLength;
1960    } else {
1961        encoding = defEncoding;
1962        if (strcmp(encoding, "utf-8") == 0) {
1963            errln("file %s is missing its BOM", fileName);
1964        }
1965    }
1966
1967    //
1968    // Open a converter to take the rule file to UTF-16
1969    //
1970    conv = ucnv_open(encoding, &status);
1971    if (U_FAILURE(status)) {
1972        goto cleanUpAndReturn;
1973    }
1974
1975    //
1976    // Convert the rules to UChar.
1977    //  Preflight first to determine required buffer size.
1978    //
1979    ulen = ucnv_toUChars(conv,
1980        NULL,           //  dest,
1981        0,              //  destCapacity,
1982        fileBufC,
1983        fileSize,
1984        &status);
1985    if (status == U_BUFFER_OVERFLOW_ERROR) {
1986        // Buffer Overflow is expected from the preflight operation.
1987        status = U_ZERO_ERROR;
1988
1989        retPtr = new UChar[ulen+1];
1990        ucnv_toUChars(conv,
1991            retPtr,       //  dest,
1992            ulen+1,
1993            fileBufC,
1994            fileSize,
1995            &status);
1996    }
1997
1998cleanUpAndReturn:
1999    fclose(f);
2000    delete[] fileBuf;
2001    ucnv_close(conv);
2002    if (U_FAILURE(status)) {
2003        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
2004        delete retPtr;
2005        retPtr = 0;
2006        ulen   = 0;
2007    };
2008    return retPtr;
2009}
2010
2011
2012//-------------------------------------------------------------------------------
2013//
2014//   PerlTests  - Run Perl's regular expression tests
2015//                The input file for this test is re_tests, the standard regular
2016//                expression test data distributed with the Perl source code.
2017//
2018//                Here is Perl's description of the test data file:
2019//
2020//        # The tests are in a separate file 't/op/re_tests'.
2021//        # Each line in that file is a separate test.
2022//        # There are five columns, separated by tabs.
2023//        #
2024//        # Column 1 contains the pattern, optionally enclosed in C<''>.
2025//        # Modifiers can be put after the closing C<'>.
2026//        #
2027//        # Column 2 contains the string to be matched.
2028//        #
2029//        # Column 3 contains the expected result:
2030//        #     y   expect a match
2031//        #     n   expect no match
2032//        #     c   expect an error
2033//        # B   test exposes a known bug in Perl, should be skipped
2034//        # b   test exposes a known bug in Perl, should be skipped if noamp
2035//        #
2036//        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
2037//        #
2038//        # Column 4 contains a string, usually C<$&>.
2039//        #
2040//        # Column 5 contains the expected result of double-quote
2041//        # interpolating that string after the match, or start of error message.
2042//        #
2043//        # Column 6, if present, contains a reason why the test is skipped.
2044//        # This is printed with "skipped", for harness to pick up.
2045//        #
2046//        # \n in the tests are interpolated, as are variables of the form ${\w+}.
2047//        #
2048//        # If you want to add a regular expression test that can't be expressed
2049//        # in this format, don't add it here: put it in op/pat.t instead.
2050//
2051//        For ICU, if field 3 contains an 'i', the test will be skipped.
2052//        The test exposes is some known incompatibility between ICU and Perl regexps.
2053//        (The i is in addition to whatever was there before.)
2054//
2055//-------------------------------------------------------------------------------
2056void RegexTest::PerlTests() {
2057    char tdd[2048];
2058    const char *srcPath;
2059    UErrorCode  status = U_ZERO_ERROR;
2060    UParseError pe;
2061
2062    //
2063    //  Open and read the test data file.
2064    //
2065    srcPath=getPath(tdd, "re_tests.txt");
2066    if(srcPath==NULL) {
2067        return; /* something went wrong, error already output */
2068    }
2069
2070    int32_t    len;
2071    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
2072    if (U_FAILURE(status)) {
2073        return; /* something went wrong, error already output */
2074    }
2075
2076    //
2077    //  Put the test data into a UnicodeString
2078    //
2079    UnicodeString testDataString(FALSE, testData, len);
2080
2081    //
2082    //  Regex to break the input file into lines, and strip the new lines.
2083    //     One line per match, capture group one is the desired data.
2084    //
2085    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
2086    if (U_FAILURE(status)) {
2087        dataerrln("RegexPattern::compile() error");
2088        return;
2089    }
2090    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
2091
2092    //
2093    //  Regex to split a test file line into fields.
2094    //    There are six fields, separated by tabs.
2095    //
2096    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
2097
2098    //
2099    //  Regex to identify test patterns with flag settings, and to separate them.
2100    //    Test patterns with flags look like 'pattern'i
2101    //    Test patterns without flags are not quoted:   pattern
2102    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
2103    //
2104    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
2105    RegexMatcher* flagMat = flagPat->matcher(status);
2106
2107    //
2108    // The Perl tests reference several perl-isms, which are evaluated/substituted
2109    //   in the test data.  Not being perl, this must be done explicitly.  Here
2110    //   are string constants and REs for these constructs.
2111    //
2112    UnicodeString nulnulSrc("${nulnul}");
2113    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
2114    nulnul = nulnul.unescape();
2115
2116    UnicodeString ffffSrc("${ffff}");
2117    UnicodeString ffff("\\uffff", -1, US_INV);
2118    ffff = ffff.unescape();
2119
2120    //  regexp for $-[0], $+[2], etc.
2121    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
2122    RegexMatcher *groupsMat = groupsPat->matcher(status);
2123
2124    //  regexp for $0, $1, $2, etc.
2125    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
2126    RegexMatcher *cgMat = cgPat->matcher(status);
2127
2128
2129    //
2130    // Main Loop for the Perl Tests, runs once per line from the
2131    //   test data file.
2132    //
2133    int32_t  lineNum = 0;
2134    int32_t  skippedUnimplementedCount = 0;
2135    while (lineMat->find()) {
2136        lineNum++;
2137
2138        //
2139        //  Get a line, break it into its fields, do the Perl
2140        //    variable substitutions.
2141        //
2142        UnicodeString line = lineMat->group(1, status);
2143        UnicodeString fields[7];
2144        fieldPat->split(line, fields, 7, status);
2145
2146        flagMat->reset(fields[0]);
2147        flagMat->matches(status);
2148        UnicodeString pattern  = flagMat->group(2, status);
2149        pattern.findAndReplace("${bang}", "!");
2150        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
2151        pattern.findAndReplace(ffffSrc, ffff);
2152
2153        //
2154        //  Identify patterns that include match flag settings,
2155        //    split off the flags, remove the extra quotes.
2156        //
2157        UnicodeString flagStr = flagMat->group(3, status);
2158        if (U_FAILURE(status)) {
2159            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
2160            return;
2161        }
2162        int32_t flags = 0;
2163        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
2164        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
2165        const UChar UChar_m = 0x6d;
2166        const UChar UChar_x = 0x78;
2167        const UChar UChar_y = 0x79;
2168        if (flagStr.indexOf(UChar_i) != -1) {
2169            flags |= UREGEX_CASE_INSENSITIVE;
2170        }
2171        if (flagStr.indexOf(UChar_m) != -1) {
2172            flags |= UREGEX_MULTILINE;
2173        }
2174        if (flagStr.indexOf(UChar_x) != -1) {
2175            flags |= UREGEX_COMMENTS;
2176        }
2177
2178        //
2179        // Compile the test pattern.
2180        //
2181        status = U_ZERO_ERROR;
2182        RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
2183        if (status == U_REGEX_UNIMPLEMENTED) {
2184            //
2185            // Test of a feature that is planned for ICU, but not yet implemented.
2186            //   skip the test.
2187            skippedUnimplementedCount++;
2188            delete testPat;
2189            status = U_ZERO_ERROR;
2190            continue;
2191        }
2192
2193        if (U_FAILURE(status)) {
2194            // Some tests are supposed to generate errors.
2195            //   Only report an error for tests that are supposed to succeed.
2196            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
2197                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
2198            {
2199                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
2200            }
2201            status = U_ZERO_ERROR;
2202            delete testPat;
2203            continue;
2204        }
2205
2206        if (fields[2].indexOf(UChar_i) >= 0) {
2207            // ICU should skip this test.
2208            delete testPat;
2209            continue;
2210        }
2211
2212        if (fields[2].indexOf(UChar_c) >= 0) {
2213            // This pattern should have caused a compilation error, but didn't/
2214            errln("line %d: Expected a pattern compile error, got success.", lineNum);
2215            delete testPat;
2216            continue;
2217        }
2218
2219        //
2220        // replace the Perl variables that appear in some of the
2221        //   match data strings.
2222        //
2223        UnicodeString matchString = fields[1];
2224        matchString.findAndReplace(nulnulSrc, nulnul);
2225        matchString.findAndReplace(ffffSrc,   ffff);
2226
2227        // Replace any \n in the match string with an actual new-line char.
2228        //  Don't do full unescape, as this unescapes more than Perl does, which
2229        //  causes other spurious failures in the tests.
2230        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
2231
2232
2233
2234        //
2235        // Run the test, check for expected match/don't match result.
2236        //
2237        RegexMatcher *testMat = testPat->matcher(matchString, status);
2238        UBool found = testMat->find();
2239        UBool expected = FALSE;
2240        if (fields[2].indexOf(UChar_y) >=0) {
2241            expected = TRUE;
2242        }
2243        if (expected != found) {
2244            errln("line %d: Expected %smatch, got %smatch",
2245                lineNum, expected?"":"no ", found?"":"no " );
2246            continue;
2247        }
2248
2249        // Don't try to check expected results if there is no match.
2250        //   (Some have stuff in the expected fields)
2251        if (!found) {
2252            delete testMat;
2253            delete testPat;
2254            continue;
2255        }
2256
2257        //
2258        // Interpret the Perl expression from the fourth field of the data file,
2259        // building up an ICU string from the results of the ICU match.
2260        //   The Perl expression will contain references to the results of
2261        //     a regex match, including the matched string, capture group strings,
2262        //     group starting and ending indicies, etc.
2263        //
2264        UnicodeString resultString;
2265        UnicodeString perlExpr = fields[3];
2266        groupsMat->reset(perlExpr);
2267        cgMat->reset(perlExpr);
2268
2269        while (perlExpr.length() > 0) {
2270            if (perlExpr.startsWith("$&")) {
2271                resultString.append(testMat->group(status));
2272                perlExpr.remove(0, 2);
2273            }
2274
2275            else if (groupsMat->lookingAt(status)) {
2276                // $-[0]   $+[2]  etc.
2277                UnicodeString digitString = groupsMat->group(2, status);
2278                int32_t t = 0;
2279                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
2280                UnicodeString plusOrMinus = groupsMat->group(1, status);
2281                int32_t matchPosition;
2282                if (plusOrMinus.compare("+") == 0) {
2283                    matchPosition = testMat->end(groupNum, status);
2284                } else {
2285                    matchPosition = testMat->start(groupNum, status);
2286                }
2287                if (matchPosition != -1) {
2288                    ICU_Utility::appendNumber(resultString, matchPosition);
2289                }
2290                perlExpr.remove(0, groupsMat->end(status));
2291            }
2292
2293            else if (cgMat->lookingAt(status)) {
2294                // $1, $2, $3, etc.
2295                UnicodeString digitString = cgMat->group(1, status);
2296                int32_t t = 0;
2297                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
2298                if (U_SUCCESS(status)) {
2299                    resultString.append(testMat->group(groupNum, status));
2300                    status = U_ZERO_ERROR;
2301                }
2302                perlExpr.remove(0, cgMat->end(status));
2303            }
2304
2305            else if (perlExpr.startsWith("@-")) {
2306                int32_t i;
2307                for (i=0; i<=testMat->groupCount(); i++) {
2308                    if (i>0) {
2309                        resultString.append(" ");
2310                    }
2311                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
2312                }
2313                perlExpr.remove(0, 2);
2314            }
2315
2316            else if (perlExpr.startsWith("@+")) {
2317                int32_t i;
2318                for (i=0; i<=testMat->groupCount(); i++) {
2319                    if (i>0) {
2320                        resultString.append(" ");
2321                    }
2322                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
2323                }
2324                perlExpr.remove(0, 2);
2325            }
2326
2327            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
2328                                                     //           or as an escaped sequence (e.g. \n)
2329                if (perlExpr.length() > 1) {
2330                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
2331                }
2332                UChar c = perlExpr.charAt(0);
2333                switch (c) {
2334                case 'n':   c = '\n'; break;
2335                // add any other escape sequences that show up in the test expected results.
2336                }
2337                resultString.append(c);
2338                perlExpr.remove(0, 1);
2339            }
2340
2341            else  {
2342                // Any characters from the perl expression that we don't explicitly
2343                //  recognize before here are assumed to be literals and copied
2344                //  as-is to the expected results.
2345                resultString.append(perlExpr.charAt(0));
2346                perlExpr.remove(0, 1);
2347            }
2348
2349            if (U_FAILURE(status)) {
2350                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
2351                break;
2352            }
2353        }
2354
2355        //
2356        // Expected Results Compare
2357        //
2358        UnicodeString expectedS(fields[4]);
2359        expectedS.findAndReplace(nulnulSrc, nulnul);
2360        expectedS.findAndReplace(ffffSrc,   ffff);
2361        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
2362
2363
2364        if (expectedS.compare(resultString) != 0) {
2365            err("Line %d: Incorrect perl expression results.", lineNum);
2366            errln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
2367        }
2368
2369        delete testMat;
2370        delete testPat;
2371    }
2372
2373    //
2374    // All done.  Clean up allocated stuff.
2375    //
2376    delete cgMat;
2377    delete cgPat;
2378
2379    delete groupsMat;
2380    delete groupsPat;
2381
2382    delete flagMat;
2383    delete flagPat;
2384
2385    delete lineMat;
2386    delete linePat;
2387
2388    delete fieldPat;
2389    delete [] testData;
2390
2391
2392    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
2393
2394}
2395
2396
2397//--------------------------------------------------------------
2398//
2399//  Bug6149   Verify limits to heap expansion for backtrack stack.
2400//             Use this pattern,
2401//                 "(a?){1,}"
2402//             The zero-length match will repeat forever.
2403//                (That this goes into a loop is another bug)
2404//
2405//---------------------------------------------------------------
2406void RegexTest::Bug6149() {
2407    UnicodeString pattern("(a?){1,}");
2408    UnicodeString s("xyz");
2409    uint32_t flags = 0;
2410    UErrorCode status = U_ZERO_ERROR;
2411
2412    RegexMatcher  matcher(pattern, s, flags, status);
2413    UBool result = false;
2414    REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
2415    REGEX_ASSERT(result == FALSE);
2416 }
2417
2418
2419//
2420//   Callbacks()    Test the callback function.
2421//                  When set, callbacks occur periodically during matching operations,
2422//                  giving the application code the ability to abort the operation
2423//                  before it's normal completion.
2424//
2425
2426struct callBackContext {
2427    RegexTest        *test;
2428    int32_t          maxCalls;
2429    int32_t          numCalls;
2430    int32_t          lastSteps;
2431    void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
2432};
2433
2434U_CDECL_BEGIN
2435static UBool U_CALLCONV
2436testCallBackFn(const void *context, int32_t steps) {
2437    callBackContext  *info = (callBackContext *)context;
2438    if (info->lastSteps+1 != steps) {
2439        info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
2440    }
2441    info->lastSteps = steps;
2442    info->numCalls++;
2443    return (info->numCalls < info->maxCalls);
2444}
2445U_CDECL_END
2446
2447void RegexTest::Callbacks() {
2448   {
2449        // Getter returns NULLs if no callback has been set
2450
2451        //   The variables that the getter will fill in.
2452        //   Init to non-null values so that the action of the getter can be seen.
2453        const void          *returnedContext = &returnedContext;
2454        URegexMatchCallback *returnedFn = &testCallBackFn;
2455
2456        UErrorCode status = U_ZERO_ERROR;
2457        RegexMatcher matcher("x", 0, status);
2458        REGEX_CHECK_STATUS;
2459        matcher.getMatchCallback(returnedFn, returnedContext, status);
2460        REGEX_CHECK_STATUS;
2461        REGEX_ASSERT(returnedFn == NULL);
2462        REGEX_ASSERT(returnedContext == NULL);
2463    }
2464
2465   {
2466        // Set and Get work
2467        callBackContext cbInfo = {this, 0, 0, 0};
2468        const void          *returnedContext;
2469        URegexMatchCallback *returnedFn;
2470        UErrorCode status = U_ZERO_ERROR;
2471        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
2472        REGEX_CHECK_STATUS;
2473        matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
2474        REGEX_CHECK_STATUS;
2475        matcher.getMatchCallback(returnedFn, returnedContext, status);
2476        REGEX_CHECK_STATUS;
2477        REGEX_ASSERT(returnedFn == testCallBackFn);
2478        REGEX_ASSERT(returnedContext == &cbInfo);
2479
2480        // A short-running match shouldn't invoke the callback
2481        status = U_ZERO_ERROR;
2482        cbInfo.reset(1);
2483        UnicodeString s = "xxx";
2484        matcher.reset(s);
2485        REGEX_ASSERT(matcher.matches(status));
2486        REGEX_CHECK_STATUS;
2487        REGEX_ASSERT(cbInfo.numCalls == 0);
2488
2489        // A medium-length match that runs long enough to invoke the
2490        //   callback, but not so long that the callback aborts it.
2491        status = U_ZERO_ERROR;
2492        cbInfo.reset(4);
2493        s = "aaaaaaaaaaaaaaaaaaab";
2494        matcher.reset(s);
2495        REGEX_ASSERT(matcher.matches(status)==FALSE);
2496        REGEX_CHECK_STATUS;
2497        REGEX_ASSERT(cbInfo.numCalls > 0);
2498
2499        // A longer running match that the callback function will abort.
2500        status = U_ZERO_ERROR;
2501        cbInfo.reset(4);
2502        s = "aaaaaaaaaaaaaaaaaaaaaaab";
2503        matcher.reset(s);
2504        REGEX_ASSERT(matcher.matches(status)==FALSE);
2505        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
2506        REGEX_ASSERT(cbInfo.numCalls == 4);
2507    }
2508
2509
2510}
2511
2512#endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
2513
2514