regextst.cpp revision 50294ead5e5d23f5bbfed76e00e6b510bd41eee1
1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2010, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7//
8//   regextst.cpp
9//
10//      ICU Regular Expressions test, part of intltest.
11//
12
13#include "intltest.h"
14#if !UCONFIG_NO_REGULAR_EXPRESSIONS
15
16#include "unicode/regex.h"
17#include "unicode/uchar.h"
18#include "unicode/ucnv.h"
19#include "unicode/ustring.h"
20#include "regextst.h"
21#include "uvector.h"
22#include "util.h"
23#include <stdlib.h>
24#include <string.h>
25#include <stdio.h>
26
27#define SUPPORT_MUTATING_INPUT_STRING   0
28
29
30//---------------------------------------------------------------------------
31//
32//  Test class boilerplate
33//
34//---------------------------------------------------------------------------
35RegexTest::RegexTest()
36{
37}
38
39
40RegexTest::~RegexTest()
41{
42}
43
44
45
46void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
47{
48    if (exec) logln("TestSuite RegexTest: ");
49    switch (index) {
50
51        case 0: name = "Basic";
52            if (exec) Basic();
53            break;
54        case 1: name = "API_Match";
55            if (exec) API_Match();
56            break;
57        case 2: name = "API_Replace";
58            if (exec) API_Replace();
59            break;
60        case 3: name = "API_Pattern";
61            if (exec) API_Pattern();
62            break;
63        case 4:
64#if !UCONFIG_NO_FILE_IO
65            name = "Extended";
66            if (exec) Extended();
67#else
68            name = "skip";
69#endif
70            break;
71        case 5: name = "Errors";
72            if (exec) Errors();
73            break;
74        case 6: name = "PerlTests";
75            if (exec) PerlTests();
76            break;
77        case 7: name = "Callbacks";
78            if (exec) Callbacks();
79            break;
80        case 8: name = "Bug 6149";
81             if (exec) Bug6149();
82             break;
83        case 9: name = "UTextBasic";
84          if (exec) UTextBasic();
85          break;
86        case 10: name = "API_Match_UTF8";
87          if (exec) API_Match_UTF8();
88          break;
89        case 11: name = "API_Replace_UTF8";
90          if (exec) API_Replace_UTF8();
91          break;
92        case 12: name = "API_Pattern_UTF8";
93          if (exec) API_Pattern_UTF8();
94          break;
95        case 13: name = "PerlTestsUTF8";
96          if (exec) PerlTestsUTF8();
97          break;
98        case 14: name = "PreAllocatedUTextCAPI";
99          if (exec) PreAllocatedUTextCAPI();
100          break;
101        case 15: name = "Bug 7651";
102          if (exec) Bug7651();
103          break;
104
105        default: name = "";
106            break; //needed to end loop
107    }
108}
109
110
111//---------------------------------------------------------------------------
112//
113//   Error Checking / Reporting macros used in all of the tests.
114//
115//---------------------------------------------------------------------------
116#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("RegexTest failure at line %d.  status=%s", \
117__LINE__, u_errorName(status)); return;}}
118
119#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
120
121#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
122if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
123    __LINE__, u_errorName(errcode), u_errorName(status));};}
124
125#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
126    "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
127
128#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
129    errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
130
131void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
132    UErrorCode status = U_ZERO_ERROR;
133    UText expectedText = UTEXT_INITIALIZER;
134    utext_openUTF8(&expectedText, expected, -1, &status);
135    utext_setNativeIndex(actual, 0);
136    if (utext_compare(&expectedText, -1, actual, -1) != 0) {
137        char buf[201 /*21*/];
138        char *bufPtr = buf;
139        UChar32 c = utext_next32From(actual, 0);
140        while (c != U_SENTINEL && bufPtr < buf+200/*20*/) {
141            if (0x20<c && c<0x7e) {
142                *bufPtr = c;
143            } else {
144                *bufPtr = '.';
145            }
146            bufPtr++;
147            c = UTEXT_NEXT32(actual);
148        }
149        *bufPtr = 0;
150
151        errln("Failure at file %s, line %d, expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expected, utext_nativeLength(&expectedText), buf, utext_nativeLength(actual));
152    }
153    utext_close(&expectedText);
154}
155
156#define REGEX_ASSERT_UTEXT(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
157
158
159//---------------------------------------------------------------------------
160//
161//    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
162//                       for the LookingAt() and  Match() functions.
163//
164//       usage:
165//          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
166//
167//          The expected results are UBool - TRUE or FALSE.
168//          The input text is unescaped.  The pattern is not.
169//
170//
171//---------------------------------------------------------------------------
172
173#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
174
175UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
176    const UnicodeString pattern(pat, -1, US_INV);
177    const UnicodeString inputText(text, -1, US_INV);
178    UErrorCode          status  = U_ZERO_ERROR;
179    UParseError         pe;
180    RegexPattern        *REPattern = NULL;
181    RegexMatcher        *REMatcher = NULL;
182    UBool               retVal     = TRUE;
183
184    UnicodeString patString(pat, -1, US_INV);
185    REPattern = RegexPattern::compile(patString, 0, pe, status);
186    if (U_FAILURE(status)) {
187        dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
188            line, u_errorName(status));
189        return FALSE;
190    }
191    if (line==376) { RegexPatternDump(REPattern);}
192
193    UnicodeString inputString(inputText);
194    UnicodeString unEscapedInput = inputString.unescape();
195    REMatcher = REPattern->matcher(unEscapedInput, status);
196    if (U_FAILURE(status)) {
197        errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
198            line, u_errorName(status));
199        return FALSE;
200    }
201
202    UBool actualmatch;
203    actualmatch = REMatcher->lookingAt(status);
204    if (U_FAILURE(status)) {
205        errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
206            line, u_errorName(status));
207        retVal =  FALSE;
208    }
209    if (actualmatch != looking) {
210        errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
211        retVal = FALSE;
212    }
213
214    status = U_ZERO_ERROR;
215    actualmatch = REMatcher->matches(status);
216    if (U_FAILURE(status)) {
217        errln("RegexTest failure in matches() at line %d.  Status = %s\n",
218            line, u_errorName(status));
219        retVal = FALSE;
220    }
221    if (actualmatch != match) {
222        errln("RegexTest: wrong return from matches() at line %d.\n", line);
223        retVal = FALSE;
224    }
225
226    if (retVal == FALSE) {
227        RegexPatternDump(REPattern);
228    }
229
230    delete REPattern;
231    delete REMatcher;
232    return retVal;
233}
234
235
236UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
237    UText               pattern    = UTEXT_INITIALIZER;
238    int32_t             inputUTF8Length;
239    char                *textChars = NULL;
240    UText               inputText  = UTEXT_INITIALIZER;
241    UErrorCode          status     = U_ZERO_ERROR;
242    UParseError         pe;
243    RegexPattern        *REPattern = NULL;
244    RegexMatcher        *REMatcher = NULL;
245    UBool               retVal     = TRUE;
246
247    utext_openUTF8(&pattern, pat, -1, &status);
248    REPattern = RegexPattern::compile(&pattern, 0, pe, status);
249    if (U_FAILURE(status)) {
250        dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
251            line, u_errorName(status));
252        return FALSE;
253    }
254
255    UnicodeString inputString(text, -1, US_INV);
256    UnicodeString unEscapedInput = inputString.unescape();
257    LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
258    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
259
260    inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
261    if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
262        // UTF-8 does not allow unpaired surrogates, so this could actually happen
263        logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
264        return TRUE; // not a failure of the Regex engine
265    }
266    status = U_ZERO_ERROR; // buffer overflow
267    textChars = new char[inputUTF8Length+1];
268    unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
269    utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
270
271    REMatcher = REPattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
272    if (U_FAILURE(status)) {
273        errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
274            line, u_errorName(status));
275        return FALSE;
276    }
277
278    UBool actualmatch;
279    actualmatch = REMatcher->lookingAt(status);
280    if (U_FAILURE(status)) {
281        errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
282            line, u_errorName(status));
283        retVal =  FALSE;
284    }
285    if (actualmatch != looking) {
286        errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
287        retVal = FALSE;
288    }
289
290    status = U_ZERO_ERROR;
291    actualmatch = REMatcher->matches(status);
292    if (U_FAILURE(status)) {
293        errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
294            line, u_errorName(status));
295        retVal = FALSE;
296    }
297    if (actualmatch != match) {
298        errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
299        retVal = FALSE;
300    }
301
302    if (retVal == FALSE) {
303        RegexPatternDump(REPattern);
304    }
305
306    delete REPattern;
307    delete REMatcher;
308    utext_close(&inputText);
309    utext_close(&pattern);
310    delete[] textChars;
311    return retVal;
312}
313
314
315
316//---------------------------------------------------------------------------
317//
318//    REGEX_ERR       Macro + invocation function to simplify writing tests
319//                       regex tests for incorrect patterns
320//
321//       usage:
322//          REGEX_ERR("pattern",   expected error line, column, expected status);
323//
324//---------------------------------------------------------------------------
325#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
326
327void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
328                          UErrorCode expectedStatus, int32_t line) {
329    UnicodeString       pattern(pat);
330
331    UErrorCode          status         = U_ZERO_ERROR;
332    UParseError         pe;
333    RegexPattern        *callerPattern = NULL;
334
335    //
336    //  Compile the caller's pattern
337    //
338    UnicodeString patString(pat);
339    callerPattern = RegexPattern::compile(patString, 0, pe, status);
340    if (status != expectedStatus) {
341        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
342    } else {
343        if (status != U_ZERO_ERROR) {
344            if (pe.line != errLine || pe.offset != errCol) {
345                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
346                    line, errLine, errCol, pe.line, pe.offset);
347            }
348        }
349    }
350
351    delete callerPattern;
352
353    //
354    //  Compile again, using a UTF-8-based UText
355    //
356    UText patternText = UTEXT_INITIALIZER;
357    utext_openUTF8(&patternText, pat, -1, &status);
358    callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
359    if (status != expectedStatus) {
360        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
361    } else {
362        if (status != U_ZERO_ERROR) {
363            if (pe.line != errLine || pe.offset != errCol) {
364                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
365                    line, errLine, errCol, pe.line, pe.offset);
366            }
367        }
368    }
369
370    delete callerPattern;
371    utext_close(&patternText);
372}
373
374
375
376//---------------------------------------------------------------------------
377//
378//      Basic      Check for basic functionality of regex pattern matching.
379//                 Avoid the use of REGEX_FIND test macro, which has
380//                 substantial dependencies on basic Regex functionality.
381//
382//---------------------------------------------------------------------------
383void RegexTest::Basic() {
384
385
386//
387// Debug - slide failing test cases early
388//
389#if 0
390    {
391        // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
392        UParseError pe;
393        UErrorCode  status = U_ZERO_ERROR;
394        RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
395        // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
396        // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
397    }
398    exit(1);
399#endif
400
401
402    //
403    // Pattern with parentheses
404    //
405    REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
406    REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
407    REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
408
409    //
410    // Patterns with *
411    //
412    REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
413    REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
414    REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
415    REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
416    REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
417
418    REGEX_TESTLM("a*", "",  TRUE, TRUE);
419    REGEX_TESTLM("a*", "b", TRUE, FALSE);
420
421
422    //
423    //  Patterns with "."
424    //
425    REGEX_TESTLM(".", "abc", TRUE, FALSE);
426    REGEX_TESTLM("...", "abc", TRUE, TRUE);
427    REGEX_TESTLM("....", "abc", FALSE, FALSE);
428    REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
429    REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
430    REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
431    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
432    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
433
434    //
435    //  Patterns with * applied to chars at end of literal string
436    //
437    REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
438    REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
439
440    //
441    //  Supplemental chars match as single chars, not a pair of surrogates.
442    //
443    REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
444    REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
445    REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
446
447
448    //
449    //  UnicodeSets in the pattern
450    //
451    REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
452    REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
453    REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
454    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
455    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
456    REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
457
458    REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
459    REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
460    REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
461    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
462    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
463
464    //
465    //   OR operator in patterns
466    //
467    REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
468    REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
469    REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
470    REGEX_TESTLM("a|b", "b", TRUE, TRUE);
471
472    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
473    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
474    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
475    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
476    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
477    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
478
479    //
480    //  +
481    //
482    REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
483    REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
484    REGEX_TESTLM("b+", "", FALSE, FALSE);
485    REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
486    REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
487    REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
488
489    //
490    //   ?
491    //
492    REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
493    REGEX_TESTLM("ab?", "a", TRUE, TRUE);
494    REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
495    REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
496    REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
497    REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
498    REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
499    REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
500    REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
501
502    //
503    //  Escape sequences that become single literal chars, handled internally
504    //   by ICU's Unescape.
505    //
506
507    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
508    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
509    REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
510    REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
511    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
512    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
513    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
514    REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
515    REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
516    REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
517
518    REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
519    REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
520
521    // Escape of special chars in patterns
522    REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
523}
524
525
526//---------------------------------------------------------------------------
527//
528//    UTextBasic   Check for quirks that are specific to the UText
529//                 implementation.
530//
531//---------------------------------------------------------------------------
532void RegexTest::UTextBasic() {
533    UErrorCode status = U_ZERO_ERROR;
534    UText pattern = UTEXT_INITIALIZER;
535    utext_openUTF8(&pattern, "abc", -1, &status);
536    RegexMatcher matcher(&pattern, 0, status);
537    REGEX_CHECK_STATUS;
538
539    UText input = UTEXT_INITIALIZER;
540    utext_openUTF8(&input, "abc", -1, &status);
541    REGEX_CHECK_STATUS;
542    matcher.reset(&input);
543    REGEX_CHECK_STATUS;
544    REGEX_ASSERT_UTEXT("abc", matcher.inputText());
545
546    matcher.reset(matcher.inputText());
547    REGEX_CHECK_STATUS;
548    REGEX_ASSERT_UTEXT("abc", matcher.inputText());
549
550    utext_close(&pattern);
551    utext_close(&input);
552}
553
554
555//---------------------------------------------------------------------------
556//
557//      API_Match   Test that the API for class RegexMatcher
558//                  is present and nominally working, but excluding functions
559//                  implementing replace operations.
560//
561//---------------------------------------------------------------------------
562void RegexTest::API_Match() {
563    UParseError         pe;
564    UErrorCode          status=U_ZERO_ERROR;
565    int32_t             flags = 0;
566
567    //
568    // Debug - slide failing test cases early
569    //
570#if 0
571    {
572    }
573    return;
574#endif
575
576    //
577    // Simple pattern compilation
578    //
579    {
580        UnicodeString       re("abc");
581        RegexPattern        *pat2;
582        pat2 = RegexPattern::compile(re, flags, pe, status);
583        REGEX_CHECK_STATUS;
584
585        UnicodeString inStr1 = "abcdef this is a test";
586        UnicodeString instr2 = "not abc";
587        UnicodeString empty  = "";
588
589
590        //
591        // Matcher creation and reset.
592        //
593        RegexMatcher *m1 = pat2->matcher(inStr1, status);
594        REGEX_CHECK_STATUS;
595        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
596        REGEX_ASSERT(m1->input() == inStr1);
597        m1->reset(instr2);
598        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
599        REGEX_ASSERT(m1->input() == instr2);
600        m1->reset(inStr1);
601        REGEX_ASSERT(m1->input() == inStr1);
602        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
603        m1->reset(empty);
604        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
605        REGEX_ASSERT(m1->input() == empty);
606        REGEX_ASSERT(&m1->pattern() == pat2);
607
608        //
609        //  reset(pos, status)
610        //
611        m1->reset(inStr1);
612        m1->reset(4, status);
613        REGEX_CHECK_STATUS;
614        REGEX_ASSERT(m1->input() == inStr1);
615        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
616
617        m1->reset(-1, status);
618        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
619        status = U_ZERO_ERROR;
620
621        m1->reset(0, status);
622        REGEX_CHECK_STATUS;
623        status = U_ZERO_ERROR;
624
625        int32_t len = m1->input().length();
626        m1->reset(len-1, status);
627        REGEX_CHECK_STATUS;
628        status = U_ZERO_ERROR;
629
630        m1->reset(len, status);
631        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
632        status = U_ZERO_ERROR;
633
634        //
635        // match(pos, status)
636        //
637        m1->reset(instr2);
638        REGEX_ASSERT(m1->matches(4, status) == TRUE);
639        m1->reset();
640        REGEX_ASSERT(m1->matches(3, status) == FALSE);
641        m1->reset();
642        REGEX_ASSERT(m1->matches(5, status) == FALSE);
643        REGEX_ASSERT(m1->matches(4, status) == TRUE);
644        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
645        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
646
647        // Match() at end of string should fail, but should not
648        //  be an error.
649        status = U_ZERO_ERROR;
650        len = m1->input().length();
651        REGEX_ASSERT(m1->matches(len, status) == FALSE);
652        REGEX_CHECK_STATUS;
653
654        // Match beyond end of string should fail with an error.
655        status = U_ZERO_ERROR;
656        REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
657        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
658
659        // Successful match at end of string.
660        {
661            status = U_ZERO_ERROR;
662            RegexMatcher m("A?", 0, status);  // will match zero length string.
663            REGEX_CHECK_STATUS;
664            m.reset(inStr1);
665            len = inStr1.length();
666            REGEX_ASSERT(m.matches(len, status) == TRUE);
667            REGEX_CHECK_STATUS;
668            m.reset(empty);
669            REGEX_ASSERT(m.matches(0, status) == TRUE);
670            REGEX_CHECK_STATUS;
671        }
672
673
674        //
675        // lookingAt(pos, status)
676        //
677        status = U_ZERO_ERROR;
678        m1->reset(instr2);  // "not abc"
679        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
680        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
681        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
682        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
683        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
684        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
685        status = U_ZERO_ERROR;
686        len = m1->input().length();
687        REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
688        REGEX_CHECK_STATUS;
689        REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
690        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
691
692        delete m1;
693        delete pat2;
694    }
695
696
697    //
698    // Capture Group.
699    //     RegexMatcher::start();
700    //     RegexMatcher::end();
701    //     RegexMatcher::groupCount();
702    //
703    {
704        int32_t             flags=0;
705        UParseError         pe;
706        UErrorCode          status=U_ZERO_ERROR;
707
708        UnicodeString       re("01(23(45)67)(.*)");
709        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
710        REGEX_CHECK_STATUS;
711        UnicodeString data = "0123456789";
712
713        RegexMatcher *matcher = pat->matcher(data, status);
714        REGEX_CHECK_STATUS;
715        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
716        static const int32_t matchStarts[] = {0,  2, 4, 8};
717        static const int32_t matchEnds[]   = {10, 8, 6, 10};
718        int32_t i;
719        for (i=0; i<4; i++) {
720            int32_t actualStart = matcher->start(i, status);
721            REGEX_CHECK_STATUS;
722            if (actualStart != matchStarts[i]) {
723                errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
724                    __LINE__, i, matchStarts[i], actualStart);
725            }
726            int32_t actualEnd = matcher->end(i, status);
727            REGEX_CHECK_STATUS;
728            if (actualEnd != matchEnds[i]) {
729                errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
730                    __LINE__, i, matchEnds[i], actualEnd);
731            }
732        }
733
734        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
735        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
736
737        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
738        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
739        matcher->reset();
740        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
741
742        matcher->lookingAt(status);
743        REGEX_ASSERT(matcher->group(status)    == "0123456789");
744        REGEX_ASSERT(matcher->group(0, status) == "0123456789");
745        REGEX_ASSERT(matcher->group(1, status) == "234567"    );
746        REGEX_ASSERT(matcher->group(2, status) == "45"        );
747        REGEX_ASSERT(matcher->group(3, status) == "89"        );
748        REGEX_CHECK_STATUS;
749        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
750        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
751        matcher->reset();
752        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
753
754        delete matcher;
755        delete pat;
756
757    }
758
759    //
760    //  find
761    //
762    {
763        int32_t             flags=0;
764        UParseError         pe;
765        UErrorCode          status=U_ZERO_ERROR;
766
767        UnicodeString       re("abc");
768        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
769        REGEX_CHECK_STATUS;
770        UnicodeString data = ".abc..abc...abc..";
771        //                    012345678901234567
772
773        RegexMatcher *matcher = pat->matcher(data, status);
774        REGEX_CHECK_STATUS;
775        REGEX_ASSERT(matcher->find());
776        REGEX_ASSERT(matcher->start(status) == 1);
777        REGEX_ASSERT(matcher->find());
778        REGEX_ASSERT(matcher->start(status) == 6);
779        REGEX_ASSERT(matcher->find());
780        REGEX_ASSERT(matcher->start(status) == 12);
781        REGEX_ASSERT(matcher->find() == FALSE);
782        REGEX_ASSERT(matcher->find() == FALSE);
783
784        matcher->reset();
785        REGEX_ASSERT(matcher->find());
786        REGEX_ASSERT(matcher->start(status) == 1);
787
788        REGEX_ASSERT(matcher->find(0, status));
789        REGEX_ASSERT(matcher->start(status) == 1);
790        REGEX_ASSERT(matcher->find(1, status));
791        REGEX_ASSERT(matcher->start(status) == 1);
792        REGEX_ASSERT(matcher->find(2, status));
793        REGEX_ASSERT(matcher->start(status) == 6);
794        REGEX_ASSERT(matcher->find(12, status));
795        REGEX_ASSERT(matcher->start(status) == 12);
796        REGEX_ASSERT(matcher->find(13, status) == FALSE);
797        REGEX_ASSERT(matcher->find(16, status) == FALSE);
798        REGEX_ASSERT(matcher->find(17, status) == FALSE);
799        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
800
801        status = U_ZERO_ERROR;
802        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
803        status = U_ZERO_ERROR;
804        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
805
806        REGEX_ASSERT(matcher->groupCount() == 0);
807
808        delete matcher;
809        delete pat;
810    }
811
812
813    //
814    //  find, with \G in pattern (true if at the end of a previous match).
815    //
816    {
817        int32_t             flags=0;
818        UParseError         pe;
819        UErrorCode          status=U_ZERO_ERROR;
820
821        UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
822        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
823        REGEX_CHECK_STATUS;
824        UnicodeString data = ".abcabc.abc..";
825        //                    012345678901234567
826
827        RegexMatcher *matcher = pat->matcher(data, status);
828        REGEX_CHECK_STATUS;
829        REGEX_ASSERT(matcher->find());
830        REGEX_ASSERT(matcher->start(status) == 0);
831        REGEX_ASSERT(matcher->start(1, status) == -1);
832        REGEX_ASSERT(matcher->start(2, status) == 1);
833
834        REGEX_ASSERT(matcher->find());
835        REGEX_ASSERT(matcher->start(status) == 4);
836        REGEX_ASSERT(matcher->start(1, status) == 4);
837        REGEX_ASSERT(matcher->start(2, status) == -1);
838        REGEX_CHECK_STATUS;
839
840        delete matcher;
841        delete pat;
842    }
843
844    //
845    //   find with zero length matches, match position should bump ahead
846    //     to prevent loops.
847    //
848    {
849        int32_t                 i;
850        UErrorCode          status=U_ZERO_ERROR;
851        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
852                                                      //   using an always-true look-ahead.
853        REGEX_CHECK_STATUS;
854        UnicodeString s("    ");
855        m.reset(s);
856        for (i=0; ; i++) {
857            if (m.find() == FALSE) {
858                break;
859            }
860            REGEX_ASSERT(m.start(status) == i);
861            REGEX_ASSERT(m.end(status) == i);
862        }
863        REGEX_ASSERT(i==5);
864
865        // Check that the bump goes over surrogate pairs OK
866        s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
867        s = s.unescape();
868        m.reset(s);
869        for (i=0; ; i+=2) {
870            if (m.find() == FALSE) {
871                break;
872            }
873            REGEX_ASSERT(m.start(status) == i);
874            REGEX_ASSERT(m.end(status) == i);
875        }
876        REGEX_ASSERT(i==10);
877    }
878    {
879        // find() loop breaking test.
880        //        with pattern of /.?/, should see a series of one char matches, then a single
881        //        match of zero length at the end of the input string.
882        int32_t                 i;
883        UErrorCode          status=U_ZERO_ERROR;
884        RegexMatcher        m(".?", 0, status);
885        REGEX_CHECK_STATUS;
886        UnicodeString s("    ");
887        m.reset(s);
888        for (i=0; ; i++) {
889            if (m.find() == FALSE) {
890                break;
891            }
892            REGEX_ASSERT(m.start(status) == i);
893            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
894        }
895        REGEX_ASSERT(i==5);
896    }
897
898
899    //
900    // Matchers with no input string behave as if they had an empty input string.
901    //
902
903    {
904        UErrorCode status = U_ZERO_ERROR;
905        RegexMatcher  m(".?", 0, status);
906        REGEX_CHECK_STATUS;
907        REGEX_ASSERT(m.find());
908        REGEX_ASSERT(m.start(status) == 0);
909        REGEX_ASSERT(m.input() == "");
910    }
911    {
912        UErrorCode status = U_ZERO_ERROR;
913        RegexPattern  *p = RegexPattern::compile(".", 0, status);
914        RegexMatcher  *m = p->matcher(status);
915        REGEX_CHECK_STATUS;
916
917        REGEX_ASSERT(m->find() == FALSE);
918        REGEX_ASSERT(m->input() == "");
919        delete m;
920        delete p;
921    }
922
923    //
924    // Regions
925    //
926    {
927        UErrorCode status = U_ZERO_ERROR;
928        UnicodeString testString("This is test data");
929        RegexMatcher m(".*", testString,  0, status);
930        REGEX_CHECK_STATUS;
931        REGEX_ASSERT(m.regionStart() == 0);
932        REGEX_ASSERT(m.regionEnd() == testString.length());
933        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
934        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
935
936        m.region(2,4, status);
937        REGEX_CHECK_STATUS;
938        REGEX_ASSERT(m.matches(status));
939        REGEX_ASSERT(m.start(status)==2);
940        REGEX_ASSERT(m.end(status)==4);
941        REGEX_CHECK_STATUS;
942
943        m.reset();
944        REGEX_ASSERT(m.regionStart() == 0);
945        REGEX_ASSERT(m.regionEnd() == testString.length());
946
947        UnicodeString shorterString("short");
948        m.reset(shorterString);
949        REGEX_ASSERT(m.regionStart() == 0);
950        REGEX_ASSERT(m.regionEnd() == shorterString.length());
951
952        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
953        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
954        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
955        REGEX_ASSERT(&m == &m.reset());
956        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
957
958        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
959        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
960        REGEX_ASSERT(&m == &m.reset());
961        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
962
963        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
964        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
965        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
966        REGEX_ASSERT(&m == &m.reset());
967        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
968
969        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
970        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
971        REGEX_ASSERT(&m == &m.reset());
972        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
973
974    }
975
976    //
977    // hitEnd() and requireEnd()
978    //
979    {
980        UErrorCode status = U_ZERO_ERROR;
981        UnicodeString testString("aabb");
982        RegexMatcher m1(".*", testString,  0, status);
983        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
984        REGEX_ASSERT(m1.hitEnd() == TRUE);
985        REGEX_ASSERT(m1.requireEnd() == FALSE);
986        REGEX_CHECK_STATUS;
987
988        status = U_ZERO_ERROR;
989        RegexMatcher m2("a*", testString, 0, status);
990        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
991        REGEX_ASSERT(m2.hitEnd() == FALSE);
992        REGEX_ASSERT(m2.requireEnd() == FALSE);
993        REGEX_CHECK_STATUS;
994
995        status = U_ZERO_ERROR;
996        RegexMatcher m3(".*$", testString, 0, status);
997        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
998        REGEX_ASSERT(m3.hitEnd() == TRUE);
999        REGEX_ASSERT(m3.requireEnd() == TRUE);
1000        REGEX_CHECK_STATUS;
1001    }
1002
1003
1004    //
1005    // Compilation error on reset with UChar *
1006    //   These were a hazard that people were stumbling over with runtime errors.
1007    //   Changed them to compiler errors by adding private methods that more closely
1008    //   matched the incorrect use of the functions.
1009    //
1010#if 0
1011    {
1012        UErrorCode status = U_ZERO_ERROR;
1013        UChar ucharString[20];
1014        RegexMatcher m(".", 0, status);
1015        m.reset(ucharString);  // should not compile.
1016
1017        RegexPattern *p = RegexPattern::compile(".", 0, status);
1018        RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1019
1020        RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1021    }
1022#endif
1023
1024    //
1025    //  Time Outs.
1026    //       Note:  These tests will need to be changed when the regexp engine is
1027    //              able to detect and cut short the exponential time behavior on
1028    //              this type of match.
1029    //
1030    {
1031        UErrorCode status = U_ZERO_ERROR;
1032        //    Enough 'a's in the string to cause the match to time out.
1033        //       (Each on additonal 'a' doubles the time)
1034        UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1035        RegexMatcher matcher("(a+)+b", testString, 0, status);
1036        REGEX_CHECK_STATUS;
1037        REGEX_ASSERT(matcher.getTimeLimit() == 0);
1038        matcher.setTimeLimit(100, status);
1039        REGEX_ASSERT(matcher.getTimeLimit() == 100);
1040        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1041        REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1042    }
1043    {
1044        UErrorCode status = U_ZERO_ERROR;
1045        //   Few enough 'a's to slip in under the time limit.
1046        UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1047        RegexMatcher matcher("(a+)+b", testString, 0, status);
1048        REGEX_CHECK_STATUS;
1049        matcher.setTimeLimit(100, status);
1050        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1051        REGEX_CHECK_STATUS;
1052    }
1053
1054    //
1055    //  Stack Limits
1056    //
1057    {
1058        UErrorCode status = U_ZERO_ERROR;
1059        UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1060
1061        // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1062        //   of the '+', and makes the stack frames larger.
1063        RegexMatcher matcher("(A)+A$", testString, 0, status);
1064
1065        // With the default stack, this match should fail to run
1066        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1067        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1068
1069        // With unlimited stack, it should run
1070        status = U_ZERO_ERROR;
1071        matcher.setStackLimit(0, status);
1072        REGEX_CHECK_STATUS;
1073        REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1074        REGEX_CHECK_STATUS;
1075        REGEX_ASSERT(matcher.getStackLimit() == 0);
1076
1077        // With a limited stack, it the match should fail
1078        status = U_ZERO_ERROR;
1079        matcher.setStackLimit(10000, status);
1080        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1081        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1082        REGEX_ASSERT(matcher.getStackLimit() == 10000);
1083    }
1084
1085        // A pattern that doesn't save state should work with
1086        //   a minimal sized stack
1087    {
1088        UErrorCode status = U_ZERO_ERROR;
1089        UnicodeString testString = "abc";
1090        RegexMatcher matcher("abc", testString, 0, status);
1091        REGEX_CHECK_STATUS;
1092        matcher.setStackLimit(30, status);
1093        REGEX_CHECK_STATUS;
1094        REGEX_ASSERT(matcher.matches(status) == TRUE);
1095        REGEX_CHECK_STATUS;
1096        REGEX_ASSERT(matcher.getStackLimit() == 30);
1097
1098        // Negative stack sizes should fail
1099        status = U_ZERO_ERROR;
1100        matcher.setStackLimit(1000, status);
1101        REGEX_CHECK_STATUS;
1102        matcher.setStackLimit(-1, status);
1103        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1104        REGEX_ASSERT(matcher.getStackLimit() == 1000);
1105    }
1106
1107
1108}
1109
1110
1111
1112
1113
1114
1115//---------------------------------------------------------------------------
1116//
1117//      API_Replace        API test for class RegexMatcher, testing the
1118//                         Replace family of functions.
1119//
1120//---------------------------------------------------------------------------
1121void RegexTest::API_Replace() {
1122    //
1123    //  Replace
1124    //
1125    int32_t             flags=0;
1126    UParseError         pe;
1127    UErrorCode          status=U_ZERO_ERROR;
1128
1129    UnicodeString       re("abc");
1130    RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1131    REGEX_CHECK_STATUS;
1132    UnicodeString data = ".abc..abc...abc..";
1133    //                    012345678901234567
1134    RegexMatcher *matcher = pat->matcher(data, status);
1135
1136    //
1137    //  Plain vanilla matches.
1138    //
1139    UnicodeString  dest;
1140    dest = matcher->replaceFirst("yz", status);
1141    REGEX_CHECK_STATUS;
1142    REGEX_ASSERT(dest == ".yz..abc...abc..");
1143
1144    dest = matcher->replaceAll("yz", status);
1145    REGEX_CHECK_STATUS;
1146    REGEX_ASSERT(dest == ".yz..yz...yz..");
1147
1148    //
1149    //  Plain vanilla non-matches.
1150    //
1151    UnicodeString d2 = ".abx..abx...abx..";
1152    matcher->reset(d2);
1153    dest = matcher->replaceFirst("yz", status);
1154    REGEX_CHECK_STATUS;
1155    REGEX_ASSERT(dest == ".abx..abx...abx..");
1156
1157    dest = matcher->replaceAll("yz", status);
1158    REGEX_CHECK_STATUS;
1159    REGEX_ASSERT(dest == ".abx..abx...abx..");
1160
1161    //
1162    // Empty source string
1163    //
1164    UnicodeString d3 = "";
1165    matcher->reset(d3);
1166    dest = matcher->replaceFirst("yz", status);
1167    REGEX_CHECK_STATUS;
1168    REGEX_ASSERT(dest == "");
1169
1170    dest = matcher->replaceAll("yz", status);
1171    REGEX_CHECK_STATUS;
1172    REGEX_ASSERT(dest == "");
1173
1174    //
1175    // Empty substitution string
1176    //
1177    matcher->reset(data);              // ".abc..abc...abc.."
1178    dest = matcher->replaceFirst("", status);
1179    REGEX_CHECK_STATUS;
1180    REGEX_ASSERT(dest == "...abc...abc..");
1181
1182    dest = matcher->replaceAll("", status);
1183    REGEX_CHECK_STATUS;
1184    REGEX_ASSERT(dest == "........");
1185
1186    //
1187    // match whole string
1188    //
1189    UnicodeString d4 = "abc";
1190    matcher->reset(d4);
1191    dest = matcher->replaceFirst("xyz", status);
1192    REGEX_CHECK_STATUS;
1193    REGEX_ASSERT(dest == "xyz");
1194
1195    dest = matcher->replaceAll("xyz", status);
1196    REGEX_CHECK_STATUS;
1197    REGEX_ASSERT(dest == "xyz");
1198
1199    //
1200    // Capture Group, simple case
1201    //
1202    UnicodeString       re2("a(..)");
1203    RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1204    REGEX_CHECK_STATUS;
1205    UnicodeString d5 = "abcdefg";
1206    RegexMatcher *matcher2 = pat2->matcher(d5, status);
1207    REGEX_CHECK_STATUS;
1208    dest = matcher2->replaceFirst("$1$1", status);
1209    REGEX_CHECK_STATUS;
1210    REGEX_ASSERT(dest == "bcbcdefg");
1211
1212    dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1213    REGEX_CHECK_STATUS;
1214    REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1215
1216    dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1217    REGEX_CHECK_STATUS;
1218    REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1219
1220    UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1221    replacement = replacement.unescape();
1222    dest = matcher2->replaceFirst(replacement, status);
1223    REGEX_CHECK_STATUS;
1224    REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1225
1226    REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1227
1228
1229    //
1230    // Replacement String with \u hex escapes
1231    //
1232    {
1233        UnicodeString  src = "abc 1 abc 2 abc 3";
1234        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1235        matcher->reset(src);
1236        UnicodeString  result = matcher->replaceAll(substitute, status);
1237        REGEX_CHECK_STATUS;
1238        REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1239    }
1240    {
1241        UnicodeString  src = "abc !";
1242        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1243        matcher->reset(src);
1244        UnicodeString  result = matcher->replaceAll(substitute, status);
1245        REGEX_CHECK_STATUS;
1246        UnicodeString expected = UnicodeString("--");
1247        expected.append((UChar32)0x10000);
1248        expected.append("-- !");
1249        REGEX_ASSERT(result == expected);
1250    }
1251    // TODO:  need more through testing of capture substitutions.
1252
1253    // Bug 4057
1254    //
1255    {
1256        status = U_ZERO_ERROR;
1257        UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1258        RegexMatcher m("ss(.*?)ee", 0, status);
1259        REGEX_CHECK_STATUS;
1260        UnicodeString result;
1261
1262        // Multiple finds do NOT bump up the previous appendReplacement postion.
1263        m.reset(s);
1264        m.find();
1265        m.find();
1266        m.appendReplacement(result, "ooh", status);
1267        REGEX_CHECK_STATUS;
1268        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1269
1270        // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1271        status = U_ZERO_ERROR;
1272        result.truncate(0);
1273        m.reset(10, status);
1274        m.find();
1275        m.find();
1276        m.appendReplacement(result, "ooh", status);
1277        REGEX_CHECK_STATUS;
1278        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1279
1280        // find() at interior of string, appendReplacemnt still starts at beginning.
1281        status = U_ZERO_ERROR;
1282        result.truncate(0);
1283        m.reset();
1284        m.find(10, status);
1285        m.find();
1286        m.appendReplacement(result, "ooh", status);
1287        REGEX_CHECK_STATUS;
1288        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1289
1290        m.appendTail(result);
1291        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1292
1293    }
1294
1295    delete matcher2;
1296    delete pat2;
1297    delete matcher;
1298    delete pat;
1299}
1300
1301
1302//---------------------------------------------------------------------------
1303//
1304//      API_Pattern       Test that the API for class RegexPattern is
1305//                        present and nominally working.
1306//
1307//---------------------------------------------------------------------------
1308void RegexTest::API_Pattern() {
1309    RegexPattern        pata;    // Test default constructor to not crash.
1310    RegexPattern        patb;
1311
1312    REGEX_ASSERT(pata == patb);
1313    REGEX_ASSERT(pata == pata);
1314
1315    UnicodeString re1("abc[a-l][m-z]");
1316    UnicodeString re2("def");
1317    UErrorCode    status = U_ZERO_ERROR;
1318    UParseError   pe;
1319
1320    RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1321    RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1322    REGEX_CHECK_STATUS;
1323    REGEX_ASSERT(*pat1 == *pat1);
1324    REGEX_ASSERT(*pat1 != pata);
1325
1326    // Assign
1327    patb = *pat1;
1328    REGEX_ASSERT(patb == *pat1);
1329
1330    // Copy Construct
1331    RegexPattern patc(*pat1);
1332    REGEX_ASSERT(patc == *pat1);
1333    REGEX_ASSERT(patb == patc);
1334    REGEX_ASSERT(pat1 != pat2);
1335    patb = *pat2;
1336    REGEX_ASSERT(patb != patc);
1337    REGEX_ASSERT(patb == *pat2);
1338
1339    // Compile with no flags.
1340    RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1341    REGEX_ASSERT(*pat1a == *pat1);
1342
1343    REGEX_ASSERT(pat1a->flags() == 0);
1344
1345    // Compile with different flags should be not equal
1346    RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1347    REGEX_CHECK_STATUS;
1348
1349    REGEX_ASSERT(*pat1b != *pat1a);
1350    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1351    REGEX_ASSERT(pat1a->flags() == 0);
1352    delete pat1b;
1353
1354    // clone
1355    RegexPattern *pat1c = pat1->clone();
1356    REGEX_ASSERT(*pat1c == *pat1);
1357    REGEX_ASSERT(*pat1c != *pat2);
1358
1359    delete pat1c;
1360    delete pat1a;
1361    delete pat1;
1362    delete pat2;
1363
1364
1365    //
1366    //   Verify that a matcher created from a cloned pattern works.
1367    //     (Jitterbug 3423)
1368    //
1369    {
1370        UErrorCode     status     = U_ZERO_ERROR;
1371        RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1372        RegexPattern  *pClone     = pSource->clone();
1373        delete         pSource;
1374        RegexMatcher  *mFromClone = pClone->matcher(status);
1375        REGEX_CHECK_STATUS;
1376        UnicodeString s = "Hello World";
1377        mFromClone->reset(s);
1378        REGEX_ASSERT(mFromClone->find() == TRUE);
1379        REGEX_ASSERT(mFromClone->group(status) == "Hello");
1380        REGEX_ASSERT(mFromClone->find() == TRUE);
1381        REGEX_ASSERT(mFromClone->group(status) == "World");
1382        REGEX_ASSERT(mFromClone->find() == FALSE);
1383        delete mFromClone;
1384        delete pClone;
1385    }
1386
1387    //
1388    //   matches convenience API
1389    //
1390    REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1391    REGEX_CHECK_STATUS;
1392    REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1393    REGEX_CHECK_STATUS;
1394    REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1395    REGEX_CHECK_STATUS;
1396    REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1397    REGEX_CHECK_STATUS;
1398    REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1399    REGEX_CHECK_STATUS;
1400    status = U_INDEX_OUTOFBOUNDS_ERROR;
1401    REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1402    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1403
1404
1405    //
1406    // Split()
1407    //
1408    status = U_ZERO_ERROR;
1409    pat1 = RegexPattern::compile(" +",  pe, status);
1410    REGEX_CHECK_STATUS;
1411    UnicodeString  fields[10];
1412
1413    int32_t n;
1414    n = pat1->split("Now is the time", fields, 10, status);
1415    REGEX_CHECK_STATUS;
1416    REGEX_ASSERT(n==4);
1417    REGEX_ASSERT(fields[0]=="Now");
1418    REGEX_ASSERT(fields[1]=="is");
1419    REGEX_ASSERT(fields[2]=="the");
1420    REGEX_ASSERT(fields[3]=="time");
1421    REGEX_ASSERT(fields[4]=="");
1422
1423    n = pat1->split("Now is the time", fields, 2, status);
1424    REGEX_CHECK_STATUS;
1425    REGEX_ASSERT(n==2);
1426    REGEX_ASSERT(fields[0]=="Now");
1427    REGEX_ASSERT(fields[1]=="is the time");
1428    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1429
1430    fields[1] = "*";
1431    status = U_ZERO_ERROR;
1432    n = pat1->split("Now is the time", fields, 1, status);
1433    REGEX_CHECK_STATUS;
1434    REGEX_ASSERT(n==1);
1435    REGEX_ASSERT(fields[0]=="Now is the time");
1436    REGEX_ASSERT(fields[1]=="*");
1437    status = U_ZERO_ERROR;
1438
1439    n = pat1->split("    Now       is the time   ", fields, 10, status);
1440    REGEX_CHECK_STATUS;
1441    REGEX_ASSERT(n==5);
1442    REGEX_ASSERT(fields[0]=="");
1443    REGEX_ASSERT(fields[1]=="Now");
1444    REGEX_ASSERT(fields[2]=="is");
1445    REGEX_ASSERT(fields[3]=="the");
1446    REGEX_ASSERT(fields[4]=="time");
1447    REGEX_ASSERT(fields[5]=="");
1448
1449    n = pat1->split("     ", fields, 10, status);
1450    REGEX_CHECK_STATUS;
1451    REGEX_ASSERT(n==1);
1452    REGEX_ASSERT(fields[0]=="");
1453
1454    fields[0] = "foo";
1455    n = pat1->split("", fields, 10, status);
1456    REGEX_CHECK_STATUS;
1457    REGEX_ASSERT(n==0);
1458    REGEX_ASSERT(fields[0]=="foo");
1459
1460    delete pat1;
1461
1462    //  split, with a pattern with (capture)
1463    pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1464    REGEX_CHECK_STATUS;
1465
1466    status = U_ZERO_ERROR;
1467    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1468    REGEX_CHECK_STATUS;
1469    REGEX_ASSERT(n==6);
1470    REGEX_ASSERT(fields[0]=="");
1471    REGEX_ASSERT(fields[1]=="a");
1472    REGEX_ASSERT(fields[2]=="Now is ");
1473    REGEX_ASSERT(fields[3]=="b");
1474    REGEX_ASSERT(fields[4]=="the time");
1475    REGEX_ASSERT(fields[5]=="c");
1476    REGEX_ASSERT(fields[6]=="");
1477    REGEX_ASSERT(status==U_ZERO_ERROR);
1478
1479    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1480    REGEX_CHECK_STATUS;
1481    REGEX_ASSERT(n==6);
1482    REGEX_ASSERT(fields[0]=="  ");
1483    REGEX_ASSERT(fields[1]=="a");
1484    REGEX_ASSERT(fields[2]=="Now is ");
1485    REGEX_ASSERT(fields[3]=="b");
1486    REGEX_ASSERT(fields[4]=="the time");
1487    REGEX_ASSERT(fields[5]=="c");
1488    REGEX_ASSERT(fields[6]=="");
1489
1490    status = U_ZERO_ERROR;
1491    fields[6] = "foo";
1492    n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1493    REGEX_CHECK_STATUS;
1494    REGEX_ASSERT(n==6);
1495    REGEX_ASSERT(fields[0]=="  ");
1496    REGEX_ASSERT(fields[1]=="a");
1497    REGEX_ASSERT(fields[2]=="Now is ");
1498    REGEX_ASSERT(fields[3]=="b");
1499    REGEX_ASSERT(fields[4]=="the time");
1500    REGEX_ASSERT(fields[5]=="c");
1501    REGEX_ASSERT(fields[6]=="foo");
1502
1503    status = U_ZERO_ERROR;
1504    fields[5] = "foo";
1505    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1506    REGEX_CHECK_STATUS;
1507    REGEX_ASSERT(n==5);
1508    REGEX_ASSERT(fields[0]=="  ");
1509    REGEX_ASSERT(fields[1]=="a");
1510    REGEX_ASSERT(fields[2]=="Now is ");
1511    REGEX_ASSERT(fields[3]=="b");
1512    REGEX_ASSERT(fields[4]=="the time<c>");
1513    REGEX_ASSERT(fields[5]=="foo");
1514
1515    status = U_ZERO_ERROR;
1516    fields[5] = "foo";
1517    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1518    REGEX_CHECK_STATUS;
1519    REGEX_ASSERT(n==5);
1520    REGEX_ASSERT(fields[0]=="  ");
1521    REGEX_ASSERT(fields[1]=="a");
1522    REGEX_ASSERT(fields[2]=="Now is ");
1523    REGEX_ASSERT(fields[3]=="b");
1524    REGEX_ASSERT(fields[4]=="the time");
1525    REGEX_ASSERT(fields[5]=="foo");
1526
1527    status = U_ZERO_ERROR;
1528    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1529    REGEX_CHECK_STATUS;
1530    REGEX_ASSERT(n==4);
1531    REGEX_ASSERT(fields[0]=="  ");
1532    REGEX_ASSERT(fields[1]=="a");
1533    REGEX_ASSERT(fields[2]=="Now is ");
1534    REGEX_ASSERT(fields[3]=="the time<c>");
1535    status = U_ZERO_ERROR;
1536    delete pat1;
1537
1538    pat1 = RegexPattern::compile("([-,])",  pe, status);
1539    REGEX_CHECK_STATUS;
1540    n = pat1->split("1-10,20", fields, 10, status);
1541    REGEX_CHECK_STATUS;
1542    REGEX_ASSERT(n==5);
1543    REGEX_ASSERT(fields[0]=="1");
1544    REGEX_ASSERT(fields[1]=="-");
1545    REGEX_ASSERT(fields[2]=="10");
1546    REGEX_ASSERT(fields[3]==",");
1547    REGEX_ASSERT(fields[4]=="20");
1548    delete pat1;
1549
1550
1551    //
1552    // RegexPattern::pattern()
1553    //
1554    pat1 = new RegexPattern();
1555    REGEX_ASSERT(pat1->pattern() == "");
1556    delete pat1;
1557
1558    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1559    REGEX_CHECK_STATUS;
1560    REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1561    delete pat1;
1562
1563
1564    //
1565    // classID functions
1566    //
1567    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1568    REGEX_CHECK_STATUS;
1569    REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1570    REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1571    UnicodeString Hello("Hello, world.");
1572    RegexMatcher *m = pat1->matcher(Hello, status);
1573    REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1574    REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1575    REGEX_ASSERT(m->getDynamicClassID() != NULL);
1576    delete m;
1577    delete pat1;
1578
1579}
1580
1581//---------------------------------------------------------------------------
1582//
1583//      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1584//                       is present and working, but excluding functions
1585//                       implementing replace operations.
1586//
1587//---------------------------------------------------------------------------
1588void RegexTest::API_Match_UTF8() {
1589    UParseError         pe;
1590    UErrorCode          status=U_ZERO_ERROR;
1591    int32_t             flags = 0;
1592
1593    //
1594    // Debug - slide failing test cases early
1595    //
1596#if 0
1597    {
1598    }
1599    return;
1600#endif
1601
1602    //
1603    // Simple pattern compilation
1604    //
1605    {
1606        UText               re = UTEXT_INITIALIZER;
1607        utext_openUTF8(&re, "abc", -1, &status);
1608        RegexPattern        *pat2;
1609        pat2 = RegexPattern::compile(&re, flags, pe, status);
1610        REGEX_CHECK_STATUS;
1611
1612        UText input1 = UTEXT_INITIALIZER;
1613        UText input2 = UTEXT_INITIALIZER;
1614        UText empty  = UTEXT_INITIALIZER;
1615        utext_openUTF8(&input1, "abcdef this is a test", -1, &status);
1616        utext_openUTF8(&input2, "not abc", -1, &status);
1617        utext_openUChars(&empty, NULL, 0, &status);
1618
1619        int32_t input1Len = strlen("abcdef this is a test");
1620        int32_t input2Len = strlen("not abc");
1621
1622
1623        //
1624        // Matcher creation and reset.
1625        //
1626        RegexMatcher *m1 = pat2->matcher(&input1, RegexPattern::PATTERN_IS_UTEXT, status);
1627        REGEX_CHECK_STATUS;
1628        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1629        REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText());
1630        m1->reset(&input2);
1631        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1632        REGEX_ASSERT_UTEXT("not abc", m1->inputText());
1633        m1->reset(&input1);
1634        REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText());
1635        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1636        m1->reset(&empty);
1637        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1638        REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1639
1640        //
1641        //  reset(pos, status)
1642        //
1643        m1->reset(&input1);
1644        m1->reset(4, status);
1645        REGEX_CHECK_STATUS;
1646        REGEX_ASSERT_UTEXT("abcdef this is a test", m1->inputText());
1647        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1648
1649        m1->reset(-1, status);
1650        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1651        status = U_ZERO_ERROR;
1652
1653        m1->reset(0, status);
1654        REGEX_CHECK_STATUS;
1655        status = U_ZERO_ERROR;
1656
1657        m1->reset(input1Len-1, status);
1658        REGEX_CHECK_STATUS;
1659        status = U_ZERO_ERROR;
1660
1661        m1->reset(input1Len, status);
1662        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1663        status = U_ZERO_ERROR;
1664
1665        //
1666        // match(pos, status)
1667        //
1668        m1->reset(&input2);
1669        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1670        m1->reset();
1671        REGEX_ASSERT(m1->matches(3, status) == FALSE);
1672        m1->reset();
1673        REGEX_ASSERT(m1->matches(5, status) == FALSE);
1674        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1675        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1676        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1677
1678        // Match() at end of string should fail, but should not
1679        //  be an error.
1680        status = U_ZERO_ERROR;
1681        REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1682        REGEX_CHECK_STATUS;
1683
1684        // Match beyond end of string should fail with an error.
1685        status = U_ZERO_ERROR;
1686        REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1687        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1688
1689        // Successful match at end of string.
1690        {
1691            status = U_ZERO_ERROR;
1692            RegexMatcher m("A?", 0, status);  // will match zero length string.
1693            REGEX_CHECK_STATUS;
1694            m.reset(&input1);
1695            REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1696            REGEX_CHECK_STATUS;
1697            m.reset(&empty);
1698            REGEX_ASSERT(m.matches(0, status) == TRUE);
1699            REGEX_CHECK_STATUS;
1700        }
1701
1702
1703        //
1704        // lookingAt(pos, status)
1705        //
1706        status = U_ZERO_ERROR;
1707        m1->reset(&input2);  // "not abc"
1708        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1709        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1710        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1711        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1712        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1713        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1714        status = U_ZERO_ERROR;
1715        REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1716        REGEX_CHECK_STATUS;
1717        REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1718        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1719
1720        delete m1;
1721        delete pat2;
1722
1723        utext_close(&re);
1724        utext_close(&input1);
1725        utext_close(&input2);
1726        utext_close(&empty);
1727    }
1728
1729
1730    //
1731    // Capture Group.
1732    //     RegexMatcher::start();
1733    //     RegexMatcher::end();
1734    //     RegexMatcher::groupCount();
1735    //
1736    {
1737        int32_t             flags=0;
1738        UParseError         pe;
1739        UErrorCode          status=U_ZERO_ERROR;
1740        UText               re=UTEXT_INITIALIZER;
1741        utext_openUTF8(&re, "01(23(45)67)(.*)", -1, &status);
1742
1743        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1744        REGEX_CHECK_STATUS;
1745
1746        UText input = UTEXT_INITIALIZER;
1747        utext_openUTF8(&input, "0123456789", -1, &status);
1748
1749        RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1750        REGEX_CHECK_STATUS;
1751        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1752        static const int32_t matchStarts[] = {0,  2, 4, 8};
1753        static const int32_t matchEnds[]   = {10, 8, 6, 10};
1754        int32_t i;
1755        for (i=0; i<4; i++) {
1756            int32_t actualStart = matcher->start(i, status);
1757            REGEX_CHECK_STATUS;
1758            if (actualStart != matchStarts[i]) {
1759                errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
1760                    __LINE__, i, matchStarts[i], actualStart);
1761            }
1762            int32_t actualEnd = matcher->end(i, status);
1763            REGEX_CHECK_STATUS;
1764            if (actualEnd != matchEnds[i]) {
1765                errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
1766                    __LINE__, i, matchEnds[i], actualEnd);
1767            }
1768        }
1769
1770        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1771        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1772
1773        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1774        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1775        matcher->reset();
1776        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1777
1778        matcher->lookingAt(status);
1779
1780        UnicodeString dest;
1781        UText destText = UTEXT_INITIALIZER;
1782        utext_openUnicodeString(&destText, &dest, &status);
1783        UText *result;
1784
1785        result = matcher->group((UText *)NULL, RegexMatcher::MATCHER_DEST_IS_UTEXT, status);
1786        REGEX_CHECK_STATUS;
1787        REGEX_ASSERT_UTEXT("0123456789", result);
1788        utext_close(result);
1789        result = matcher->group(&destText, RegexMatcher::MATCHER_DEST_IS_UTEXT, status);
1790        REGEX_CHECK_STATUS;
1791        REGEX_ASSERT(result == &destText);
1792        REGEX_ASSERT_UTEXT("0123456789", result);
1793
1794        result = matcher->group(0, NULL, status);
1795        REGEX_CHECK_STATUS;
1796        REGEX_ASSERT_UTEXT("0123456789", result);
1797        utext_close(result);
1798        result = matcher->group(0, &destText, status);
1799        REGEX_CHECK_STATUS;
1800        REGEX_ASSERT(result == &destText);
1801        REGEX_ASSERT_UTEXT("0123456789", result);
1802
1803        result = matcher->group(1, NULL, status);
1804        REGEX_CHECK_STATUS;
1805        REGEX_ASSERT_UTEXT("234567", result);
1806        utext_close(result);
1807        result = matcher->group(1, &destText, status);
1808        REGEX_CHECK_STATUS;
1809        REGEX_ASSERT(result == &destText);
1810        REGEX_ASSERT_UTEXT("234567", result);
1811
1812        result = matcher->group(2, NULL, status);
1813        REGEX_CHECK_STATUS;
1814        REGEX_ASSERT_UTEXT("45", result);
1815        utext_close(result);
1816        result = matcher->group(2, &destText, status);
1817        REGEX_CHECK_STATUS;
1818        REGEX_ASSERT(result == &destText);
1819        REGEX_ASSERT_UTEXT("45", result);
1820
1821        result = matcher->group(3, NULL, status);
1822        REGEX_CHECK_STATUS;
1823        REGEX_ASSERT_UTEXT("89", result);
1824        utext_close(result);
1825        result = matcher->group(3, &destText, status);
1826        REGEX_CHECK_STATUS;
1827        REGEX_ASSERT(result == &destText);
1828        REGEX_ASSERT_UTEXT("89", result);
1829
1830        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1831        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1832        matcher->reset();
1833        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
1834
1835        delete matcher;
1836        delete pat;
1837
1838        utext_close(&destText);
1839        utext_close(&input);
1840        utext_close(&re);
1841    }
1842
1843    //
1844    //  find
1845    //
1846    {
1847        int32_t             flags=0;
1848        UParseError         pe;
1849        UErrorCode          status=U_ZERO_ERROR;
1850        UText               re=UTEXT_INITIALIZER;
1851        utext_openUTF8(&re, "abc", -1, &status);
1852
1853        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1854        REGEX_CHECK_STATUS;
1855        UText input = UTEXT_INITIALIZER;
1856        utext_openUTF8(&input, ".abc..abc...abc..", -1, &status);
1857        //                      012345678901234567
1858
1859        RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1860        REGEX_CHECK_STATUS;
1861        REGEX_ASSERT(matcher->find());
1862        REGEX_ASSERT(matcher->start(status) == 1);
1863        REGEX_ASSERT(matcher->find());
1864        REGEX_ASSERT(matcher->start(status) == 6);
1865        REGEX_ASSERT(matcher->find());
1866        REGEX_ASSERT(matcher->start(status) == 12);
1867        REGEX_ASSERT(matcher->find() == FALSE);
1868        REGEX_ASSERT(matcher->find() == FALSE);
1869
1870        matcher->reset();
1871        REGEX_ASSERT(matcher->find());
1872        REGEX_ASSERT(matcher->start(status) == 1);
1873
1874        REGEX_ASSERT(matcher->find(0, status));
1875        REGEX_ASSERT(matcher->start(status) == 1);
1876        REGEX_ASSERT(matcher->find(1, status));
1877        REGEX_ASSERT(matcher->start(status) == 1);
1878        REGEX_ASSERT(matcher->find(2, status));
1879        REGEX_ASSERT(matcher->start(status) == 6);
1880        REGEX_ASSERT(matcher->find(12, status));
1881        REGEX_ASSERT(matcher->start(status) == 12);
1882        REGEX_ASSERT(matcher->find(13, status) == FALSE);
1883        REGEX_ASSERT(matcher->find(16, status) == FALSE);
1884        REGEX_ASSERT(matcher->find(17, status) == FALSE);
1885        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1886
1887        status = U_ZERO_ERROR;
1888        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1889        status = U_ZERO_ERROR;
1890        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1891
1892        REGEX_ASSERT(matcher->groupCount() == 0);
1893
1894        delete matcher;
1895        delete pat;
1896
1897        utext_close(&input);
1898        utext_close(&re);
1899    }
1900
1901
1902    //
1903    //  find, with \G in pattern (true if at the end of a previous match).
1904    //
1905    {
1906        int32_t             flags=0;
1907        UParseError         pe;
1908        UErrorCode          status=U_ZERO_ERROR;
1909        UText               re=UTEXT_INITIALIZER;
1910        utext_openUTF8(&re, ".*?(?:(\\Gabc)|(abc))", -1, &status);
1911
1912        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1913
1914        REGEX_CHECK_STATUS;
1915        UText input = UTEXT_INITIALIZER;
1916        utext_openUTF8(&input, ".abcabc.abc..", -1, &status);
1917        //                      012345678901234567
1918
1919        RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1920        REGEX_CHECK_STATUS;
1921        REGEX_ASSERT(matcher->find());
1922        REGEX_ASSERT(matcher->start(status) == 0);
1923        REGEX_ASSERT(matcher->start(1, status) == -1);
1924        REGEX_ASSERT(matcher->start(2, status) == 1);
1925
1926        REGEX_ASSERT(matcher->find());
1927        REGEX_ASSERT(matcher->start(status) == 4);
1928        REGEX_ASSERT(matcher->start(1, status) == 4);
1929        REGEX_ASSERT(matcher->start(2, status) == -1);
1930        REGEX_CHECK_STATUS;
1931
1932        delete matcher;
1933        delete pat;
1934
1935        utext_close(&input);
1936        utext_close(&re);
1937    }
1938
1939    //
1940    //   find with zero length matches, match position should bump ahead
1941    //     to prevent loops.
1942    //
1943    {
1944        int32_t                 i;
1945        UErrorCode          status=U_ZERO_ERROR;
1946        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1947                                                      //   using an always-true look-ahead.
1948        REGEX_CHECK_STATUS;
1949        UText s = UTEXT_INITIALIZER;
1950        utext_openUTF8(&s, "    ", -1, &status);
1951        m.reset(&s);
1952        for (i=0; ; i++) {
1953            if (m.find() == FALSE) {
1954                break;
1955            }
1956            REGEX_ASSERT(m.start(status) == i);
1957            REGEX_ASSERT(m.end(status) == i);
1958        }
1959        REGEX_ASSERT(i==5);
1960
1961        // Check that the bump goes over characters outside the BMP OK
1962        // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
1963        unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
1964        utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
1965        m.reset(&s);
1966        for (i=0; ; i+=2) {
1967            if (m.find() == FALSE) {
1968                break;
1969            }
1970            REGEX_ASSERT(m.start(status) == i);
1971            REGEX_ASSERT(m.end(status) == i);
1972        }
1973        REGEX_ASSERT(i==10);
1974
1975        utext_close(&s);
1976    }
1977    {
1978        // find() loop breaking test.
1979        //        with pattern of /.?/, should see a series of one char matches, then a single
1980        //        match of zero length at the end of the input string.
1981        int32_t                 i;
1982        UErrorCode          status=U_ZERO_ERROR;
1983        RegexMatcher        m(".?", 0, status);
1984        REGEX_CHECK_STATUS;
1985        UText s = UTEXT_INITIALIZER;
1986        utext_openUTF8(&s, "    ", -1, &status);
1987        m.reset(&s);
1988        for (i=0; ; i++) {
1989            if (m.find() == FALSE) {
1990                break;
1991            }
1992            REGEX_ASSERT(m.start(status) == i);
1993            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1994        }
1995        REGEX_ASSERT(i==5);
1996
1997        utext_close(&s);
1998    }
1999
2000
2001    //
2002    // Matchers with no input string behave as if they had an empty input string.
2003    //
2004
2005    {
2006        UErrorCode status = U_ZERO_ERROR;
2007        RegexMatcher  m(".?", 0, status);
2008        REGEX_CHECK_STATUS;
2009        REGEX_ASSERT(m.find());
2010        REGEX_ASSERT(m.start(status) == 0);
2011        REGEX_ASSERT(m.input() == "");
2012    }
2013    {
2014        UErrorCode status = U_ZERO_ERROR;
2015        RegexPattern  *p = RegexPattern::compile(".", 0, status);
2016        RegexMatcher  *m = p->matcher(status);
2017        REGEX_CHECK_STATUS;
2018
2019        REGEX_ASSERT(m->find() == FALSE);
2020        REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2021        delete m;
2022        delete p;
2023    }
2024
2025    //
2026    // Regions
2027    //
2028    {
2029        UErrorCode status = U_ZERO_ERROR;
2030        UText testPattern = UTEXT_INITIALIZER;
2031        UText testText    = UTEXT_INITIALIZER;
2032        utext_openUTF8(&testPattern, ".*", -1, &status);
2033        utext_openUTF8(&testText, "This is test data", -1, &status);
2034
2035        RegexMatcher m(&testPattern, &testText, 0, status);
2036        REGEX_CHECK_STATUS;
2037        REGEX_ASSERT(m.regionStart() == 0);
2038        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2039        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2040        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2041
2042        m.region(2,4, status);
2043        REGEX_CHECK_STATUS;
2044        REGEX_ASSERT(m.matches(status));
2045        REGEX_ASSERT(m.start(status)==2);
2046        REGEX_ASSERT(m.end(status)==4);
2047        REGEX_CHECK_STATUS;
2048
2049        m.reset();
2050        REGEX_ASSERT(m.regionStart() == 0);
2051        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2052
2053        utext_openUTF8(&testText, "short", -1, &status);
2054        m.reset(&testText);
2055        REGEX_ASSERT(m.regionStart() == 0);
2056        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2057
2058        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2059        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2060        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2061        REGEX_ASSERT(&m == &m.reset());
2062        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2063
2064        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2065        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2066        REGEX_ASSERT(&m == &m.reset());
2067        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2068
2069        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2070        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2071        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2072        REGEX_ASSERT(&m == &m.reset());
2073        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2074
2075        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2076        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2077        REGEX_ASSERT(&m == &m.reset());
2078        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2079
2080        utext_close(&testText);
2081        utext_close(&testPattern);
2082    }
2083
2084    //
2085    // hitEnd() and requireEnd()
2086    //
2087    {
2088        UErrorCode status = U_ZERO_ERROR;
2089        UText testPattern = UTEXT_INITIALIZER;
2090        UText testText    = UTEXT_INITIALIZER;
2091        utext_openUTF8(&testPattern, ".*", -1, &status);
2092        utext_openUTF8(&testText, "aabb", -1, &status);
2093
2094        RegexMatcher m1(&testPattern, &testText,  0, status);
2095        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2096        REGEX_ASSERT(m1.hitEnd() == TRUE);
2097        REGEX_ASSERT(m1.requireEnd() == FALSE);
2098        REGEX_CHECK_STATUS;
2099
2100        status = U_ZERO_ERROR;
2101        utext_openUTF8(&testPattern, "a*", -1, &status);
2102        RegexMatcher m2(&testPattern, &testText, 0, status);
2103        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2104        REGEX_ASSERT(m2.hitEnd() == FALSE);
2105        REGEX_ASSERT(m2.requireEnd() == FALSE);
2106        REGEX_CHECK_STATUS;
2107
2108        status = U_ZERO_ERROR;
2109        utext_openUTF8(&testPattern, ".*$", -1, &status);
2110        RegexMatcher m3(&testPattern, &testText, 0, status);
2111        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2112        REGEX_ASSERT(m3.hitEnd() == TRUE);
2113        REGEX_ASSERT(m3.requireEnd() == TRUE);
2114        REGEX_CHECK_STATUS;
2115
2116        utext_close(&testText);
2117        utext_close(&testPattern);
2118    }
2119}
2120
2121
2122//---------------------------------------------------------------------------
2123//
2124//      API_Replace_UTF8   API test for class RegexMatcher, testing the
2125//                         Replace family of functions.
2126//
2127//---------------------------------------------------------------------------
2128void RegexTest::API_Replace_UTF8() {
2129    //
2130    //  Replace
2131    //
2132    int32_t             flags=0;
2133    UParseError         pe;
2134    UErrorCode          status=U_ZERO_ERROR;
2135
2136    UText               re=UTEXT_INITIALIZER;
2137    utext_openUTF8(&re, "abc", -1, &status);
2138    RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2139    REGEX_CHECK_STATUS;
2140
2141    char data[] = ".abc..abc...abc..";
2142    //             012345678901234567
2143    UText dataText = UTEXT_INITIALIZER;
2144    utext_openUTF8(&dataText, data, -1, &status);
2145    RegexMatcher *matcher = pat->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
2146
2147    //
2148    //  Plain vanilla matches.
2149    //
2150    UnicodeString  dest;
2151    UText destText = UTEXT_INITIALIZER;
2152    utext_openUnicodeString(&destText, &dest, &status);
2153    UText *result;
2154
2155    UText replText = UTEXT_INITIALIZER;
2156
2157    utext_openUTF8(&replText, "yz", -1, &status);
2158    result = matcher->replaceFirst(&replText, NULL, status);
2159    REGEX_CHECK_STATUS;
2160    REGEX_ASSERT_UTEXT(".yz..abc...abc..", result);
2161    utext_close(result);
2162    result = matcher->replaceFirst(&replText, &destText, status);
2163    REGEX_CHECK_STATUS;
2164    REGEX_ASSERT(result == &destText);
2165    REGEX_ASSERT_UTEXT(".yz..abc...abc..", result);
2166
2167    result = matcher->replaceAll(&replText, NULL, status);
2168    REGEX_CHECK_STATUS;
2169    REGEX_ASSERT_UTEXT(".yz..yz...yz..", result);
2170    utext_close(result);
2171
2172    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2173    result = matcher->replaceAll(&replText, &destText, status);
2174    REGEX_CHECK_STATUS;
2175    REGEX_ASSERT(result == &destText);
2176    REGEX_ASSERT_UTEXT(".yz..yz...yz..", result);
2177
2178    //
2179    //  Plain vanilla non-matches.
2180    //
2181    utext_openUTF8(&dataText, ".abx..abx...abx..", -1, &status);
2182    matcher->reset(&dataText);
2183
2184    result = matcher->replaceFirst(&replText, NULL, status);
2185    REGEX_CHECK_STATUS;
2186    REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
2187    utext_close(result);
2188    result = matcher->replaceFirst(&replText, &destText, status);
2189    REGEX_CHECK_STATUS;
2190    REGEX_ASSERT(result == &destText);
2191    REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
2192
2193    result = matcher->replaceAll(&replText, NULL, status);
2194    REGEX_CHECK_STATUS;
2195    REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
2196    utext_close(result);
2197    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2198    result = matcher->replaceAll(&replText, &destText, status);
2199    REGEX_CHECK_STATUS;
2200    REGEX_ASSERT(result == &destText);
2201    REGEX_ASSERT_UTEXT(".abx..abx...abx..", result);
2202
2203    //
2204    // Empty source string
2205    //
2206    utext_openUTF8(&dataText, NULL, 0, &status);
2207    matcher->reset(&dataText);
2208
2209    result = matcher->replaceFirst(&replText, NULL, status);
2210    REGEX_CHECK_STATUS;
2211    REGEX_ASSERT_UTEXT("", result);
2212    utext_close(result);
2213    result = matcher->replaceFirst(&replText, &destText, status);
2214    REGEX_CHECK_STATUS;
2215    REGEX_ASSERT(result == &destText);
2216    REGEX_ASSERT_UTEXT("", result);
2217
2218    result = matcher->replaceAll(&replText, NULL, status);
2219    REGEX_CHECK_STATUS;
2220    REGEX_ASSERT_UTEXT("", result);
2221    utext_close(result);
2222    result = matcher->replaceAll(&replText, &destText, status);
2223    REGEX_CHECK_STATUS;
2224    REGEX_ASSERT(result == &destText);
2225    REGEX_ASSERT_UTEXT("", result);
2226
2227    //
2228    // Empty substitution string
2229    //
2230    utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2231    matcher->reset(&dataText);
2232
2233    utext_openUTF8(&replText, NULL, 0, &status);
2234    result = matcher->replaceFirst(&replText, NULL, status);
2235    REGEX_CHECK_STATUS;
2236    REGEX_ASSERT_UTEXT("...abc...abc..", result);
2237    utext_close(result);
2238    result = matcher->replaceFirst(&replText, &destText, status);
2239    REGEX_CHECK_STATUS;
2240    REGEX_ASSERT(result == &destText);
2241    REGEX_ASSERT_UTEXT("...abc...abc..", result);
2242
2243    result = matcher->replaceAll(&replText, NULL, status);
2244    REGEX_CHECK_STATUS;
2245    REGEX_ASSERT_UTEXT("........", result);
2246    utext_close(result);
2247    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2248    result = matcher->replaceAll(&replText, &destText, status);
2249    REGEX_CHECK_STATUS;
2250    REGEX_ASSERT(result == &destText);
2251    REGEX_ASSERT_UTEXT("........", result);
2252
2253    //
2254    // match whole string
2255    //
2256    utext_openUTF8(&dataText, "abc", -1, &status);
2257    matcher->reset(&dataText);
2258
2259    utext_openUTF8(&replText, "xyz", -1, &status);
2260    result = matcher->replaceFirst(&replText, NULL, status);
2261    REGEX_CHECK_STATUS;
2262    REGEX_ASSERT_UTEXT("xyz", result);
2263    utext_close(result);
2264    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2265    result = matcher->replaceFirst(&replText, &destText, status);
2266    REGEX_CHECK_STATUS;
2267    REGEX_ASSERT(result == &destText);
2268    REGEX_ASSERT_UTEXT("xyz", result);
2269
2270    result = matcher->replaceAll(&replText, NULL, status);
2271    REGEX_CHECK_STATUS;
2272    REGEX_ASSERT_UTEXT("xyz", result);
2273    utext_close(result);
2274    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2275    result = matcher->replaceAll(&replText, &destText, status);
2276    REGEX_CHECK_STATUS;
2277    REGEX_ASSERT(result == &destText);
2278    REGEX_ASSERT_UTEXT("xyz", result);
2279
2280    //
2281    // Capture Group, simple case
2282    //
2283    utext_openUTF8(&re, "a(..)", -1, &status);
2284    RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2285    REGEX_CHECK_STATUS;
2286
2287    utext_openUTF8(&dataText, "abcdefg", -1, &status);
2288    RegexMatcher *matcher2 = pat2->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
2289    REGEX_CHECK_STATUS;
2290
2291    utext_openUTF8(&replText, "$1$1", -1, &status);
2292    result = matcher2->replaceFirst(&replText, NULL, status);
2293    REGEX_CHECK_STATUS;
2294    REGEX_ASSERT_UTEXT("bcbcdefg", result);
2295    utext_close(result);
2296    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2297    result = matcher2->replaceFirst(&replText, &destText, status);
2298    REGEX_CHECK_STATUS;
2299    REGEX_ASSERT(result == &destText);
2300    REGEX_ASSERT_UTEXT("bcbcdefg", result);
2301
2302    utext_openUTF8(&replText, "The value of \\$1 is $1.", -1, &status);
2303    result = matcher2->replaceFirst(&replText, NULL, status);
2304    REGEX_CHECK_STATUS;
2305    REGEX_ASSERT_UTEXT("The value of $1 is bc.defg", result);
2306    utext_close(result);
2307    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2308    result = matcher2->replaceFirst(&replText, &destText, status);
2309    REGEX_CHECK_STATUS;
2310    REGEX_ASSERT(result == &destText);
2311    REGEX_ASSERT_UTEXT("The value of $1 is bc.defg", result);
2312
2313    utext_openUTF8(&replText, "$ by itself, no group number $$$", -1, &status);
2314    result = matcher2->replaceFirst(&replText, NULL, status);
2315    REGEX_CHECK_STATUS;
2316    REGEX_ASSERT_UTEXT("$ by itself, no group number $$$defg", result);
2317    utext_close(result);
2318    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2319    result = matcher2->replaceFirst(&replText, &destText, status);
2320    REGEX_CHECK_STATUS;
2321    REGEX_ASSERT(result == &destText);
2322    REGEX_ASSERT_UTEXT("$ by itself, no group number $$$defg", result);
2323
2324    unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2325    //                                 012345678901234567890123456
2326    supplDigitChars[22] = 0xF0;
2327    supplDigitChars[23] = 0x9D;
2328    supplDigitChars[24] = 0x9F;
2329    supplDigitChars[25] = 0x8F;
2330    utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2331
2332    result = matcher2->replaceFirst(&replText, NULL, status);
2333    REGEX_CHECK_STATUS;
2334    REGEX_ASSERT_UTEXT("Supplemental Digit 1 bc.defg", result);
2335    utext_close(result);
2336    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2337    result = matcher2->replaceFirst(&replText, &destText, status);
2338    REGEX_CHECK_STATUS;
2339    REGEX_ASSERT(result == &destText);
2340    REGEX_ASSERT_UTEXT("Supplemental Digit 1 bc.defg", result);
2341
2342    utext_openUTF8(&replText, "bad capture group number $5...", -1, &status);
2343    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2344//    REGEX_ASSERT_UTEXT("abcdefg", result);
2345    utext_close(result);
2346    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2347    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2348    REGEX_ASSERT(result == &destText);
2349//    REGEX_ASSERT_UTEXT("abcdefg", result);
2350
2351    //
2352    // Replacement String with \u hex escapes
2353    //
2354    {
2355        utext_openUTF8(&dataText, "abc 1 abc 2 abc 3", -1, &status);
2356        utext_openUTF8(&replText, "--\\u0043--", -1, &status);
2357        matcher->reset(&dataText);
2358
2359        result = matcher->replaceAll(&replText, NULL, status);
2360        REGEX_CHECK_STATUS;
2361        REGEX_ASSERT_UTEXT("--C-- 1 --C-- 2 --C-- 3", result);
2362        utext_close(result);
2363        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2364        result = matcher->replaceAll(&replText, &destText, status);
2365        REGEX_CHECK_STATUS;
2366        REGEX_ASSERT(result == &destText);
2367        REGEX_ASSERT_UTEXT("--C-- 1 --C-- 2 --C-- 3", result);
2368    }
2369    {
2370        utext_openUTF8(&dataText, "abc !", -1, &status);
2371        utext_openUTF8(&replText, "--\\U00010000--", -1, &status);
2372        matcher->reset(&dataText);
2373
2374        unsigned char expected[] = "--xxxx-- !"; // \U00010000, "LINEAR B SYLLABLE B008 A"
2375        //                          0123456789
2376        expected[2] = 0xF0;
2377        expected[3] = 0x90;
2378        expected[4] = 0x80;
2379        expected[5] = 0x80;
2380
2381        result = matcher->replaceAll(&replText, NULL, status);
2382        REGEX_CHECK_STATUS;
2383        REGEX_ASSERT_UTEXT((char *)expected, result);
2384        utext_close(result);
2385        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2386        result = matcher->replaceAll(&replText, &destText, status);
2387        REGEX_CHECK_STATUS;
2388        REGEX_ASSERT(result == &destText);
2389        REGEX_ASSERT_UTEXT((char *)expected, result);
2390    }
2391    // TODO:  need more through testing of capture substitutions.
2392
2393    // Bug 4057
2394    //
2395    {
2396        status = U_ZERO_ERROR;
2397        utext_openUTF8(&re, "ss(.*?)ee", -1, &status);
2398        utext_openUTF8(&dataText, "The matches start with ss and end with ee ss stuff ee fin", -1, &status);
2399        utext_openUTF8(&replText, "ooh", -1, &status);
2400
2401        RegexMatcher m(&re, 0, status);
2402        REGEX_CHECK_STATUS;
2403
2404        UnicodeString result;
2405        UText resultText = UTEXT_INITIALIZER;
2406        utext_openUnicodeString(&resultText, &result, &status);
2407
2408        // Multiple finds do NOT bump up the previous appendReplacement postion.
2409        m.reset(&dataText);
2410        m.find();
2411        m.find();
2412        m.appendReplacement(&resultText, &replText, status);
2413        REGEX_CHECK_STATUS;
2414        REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText);
2415
2416        // After a reset into the interior of a string, appendReplacement still starts at beginning.
2417        status = U_ZERO_ERROR;
2418        result.truncate(0);
2419        utext_openUnicodeString(&resultText, &result, &status);
2420        m.reset(10, status);
2421        m.find();
2422        m.find();
2423        m.appendReplacement(&resultText, &replText, status);
2424        REGEX_CHECK_STATUS;
2425        REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText);
2426
2427        // find() at interior of string, appendReplacement still starts at beginning.
2428        status = U_ZERO_ERROR;
2429        result.truncate(0);
2430        utext_openUnicodeString(&resultText, &result, &status);
2431        m.reset();
2432        m.find(10, status);
2433        m.find();
2434        m.appendReplacement(&resultText, &replText, status);
2435        REGEX_CHECK_STATUS;
2436        REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh", &resultText);
2437
2438        m.appendTail(&resultText);
2439        REGEX_ASSERT_UTEXT("The matches start with ss and end with ee ooh fin", &resultText);
2440
2441        utext_close(&resultText);
2442    }
2443
2444    delete matcher2;
2445    delete pat2;
2446    delete matcher;
2447    delete pat;
2448
2449    utext_close(&dataText);
2450    utext_close(&replText);
2451    utext_close(&destText);
2452    utext_close(&re);
2453}
2454
2455
2456//---------------------------------------------------------------------------
2457//
2458//      API_Pattern_UTF8  Test that the API for class RegexPattern is
2459//                        present and nominally working.
2460//
2461//---------------------------------------------------------------------------
2462void RegexTest::API_Pattern_UTF8() {
2463    RegexPattern        pata;    // Test default constructor to not crash.
2464    RegexPattern        patb;
2465
2466    REGEX_ASSERT(pata == patb);
2467    REGEX_ASSERT(pata == pata);
2468
2469    UText         re1 = UTEXT_INITIALIZER;
2470    UText         re2 = UTEXT_INITIALIZER;
2471    UErrorCode    status = U_ZERO_ERROR;
2472    UParseError   pe;
2473
2474    utext_openUTF8(&re1, "abc[a-l][m-z]", -1, &status);
2475    utext_openUTF8(&re2, "def", -1, &status);
2476
2477    RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2478    RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2479    REGEX_CHECK_STATUS;
2480    REGEX_ASSERT(*pat1 == *pat1);
2481    REGEX_ASSERT(*pat1 != pata);
2482
2483    // Assign
2484    patb = *pat1;
2485    REGEX_ASSERT(patb == *pat1);
2486
2487    // Copy Construct
2488    RegexPattern patc(*pat1);
2489    REGEX_ASSERT(patc == *pat1);
2490    REGEX_ASSERT(patb == patc);
2491    REGEX_ASSERT(pat1 != pat2);
2492    patb = *pat2;
2493    REGEX_ASSERT(patb != patc);
2494    REGEX_ASSERT(patb == *pat2);
2495
2496    // Compile with no flags.
2497    RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2498    REGEX_ASSERT(*pat1a == *pat1);
2499
2500    REGEX_ASSERT(pat1a->flags() == 0);
2501
2502    // Compile with different flags should be not equal
2503    RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2504    REGEX_CHECK_STATUS;
2505
2506    REGEX_ASSERT(*pat1b != *pat1a);
2507    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2508    REGEX_ASSERT(pat1a->flags() == 0);
2509    delete pat1b;
2510
2511    // clone
2512    RegexPattern *pat1c = pat1->clone();
2513    REGEX_ASSERT(*pat1c == *pat1);
2514    REGEX_ASSERT(*pat1c != *pat2);
2515
2516    delete pat1c;
2517    delete pat1a;
2518    delete pat1;
2519    delete pat2;
2520
2521    utext_close(&re1);
2522    utext_close(&re2);
2523
2524
2525    //
2526    //   Verify that a matcher created from a cloned pattern works.
2527    //     (Jitterbug 3423)
2528    //
2529    {
2530        UErrorCode     status     = U_ZERO_ERROR;
2531        UText          pattern    = UTEXT_INITIALIZER;
2532        utext_openUTF8(&pattern, "\\p{L}+", -1, &status);
2533
2534        RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2535        RegexPattern  *pClone     = pSource->clone();
2536        delete         pSource;
2537        RegexMatcher  *mFromClone = pClone->matcher(status);
2538        REGEX_CHECK_STATUS;
2539
2540        UText          input      = UTEXT_INITIALIZER;
2541        utext_openUTF8(&input, "Hello World", -1, &status);
2542        mFromClone->reset(&input);
2543        REGEX_ASSERT(mFromClone->find() == TRUE);
2544        REGEX_ASSERT(mFromClone->group(status) == "Hello");
2545        REGEX_ASSERT(mFromClone->find() == TRUE);
2546        REGEX_ASSERT(mFromClone->group(status) == "World");
2547        REGEX_ASSERT(mFromClone->find() == FALSE);
2548        delete mFromClone;
2549        delete pClone;
2550
2551        utext_close(&input);
2552        utext_close(&pattern);
2553    }
2554
2555    //
2556    //   matches convenience API
2557    //
2558    {
2559        UErrorCode status  = U_ZERO_ERROR;
2560        UText      pattern = UTEXT_INITIALIZER;
2561        UText      input   = UTEXT_INITIALIZER;
2562
2563        utext_openUTF8(&input, "random input", -1, &status);
2564
2565        utext_openUTF8(&pattern, ".*", -1, &status);
2566        REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2567        REGEX_CHECK_STATUS;
2568
2569        utext_openUTF8(&pattern, "abc", -1, &status);
2570        REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2571        REGEX_CHECK_STATUS;
2572
2573        utext_openUTF8(&pattern, ".*nput", -1, &status);
2574        REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2575        REGEX_CHECK_STATUS;
2576
2577        utext_openUTF8(&pattern, "random input", -1, &status);
2578        REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2579        REGEX_CHECK_STATUS;
2580
2581        utext_openUTF8(&pattern, ".*u", -1, &status);
2582        REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2583        REGEX_CHECK_STATUS;
2584
2585        utext_openUTF8(&input, "abc", -1, &status);
2586        utext_openUTF8(&pattern, "abc", -1, &status);
2587        status = U_INDEX_OUTOFBOUNDS_ERROR;
2588        REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2589        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2590
2591        utext_close(&input);
2592        utext_close(&pattern);
2593    }
2594
2595
2596    //
2597    // Split()
2598    //
2599    status = U_ZERO_ERROR;
2600    utext_openUTF8(&re1, " +", -1, &status);
2601    pat1 = RegexPattern::compile(&re1, pe, status);
2602    REGEX_CHECK_STATUS;
2603    UnicodeString  fields[10];
2604
2605    int32_t n;
2606    n = pat1->split("Now is the time", fields, 10, status);
2607    REGEX_CHECK_STATUS;
2608    REGEX_ASSERT(n==4);
2609    REGEX_ASSERT(fields[0]=="Now");
2610    REGEX_ASSERT(fields[1]=="is");
2611    REGEX_ASSERT(fields[2]=="the");
2612    REGEX_ASSERT(fields[3]=="time");
2613    REGEX_ASSERT(fields[4]=="");
2614
2615    n = pat1->split("Now is the time", fields, 2, status);
2616    REGEX_CHECK_STATUS;
2617    REGEX_ASSERT(n==2);
2618    REGEX_ASSERT(fields[0]=="Now");
2619    REGEX_ASSERT(fields[1]=="is the time");
2620    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2621
2622    fields[1] = "*";
2623    status = U_ZERO_ERROR;
2624    n = pat1->split("Now is the time", fields, 1, status);
2625    REGEX_CHECK_STATUS;
2626    REGEX_ASSERT(n==1);
2627    REGEX_ASSERT(fields[0]=="Now is the time");
2628    REGEX_ASSERT(fields[1]=="*");
2629    status = U_ZERO_ERROR;
2630
2631    n = pat1->split("    Now       is the time   ", fields, 10, status);
2632    REGEX_CHECK_STATUS;
2633    REGEX_ASSERT(n==5);
2634    REGEX_ASSERT(fields[0]=="");
2635    REGEX_ASSERT(fields[1]=="Now");
2636    REGEX_ASSERT(fields[2]=="is");
2637    REGEX_ASSERT(fields[3]=="the");
2638    REGEX_ASSERT(fields[4]=="time");
2639    REGEX_ASSERT(fields[5]=="");
2640
2641    n = pat1->split("     ", fields, 10, status);
2642    REGEX_CHECK_STATUS;
2643    REGEX_ASSERT(n==1);
2644    REGEX_ASSERT(fields[0]=="");
2645
2646    fields[0] = "foo";
2647    n = pat1->split("", fields, 10, status);
2648    REGEX_CHECK_STATUS;
2649    REGEX_ASSERT(n==0);
2650    REGEX_ASSERT(fields[0]=="foo");
2651
2652    delete pat1;
2653
2654    //  split, with a pattern with (capture)
2655    utext_openUTF8(&re1, "<(\\w*)>", -1, &status);
2656    pat1 = RegexPattern::compile(&re1,  pe, status);
2657    REGEX_CHECK_STATUS;
2658
2659    status = U_ZERO_ERROR;
2660    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2661    REGEX_CHECK_STATUS;
2662    REGEX_ASSERT(n==6);
2663    REGEX_ASSERT(fields[0]=="");
2664    REGEX_ASSERT(fields[1]=="a");
2665    REGEX_ASSERT(fields[2]=="Now is ");
2666    REGEX_ASSERT(fields[3]=="b");
2667    REGEX_ASSERT(fields[4]=="the time");
2668    REGEX_ASSERT(fields[5]=="c");
2669    REGEX_ASSERT(fields[6]=="");
2670    REGEX_ASSERT(status==U_ZERO_ERROR);
2671
2672    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2673    REGEX_CHECK_STATUS;
2674    REGEX_ASSERT(n==6);
2675    REGEX_ASSERT(fields[0]=="  ");
2676    REGEX_ASSERT(fields[1]=="a");
2677    REGEX_ASSERT(fields[2]=="Now is ");
2678    REGEX_ASSERT(fields[3]=="b");
2679    REGEX_ASSERT(fields[4]=="the time");
2680    REGEX_ASSERT(fields[5]=="c");
2681    REGEX_ASSERT(fields[6]=="");
2682
2683    status = U_ZERO_ERROR;
2684    fields[6] = "foo";
2685    n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
2686    REGEX_CHECK_STATUS;
2687    REGEX_ASSERT(n==6);
2688    REGEX_ASSERT(fields[0]=="  ");
2689    REGEX_ASSERT(fields[1]=="a");
2690    REGEX_ASSERT(fields[2]=="Now is ");
2691    REGEX_ASSERT(fields[3]=="b");
2692    REGEX_ASSERT(fields[4]=="the time");
2693    REGEX_ASSERT(fields[5]=="c");
2694    REGEX_ASSERT(fields[6]=="foo");
2695
2696    status = U_ZERO_ERROR;
2697    fields[5] = "foo";
2698    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
2699    REGEX_CHECK_STATUS;
2700    REGEX_ASSERT(n==5);
2701    REGEX_ASSERT(fields[0]=="  ");
2702    REGEX_ASSERT(fields[1]=="a");
2703    REGEX_ASSERT(fields[2]=="Now is ");
2704    REGEX_ASSERT(fields[3]=="b");
2705    REGEX_ASSERT(fields[4]=="the time<c>");
2706    REGEX_ASSERT(fields[5]=="foo");
2707
2708    status = U_ZERO_ERROR;
2709    fields[5] = "foo";
2710    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
2711    REGEX_CHECK_STATUS;
2712    REGEX_ASSERT(n==5);
2713    REGEX_ASSERT(fields[0]=="  ");
2714    REGEX_ASSERT(fields[1]=="a");
2715    REGEX_ASSERT(fields[2]=="Now is ");
2716    REGEX_ASSERT(fields[3]=="b");
2717    REGEX_ASSERT(fields[4]=="the time");
2718    REGEX_ASSERT(fields[5]=="foo");
2719
2720    status = U_ZERO_ERROR;
2721    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
2722    REGEX_CHECK_STATUS;
2723    REGEX_ASSERT(n==4);
2724    REGEX_ASSERT(fields[0]=="  ");
2725    REGEX_ASSERT(fields[1]=="a");
2726    REGEX_ASSERT(fields[2]=="Now is ");
2727    REGEX_ASSERT(fields[3]=="the time<c>");
2728    status = U_ZERO_ERROR;
2729    delete pat1;
2730
2731    utext_openUTF8(&re1, "([-,])", -1, &status);
2732    pat1 = RegexPattern::compile(&re1, pe, status);
2733    REGEX_CHECK_STATUS;
2734    n = pat1->split("1-10,20", fields, 10, status);
2735    REGEX_CHECK_STATUS;
2736    REGEX_ASSERT(n==5);
2737    REGEX_ASSERT(fields[0]=="1");
2738    REGEX_ASSERT(fields[1]=="-");
2739    REGEX_ASSERT(fields[2]=="10");
2740    REGEX_ASSERT(fields[3]==",");
2741    REGEX_ASSERT(fields[4]=="20");
2742    delete pat1;
2743
2744
2745    //
2746    // RegexPattern::pattern() and patternText()
2747    //
2748    pat1 = new RegexPattern();
2749    REGEX_ASSERT(pat1->pattern() == "");
2750    REGEX_ASSERT_UTEXT("", pat1->patternText());
2751    delete pat1;
2752
2753    utext_openUTF8(&re1, "(Hello, world)*", -1, &status);
2754    pat1 = RegexPattern::compile(&re1, pe, status);
2755    REGEX_CHECK_STATUS;
2756    REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
2757    REGEX_ASSERT_UTEXT("(Hello, world)*", pat1->patternText());
2758    delete pat1;
2759
2760    utext_close(&re1);
2761}
2762
2763
2764//---------------------------------------------------------------------------
2765//
2766//      Extended       A more thorough check for features of regex patterns
2767//                     The test cases are in a separate data file,
2768//                       source/tests/testdata/regextst.txt
2769//                     A description of the test data format is included in that file.
2770//
2771//---------------------------------------------------------------------------
2772
2773const char *
2774RegexTest::getPath(char buffer[2048], const char *filename) {
2775    UErrorCode status=U_ZERO_ERROR;
2776    const char *testDataDirectory = IntlTest::getSourceTestData(status);
2777    if (U_FAILURE(status)) {
2778        errln("ERROR: loadTestData() failed - %s", u_errorName(status));
2779        return NULL;
2780    }
2781
2782    strcpy(buffer, testDataDirectory);
2783    strcat(buffer, filename);
2784    return buffer;
2785}
2786
2787void RegexTest::Extended() {
2788    char tdd[2048];
2789    const char *srcPath;
2790    UErrorCode  status  = U_ZERO_ERROR;
2791    int32_t     lineNum = 0;
2792
2793    //
2794    //  Open and read the test data file.
2795    //
2796    srcPath=getPath(tdd, "regextst.txt");
2797    if(srcPath==NULL) {
2798        return; /* something went wrong, error already output */
2799    }
2800
2801    int32_t    len;
2802    UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
2803    if (U_FAILURE(status)) {
2804        return; /* something went wrong, error already output */
2805    }
2806
2807    //
2808    //  Put the test data into a UnicodeString
2809    //
2810    UnicodeString testString(FALSE, testData, len);
2811
2812    RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
2813    RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
2814    RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
2815
2816    RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
2817    UnicodeString   testPattern;   // The pattern for test from the test file.
2818    UnicodeString   testFlags;     // the flags   for a test.
2819    UnicodeString   matchString;   // The marked up string to be used as input
2820
2821    if (U_FAILURE(status)){
2822        dataerrln("Construct RegexMatcher() error.");
2823        delete [] testData;
2824        return;
2825    }
2826
2827    //
2828    //  Loop over the test data file, once per line.
2829    //
2830    while (lineMat.find()) {
2831        lineNum++;
2832        if (U_FAILURE(status)) {
2833            errln("line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
2834        }
2835
2836        status = U_ZERO_ERROR;
2837        UnicodeString testLine = lineMat.group(1, status);
2838        if (testLine.length() == 0) {
2839            continue;
2840        }
2841
2842        //
2843        // Parse the test line.  Skip blank and comment only lines.
2844        // Separate out the three main fields - pattern, flags, target.
2845        //
2846
2847        commentMat.reset(testLine);
2848        if (commentMat.lookingAt(status)) {
2849            // This line is a comment, or blank.
2850            continue;
2851        }
2852
2853        //
2854        //  Pull out the pattern field, remove it from the test file line.
2855        //
2856        quotedStuffMat.reset(testLine);
2857        if (quotedStuffMat.lookingAt(status)) {
2858            testPattern = quotedStuffMat.group(2, status);
2859            testLine.remove(0, quotedStuffMat.end(0, status));
2860        } else {
2861            errln("Bad pattern (missing quotes?) at test file line %d", lineNum);
2862            continue;
2863        }
2864
2865
2866        //
2867        //  Pull out the flags from the test file line.
2868        //
2869        flagsMat.reset(testLine);
2870        flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
2871        testFlags = flagsMat.group(1, status);
2872        if (flagsMat.group(2, status).length() > 0) {
2873            errln("Bad Match flag at line %d. Scanning %c\n",
2874                lineNum, flagsMat.group(2, status).charAt(0));
2875            continue;
2876        }
2877        testLine.remove(0, flagsMat.end(0, status));
2878
2879        //
2880        //  Pull out the match string, as a whole.
2881        //    We'll process the <tags> later.
2882        //
2883        quotedStuffMat.reset(testLine);
2884        if (quotedStuffMat.lookingAt(status)) {
2885            matchString = quotedStuffMat.group(2, status);
2886            testLine.remove(0, quotedStuffMat.end(0, status));
2887        } else {
2888            errln("Bad match string at test file line %d", lineNum);
2889            continue;
2890        }
2891
2892        //
2893        //  The only thing left from the input line should be an optional trailing comment.
2894        //
2895        commentMat.reset(testLine);
2896        if (commentMat.lookingAt(status) == FALSE) {
2897            errln("Line %d: unexpected characters at end of test line.", lineNum);
2898            continue;
2899        }
2900
2901        //
2902        //  Run the test
2903        //
2904        regex_find(testPattern, testFlags, matchString, lineNum);
2905    }
2906
2907    delete [] testData;
2908
2909}
2910
2911
2912
2913//---------------------------------------------------------------------------
2914//
2915//    regex_find(pattern, flags, inputString, lineNumber)
2916//
2917//         Function to run a single test from the Extended (data driven) tests.
2918//         See file test/testdata/regextst.txt for a description of the
2919//         pattern and inputString fields, and the allowed flags.
2920//         lineNumber is the source line in regextst.txt of the test.
2921//
2922//---------------------------------------------------------------------------
2923
2924
2925//  Set a value into a UVector at position specified by a decimal number in
2926//   a UnicodeString.   This is a utility function needed by the actual test function,
2927//   which follows.
2928static void set(UVector &vec, int32_t val, UnicodeString index) {
2929    UErrorCode  status=U_ZERO_ERROR;
2930    int32_t  idx = 0;
2931    for (int32_t i=0; i<index.length(); i++) {
2932        int32_t d=u_charDigitValue(index.charAt(i));
2933        if (d<0) {return;}
2934        idx = idx*10 + d;
2935    }
2936    while (vec.size()<idx+1) {vec.addElement(-1, status);}
2937    vec.setElementAt(val, idx);
2938}
2939
2940void RegexTest::regex_find(const UnicodeString &pattern,
2941                           const UnicodeString &flags,
2942                           const UnicodeString &inputString,
2943                           int32_t line) {
2944    UnicodeString       unEscapedInput;
2945    UnicodeString       deTaggedInput;
2946
2947    int32_t             patternUTF8Length,      inputUTF8Length;
2948    char                *patternChars  = NULL, *inputChars = NULL;
2949    UText               patternText    = UTEXT_INITIALIZER;
2950    UText               inputText      = UTEXT_INITIALIZER;
2951    UConverter          *UTF8Converter = NULL;
2952
2953    UErrorCode          status         = U_ZERO_ERROR;
2954    UParseError         pe;
2955    RegexPattern        *parsePat      = NULL;
2956    RegexMatcher        *parseMatcher  = NULL;
2957    RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
2958    RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
2959    UVector             groupStarts(status);
2960    UVector             groupEnds(status);
2961    UBool               isMatch        = FALSE, isUTF8Match = FALSE;
2962    UBool               failed         = FALSE;
2963    int32_t             numFinds;
2964    int32_t             i;
2965    UBool               useMatchesFunc   = FALSE;
2966    UBool               useLookingAtFunc = FALSE;
2967    int32_t             regionStart      = -1;
2968    int32_t             regionEnd        = -1;
2969
2970    //
2971    //  Compile the caller's pattern
2972    //
2973    uint32_t bflags = 0;
2974    if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
2975        bflags |= UREGEX_CASE_INSENSITIVE;
2976    }
2977    if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
2978        bflags |= UREGEX_COMMENTS;
2979    }
2980    if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
2981        bflags |= UREGEX_DOTALL;
2982    }
2983    if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
2984        bflags |= UREGEX_MULTILINE;
2985    }
2986
2987    if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
2988        bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
2989    }
2990    if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
2991        bflags |= UREGEX_UNIX_LINES;
2992    }
2993
2994
2995    callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
2996    if (status != U_ZERO_ERROR) {
2997        #if UCONFIG_NO_BREAK_ITERATION==1
2998        // 'v' test flag means that the test pattern should not compile if ICU was configured
2999        //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3000        if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3001            goto cleanupAndReturn;
3002        }
3003        #endif
3004        if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3005            // Expected pattern compilation error.
3006            if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3007                logln("Pattern Compile returns \"%s\"", u_errorName(status));
3008            }
3009            goto cleanupAndReturn;
3010        } else {
3011            // Unexpected pattern compilation error.
3012            errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3013            goto cleanupAndReturn;
3014        }
3015    }
3016
3017    UTF8Converter = ucnv_open("UTF8", &status);
3018    ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3019
3020    patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3021    status = U_ZERO_ERROR; // buffer overflow
3022    patternChars = new char[patternUTF8Length+1];
3023    pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3024    utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3025
3026    if (status == U_ZERO_ERROR) {
3027        UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3028
3029        if (status != U_ZERO_ERROR) {
3030#if UCONFIG_NO_BREAK_ITERATION==1
3031            // 'v' test flag means that the test pattern should not compile if ICU was configured
3032            //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3033            if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3034                goto cleanupAndReturn;
3035            }
3036#endif
3037            if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3038                // Expected pattern compilation error.
3039                if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3040                    logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3041                }
3042                goto cleanupAndReturn;
3043            } else {
3044                // Unexpected pattern compilation error.
3045                errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3046                goto cleanupAndReturn;
3047            }
3048        }
3049    }
3050
3051    if (UTF8Pattern == NULL) {
3052        // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3053        logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for line %d", line);
3054        status = U_ZERO_ERROR;
3055    }
3056
3057    if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3058        RegexPatternDump(callerPattern);
3059    }
3060
3061    if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3062        errln("Expected, but did not get, a pattern compilation error.");
3063        goto cleanupAndReturn;
3064    }
3065
3066
3067    //
3068    // Number of times find() should be called on the test string, default to 1
3069    //
3070    numFinds = 1;
3071    for (i=2; i<=9; i++) {
3072        if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3073            if (numFinds != 1) {
3074                errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3075                goto cleanupAndReturn;
3076            }
3077            numFinds = i;
3078        }
3079    }
3080
3081    // 'M' flag.  Use matches() instead of find()
3082    if (flags.indexOf((UChar)0x4d) >= 0) {
3083        useMatchesFunc = TRUE;
3084    }
3085    if (flags.indexOf((UChar)0x4c) >= 0) {
3086        useLookingAtFunc = TRUE;
3087    }
3088
3089    //
3090    //  Find the tags in the input data, remove them, and record the group boundary
3091    //    positions.
3092    //
3093    parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3094    REGEX_CHECK_STATUS_L(line);
3095
3096    unEscapedInput = inputString.unescape();
3097    parseMatcher = parsePat->matcher(unEscapedInput, status);
3098    REGEX_CHECK_STATUS_L(line);
3099    while(parseMatcher->find()) {
3100        parseMatcher->appendReplacement(deTaggedInput, "", status);
3101        REGEX_CHECK_STATUS;
3102        UnicodeString groupNum = parseMatcher->group(2, status);
3103        if (groupNum == "r") {
3104            // <r> or </r>, a region specification within the string
3105            if (parseMatcher->group(1, status) == "/") {
3106                regionEnd = deTaggedInput.length();
3107            } else {
3108                regionStart = deTaggedInput.length();
3109            }
3110        } else {
3111            // <digits> or </digits>, a group match boundary tag.
3112            if (parseMatcher->group(1, status) == "/") {
3113                set(groupEnds, deTaggedInput.length(), groupNum);
3114            } else {
3115                set(groupStarts, deTaggedInput.length(), groupNum);
3116            }
3117        }
3118    }
3119    parseMatcher->appendTail(deTaggedInput);
3120    REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3121    if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3122      errln("mismatched <r> tags");
3123      failed = TRUE;
3124      goto cleanupAndReturn;
3125    }
3126
3127
3128    //
3129    //  Configure the matcher according to the flags specified with this test.
3130    //
3131    matcher = callerPattern->matcher(deTaggedInput, status);
3132    REGEX_CHECK_STATUS_L(line);
3133    if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3134        matcher->setTrace(TRUE);
3135    }
3136
3137    if (UTF8Pattern != NULL) {
3138        inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3139        status = U_ZERO_ERROR; // buffer overflow
3140        inputChars = new char[inputUTF8Length+1];
3141        deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3142        utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3143
3144        if (status == U_ZERO_ERROR) {
3145            UTF8Matcher = UTF8Pattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
3146            REGEX_CHECK_STATUS_L(line);
3147        }
3148
3149        if (UTF8Matcher == NULL) {
3150            // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3151            logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for line %d", line);
3152            status = U_ZERO_ERROR;
3153        }
3154    }
3155
3156    if (regionStart>=0) {
3157       matcher->region(regionStart, regionEnd, status);
3158       REGEX_CHECK_STATUS_L(line);
3159       if (UTF8Matcher != NULL) {
3160           UTF8Matcher->region(regionStart, regionEnd, status);
3161           REGEX_CHECK_STATUS_L(line);
3162       }
3163    }
3164    if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3165        matcher->useAnchoringBounds(FALSE);
3166        if (UTF8Matcher != NULL) {
3167            UTF8Matcher->useAnchoringBounds(FALSE);
3168        }
3169    }
3170    if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3171        matcher->useTransparentBounds(TRUE);
3172        if (UTF8Matcher != NULL) {
3173            UTF8Matcher->useTransparentBounds(TRUE);
3174        }
3175    }
3176
3177
3178
3179    //
3180    // Do a find on the de-tagged input using the caller's pattern
3181    //     TODO: error on count>1 and not find().
3182    //           error on both matches() and lookingAt().
3183    //
3184    for (i=0; i<numFinds; i++) {
3185        if (useMatchesFunc) {
3186            isMatch = matcher->matches(status);
3187            if (UTF8Matcher != NULL) {
3188               isUTF8Match = UTF8Matcher->matches(status);
3189            }
3190        } else  if (useLookingAtFunc) {
3191            isMatch = matcher->lookingAt(status);
3192            if (UTF8Matcher != NULL) {
3193                isUTF8Match = UTF8Matcher->lookingAt(status);
3194            }
3195        } else {
3196            isMatch = matcher->find();
3197            if (UTF8Matcher != NULL) {
3198                isUTF8Match = UTF8Matcher->find();
3199            }
3200        }
3201    }
3202    matcher->setTrace(FALSE);
3203
3204    //
3205    // Match up the groups from the find() with the groups from the tags
3206    //
3207
3208    // number of tags should match number of groups from find operation.
3209    // matcher->groupCount does not include group 0, the entire match, hence the +1.
3210    //   G option in test means that capture group data is not available in the
3211    //     expected results, so the check needs to be suppressed.
3212    if (isMatch == FALSE && groupStarts.size() != 0) {
3213        errln("Error at line %d:  Match expected, but none found.", line);
3214        failed = TRUE;
3215        goto cleanupAndReturn;
3216    } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3217        errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3218        failed = TRUE;
3219        goto cleanupAndReturn;
3220    }
3221
3222    if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3223        // Only check for match / no match.  Don't check capture groups.
3224        if (isMatch && groupStarts.size() == 0) {
3225            errln("Error at line %d:  No match expected, but one found.", line);
3226            failed = TRUE;
3227        } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3228            errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3229            failed = TRUE;
3230        }
3231        goto cleanupAndReturn;
3232    }
3233
3234    REGEX_CHECK_STATUS_L(line);
3235    for (i=0; i<=matcher->groupCount(); i++) {
3236        int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3237        if (matcher->start(i, status) != expectedStart) {
3238            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3239                line, i, expectedStart, matcher->start(i, status));
3240            failed = TRUE;
3241            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3242        } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStart) {
3243            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3244                  line, i, expectedStart, UTF8Matcher->start(i, status));
3245            failed = TRUE;
3246            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3247        }
3248
3249        int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3250        if (matcher->end(i, status) != expectedEnd) {
3251            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3252                line, i, expectedEnd, matcher->end(i, status));
3253            failed = TRUE;
3254            // Error on end position;  keep going; real error is probably yet to come as group
3255            //   end positions work from end of the input data towards the front.
3256        } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEnd) {
3257            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3258                  line, i, expectedEnd, UTF8Matcher->end(i, status));
3259            failed = TRUE;
3260            // Error on end position;  keep going; real error is probably yet to come as group
3261            //   end positions work from end of the input data towards the front.
3262        }
3263    }
3264    if ( matcher->groupCount()+1 < groupStarts.size()) {
3265        errln("Error at line %d: Expected %d capture groups, found %d.",
3266            line, groupStarts.size()-1, matcher->groupCount());
3267        failed = TRUE;
3268        }
3269    else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3270        errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3271              line, groupStarts.size()-1, UTF8Matcher->groupCount());
3272        failed = TRUE;
3273    }
3274
3275    if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3276        matcher->requireEnd() == TRUE) {
3277        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3278        failed = TRUE;
3279    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3280        UTF8Matcher->requireEnd() == TRUE) {
3281        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3282        failed = TRUE;
3283    }
3284
3285    if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3286        matcher->requireEnd() == FALSE) {
3287        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3288        failed = TRUE;
3289    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3290        UTF8Matcher->requireEnd() == FALSE) {
3291        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3292        failed = TRUE;
3293    }
3294
3295    if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3296        matcher->hitEnd() == TRUE) {
3297        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3298        failed = TRUE;
3299    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3300               UTF8Matcher->hitEnd() == TRUE) {
3301        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3302        failed = TRUE;
3303    }
3304
3305    if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3306        matcher->hitEnd() == FALSE) {
3307        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3308        failed = TRUE;
3309    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3310               UTF8Matcher->hitEnd() == FALSE) {
3311        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3312        failed = TRUE;
3313    }
3314
3315
3316cleanupAndReturn:
3317    if (failed) {
3318        infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3319            +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3320        // callerPattern->dump();
3321    }
3322    delete parseMatcher;
3323    delete parsePat;
3324    delete UTF8Matcher;
3325    delete UTF8Pattern;
3326    delete matcher;
3327    delete callerPattern;
3328
3329    utext_close(&inputText);
3330    delete[] inputChars;
3331    utext_close(&patternText);
3332    delete[] patternChars;
3333    ucnv_close(UTF8Converter);
3334}
3335
3336
3337
3338
3339//---------------------------------------------------------------------------
3340//
3341//      Errors     Check for error handling in patterns.
3342//
3343//---------------------------------------------------------------------------
3344void RegexTest::Errors() {
3345    // \escape sequences that aren't implemented yet.
3346    //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3347
3348    // Missing close parentheses
3349    REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3350    REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3351    REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3352
3353    // Extra close paren
3354    REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3355    REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3356    REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3357
3358    // Look-ahead, Look-behind
3359    //  TODO:  add tests for unbounded length look-behinds.
3360    REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3361
3362    // Attempt to use non-default flags
3363    {
3364        UParseError   pe;
3365        UErrorCode    status = U_ZERO_ERROR;
3366        int32_t       flags  = UREGEX_CANON_EQ |
3367                               UREGEX_COMMENTS         | UREGEX_DOTALL   |
3368                               UREGEX_MULTILINE;
3369        RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3370        REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3371        delete pat1;
3372    }
3373
3374
3375    // Quantifiers are allowed only after something that can be quantified.
3376    REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3377    REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3378    REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3379
3380    // Mal-formed {min,max} quantifiers
3381    REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3382    REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3383    REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3384    REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3385    REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3386    REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3387    REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3388    REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3389    REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3390
3391    // Ticket 5389
3392    REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3393
3394    // Invalid Back Reference \0
3395    //    For ICU 3.8 and earlier
3396    //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3397    //
3398    REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3399
3400}
3401
3402
3403//-------------------------------------------------------------------------------
3404//
3405//  Read a text data file, convert it to UChars, and return the data
3406//    in one big UChar * buffer, which the caller must delete.
3407//
3408//--------------------------------------------------------------------------------
3409UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3410                                     const char *defEncoding, UErrorCode &status) {
3411    UChar       *retPtr  = NULL;
3412    char        *fileBuf = NULL;
3413    UConverter* conv     = NULL;
3414    FILE        *f       = NULL;
3415
3416    ulen = 0;
3417    if (U_FAILURE(status)) {
3418        return retPtr;
3419    }
3420
3421    //
3422    //  Open the file.
3423    //
3424    f = fopen(fileName, "rb");
3425    if (f == 0) {
3426        dataerrln("Error opening test data file %s\n", fileName);
3427        status = U_FILE_ACCESS_ERROR;
3428        return NULL;
3429    }
3430    //
3431    //  Read it in
3432    //
3433    int32_t            fileSize;
3434    int32_t            amt_read;
3435
3436    fseek( f, 0, SEEK_END);
3437    fileSize = ftell(f);
3438    fileBuf = new char[fileSize];
3439    fseek(f, 0, SEEK_SET);
3440    amt_read = fread(fileBuf, 1, fileSize, f);
3441    if (amt_read != fileSize || fileSize <= 0) {
3442        errln("Error reading test data file.");
3443        goto cleanUpAndReturn;
3444    }
3445
3446    //
3447    // Look for a Unicode Signature (BOM) on the data just read
3448    //
3449    int32_t        signatureLength;
3450    const char *   fileBufC;
3451    const char*    encoding;
3452
3453    fileBufC = fileBuf;
3454    encoding = ucnv_detectUnicodeSignature(
3455        fileBuf, fileSize, &signatureLength, &status);
3456    if(encoding!=NULL ){
3457        fileBufC  += signatureLength;
3458        fileSize  -= signatureLength;
3459    } else {
3460        encoding = defEncoding;
3461        if (strcmp(encoding, "utf-8") == 0) {
3462            errln("file %s is missing its BOM", fileName);
3463        }
3464    }
3465
3466    //
3467    // Open a converter to take the rule file to UTF-16
3468    //
3469    conv = ucnv_open(encoding, &status);
3470    if (U_FAILURE(status)) {
3471        goto cleanUpAndReturn;
3472    }
3473
3474    //
3475    // Convert the rules to UChar.
3476    //  Preflight first to determine required buffer size.
3477    //
3478    ulen = ucnv_toUChars(conv,
3479        NULL,           //  dest,
3480        0,              //  destCapacity,
3481        fileBufC,
3482        fileSize,
3483        &status);
3484    if (status == U_BUFFER_OVERFLOW_ERROR) {
3485        // Buffer Overflow is expected from the preflight operation.
3486        status = U_ZERO_ERROR;
3487
3488        retPtr = new UChar[ulen+1];
3489        ucnv_toUChars(conv,
3490            retPtr,       //  dest,
3491            ulen+1,
3492            fileBufC,
3493            fileSize,
3494            &status);
3495    }
3496
3497cleanUpAndReturn:
3498    fclose(f);
3499    delete[] fileBuf;
3500    ucnv_close(conv);
3501    if (U_FAILURE(status)) {
3502        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3503        delete retPtr;
3504        retPtr = 0;
3505        ulen   = 0;
3506    };
3507    return retPtr;
3508}
3509
3510
3511//-------------------------------------------------------------------------------
3512//
3513//   PerlTests  - Run Perl's regular expression tests
3514//                The input file for this test is re_tests, the standard regular
3515//                expression test data distributed with the Perl source code.
3516//
3517//                Here is Perl's description of the test data file:
3518//
3519//        # The tests are in a separate file 't/op/re_tests'.
3520//        # Each line in that file is a separate test.
3521//        # There are five columns, separated by tabs.
3522//        #
3523//        # Column 1 contains the pattern, optionally enclosed in C<''>.
3524//        # Modifiers can be put after the closing C<'>.
3525//        #
3526//        # Column 2 contains the string to be matched.
3527//        #
3528//        # Column 3 contains the expected result:
3529//        #     y   expect a match
3530//        #     n   expect no match
3531//        #     c   expect an error
3532//        # B   test exposes a known bug in Perl, should be skipped
3533//        # b   test exposes a known bug in Perl, should be skipped if noamp
3534//        #
3535//        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3536//        #
3537//        # Column 4 contains a string, usually C<$&>.
3538//        #
3539//        # Column 5 contains the expected result of double-quote
3540//        # interpolating that string after the match, or start of error message.
3541//        #
3542//        # Column 6, if present, contains a reason why the test is skipped.
3543//        # This is printed with "skipped", for harness to pick up.
3544//        #
3545//        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3546//        #
3547//        # If you want to add a regular expression test that can't be expressed
3548//        # in this format, don't add it here: put it in op/pat.t instead.
3549//
3550//        For ICU, if field 3 contains an 'i', the test will be skipped.
3551//        The test exposes is some known incompatibility between ICU and Perl regexps.
3552//        (The i is in addition to whatever was there before.)
3553//
3554//-------------------------------------------------------------------------------
3555void RegexTest::PerlTests() {
3556    char tdd[2048];
3557    const char *srcPath;
3558    UErrorCode  status = U_ZERO_ERROR;
3559    UParseError pe;
3560
3561    //
3562    //  Open and read the test data file.
3563    //
3564    srcPath=getPath(tdd, "re_tests.txt");
3565    if(srcPath==NULL) {
3566        return; /* something went wrong, error already output */
3567    }
3568
3569    int32_t    len;
3570    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3571    if (U_FAILURE(status)) {
3572        return; /* something went wrong, error already output */
3573    }
3574
3575    //
3576    //  Put the test data into a UnicodeString
3577    //
3578    UnicodeString testDataString(FALSE, testData, len);
3579
3580    //
3581    //  Regex to break the input file into lines, and strip the new lines.
3582    //     One line per match, capture group one is the desired data.
3583    //
3584    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3585    if (U_FAILURE(status)) {
3586        dataerrln("RegexPattern::compile() error");
3587        return;
3588    }
3589    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3590
3591    //
3592    //  Regex to split a test file line into fields.
3593    //    There are six fields, separated by tabs.
3594    //
3595    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3596
3597    //
3598    //  Regex to identify test patterns with flag settings, and to separate them.
3599    //    Test patterns with flags look like 'pattern'i
3600    //    Test patterns without flags are not quoted:   pattern
3601    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3602    //
3603    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3604    RegexMatcher* flagMat = flagPat->matcher(status);
3605
3606    //
3607    // The Perl tests reference several perl-isms, which are evaluated/substituted
3608    //   in the test data.  Not being perl, this must be done explicitly.  Here
3609    //   are string constants and REs for these constructs.
3610    //
3611    UnicodeString nulnulSrc("${nulnul}");
3612    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3613    nulnul = nulnul.unescape();
3614
3615    UnicodeString ffffSrc("${ffff}");
3616    UnicodeString ffff("\\uffff", -1, US_INV);
3617    ffff = ffff.unescape();
3618
3619    //  regexp for $-[0], $+[2], etc.
3620    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3621    RegexMatcher *groupsMat = groupsPat->matcher(status);
3622
3623    //  regexp for $0, $1, $2, etc.
3624    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3625    RegexMatcher *cgMat = cgPat->matcher(status);
3626
3627
3628    //
3629    // Main Loop for the Perl Tests, runs once per line from the
3630    //   test data file.
3631    //
3632    int32_t  lineNum = 0;
3633    int32_t  skippedUnimplementedCount = 0;
3634    while (lineMat->find()) {
3635        lineNum++;
3636
3637        //
3638        //  Get a line, break it into its fields, do the Perl
3639        //    variable substitutions.
3640        //
3641        UnicodeString line = lineMat->group(1, status);
3642        UnicodeString fields[7];
3643        fieldPat->split(line, fields, 7, status);
3644
3645        flagMat->reset(fields[0]);
3646        flagMat->matches(status);
3647        UnicodeString pattern  = flagMat->group(2, status);
3648        pattern.findAndReplace("${bang}", "!");
3649        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
3650        pattern.findAndReplace(ffffSrc, ffff);
3651
3652        //
3653        //  Identify patterns that include match flag settings,
3654        //    split off the flags, remove the extra quotes.
3655        //
3656        UnicodeString flagStr = flagMat->group(3, status);
3657        if (U_FAILURE(status)) {
3658            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3659            return;
3660        }
3661        int32_t flags = 0;
3662        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
3663        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
3664        const UChar UChar_m = 0x6d;
3665        const UChar UChar_x = 0x78;
3666        const UChar UChar_y = 0x79;
3667        if (flagStr.indexOf(UChar_i) != -1) {
3668            flags |= UREGEX_CASE_INSENSITIVE;
3669        }
3670        if (flagStr.indexOf(UChar_m) != -1) {
3671            flags |= UREGEX_MULTILINE;
3672        }
3673        if (flagStr.indexOf(UChar_x) != -1) {
3674            flags |= UREGEX_COMMENTS;
3675        }
3676
3677        //
3678        // Compile the test pattern.
3679        //
3680        status = U_ZERO_ERROR;
3681        RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
3682        if (status == U_REGEX_UNIMPLEMENTED) {
3683            //
3684            // Test of a feature that is planned for ICU, but not yet implemented.
3685            //   skip the test.
3686            skippedUnimplementedCount++;
3687            delete testPat;
3688            status = U_ZERO_ERROR;
3689            continue;
3690        }
3691
3692        if (U_FAILURE(status)) {
3693            // Some tests are supposed to generate errors.
3694            //   Only report an error for tests that are supposed to succeed.
3695            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
3696                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
3697            {
3698                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
3699            }
3700            status = U_ZERO_ERROR;
3701            delete testPat;
3702            continue;
3703        }
3704
3705        if (fields[2].indexOf(UChar_i) >= 0) {
3706            // ICU should skip this test.
3707            delete testPat;
3708            continue;
3709        }
3710
3711        if (fields[2].indexOf(UChar_c) >= 0) {
3712            // This pattern should have caused a compilation error, but didn't/
3713            errln("line %d: Expected a pattern compile error, got success.", lineNum);
3714            delete testPat;
3715            continue;
3716        }
3717
3718        //
3719        // replace the Perl variables that appear in some of the
3720        //   match data strings.
3721        //
3722        UnicodeString matchString = fields[1];
3723        matchString.findAndReplace(nulnulSrc, nulnul);
3724        matchString.findAndReplace(ffffSrc,   ffff);
3725
3726        // Replace any \n in the match string with an actual new-line char.
3727        //  Don't do full unescape, as this unescapes more than Perl does, which
3728        //  causes other spurious failures in the tests.
3729        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
3730
3731
3732
3733        //
3734        // Run the test, check for expected match/don't match result.
3735        //
3736        RegexMatcher *testMat = testPat->matcher(matchString, status);
3737        UBool found = testMat->find();
3738        UBool expected = FALSE;
3739        if (fields[2].indexOf(UChar_y) >=0) {
3740            expected = TRUE;
3741        }
3742        if (expected != found) {
3743            errln("line %d: Expected %smatch, got %smatch",
3744                lineNum, expected?"":"no ", found?"":"no " );
3745            continue;
3746        }
3747
3748        // Don't try to check expected results if there is no match.
3749        //   (Some have stuff in the expected fields)
3750        if (!found) {
3751            delete testMat;
3752            delete testPat;
3753            continue;
3754        }
3755
3756        //
3757        // Interpret the Perl expression from the fourth field of the data file,
3758        // building up an ICU string from the results of the ICU match.
3759        //   The Perl expression will contain references to the results of
3760        //     a regex match, including the matched string, capture group strings,
3761        //     group starting and ending indicies, etc.
3762        //
3763        UnicodeString resultString;
3764        UnicodeString perlExpr = fields[3];
3765#if SUPPORT_MUTATING_INPUT_STRING
3766        groupsMat->reset(perlExpr);
3767        cgMat->reset(perlExpr);
3768#endif
3769
3770        while (perlExpr.length() > 0) {
3771#if !SUPPORT_MUTATING_INPUT_STRING
3772            //  Perferred usage.  Reset after any modification to input string.
3773            groupsMat->reset(perlExpr);
3774            cgMat->reset(perlExpr);
3775#endif
3776
3777            if (perlExpr.startsWith("$&")) {
3778                resultString.append(testMat->group(status));
3779                perlExpr.remove(0, 2);
3780            }
3781
3782            else if (groupsMat->lookingAt(status)) {
3783                // $-[0]   $+[2]  etc.
3784                UnicodeString digitString = groupsMat->group(2, status);
3785                int32_t t = 0;
3786                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
3787                UnicodeString plusOrMinus = groupsMat->group(1, status);
3788                int32_t matchPosition;
3789                if (plusOrMinus.compare("+") == 0) {
3790                    matchPosition = testMat->end(groupNum, status);
3791                } else {
3792                    matchPosition = testMat->start(groupNum, status);
3793                }
3794                if (matchPosition != -1) {
3795                    ICU_Utility::appendNumber(resultString, matchPosition);
3796                }
3797                perlExpr.remove(0, groupsMat->end(status));
3798            }
3799
3800            else if (cgMat->lookingAt(status)) {
3801                // $1, $2, $3, etc.
3802                UnicodeString digitString = cgMat->group(1, status);
3803                int32_t t = 0;
3804                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
3805                if (U_SUCCESS(status)) {
3806                    resultString.append(testMat->group(groupNum, status));
3807                    status = U_ZERO_ERROR;
3808                }
3809                perlExpr.remove(0, cgMat->end(status));
3810            }
3811
3812            else if (perlExpr.startsWith("@-")) {
3813                int32_t i;
3814                for (i=0; i<=testMat->groupCount(); i++) {
3815                    if (i>0) {
3816                        resultString.append(" ");
3817                    }
3818                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
3819                }
3820                perlExpr.remove(0, 2);
3821            }
3822
3823            else if (perlExpr.startsWith("@+")) {
3824                int32_t i;
3825                for (i=0; i<=testMat->groupCount(); i++) {
3826                    if (i>0) {
3827                        resultString.append(" ");
3828                    }
3829                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
3830                }
3831                perlExpr.remove(0, 2);
3832            }
3833
3834            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
3835                                                     //           or as an escaped sequence (e.g. \n)
3836                if (perlExpr.length() > 1) {
3837                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
3838                }
3839                UChar c = perlExpr.charAt(0);
3840                switch (c) {
3841                case 'n':   c = '\n'; break;
3842                // add any other escape sequences that show up in the test expected results.
3843                }
3844                resultString.append(c);
3845                perlExpr.remove(0, 1);
3846            }
3847
3848            else  {
3849                // Any characters from the perl expression that we don't explicitly
3850                //  recognize before here are assumed to be literals and copied
3851                //  as-is to the expected results.
3852                resultString.append(perlExpr.charAt(0));
3853                perlExpr.remove(0, 1);
3854            }
3855
3856            if (U_FAILURE(status)) {
3857                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
3858                break;
3859            }
3860        }
3861
3862        //
3863        // Expected Results Compare
3864        //
3865        UnicodeString expectedS(fields[4]);
3866        expectedS.findAndReplace(nulnulSrc, nulnul);
3867        expectedS.findAndReplace(ffffSrc,   ffff);
3868        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
3869
3870
3871        if (expectedS.compare(resultString) != 0) {
3872            err("Line %d: Incorrect perl expression results.", lineNum);
3873            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
3874        }
3875
3876        delete testMat;
3877        delete testPat;
3878    }
3879
3880    //
3881    // All done.  Clean up allocated stuff.
3882    //
3883    delete cgMat;
3884    delete cgPat;
3885
3886    delete groupsMat;
3887    delete groupsPat;
3888
3889    delete flagMat;
3890    delete flagPat;
3891
3892    delete lineMat;
3893    delete linePat;
3894
3895    delete fieldPat;
3896    delete [] testData;
3897
3898
3899    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
3900
3901}
3902
3903
3904//-------------------------------------------------------------------------------
3905//
3906//   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
3907//                  (instead of using UnicodeStrings) to test the alternate engine.
3908//                  The input file for this test is re_tests, the standard regular
3909//                  expression test data distributed with the Perl source code.
3910//                  See PerlTests() for more information.
3911//
3912//-------------------------------------------------------------------------------
3913void RegexTest::PerlTestsUTF8() {
3914    char tdd[2048];
3915    const char *srcPath;
3916    UErrorCode  status = U_ZERO_ERROR;
3917    UParseError pe;
3918    LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
3919    UText       patternText = UTEXT_INITIALIZER;
3920    char       *patternChars = NULL;
3921    int32_t     patternLength;
3922    int32_t     patternCapacity = 0;
3923    UText       inputText = UTEXT_INITIALIZER;
3924    char       *inputChars = NULL;
3925    int32_t     inputLength;
3926    int32_t     inputCapacity = 0;
3927
3928    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3929
3930    //
3931    //  Open and read the test data file.
3932    //
3933    srcPath=getPath(tdd, "re_tests.txt");
3934    if(srcPath==NULL) {
3935        return; /* something went wrong, error already output */
3936    }
3937
3938    int32_t    len;
3939    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3940    if (U_FAILURE(status)) {
3941        return; /* something went wrong, error already output */
3942    }
3943
3944    //
3945    //  Put the test data into a UnicodeString
3946    //
3947    UnicodeString testDataString(FALSE, testData, len);
3948
3949    //
3950    //  Regex to break the input file into lines, and strip the new lines.
3951    //     One line per match, capture group one is the desired data.
3952    //
3953    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3954    if (U_FAILURE(status)) {
3955        dataerrln("RegexPattern::compile() error");
3956        return;
3957    }
3958    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3959
3960    //
3961    //  Regex to split a test file line into fields.
3962    //    There are six fields, separated by tabs.
3963    //
3964    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3965
3966    //
3967    //  Regex to identify test patterns with flag settings, and to separate them.
3968    //    Test patterns with flags look like 'pattern'i
3969    //    Test patterns without flags are not quoted:   pattern
3970    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3971    //
3972    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3973    RegexMatcher* flagMat = flagPat->matcher(status);
3974
3975    //
3976    // The Perl tests reference several perl-isms, which are evaluated/substituted
3977    //   in the test data.  Not being perl, this must be done explicitly.  Here
3978    //   are string constants and REs for these constructs.
3979    //
3980    UnicodeString nulnulSrc("${nulnul}");
3981    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3982    nulnul = nulnul.unescape();
3983
3984    UnicodeString ffffSrc("${ffff}");
3985    UnicodeString ffff("\\uffff", -1, US_INV);
3986    ffff = ffff.unescape();
3987
3988    //  regexp for $-[0], $+[2], etc.
3989    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3990    RegexMatcher *groupsMat = groupsPat->matcher(status);
3991
3992    //  regexp for $0, $1, $2, etc.
3993    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3994    RegexMatcher *cgMat = cgPat->matcher(status);
3995
3996
3997    //
3998    // Main Loop for the Perl Tests, runs once per line from the
3999    //   test data file.
4000    //
4001    int32_t  lineNum = 0;
4002    int32_t  skippedUnimplementedCount = 0;
4003    while (lineMat->find()) {
4004        lineNum++;
4005
4006        //
4007        //  Get a line, break it into its fields, do the Perl
4008        //    variable substitutions.
4009        //
4010        UnicodeString line = lineMat->group(1, status);
4011        UnicodeString fields[7];
4012        fieldPat->split(line, fields, 7, status);
4013
4014        flagMat->reset(fields[0]);
4015        flagMat->matches(status);
4016        UnicodeString pattern  = flagMat->group(2, status);
4017        pattern.findAndReplace("${bang}", "!");
4018        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4019        pattern.findAndReplace(ffffSrc, ffff);
4020
4021        //
4022        //  Identify patterns that include match flag settings,
4023        //    split off the flags, remove the extra quotes.
4024        //
4025        UnicodeString flagStr = flagMat->group(3, status);
4026        if (U_FAILURE(status)) {
4027            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4028            return;
4029        }
4030        int32_t flags = 0;
4031        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4032        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4033        const UChar UChar_m = 0x6d;
4034        const UChar UChar_x = 0x78;
4035        const UChar UChar_y = 0x79;
4036        if (flagStr.indexOf(UChar_i) != -1) {
4037            flags |= UREGEX_CASE_INSENSITIVE;
4038        }
4039        if (flagStr.indexOf(UChar_m) != -1) {
4040            flags |= UREGEX_MULTILINE;
4041        }
4042        if (flagStr.indexOf(UChar_x) != -1) {
4043            flags |= UREGEX_COMMENTS;
4044        }
4045
4046        //
4047        // Put the pattern in a UTF-8 UText
4048        //
4049        status = U_ZERO_ERROR;
4050        patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4051        if (status == U_BUFFER_OVERFLOW_ERROR) {
4052            status = U_ZERO_ERROR;
4053            delete[] patternChars;
4054            patternCapacity = patternLength + 1;
4055            patternChars = new char[patternCapacity];
4056            pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4057        }
4058        utext_openUTF8(&patternText, patternChars, patternLength, &status);
4059
4060        //
4061        // Compile the test pattern.
4062        //
4063        RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4064        if (status == U_REGEX_UNIMPLEMENTED) {
4065            //
4066            // Test of a feature that is planned for ICU, but not yet implemented.
4067            //   skip the test.
4068            skippedUnimplementedCount++;
4069            delete testPat;
4070            status = U_ZERO_ERROR;
4071            continue;
4072        }
4073
4074        if (U_FAILURE(status)) {
4075            // Some tests are supposed to generate errors.
4076            //   Only report an error for tests that are supposed to succeed.
4077            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4078                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4079            {
4080                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4081            }
4082            status = U_ZERO_ERROR;
4083            delete testPat;
4084            continue;
4085        }
4086
4087        if (fields[2].indexOf(UChar_i) >= 0) {
4088            // ICU should skip this test.
4089            delete testPat;
4090            continue;
4091        }
4092
4093        if (fields[2].indexOf(UChar_c) >= 0) {
4094            // This pattern should have caused a compilation error, but didn't/
4095            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4096            delete testPat;
4097            continue;
4098        }
4099
4100
4101        //
4102        // replace the Perl variables that appear in some of the
4103        //   match data strings.
4104        //
4105        UnicodeString matchString = fields[1];
4106        matchString.findAndReplace(nulnulSrc, nulnul);
4107        matchString.findAndReplace(ffffSrc,   ffff);
4108
4109        // Replace any \n in the match string with an actual new-line char.
4110        //  Don't do full unescape, as this unescapes more than Perl does, which
4111        //  causes other spurious failures in the tests.
4112        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4113
4114        //
4115        // Put the input in a UTF-8 UText
4116        //
4117        status = U_ZERO_ERROR;
4118        inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4119        if (status == U_BUFFER_OVERFLOW_ERROR) {
4120            status = U_ZERO_ERROR;
4121            delete[] inputChars;
4122            inputCapacity = inputLength + 1;
4123            inputChars = new char[inputCapacity];
4124            matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4125        }
4126        utext_openUTF8(&inputText, inputChars, inputLength, &status);
4127
4128        //
4129        // Run the test, check for expected match/don't match result.
4130        //
4131        RegexMatcher *testMat = testPat->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
4132        UBool found = testMat->find();
4133        UBool expected = FALSE;
4134        if (fields[2].indexOf(UChar_y) >=0) {
4135            expected = TRUE;
4136        }
4137        if (expected != found) {
4138            errln("line %d: Expected %smatch, got %smatch",
4139                lineNum, expected?"":"no ", found?"":"no " );
4140            continue;
4141        }
4142
4143        // Don't try to check expected results if there is no match.
4144        //   (Some have stuff in the expected fields)
4145        if (!found) {
4146            delete testMat;
4147            delete testPat;
4148            continue;
4149        }
4150
4151        //
4152        // Interpret the Perl expression from the fourth field of the data file,
4153        // building up an ICU string from the results of the ICU match.
4154        //   The Perl expression will contain references to the results of
4155        //     a regex match, including the matched string, capture group strings,
4156        //     group starting and ending indicies, etc.
4157        //
4158        UnicodeString resultString;
4159        UnicodeString perlExpr = fields[3];
4160
4161        while (perlExpr.length() > 0) {
4162            groupsMat->reset(perlExpr);
4163            cgMat->reset(perlExpr);
4164
4165            if (perlExpr.startsWith("$&")) {
4166                resultString.append(testMat->group(status));
4167                perlExpr.remove(0, 2);
4168            }
4169
4170            else if (groupsMat->lookingAt(status)) {
4171                // $-[0]   $+[2]  etc.
4172                UnicodeString digitString = groupsMat->group(2, status);
4173                int32_t t = 0;
4174                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4175                UnicodeString plusOrMinus = groupsMat->group(1, status);
4176                int32_t matchPosition;
4177                if (plusOrMinus.compare("+") == 0) {
4178                    matchPosition = testMat->end(groupNum, status);
4179                } else {
4180                    matchPosition = testMat->start(groupNum, status);
4181                }
4182                if (matchPosition != -1) {
4183                    ICU_Utility::appendNumber(resultString, matchPosition);
4184                }
4185                perlExpr.remove(0, groupsMat->end(status));
4186            }
4187
4188            else if (cgMat->lookingAt(status)) {
4189                // $1, $2, $3, etc.
4190                UnicodeString digitString = cgMat->group(1, status);
4191                int32_t t = 0;
4192                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4193                if (U_SUCCESS(status)) {
4194                    resultString.append(testMat->group(groupNum, status));
4195                    status = U_ZERO_ERROR;
4196                }
4197                perlExpr.remove(0, cgMat->end(status));
4198            }
4199
4200            else if (perlExpr.startsWith("@-")) {
4201                int32_t i;
4202                for (i=0; i<=testMat->groupCount(); i++) {
4203                    if (i>0) {
4204                        resultString.append(" ");
4205                    }
4206                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4207                }
4208                perlExpr.remove(0, 2);
4209            }
4210
4211            else if (perlExpr.startsWith("@+")) {
4212                int32_t i;
4213                for (i=0; i<=testMat->groupCount(); i++) {
4214                    if (i>0) {
4215                        resultString.append(" ");
4216                    }
4217                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4218                }
4219                perlExpr.remove(0, 2);
4220            }
4221
4222            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4223                                                     //           or as an escaped sequence (e.g. \n)
4224                if (perlExpr.length() > 1) {
4225                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4226                }
4227                UChar c = perlExpr.charAt(0);
4228                switch (c) {
4229                case 'n':   c = '\n'; break;
4230                // add any other escape sequences that show up in the test expected results.
4231                }
4232                resultString.append(c);
4233                perlExpr.remove(0, 1);
4234            }
4235
4236            else  {
4237                // Any characters from the perl expression that we don't explicitly
4238                //  recognize before here are assumed to be literals and copied
4239                //  as-is to the expected results.
4240                resultString.append(perlExpr.charAt(0));
4241                perlExpr.remove(0, 1);
4242            }
4243
4244            if (U_FAILURE(status)) {
4245                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4246                break;
4247            }
4248        }
4249
4250        //
4251        // Expected Results Compare
4252        //
4253        UnicodeString expectedS(fields[4]);
4254        expectedS.findAndReplace(nulnulSrc, nulnul);
4255        expectedS.findAndReplace(ffffSrc,   ffff);
4256        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4257
4258
4259        if (expectedS.compare(resultString) != 0) {
4260            err("Line %d: Incorrect perl expression results.", lineNum);
4261            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4262        }
4263
4264        delete testMat;
4265        delete testPat;
4266    }
4267
4268    //
4269    // All done.  Clean up allocated stuff.
4270    //
4271    delete cgMat;
4272    delete cgPat;
4273
4274    delete groupsMat;
4275    delete groupsPat;
4276
4277    delete flagMat;
4278    delete flagPat;
4279
4280    delete lineMat;
4281    delete linePat;
4282
4283    delete fieldPat;
4284    delete [] testData;
4285
4286    utext_close(&patternText);
4287    utext_close(&inputText);
4288
4289    delete [] patternChars;
4290    delete [] inputChars;
4291
4292
4293    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4294
4295}
4296
4297
4298//--------------------------------------------------------------
4299//
4300//  Bug6149   Verify limits to heap expansion for backtrack stack.
4301//             Use this pattern,
4302//                 "(a?){1,}"
4303//             The zero-length match will repeat forever.
4304//                (That this goes into a loop is another bug)
4305//
4306//---------------------------------------------------------------
4307void RegexTest::Bug6149() {
4308    UnicodeString pattern("(a?){1,}");
4309    UnicodeString s("xyz");
4310    uint32_t flags = 0;
4311    UErrorCode status = U_ZERO_ERROR;
4312
4313    RegexMatcher  matcher(pattern, s, flags, status);
4314    UBool result = false;
4315    REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4316    REGEX_ASSERT(result == FALSE);
4317 }
4318
4319
4320//
4321//   Callbacks()    Test the callback function.
4322//                  When set, callbacks occur periodically during matching operations,
4323//                  giving the application code the ability to abort the operation
4324//                  before it's normal completion.
4325//
4326
4327struct callBackContext {
4328    RegexTest        *test;
4329    int32_t          maxCalls;
4330    int32_t          numCalls;
4331    int32_t          lastSteps;
4332    void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4333};
4334
4335U_CDECL_BEGIN
4336static UBool U_CALLCONV
4337testCallBackFn(const void *context, int32_t steps) {
4338    callBackContext  *info = (callBackContext *)context;
4339    if (info->lastSteps+1 != steps) {
4340        info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4341    }
4342    info->lastSteps = steps;
4343    info->numCalls++;
4344    return (info->numCalls < info->maxCalls);
4345}
4346U_CDECL_END
4347
4348void RegexTest::Callbacks() {
4349   {
4350        // Getter returns NULLs if no callback has been set
4351
4352        //   The variables that the getter will fill in.
4353        //   Init to non-null values so that the action of the getter can be seen.
4354        const void          *returnedContext = &returnedContext;
4355        URegexMatchCallback *returnedFn = &testCallBackFn;
4356
4357        UErrorCode status = U_ZERO_ERROR;
4358        RegexMatcher matcher("x", 0, status);
4359        REGEX_CHECK_STATUS;
4360        matcher.getMatchCallback(returnedFn, returnedContext, status);
4361        REGEX_CHECK_STATUS;
4362        REGEX_ASSERT(returnedFn == NULL);
4363        REGEX_ASSERT(returnedContext == NULL);
4364    }
4365
4366   {
4367        // Set and Get work
4368        callBackContext cbInfo = {this, 0, 0, 0};
4369        const void          *returnedContext;
4370        URegexMatchCallback *returnedFn;
4371        UErrorCode status = U_ZERO_ERROR;
4372        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4373        REGEX_CHECK_STATUS;
4374        matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4375        REGEX_CHECK_STATUS;
4376        matcher.getMatchCallback(returnedFn, returnedContext, status);
4377        REGEX_CHECK_STATUS;
4378        REGEX_ASSERT(returnedFn == testCallBackFn);
4379        REGEX_ASSERT(returnedContext == &cbInfo);
4380
4381        // A short-running match shouldn't invoke the callback
4382        status = U_ZERO_ERROR;
4383        cbInfo.reset(1);
4384        UnicodeString s = "xxx";
4385        matcher.reset(s);
4386        REGEX_ASSERT(matcher.matches(status));
4387        REGEX_CHECK_STATUS;
4388        REGEX_ASSERT(cbInfo.numCalls == 0);
4389
4390        // A medium-length match that runs long enough to invoke the
4391        //   callback, but not so long that the callback aborts it.
4392        status = U_ZERO_ERROR;
4393        cbInfo.reset(4);
4394        s = "aaaaaaaaaaaaaaaaaaab";
4395        matcher.reset(s);
4396        REGEX_ASSERT(matcher.matches(status)==FALSE);
4397        REGEX_CHECK_STATUS;
4398        REGEX_ASSERT(cbInfo.numCalls > 0);
4399
4400        // A longer running match that the callback function will abort.
4401        status = U_ZERO_ERROR;
4402        cbInfo.reset(4);
4403        s = "aaaaaaaaaaaaaaaaaaaaaaab";
4404        matcher.reset(s);
4405        REGEX_ASSERT(matcher.matches(status)==FALSE);
4406        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4407        REGEX_ASSERT(cbInfo.numCalls == 4);
4408    }
4409
4410
4411}
4412
4413
4414//---------------------------------------------------------------------------
4415//
4416//    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4417//                             UTexts. The pure-C implementation of UText
4418//                             has no mutable backing stores, but we can
4419//                             use UnicodeString here to test the functionality.
4420//
4421//---------------------------------------------------------------------------
4422void RegexTest::PreAllocatedUTextCAPI () {
4423    UErrorCode           status = U_ZERO_ERROR;
4424    URegularExpression  *re;
4425    UText                patternText = UTEXT_INITIALIZER;
4426    UnicodeString        buffer;
4427    UText                bufferText = UTEXT_INITIALIZER;
4428
4429    utext_openUnicodeString(&bufferText, &buffer, &status);
4430
4431    /*
4432     *  getText() and getUText()
4433     */
4434    {
4435        UText  text1 = UTEXT_INITIALIZER;
4436        UText  text2 = UTEXT_INITIALIZER;
4437        UChar  text2Chars[20];
4438        UText  *resultText;
4439
4440        status = U_ZERO_ERROR;
4441        utext_openUTF8(&text1, "abcccd", -1, &status);
4442        utext_openUTF8(&text2, "abcccxd", -1, &status);
4443        u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4444        utext_openUChars(&text2, text2Chars, -1, &status);
4445
4446        utext_openUTF8(&patternText, "abc*d", -1, &status);
4447        re = uregex_openUText(&patternText, 0, NULL, &status);
4448
4449        /* First set a UText */
4450        uregex_setUText(re, &text1, &status);
4451        resultText = uregex_getUText(re, &bufferText, &status);
4452        REGEX_CHECK_STATUS;
4453        REGEX_ASSERT(resultText == &bufferText);
4454        utext_setNativeIndex(resultText, 0);
4455        utext_setNativeIndex(&text1, 0);
4456        REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4457
4458        resultText = uregex_getUText(re, &bufferText, &status);
4459        REGEX_CHECK_STATUS;
4460        REGEX_ASSERT(resultText == &bufferText);
4461        utext_setNativeIndex(resultText, 0);
4462        utext_setNativeIndex(&text1, 0);
4463        REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4464
4465        /* Then set a UChar * */
4466        uregex_setText(re, text2Chars, 7, &status);
4467        resultText = uregex_getUText(re, &bufferText, &status);
4468        REGEX_CHECK_STATUS;
4469        REGEX_ASSERT(resultText == &bufferText);
4470        utext_setNativeIndex(resultText, 0);
4471        utext_setNativeIndex(&text2, 0);
4472        REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
4473
4474        uregex_close(re);
4475        utext_close(&text1);
4476        utext_close(&text2);
4477    }
4478
4479    /*
4480     *  group()
4481     */
4482    {
4483        UChar    text1[80];
4484        UText   *actual;
4485        UBool    result;
4486        u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
4487
4488        status = U_ZERO_ERROR;
4489        re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4490        REGEX_CHECK_STATUS;
4491
4492        uregex_setText(re, text1, -1, &status);
4493        result = uregex_find(re, 0, &status);
4494        REGEX_ASSERT(result==TRUE);
4495
4496        /*  Capture Group 0, the full match.  Should succeed.  */
4497        status = U_ZERO_ERROR;
4498        actual = uregex_groupUText(re, 0, &bufferText, &status);
4499        REGEX_CHECK_STATUS;
4500        REGEX_ASSERT(actual == &bufferText);
4501        REGEX_ASSERT_UTEXT("abc interior def", actual);
4502
4503        /*  Capture group #1.  Should succeed. */
4504        status = U_ZERO_ERROR;
4505        actual = uregex_groupUText(re, 1, &bufferText, &status);
4506        REGEX_CHECK_STATUS;
4507        REGEX_ASSERT(actual == &bufferText);
4508        REGEX_ASSERT_UTEXT(" interior ", actual);
4509
4510        /*  Capture group out of range.  Error. */
4511        status = U_ZERO_ERROR;
4512        actual = uregex_groupUText(re, 2, &bufferText, &status);
4513        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
4514        REGEX_ASSERT(actual == &bufferText);
4515
4516        uregex_close(re);
4517
4518    }
4519
4520    /*
4521     *  replaceFirst()
4522     */
4523    {
4524        UChar    text1[80];
4525        UChar    text2[80];
4526        UText    replText = UTEXT_INITIALIZER;
4527        UText   *result;
4528
4529        status = U_ZERO_ERROR;
4530        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
4531        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
4532        utext_openUTF8(&replText, "<$1>", -1, &status);
4533
4534        re = uregex_openC("x(.*?)x", 0, NULL, &status);
4535        REGEX_CHECK_STATUS;
4536
4537        /*  Normal case, with match */
4538        uregex_setText(re, text1, -1, &status);
4539        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4540        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4541        REGEX_CHECK_STATUS;
4542        REGEX_ASSERT(result == &bufferText);
4543        REGEX_ASSERT_UTEXT("Replace <aa> x1x x...x.", result);
4544
4545        /* No match.  Text should copy to output with no changes.  */
4546        uregex_setText(re, text2, -1, &status);
4547        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4548        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4549        REGEX_CHECK_STATUS;
4550        REGEX_ASSERT(result == &bufferText);
4551        REGEX_ASSERT_UTEXT("No match here.", result);
4552
4553        /* Unicode escapes */
4554        uregex_setText(re, text1, -1, &status);
4555        utext_openUTF8(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
4556        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4557        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4558        REGEX_CHECK_STATUS;
4559        REGEX_ASSERT(result == &bufferText);
4560        REGEX_ASSERT_UTEXT("Replace \\AaaB$a x1x x...x.", result);
4561
4562        uregex_close(re);
4563        utext_close(&replText);
4564    }
4565
4566
4567    /*
4568     *  replaceAll()
4569     */
4570    {
4571        UChar    text1[80];
4572        UChar    text2[80];
4573        UText    replText = UTEXT_INITIALIZER;
4574        UText   *result;
4575
4576        status = U_ZERO_ERROR;
4577        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
4578        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
4579        utext_openUTF8(&replText, "<$1>", -1, &status);
4580
4581        re = uregex_openC("x(.*?)x", 0, NULL, &status);
4582        REGEX_CHECK_STATUS;
4583
4584        /*  Normal case, with match */
4585        uregex_setText(re, text1, -1, &status);
4586        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4587        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
4588        REGEX_CHECK_STATUS;
4589        REGEX_ASSERT(result == &bufferText);
4590        REGEX_ASSERT_UTEXT("Replace <aa> <1> <...>.", result);
4591
4592        /* No match.  Text should copy to output with no changes.  */
4593        uregex_setText(re, text2, -1, &status);
4594        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4595        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
4596        REGEX_CHECK_STATUS;
4597        REGEX_ASSERT(result == &bufferText);
4598        REGEX_ASSERT_UTEXT("No match here.", result);
4599
4600        uregex_close(re);
4601        utext_close(&replText);
4602    }
4603
4604
4605    /*
4606     *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
4607     *   so we don't need to test it here.
4608     */
4609
4610    utext_close(&bufferText);
4611    utext_close(&patternText);
4612}
4613
4614//--------------------------------------------------------------
4615//
4616//  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
4617//
4618//---------------------------------------------------------------
4619void RegexTest::Bug7651() {
4620    UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
4621    //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
4622    UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
4623    UnicodeString s("#ff @abcd This is test");
4624    RegexPattern  *REPattern = NULL;
4625    RegexMatcher  *REMatcher = NULL;
4626    UErrorCode status = U_ZERO_ERROR;
4627    UParseError pe;
4628
4629    REPattern = RegexPattern::compile(pattern1, 0, pe, status);
4630    REGEX_CHECK_STATUS;
4631    REMatcher = REPattern->matcher(s, status);
4632    REGEX_CHECK_STATUS;
4633    REGEX_ASSERT(REMatcher->find());
4634    REGEX_ASSERT(REMatcher->start(status) == 0);
4635    delete REPattern;
4636    delete REMatcher;
4637    status = U_ZERO_ERROR;
4638
4639    REPattern = RegexPattern::compile(pattern2, 0, pe, status);
4640    REGEX_CHECK_STATUS;
4641    REMatcher = REPattern->matcher(s, status);
4642    REGEX_CHECK_STATUS;
4643    REGEX_ASSERT(REMatcher->find());
4644    REGEX_ASSERT(REMatcher->start(status) == 0);
4645    delete REPattern;
4646    delete REMatcher;
4647    status = U_ZERO_ERROR;
4648 }
4649
4650#endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
4651
4652