1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2010, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7//
8//   regextst.cpp
9//
10//      ICU Regular Expressions test, part of intltest.
11//
12
13#include "intltest.h"
14#if !UCONFIG_NO_REGULAR_EXPRESSIONS
15
16#include "unicode/regex.h"
17#include "unicode/uchar.h"
18#include "unicode/ucnv.h"
19#include "unicode/ustring.h"
20#include "regextst.h"
21#include "uvector.h"
22#include "util.h"
23#include <stdlib.h>
24#include <string.h>
25#include <stdio.h>
26#include "cstring.h"
27#include "uinvchar.h"
28
29#define SUPPORT_MUTATING_INPUT_STRING   0
30
31//---------------------------------------------------------------------------
32//
33//  Test class boilerplate
34//
35//---------------------------------------------------------------------------
36RegexTest::RegexTest()
37{
38}
39
40
41RegexTest::~RegexTest()
42{
43}
44
45
46
47void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
48{
49    if (exec) logln("TestSuite RegexTest: ");
50    switch (index) {
51
52        case 0: name = "Basic";
53            if (exec) Basic();
54            break;
55        case 1: name = "API_Match";
56            if (exec) API_Match();
57            break;
58        case 2: name = "API_Replace";
59            if (exec) API_Replace();
60            break;
61        case 3: name = "API_Pattern";
62            if (exec) API_Pattern();
63            break;
64        case 4:
65#if !UCONFIG_NO_FILE_IO
66            name = "Extended";
67            if (exec) Extended();
68#else
69            name = "skip";
70#endif
71            break;
72        case 5: name = "Errors";
73            if (exec) Errors();
74            break;
75        case 6: name = "PerlTests";
76            if (exec) PerlTests();
77            break;
78        case 7: name = "Callbacks";
79            if (exec) Callbacks();
80            break;
81        case 8: name = "FindProgressCallbacks";
82            if (exec) FindProgressCallbacks();
83            break;
84        case 9: name = "Bug 6149";
85             if (exec) Bug6149();
86             break;
87        case 10: name = "UTextBasic";
88          if (exec) UTextBasic();
89          break;
90        case 11: name = "API_Match_UTF8";
91          if (exec) API_Match_UTF8();
92          break;
93        case 12: name = "API_Replace_UTF8";
94          if (exec) API_Replace_UTF8();
95          break;
96        case 13: name = "API_Pattern_UTF8";
97          if (exec) API_Pattern_UTF8();
98          break;
99        case 14: name = "PerlTestsUTF8";
100          if (exec) PerlTestsUTF8();
101          break;
102        case 15: name = "PreAllocatedUTextCAPI";
103          if (exec) PreAllocatedUTextCAPI();
104          break;
105        case 16: name = "Bug 7651";
106             if (exec) Bug7651();
107             break;
108        case 17: name = "Bug 7740";
109            if (exec) Bug7740();
110            break;
111
112        default: name = "";
113            break; //needed to end loop
114    }
115}
116
117
118/**
119 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
120 * into ASCII.
121 * @see utext_openUTF8
122 */
123static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
124
125static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
126#if U_CHARSET_FAMILY==U_ASCII_FAMILY
127  return utext_openUTF8(ut, inv, length, status);
128#else
129  char buf[1024];
130
131  uprv_aestrncpy((uint8_t*)buf, (const uint8_t*)inv, length);
132
133  return utext_openUTF8(ut, buf, length, status);
134#endif
135}
136
137//---------------------------------------------------------------------------
138//
139//   Error Checking / Reporting macros used in all of the tests.
140//
141//---------------------------------------------------------------------------
142
143static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
144  int64_t oldIndex = utext_getNativeIndex(text);
145  utext_setNativeIndex(text, 0);
146  char *bufPtr = buf;
147  UChar32 c = utext_next32From(text, 0);
148  while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
149    if (0x000020<=c && c<0x00007e) {
150      *bufPtr = c;
151    } else {
152#if 0
153      sprintf(bufPtr,"U+%04X", c);
154      bufPtr+= strlen(bufPtr)-1;
155#else
156      *bufPtr = '%';
157#endif
158    }
159    bufPtr++;
160    c = UTEXT_NEXT32(text);
161  }
162  *bufPtr = 0;
163#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
164  char *ebuf = (char*)malloc(bufLen);
165  uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
166  uprv_strncpy(buf, ebuf, bufLen);
167  free((void*)ebuf);
168#endif
169  utext_setNativeIndex(text, oldIndex);
170}
171
172#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
173
174#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
175                                                              __FILE__, __LINE__, u_errorName(status)); return;}}
176
177#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
178
179#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
180if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
181    __LINE__, u_errorName(errcode), u_errorName(status));};}
182
183#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
184    "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
185
186#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
187    errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
188
189/**
190 * @param expected expected text in UTF-8 (not platform) codepage
191 */
192void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
193    UErrorCode status = U_ZERO_ERROR;
194    UText expectedText = UTEXT_INITIALIZER;
195    utext_openUTF8(&expectedText, expected, -1, &status);
196    if(U_FAILURE(status)) {
197      errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
198      return;
199    }
200    if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
201      errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
202      return;
203    }
204    utext_setNativeIndex(actual, 0);
205    if (utext_compare(&expectedText, -1, actual, -1) != 0) {
206        char buf[201 /*21*/];
207        char expectedBuf[201];
208        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
209        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
210        errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
211    }
212    utext_close(&expectedText);
213}
214/**
215 * @param expected invariant (platform local text) input
216 */
217
218void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
219    UErrorCode status = U_ZERO_ERROR;
220    UText expectedText = UTEXT_INITIALIZER;
221    regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
222    if(U_FAILURE(status)) {
223      errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
224      return;
225    }
226    utext_setNativeIndex(actual, 0);
227    if (utext_compare(&expectedText, -1, actual, -1) != 0) {
228        char buf[201 /*21*/];
229        char expectedBuf[201];
230        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
231        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
232        errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
233    }
234    utext_close(&expectedText);
235}
236
237/**
238 * Assumes utf-8 input
239 */
240#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
241/**
242 * Assumes Invariant input
243 */
244#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
245
246
247//---------------------------------------------------------------------------
248//
249//    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
250//                       for the LookingAt() and  Match() functions.
251//
252//       usage:
253//          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
254//
255//          The expected results are UBool - TRUE or FALSE.
256//          The input text is unescaped.  The pattern is not.
257//
258//
259//---------------------------------------------------------------------------
260
261#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
262
263UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
264    const UnicodeString pattern(pat, -1, US_INV);
265    const UnicodeString inputText(text, -1, US_INV);
266    UErrorCode          status  = U_ZERO_ERROR;
267    UParseError         pe;
268    RegexPattern        *REPattern = NULL;
269    RegexMatcher        *REMatcher = NULL;
270    UBool               retVal     = TRUE;
271
272    UnicodeString patString(pat, -1, US_INV);
273    REPattern = RegexPattern::compile(patString, 0, pe, status);
274    if (U_FAILURE(status)) {
275        dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
276            line, u_errorName(status));
277        return FALSE;
278    }
279    if (line==376) { RegexPatternDump(REPattern);}
280
281    UnicodeString inputString(inputText);
282    UnicodeString unEscapedInput = inputString.unescape();
283    REMatcher = REPattern->matcher(unEscapedInput, status);
284    if (U_FAILURE(status)) {
285        errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
286            line, u_errorName(status));
287        return FALSE;
288    }
289
290    UBool actualmatch;
291    actualmatch = REMatcher->lookingAt(status);
292    if (U_FAILURE(status)) {
293        errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
294            line, u_errorName(status));
295        retVal =  FALSE;
296    }
297    if (actualmatch != looking) {
298        errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
299        retVal = FALSE;
300    }
301
302    status = U_ZERO_ERROR;
303    actualmatch = REMatcher->matches(status);
304    if (U_FAILURE(status)) {
305        errln("RegexTest failure in matches() at line %d.  Status = %s\n",
306            line, u_errorName(status));
307        retVal = FALSE;
308    }
309    if (actualmatch != match) {
310        errln("RegexTest: wrong return from matches() at line %d.\n", line);
311        retVal = FALSE;
312    }
313
314    if (retVal == FALSE) {
315        RegexPatternDump(REPattern);
316    }
317
318    delete REPattern;
319    delete REMatcher;
320    return retVal;
321}
322
323
324UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
325    UText               pattern    = UTEXT_INITIALIZER;
326    int32_t             inputUTF8Length;
327    char                *textChars = NULL;
328    UText               inputText  = UTEXT_INITIALIZER;
329    UErrorCode          status     = U_ZERO_ERROR;
330    UParseError         pe;
331    RegexPattern        *REPattern = NULL;
332    RegexMatcher        *REMatcher = NULL;
333    UBool               retVal     = TRUE;
334
335    regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
336    REPattern = RegexPattern::compile(&pattern, 0, pe, status);
337    if (U_FAILURE(status)) {
338        dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
339            line, u_errorName(status));
340        return FALSE;
341    }
342
343    UnicodeString inputString(text, -1, US_INV);
344    UnicodeString unEscapedInput = inputString.unescape();
345    LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
346    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
347
348    inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
349    if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
350        // UTF-8 does not allow unpaired surrogates, so this could actually happen
351        logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
352        return TRUE; // not a failure of the Regex engine
353    }
354    status = U_ZERO_ERROR; // buffer overflow
355    textChars = new char[inputUTF8Length+1];
356    unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
357    utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
358
359    REMatcher = REPattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
360    if (U_FAILURE(status)) {
361        errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
362            line, u_errorName(status));
363        return FALSE;
364    }
365
366    UBool actualmatch;
367    actualmatch = REMatcher->lookingAt(status);
368    if (U_FAILURE(status)) {
369        errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
370            line, u_errorName(status));
371        retVal =  FALSE;
372    }
373    if (actualmatch != looking) {
374        errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
375        retVal = FALSE;
376    }
377
378    status = U_ZERO_ERROR;
379    actualmatch = REMatcher->matches(status);
380    if (U_FAILURE(status)) {
381        errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
382            line, u_errorName(status));
383        retVal = FALSE;
384    }
385    if (actualmatch != match) {
386        errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
387        retVal = FALSE;
388    }
389
390    if (retVal == FALSE) {
391        RegexPatternDump(REPattern);
392    }
393
394    delete REPattern;
395    delete REMatcher;
396    utext_close(&inputText);
397    utext_close(&pattern);
398    delete[] textChars;
399    return retVal;
400}
401
402
403
404//---------------------------------------------------------------------------
405//
406//    REGEX_ERR       Macro + invocation function to simplify writing tests
407//                       regex tests for incorrect patterns
408//
409//       usage:
410//          REGEX_ERR("pattern",   expected error line, column, expected status);
411//
412//---------------------------------------------------------------------------
413#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
414
415void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
416                          UErrorCode expectedStatus, int32_t line) {
417    UnicodeString       pattern(pat);
418
419    UErrorCode          status         = U_ZERO_ERROR;
420    UParseError         pe;
421    RegexPattern        *callerPattern = NULL;
422
423    //
424    //  Compile the caller's pattern
425    //
426    UnicodeString patString(pat);
427    callerPattern = RegexPattern::compile(patString, 0, pe, status);
428    if (status != expectedStatus) {
429        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
430    } else {
431        if (status != U_ZERO_ERROR) {
432            if (pe.line != errLine || pe.offset != errCol) {
433                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
434                    line, errLine, errCol, pe.line, pe.offset);
435            }
436        }
437    }
438
439    delete callerPattern;
440
441    //
442    //  Compile again, using a UTF-8-based UText
443    //
444    UText patternText = UTEXT_INITIALIZER;
445    regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
446    callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
447    if (status != expectedStatus) {
448        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
449    } else {
450        if (status != U_ZERO_ERROR) {
451            if (pe.line != errLine || pe.offset != errCol) {
452                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
453                    line, errLine, errCol, pe.line, pe.offset);
454            }
455        }
456    }
457
458    delete callerPattern;
459    utext_close(&patternText);
460}
461
462
463
464//---------------------------------------------------------------------------
465//
466//      Basic      Check for basic functionality of regex pattern matching.
467//                 Avoid the use of REGEX_FIND test macro, which has
468//                 substantial dependencies on basic Regex functionality.
469//
470//---------------------------------------------------------------------------
471void RegexTest::Basic() {
472
473
474//
475// Debug - slide failing test cases early
476//
477#if 0
478    {
479        // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
480        UParseError pe;
481        UErrorCode  status = U_ZERO_ERROR;
482        RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
483        // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
484        // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
485    }
486    exit(1);
487#endif
488
489
490    //
491    // Pattern with parentheses
492    //
493    REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
494    REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
495    REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
496
497    //
498    // Patterns with *
499    //
500    REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
501    REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
502    REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
503    REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
504    REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
505
506    REGEX_TESTLM("a*", "",  TRUE, TRUE);
507    REGEX_TESTLM("a*", "b", TRUE, FALSE);
508
509
510    //
511    //  Patterns with "."
512    //
513    REGEX_TESTLM(".", "abc", TRUE, FALSE);
514    REGEX_TESTLM("...", "abc", TRUE, TRUE);
515    REGEX_TESTLM("....", "abc", FALSE, FALSE);
516    REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
517    REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
518    REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
519    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
520    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
521
522    //
523    //  Patterns with * applied to chars at end of literal string
524    //
525    REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
526    REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
527
528    //
529    //  Supplemental chars match as single chars, not a pair of surrogates.
530    //
531    REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
532    REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
533    REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
534
535
536    //
537    //  UnicodeSets in the pattern
538    //
539    REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
540    REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
541    REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
542    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
543    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
544    REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
545
546    REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
547    REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
548    REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
549    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
550    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
551
552    //
553    //   OR operator in patterns
554    //
555    REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
556    REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
557    REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
558    REGEX_TESTLM("a|b", "b", TRUE, TRUE);
559
560    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
561    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
562    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
563    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
564    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
565    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
566
567    //
568    //  +
569    //
570    REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
571    REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
572    REGEX_TESTLM("b+", "", FALSE, FALSE);
573    REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
574    REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
575    REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
576
577    //
578    //   ?
579    //
580    REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
581    REGEX_TESTLM("ab?", "a", TRUE, TRUE);
582    REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
583    REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
584    REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
585    REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
586    REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
587    REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
588    REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
589
590    //
591    //  Escape sequences that become single literal chars, handled internally
592    //   by ICU's Unescape.
593    //
594
595    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
596    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
597    REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
598    REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
599    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
600    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
601    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
602    REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
603    REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
604    REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
605
606    REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
607    REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
608
609    // Escape of special chars in patterns
610    REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
611}
612
613
614//---------------------------------------------------------------------------
615//
616//    UTextBasic   Check for quirks that are specific to the UText
617//                 implementation.
618//
619//---------------------------------------------------------------------------
620void RegexTest::UTextBasic() {
621    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
622    UErrorCode status = U_ZERO_ERROR;
623    UText pattern = UTEXT_INITIALIZER;
624    utext_openUTF8(&pattern, str_abc, -1, &status);
625    RegexMatcher matcher(&pattern, 0, status);
626    REGEX_CHECK_STATUS;
627
628    UText input = UTEXT_INITIALIZER;
629    utext_openUTF8(&input, str_abc, -1, &status);
630    REGEX_CHECK_STATUS;
631    matcher.reset(&input);
632    REGEX_CHECK_STATUS;
633    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
634
635    matcher.reset(matcher.inputText());
636    REGEX_CHECK_STATUS;
637    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
638
639    utext_close(&pattern);
640    utext_close(&input);
641}
642
643
644//---------------------------------------------------------------------------
645//
646//      API_Match   Test that the API for class RegexMatcher
647//                  is present and nominally working, but excluding functions
648//                  implementing replace operations.
649//
650//---------------------------------------------------------------------------
651void RegexTest::API_Match() {
652    UParseError         pe;
653    UErrorCode          status=U_ZERO_ERROR;
654    int32_t             flags = 0;
655
656    //
657    // Debug - slide failing test cases early
658    //
659#if 0
660    {
661    }
662    return;
663#endif
664
665    //
666    // Simple pattern compilation
667    //
668    {
669        UnicodeString       re("abc");
670        RegexPattern        *pat2;
671        pat2 = RegexPattern::compile(re, flags, pe, status);
672        REGEX_CHECK_STATUS;
673
674        UnicodeString inStr1 = "abcdef this is a test";
675        UnicodeString instr2 = "not abc";
676        UnicodeString empty  = "";
677
678
679        //
680        // Matcher creation and reset.
681        //
682        RegexMatcher *m1 = pat2->matcher(inStr1, status);
683        REGEX_CHECK_STATUS;
684        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
685        REGEX_ASSERT(m1->input() == inStr1);
686        m1->reset(instr2);
687        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
688        REGEX_ASSERT(m1->input() == instr2);
689        m1->reset(inStr1);
690        REGEX_ASSERT(m1->input() == inStr1);
691        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
692        m1->reset(empty);
693        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
694        REGEX_ASSERT(m1->input() == empty);
695        REGEX_ASSERT(&m1->pattern() == pat2);
696
697        //
698        //  reset(pos, status)
699        //
700        m1->reset(inStr1);
701        m1->reset(4, status);
702        REGEX_CHECK_STATUS;
703        REGEX_ASSERT(m1->input() == inStr1);
704        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
705
706        m1->reset(-1, status);
707        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
708        status = U_ZERO_ERROR;
709
710        m1->reset(0, status);
711        REGEX_CHECK_STATUS;
712        status = U_ZERO_ERROR;
713
714        int32_t len = m1->input().length();
715        m1->reset(len-1, status);
716        REGEX_CHECK_STATUS;
717        status = U_ZERO_ERROR;
718
719        m1->reset(len, status);
720        REGEX_CHECK_STATUS;
721        status = U_ZERO_ERROR;
722
723        m1->reset(len+1, status);
724        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
725        status = U_ZERO_ERROR;
726
727        //
728        // match(pos, status)
729        //
730        m1->reset(instr2);
731        REGEX_ASSERT(m1->matches(4, status) == TRUE);
732        m1->reset();
733        REGEX_ASSERT(m1->matches(3, status) == FALSE);
734        m1->reset();
735        REGEX_ASSERT(m1->matches(5, status) == FALSE);
736        REGEX_ASSERT(m1->matches(4, status) == TRUE);
737        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
738        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
739
740        // Match() at end of string should fail, but should not
741        //  be an error.
742        status = U_ZERO_ERROR;
743        len = m1->input().length();
744        REGEX_ASSERT(m1->matches(len, status) == FALSE);
745        REGEX_CHECK_STATUS;
746
747        // Match beyond end of string should fail with an error.
748        status = U_ZERO_ERROR;
749        REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
750        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
751
752        // Successful match at end of string.
753        {
754            status = U_ZERO_ERROR;
755            RegexMatcher m("A?", 0, status);  // will match zero length string.
756            REGEX_CHECK_STATUS;
757            m.reset(inStr1);
758            len = inStr1.length();
759            REGEX_ASSERT(m.matches(len, status) == TRUE);
760            REGEX_CHECK_STATUS;
761            m.reset(empty);
762            REGEX_ASSERT(m.matches(0, status) == TRUE);
763            REGEX_CHECK_STATUS;
764        }
765
766
767        //
768        // lookingAt(pos, status)
769        //
770        status = U_ZERO_ERROR;
771        m1->reset(instr2);  // "not abc"
772        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
773        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
774        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
775        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
776        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
777        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
778        status = U_ZERO_ERROR;
779        len = m1->input().length();
780        REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
781        REGEX_CHECK_STATUS;
782        REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
783        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
784
785        delete m1;
786        delete pat2;
787    }
788
789
790    //
791    // Capture Group.
792    //     RegexMatcher::start();
793    //     RegexMatcher::end();
794    //     RegexMatcher::groupCount();
795    //
796    {
797        int32_t             flags=0;
798        UParseError         pe;
799        UErrorCode          status=U_ZERO_ERROR;
800
801        UnicodeString       re("01(23(45)67)(.*)");
802        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
803        REGEX_CHECK_STATUS;
804        UnicodeString data = "0123456789";
805
806        RegexMatcher *matcher = pat->matcher(data, status);
807        REGEX_CHECK_STATUS;
808        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
809        static const int32_t matchStarts[] = {0,  2, 4, 8};
810        static const int32_t matchEnds[]   = {10, 8, 6, 10};
811        int32_t i;
812        for (i=0; i<4; i++) {
813            int32_t actualStart = matcher->start(i, status);
814            REGEX_CHECK_STATUS;
815            if (actualStart != matchStarts[i]) {
816                errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
817                    __LINE__, i, matchStarts[i], actualStart);
818            }
819            int32_t actualEnd = matcher->end(i, status);
820            REGEX_CHECK_STATUS;
821            if (actualEnd != matchEnds[i]) {
822                errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
823                    __LINE__, i, matchEnds[i], actualEnd);
824            }
825        }
826
827        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
828        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
829
830        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
831        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
832        matcher->reset();
833        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
834
835        matcher->lookingAt(status);
836        REGEX_ASSERT(matcher->group(status)    == "0123456789");
837        REGEX_ASSERT(matcher->group(0, status) == "0123456789");
838        REGEX_ASSERT(matcher->group(1, status) == "234567"    );
839        REGEX_ASSERT(matcher->group(2, status) == "45"        );
840        REGEX_ASSERT(matcher->group(3, status) == "89"        );
841        REGEX_CHECK_STATUS;
842        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
843        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
844        matcher->reset();
845        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
846
847        delete matcher;
848        delete pat;
849
850    }
851
852    //
853    //  find
854    //
855    {
856        int32_t             flags=0;
857        UParseError         pe;
858        UErrorCode          status=U_ZERO_ERROR;
859
860        UnicodeString       re("abc");
861        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
862        REGEX_CHECK_STATUS;
863        UnicodeString data = ".abc..abc...abc..";
864        //                    012345678901234567
865
866        RegexMatcher *matcher = pat->matcher(data, status);
867        REGEX_CHECK_STATUS;
868        REGEX_ASSERT(matcher->find());
869        REGEX_ASSERT(matcher->start(status) == 1);
870        REGEX_ASSERT(matcher->find());
871        REGEX_ASSERT(matcher->start(status) == 6);
872        REGEX_ASSERT(matcher->find());
873        REGEX_ASSERT(matcher->start(status) == 12);
874        REGEX_ASSERT(matcher->find() == FALSE);
875        REGEX_ASSERT(matcher->find() == FALSE);
876
877        matcher->reset();
878        REGEX_ASSERT(matcher->find());
879        REGEX_ASSERT(matcher->start(status) == 1);
880
881        REGEX_ASSERT(matcher->find(0, status));
882        REGEX_ASSERT(matcher->start(status) == 1);
883        REGEX_ASSERT(matcher->find(1, status));
884        REGEX_ASSERT(matcher->start(status) == 1);
885        REGEX_ASSERT(matcher->find(2, status));
886        REGEX_ASSERT(matcher->start(status) == 6);
887        REGEX_ASSERT(matcher->find(12, status));
888        REGEX_ASSERT(matcher->start(status) == 12);
889        REGEX_ASSERT(matcher->find(13, status) == FALSE);
890        REGEX_ASSERT(matcher->find(16, status) == FALSE);
891        REGEX_ASSERT(matcher->find(17, status) == FALSE);
892        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
893
894        status = U_ZERO_ERROR;
895        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
896        status = U_ZERO_ERROR;
897        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
898
899        REGEX_ASSERT(matcher->groupCount() == 0);
900
901        delete matcher;
902        delete pat;
903    }
904
905
906    //
907    //  find, with \G in pattern (true if at the end of a previous match).
908    //
909    {
910        int32_t             flags=0;
911        UParseError         pe;
912        UErrorCode          status=U_ZERO_ERROR;
913
914        UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
915        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
916        REGEX_CHECK_STATUS;
917        UnicodeString data = ".abcabc.abc..";
918        //                    012345678901234567
919
920        RegexMatcher *matcher = pat->matcher(data, status);
921        REGEX_CHECK_STATUS;
922        REGEX_ASSERT(matcher->find());
923        REGEX_ASSERT(matcher->start(status) == 0);
924        REGEX_ASSERT(matcher->start(1, status) == -1);
925        REGEX_ASSERT(matcher->start(2, status) == 1);
926
927        REGEX_ASSERT(matcher->find());
928        REGEX_ASSERT(matcher->start(status) == 4);
929        REGEX_ASSERT(matcher->start(1, status) == 4);
930        REGEX_ASSERT(matcher->start(2, status) == -1);
931        REGEX_CHECK_STATUS;
932
933        delete matcher;
934        delete pat;
935    }
936
937    //
938    //   find with zero length matches, match position should bump ahead
939    //     to prevent loops.
940    //
941    {
942        int32_t                 i;
943        UErrorCode          status=U_ZERO_ERROR;
944        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
945                                                      //   using an always-true look-ahead.
946        REGEX_CHECK_STATUS;
947        UnicodeString s("    ");
948        m.reset(s);
949        for (i=0; ; i++) {
950            if (m.find() == FALSE) {
951                break;
952            }
953            REGEX_ASSERT(m.start(status) == i);
954            REGEX_ASSERT(m.end(status) == i);
955        }
956        REGEX_ASSERT(i==5);
957
958        // Check that the bump goes over surrogate pairs OK
959        s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
960        s = s.unescape();
961        m.reset(s);
962        for (i=0; ; i+=2) {
963            if (m.find() == FALSE) {
964                break;
965            }
966            REGEX_ASSERT(m.start(status) == i);
967            REGEX_ASSERT(m.end(status) == i);
968        }
969        REGEX_ASSERT(i==10);
970    }
971    {
972        // find() loop breaking test.
973        //        with pattern of /.?/, should see a series of one char matches, then a single
974        //        match of zero length at the end of the input string.
975        int32_t                 i;
976        UErrorCode          status=U_ZERO_ERROR;
977        RegexMatcher        m(".?", 0, status);
978        REGEX_CHECK_STATUS;
979        UnicodeString s("    ");
980        m.reset(s);
981        for (i=0; ; i++) {
982            if (m.find() == FALSE) {
983                break;
984            }
985            REGEX_ASSERT(m.start(status) == i);
986            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
987        }
988        REGEX_ASSERT(i==5);
989    }
990
991
992    //
993    // Matchers with no input string behave as if they had an empty input string.
994    //
995
996    {
997        UErrorCode status = U_ZERO_ERROR;
998        RegexMatcher  m(".?", 0, status);
999        REGEX_CHECK_STATUS;
1000        REGEX_ASSERT(m.find());
1001        REGEX_ASSERT(m.start(status) == 0);
1002        REGEX_ASSERT(m.input() == "");
1003    }
1004    {
1005        UErrorCode status = U_ZERO_ERROR;
1006        RegexPattern  *p = RegexPattern::compile(".", 0, status);
1007        RegexMatcher  *m = p->matcher(status);
1008        REGEX_CHECK_STATUS;
1009
1010        REGEX_ASSERT(m->find() == FALSE);
1011        REGEX_ASSERT(m->input() == "");
1012        delete m;
1013        delete p;
1014    }
1015
1016    //
1017    // Regions
1018    //
1019    {
1020        UErrorCode status = U_ZERO_ERROR;
1021        UnicodeString testString("This is test data");
1022        RegexMatcher m(".*", testString,  0, status);
1023        REGEX_CHECK_STATUS;
1024        REGEX_ASSERT(m.regionStart() == 0);
1025        REGEX_ASSERT(m.regionEnd() == testString.length());
1026        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1027        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1028
1029        m.region(2,4, status);
1030        REGEX_CHECK_STATUS;
1031        REGEX_ASSERT(m.matches(status));
1032        REGEX_ASSERT(m.start(status)==2);
1033        REGEX_ASSERT(m.end(status)==4);
1034        REGEX_CHECK_STATUS;
1035
1036        m.reset();
1037        REGEX_ASSERT(m.regionStart() == 0);
1038        REGEX_ASSERT(m.regionEnd() == testString.length());
1039
1040        UnicodeString shorterString("short");
1041        m.reset(shorterString);
1042        REGEX_ASSERT(m.regionStart() == 0);
1043        REGEX_ASSERT(m.regionEnd() == shorterString.length());
1044
1045        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1046        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1047        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1048        REGEX_ASSERT(&m == &m.reset());
1049        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1050
1051        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1052        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1053        REGEX_ASSERT(&m == &m.reset());
1054        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1055
1056        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1057        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1058        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1059        REGEX_ASSERT(&m == &m.reset());
1060        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1061
1062        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1063        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1064        REGEX_ASSERT(&m == &m.reset());
1065        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1066
1067    }
1068
1069    //
1070    // hitEnd() and requireEnd()
1071    //
1072    {
1073        UErrorCode status = U_ZERO_ERROR;
1074        UnicodeString testString("aabb");
1075        RegexMatcher m1(".*", testString,  0, status);
1076        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1077        REGEX_ASSERT(m1.hitEnd() == TRUE);
1078        REGEX_ASSERT(m1.requireEnd() == FALSE);
1079        REGEX_CHECK_STATUS;
1080
1081        status = U_ZERO_ERROR;
1082        RegexMatcher m2("a*", testString, 0, status);
1083        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1084        REGEX_ASSERT(m2.hitEnd() == FALSE);
1085        REGEX_ASSERT(m2.requireEnd() == FALSE);
1086        REGEX_CHECK_STATUS;
1087
1088        status = U_ZERO_ERROR;
1089        RegexMatcher m3(".*$", testString, 0, status);
1090        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1091        REGEX_ASSERT(m3.hitEnd() == TRUE);
1092        REGEX_ASSERT(m3.requireEnd() == TRUE);
1093        REGEX_CHECK_STATUS;
1094    }
1095
1096
1097    //
1098    // Compilation error on reset with UChar *
1099    //   These were a hazard that people were stumbling over with runtime errors.
1100    //   Changed them to compiler errors by adding private methods that more closely
1101    //   matched the incorrect use of the functions.
1102    //
1103#if 0
1104    {
1105        UErrorCode status = U_ZERO_ERROR;
1106        UChar ucharString[20];
1107        RegexMatcher m(".", 0, status);
1108        m.reset(ucharString);  // should not compile.
1109
1110        RegexPattern *p = RegexPattern::compile(".", 0, status);
1111        RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1112
1113        RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1114    }
1115#endif
1116
1117    //
1118    //  Time Outs.
1119    //       Note:  These tests will need to be changed when the regexp engine is
1120    //              able to detect and cut short the exponential time behavior on
1121    //              this type of match.
1122    //
1123    {
1124        UErrorCode status = U_ZERO_ERROR;
1125        //    Enough 'a's in the string to cause the match to time out.
1126        //       (Each on additonal 'a' doubles the time)
1127        UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1128        RegexMatcher matcher("(a+)+b", testString, 0, status);
1129        REGEX_CHECK_STATUS;
1130        REGEX_ASSERT(matcher.getTimeLimit() == 0);
1131        matcher.setTimeLimit(100, status);
1132        REGEX_ASSERT(matcher.getTimeLimit() == 100);
1133        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1134        REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1135    }
1136    {
1137        UErrorCode status = U_ZERO_ERROR;
1138        //   Few enough 'a's to slip in under the time limit.
1139        UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1140        RegexMatcher matcher("(a+)+b", testString, 0, status);
1141        REGEX_CHECK_STATUS;
1142        matcher.setTimeLimit(100, status);
1143        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1144        REGEX_CHECK_STATUS;
1145    }
1146
1147    //
1148    //  Stack Limits
1149    //
1150    {
1151        UErrorCode status = U_ZERO_ERROR;
1152        UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1153
1154        // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1155        //   of the '+', and makes the stack frames larger.
1156        RegexMatcher matcher("(A)+A$", testString, 0, status);
1157
1158        // With the default stack, this match should fail to run
1159        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1160        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1161
1162        // With unlimited stack, it should run
1163        status = U_ZERO_ERROR;
1164        matcher.setStackLimit(0, status);
1165        REGEX_CHECK_STATUS;
1166        REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1167        REGEX_CHECK_STATUS;
1168        REGEX_ASSERT(matcher.getStackLimit() == 0);
1169
1170        // With a limited stack, it the match should fail
1171        status = U_ZERO_ERROR;
1172        matcher.setStackLimit(10000, status);
1173        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1174        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1175        REGEX_ASSERT(matcher.getStackLimit() == 10000);
1176    }
1177
1178        // A pattern that doesn't save state should work with
1179        //   a minimal sized stack
1180    {
1181        UErrorCode status = U_ZERO_ERROR;
1182        UnicodeString testString = "abc";
1183        RegexMatcher matcher("abc", testString, 0, status);
1184        REGEX_CHECK_STATUS;
1185        matcher.setStackLimit(30, status);
1186        REGEX_CHECK_STATUS;
1187        REGEX_ASSERT(matcher.matches(status) == TRUE);
1188        REGEX_CHECK_STATUS;
1189        REGEX_ASSERT(matcher.getStackLimit() == 30);
1190
1191        // Negative stack sizes should fail
1192        status = U_ZERO_ERROR;
1193        matcher.setStackLimit(1000, status);
1194        REGEX_CHECK_STATUS;
1195        matcher.setStackLimit(-1, status);
1196        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1197        REGEX_ASSERT(matcher.getStackLimit() == 1000);
1198    }
1199
1200
1201}
1202
1203
1204
1205
1206
1207
1208//---------------------------------------------------------------------------
1209//
1210//      API_Replace        API test for class RegexMatcher, testing the
1211//                         Replace family of functions.
1212//
1213//---------------------------------------------------------------------------
1214void RegexTest::API_Replace() {
1215    //
1216    //  Replace
1217    //
1218    int32_t             flags=0;
1219    UParseError         pe;
1220    UErrorCode          status=U_ZERO_ERROR;
1221
1222    UnicodeString       re("abc");
1223    RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1224    REGEX_CHECK_STATUS;
1225    UnicodeString data = ".abc..abc...abc..";
1226    //                    012345678901234567
1227    RegexMatcher *matcher = pat->matcher(data, status);
1228
1229    //
1230    //  Plain vanilla matches.
1231    //
1232    UnicodeString  dest;
1233    dest = matcher->replaceFirst("yz", status);
1234    REGEX_CHECK_STATUS;
1235    REGEX_ASSERT(dest == ".yz..abc...abc..");
1236
1237    dest = matcher->replaceAll("yz", status);
1238    REGEX_CHECK_STATUS;
1239    REGEX_ASSERT(dest == ".yz..yz...yz..");
1240
1241    //
1242    //  Plain vanilla non-matches.
1243    //
1244    UnicodeString d2 = ".abx..abx...abx..";
1245    matcher->reset(d2);
1246    dest = matcher->replaceFirst("yz", status);
1247    REGEX_CHECK_STATUS;
1248    REGEX_ASSERT(dest == ".abx..abx...abx..");
1249
1250    dest = matcher->replaceAll("yz", status);
1251    REGEX_CHECK_STATUS;
1252    REGEX_ASSERT(dest == ".abx..abx...abx..");
1253
1254    //
1255    // Empty source string
1256    //
1257    UnicodeString d3 = "";
1258    matcher->reset(d3);
1259    dest = matcher->replaceFirst("yz", status);
1260    REGEX_CHECK_STATUS;
1261    REGEX_ASSERT(dest == "");
1262
1263    dest = matcher->replaceAll("yz", status);
1264    REGEX_CHECK_STATUS;
1265    REGEX_ASSERT(dest == "");
1266
1267    //
1268    // Empty substitution string
1269    //
1270    matcher->reset(data);              // ".abc..abc...abc.."
1271    dest = matcher->replaceFirst("", status);
1272    REGEX_CHECK_STATUS;
1273    REGEX_ASSERT(dest == "...abc...abc..");
1274
1275    dest = matcher->replaceAll("", status);
1276    REGEX_CHECK_STATUS;
1277    REGEX_ASSERT(dest == "........");
1278
1279    //
1280    // match whole string
1281    //
1282    UnicodeString d4 = "abc";
1283    matcher->reset(d4);
1284    dest = matcher->replaceFirst("xyz", status);
1285    REGEX_CHECK_STATUS;
1286    REGEX_ASSERT(dest == "xyz");
1287
1288    dest = matcher->replaceAll("xyz", status);
1289    REGEX_CHECK_STATUS;
1290    REGEX_ASSERT(dest == "xyz");
1291
1292    //
1293    // Capture Group, simple case
1294    //
1295    UnicodeString       re2("a(..)");
1296    RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1297    REGEX_CHECK_STATUS;
1298    UnicodeString d5 = "abcdefg";
1299    RegexMatcher *matcher2 = pat2->matcher(d5, status);
1300    REGEX_CHECK_STATUS;
1301    dest = matcher2->replaceFirst("$1$1", status);
1302    REGEX_CHECK_STATUS;
1303    REGEX_ASSERT(dest == "bcbcdefg");
1304
1305    dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1306    REGEX_CHECK_STATUS;
1307    REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1308
1309    dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1310    REGEX_CHECK_STATUS;
1311    REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1312
1313    UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1314    replacement = replacement.unescape();
1315    dest = matcher2->replaceFirst(replacement, status);
1316    REGEX_CHECK_STATUS;
1317    REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1318
1319    REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1320
1321
1322    //
1323    // Replacement String with \u hex escapes
1324    //
1325    {
1326        UnicodeString  src = "abc 1 abc 2 abc 3";
1327        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1328        matcher->reset(src);
1329        UnicodeString  result = matcher->replaceAll(substitute, status);
1330        REGEX_CHECK_STATUS;
1331        REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1332    }
1333    {
1334        UnicodeString  src = "abc !";
1335        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1336        matcher->reset(src);
1337        UnicodeString  result = matcher->replaceAll(substitute, status);
1338        REGEX_CHECK_STATUS;
1339        UnicodeString expected = UnicodeString("--");
1340        expected.append((UChar32)0x10000);
1341        expected.append("-- !");
1342        REGEX_ASSERT(result == expected);
1343    }
1344    // TODO:  need more through testing of capture substitutions.
1345
1346    // Bug 4057
1347    //
1348    {
1349        status = U_ZERO_ERROR;
1350        UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1351        RegexMatcher m("ss(.*?)ee", 0, status);
1352        REGEX_CHECK_STATUS;
1353        UnicodeString result;
1354
1355        // Multiple finds do NOT bump up the previous appendReplacement postion.
1356        m.reset(s);
1357        m.find();
1358        m.find();
1359        m.appendReplacement(result, "ooh", status);
1360        REGEX_CHECK_STATUS;
1361        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1362
1363        // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1364        status = U_ZERO_ERROR;
1365        result.truncate(0);
1366        m.reset(10, status);
1367        m.find();
1368        m.find();
1369        m.appendReplacement(result, "ooh", status);
1370        REGEX_CHECK_STATUS;
1371        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1372
1373        // find() at interior of string, appendReplacemnt still starts at beginning.
1374        status = U_ZERO_ERROR;
1375        result.truncate(0);
1376        m.reset();
1377        m.find(10, status);
1378        m.find();
1379        m.appendReplacement(result, "ooh", status);
1380        REGEX_CHECK_STATUS;
1381        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1382
1383        m.appendTail(result);
1384        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1385
1386    }
1387
1388    delete matcher2;
1389    delete pat2;
1390    delete matcher;
1391    delete pat;
1392}
1393
1394
1395//---------------------------------------------------------------------------
1396//
1397//      API_Pattern       Test that the API for class RegexPattern is
1398//                        present and nominally working.
1399//
1400//---------------------------------------------------------------------------
1401void RegexTest::API_Pattern() {
1402    RegexPattern        pata;    // Test default constructor to not crash.
1403    RegexPattern        patb;
1404
1405    REGEX_ASSERT(pata == patb);
1406    REGEX_ASSERT(pata == pata);
1407
1408    UnicodeString re1("abc[a-l][m-z]");
1409    UnicodeString re2("def");
1410    UErrorCode    status = U_ZERO_ERROR;
1411    UParseError   pe;
1412
1413    RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1414    RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1415    REGEX_CHECK_STATUS;
1416    REGEX_ASSERT(*pat1 == *pat1);
1417    REGEX_ASSERT(*pat1 != pata);
1418
1419    // Assign
1420    patb = *pat1;
1421    REGEX_ASSERT(patb == *pat1);
1422
1423    // Copy Construct
1424    RegexPattern patc(*pat1);
1425    REGEX_ASSERT(patc == *pat1);
1426    REGEX_ASSERT(patb == patc);
1427    REGEX_ASSERT(pat1 != pat2);
1428    patb = *pat2;
1429    REGEX_ASSERT(patb != patc);
1430    REGEX_ASSERT(patb == *pat2);
1431
1432    // Compile with no flags.
1433    RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1434    REGEX_ASSERT(*pat1a == *pat1);
1435
1436    REGEX_ASSERT(pat1a->flags() == 0);
1437
1438    // Compile with different flags should be not equal
1439    RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1440    REGEX_CHECK_STATUS;
1441
1442    REGEX_ASSERT(*pat1b != *pat1a);
1443    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1444    REGEX_ASSERT(pat1a->flags() == 0);
1445    delete pat1b;
1446
1447    // clone
1448    RegexPattern *pat1c = pat1->clone();
1449    REGEX_ASSERT(*pat1c == *pat1);
1450    REGEX_ASSERT(*pat1c != *pat2);
1451
1452    delete pat1c;
1453    delete pat1a;
1454    delete pat1;
1455    delete pat2;
1456
1457
1458    //
1459    //   Verify that a matcher created from a cloned pattern works.
1460    //     (Jitterbug 3423)
1461    //
1462    {
1463        UErrorCode     status     = U_ZERO_ERROR;
1464        RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1465        RegexPattern  *pClone     = pSource->clone();
1466        delete         pSource;
1467        RegexMatcher  *mFromClone = pClone->matcher(status);
1468        REGEX_CHECK_STATUS;
1469        UnicodeString s = "Hello World";
1470        mFromClone->reset(s);
1471        REGEX_ASSERT(mFromClone->find() == TRUE);
1472        REGEX_ASSERT(mFromClone->group(status) == "Hello");
1473        REGEX_ASSERT(mFromClone->find() == TRUE);
1474        REGEX_ASSERT(mFromClone->group(status) == "World");
1475        REGEX_ASSERT(mFromClone->find() == FALSE);
1476        delete mFromClone;
1477        delete pClone;
1478    }
1479
1480    //
1481    //   matches convenience API
1482    //
1483    REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1484    REGEX_CHECK_STATUS;
1485    REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1486    REGEX_CHECK_STATUS;
1487    REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1488    REGEX_CHECK_STATUS;
1489    REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1490    REGEX_CHECK_STATUS;
1491    REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1492    REGEX_CHECK_STATUS;
1493    status = U_INDEX_OUTOFBOUNDS_ERROR;
1494    REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1495    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1496
1497
1498    //
1499    // Split()
1500    //
1501    status = U_ZERO_ERROR;
1502    pat1 = RegexPattern::compile(" +",  pe, status);
1503    REGEX_CHECK_STATUS;
1504    UnicodeString  fields[10];
1505
1506    int32_t n;
1507    n = pat1->split("Now is the time", fields, 10, status);
1508    REGEX_CHECK_STATUS;
1509    REGEX_ASSERT(n==4);
1510    REGEX_ASSERT(fields[0]=="Now");
1511    REGEX_ASSERT(fields[1]=="is");
1512    REGEX_ASSERT(fields[2]=="the");
1513    REGEX_ASSERT(fields[3]=="time");
1514    REGEX_ASSERT(fields[4]=="");
1515
1516    n = pat1->split("Now is the time", fields, 2, status);
1517    REGEX_CHECK_STATUS;
1518    REGEX_ASSERT(n==2);
1519    REGEX_ASSERT(fields[0]=="Now");
1520    REGEX_ASSERT(fields[1]=="is the time");
1521    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1522
1523    fields[1] = "*";
1524    status = U_ZERO_ERROR;
1525    n = pat1->split("Now is the time", fields, 1, status);
1526    REGEX_CHECK_STATUS;
1527    REGEX_ASSERT(n==1);
1528    REGEX_ASSERT(fields[0]=="Now is the time");
1529    REGEX_ASSERT(fields[1]=="*");
1530    status = U_ZERO_ERROR;
1531
1532    n = pat1->split("    Now       is the time   ", fields, 10, status);
1533    REGEX_CHECK_STATUS;
1534    REGEX_ASSERT(n==5);
1535    REGEX_ASSERT(fields[0]=="");
1536    REGEX_ASSERT(fields[1]=="Now");
1537    REGEX_ASSERT(fields[2]=="is");
1538    REGEX_ASSERT(fields[3]=="the");
1539    REGEX_ASSERT(fields[4]=="time");
1540    REGEX_ASSERT(fields[5]=="");
1541
1542    n = pat1->split("     ", fields, 10, status);
1543    REGEX_CHECK_STATUS;
1544    REGEX_ASSERT(n==1);
1545    REGEX_ASSERT(fields[0]=="");
1546
1547    fields[0] = "foo";
1548    n = pat1->split("", fields, 10, status);
1549    REGEX_CHECK_STATUS;
1550    REGEX_ASSERT(n==0);
1551    REGEX_ASSERT(fields[0]=="foo");
1552
1553    delete pat1;
1554
1555    //  split, with a pattern with (capture)
1556    pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1557    REGEX_CHECK_STATUS;
1558
1559    status = U_ZERO_ERROR;
1560    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1561    REGEX_CHECK_STATUS;
1562    REGEX_ASSERT(n==6);
1563    REGEX_ASSERT(fields[0]=="");
1564    REGEX_ASSERT(fields[1]=="a");
1565    REGEX_ASSERT(fields[2]=="Now is ");
1566    REGEX_ASSERT(fields[3]=="b");
1567    REGEX_ASSERT(fields[4]=="the time");
1568    REGEX_ASSERT(fields[5]=="c");
1569    REGEX_ASSERT(fields[6]=="");
1570    REGEX_ASSERT(status==U_ZERO_ERROR);
1571
1572    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1573    REGEX_CHECK_STATUS;
1574    REGEX_ASSERT(n==6);
1575    REGEX_ASSERT(fields[0]=="  ");
1576    REGEX_ASSERT(fields[1]=="a");
1577    REGEX_ASSERT(fields[2]=="Now is ");
1578    REGEX_ASSERT(fields[3]=="b");
1579    REGEX_ASSERT(fields[4]=="the time");
1580    REGEX_ASSERT(fields[5]=="c");
1581    REGEX_ASSERT(fields[6]=="");
1582
1583    status = U_ZERO_ERROR;
1584    fields[6] = "foo";
1585    n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1586    REGEX_CHECK_STATUS;
1587    REGEX_ASSERT(n==6);
1588    REGEX_ASSERT(fields[0]=="  ");
1589    REGEX_ASSERT(fields[1]=="a");
1590    REGEX_ASSERT(fields[2]=="Now is ");
1591    REGEX_ASSERT(fields[3]=="b");
1592    REGEX_ASSERT(fields[4]=="the time");
1593    REGEX_ASSERT(fields[5]=="c");
1594    REGEX_ASSERT(fields[6]=="foo");
1595
1596    status = U_ZERO_ERROR;
1597    fields[5] = "foo";
1598    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1599    REGEX_CHECK_STATUS;
1600    REGEX_ASSERT(n==5);
1601    REGEX_ASSERT(fields[0]=="  ");
1602    REGEX_ASSERT(fields[1]=="a");
1603    REGEX_ASSERT(fields[2]=="Now is ");
1604    REGEX_ASSERT(fields[3]=="b");
1605    REGEX_ASSERT(fields[4]=="the time<c>");
1606    REGEX_ASSERT(fields[5]=="foo");
1607
1608    status = U_ZERO_ERROR;
1609    fields[5] = "foo";
1610    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1611    REGEX_CHECK_STATUS;
1612    REGEX_ASSERT(n==5);
1613    REGEX_ASSERT(fields[0]=="  ");
1614    REGEX_ASSERT(fields[1]=="a");
1615    REGEX_ASSERT(fields[2]=="Now is ");
1616    REGEX_ASSERT(fields[3]=="b");
1617    REGEX_ASSERT(fields[4]=="the time");
1618    REGEX_ASSERT(fields[5]=="foo");
1619
1620    status = U_ZERO_ERROR;
1621    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1622    REGEX_CHECK_STATUS;
1623    REGEX_ASSERT(n==4);
1624    REGEX_ASSERT(fields[0]=="  ");
1625    REGEX_ASSERT(fields[1]=="a");
1626    REGEX_ASSERT(fields[2]=="Now is ");
1627    REGEX_ASSERT(fields[3]=="the time<c>");
1628    status = U_ZERO_ERROR;
1629    delete pat1;
1630
1631    pat1 = RegexPattern::compile("([-,])",  pe, status);
1632    REGEX_CHECK_STATUS;
1633    n = pat1->split("1-10,20", fields, 10, status);
1634    REGEX_CHECK_STATUS;
1635    REGEX_ASSERT(n==5);
1636    REGEX_ASSERT(fields[0]=="1");
1637    REGEX_ASSERT(fields[1]=="-");
1638    REGEX_ASSERT(fields[2]=="10");
1639    REGEX_ASSERT(fields[3]==",");
1640    REGEX_ASSERT(fields[4]=="20");
1641    delete pat1;
1642
1643
1644    //
1645    // RegexPattern::pattern()
1646    //
1647    pat1 = new RegexPattern();
1648    REGEX_ASSERT(pat1->pattern() == "");
1649    delete pat1;
1650
1651    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1652    REGEX_CHECK_STATUS;
1653    REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1654    delete pat1;
1655
1656
1657    //
1658    // classID functions
1659    //
1660    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1661    REGEX_CHECK_STATUS;
1662    REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1663    REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1664    UnicodeString Hello("Hello, world.");
1665    RegexMatcher *m = pat1->matcher(Hello, status);
1666    REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1667    REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1668    REGEX_ASSERT(m->getDynamicClassID() != NULL);
1669    delete m;
1670    delete pat1;
1671
1672}
1673
1674//---------------------------------------------------------------------------
1675//
1676//      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1677//                       is present and working, but excluding functions
1678//                       implementing replace operations.
1679//
1680//---------------------------------------------------------------------------
1681void RegexTest::API_Match_UTF8() {
1682    UParseError         pe;
1683    UErrorCode          status=U_ZERO_ERROR;
1684    int32_t             flags = 0;
1685
1686    //
1687    // Debug - slide failing test cases early
1688    //
1689#if 0
1690    {
1691    }
1692    return;
1693#endif
1694
1695    //
1696    // Simple pattern compilation
1697    //
1698    {
1699        UText               re = UTEXT_INITIALIZER;
1700        regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1701        RegexPattern        *pat2;
1702        pat2 = RegexPattern::compile(&re, flags, pe, status);
1703        REGEX_CHECK_STATUS;
1704
1705        UText input1 = UTEXT_INITIALIZER;
1706        UText input2 = UTEXT_INITIALIZER;
1707        UText empty  = UTEXT_INITIALIZER;
1708        regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1709        REGEX_VERBOSE_TEXT(&input1);
1710        regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1711        REGEX_VERBOSE_TEXT(&input2);
1712        utext_openUChars(&empty, NULL, 0, &status);
1713
1714        int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1715        int32_t input2Len = strlen("not abc");
1716
1717
1718        //
1719        // Matcher creation and reset.
1720        //
1721        RegexMatcher *m1 = pat2->matcher(&input1, RegexPattern::PATTERN_IS_UTEXT, status);
1722        REGEX_CHECK_STATUS;
1723        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1724        const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1725        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1726        m1->reset(&input2);
1727        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1728        const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1729        REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1730        m1->reset(&input1);
1731        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1732        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1733        m1->reset(&empty);
1734        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1735        REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1736
1737        //
1738        //  reset(pos, status)
1739        //
1740        m1->reset(&input1);
1741        m1->reset(4, status);
1742        REGEX_CHECK_STATUS;
1743        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1744        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1745
1746        m1->reset(-1, status);
1747        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1748        status = U_ZERO_ERROR;
1749
1750        m1->reset(0, status);
1751        REGEX_CHECK_STATUS;
1752        status = U_ZERO_ERROR;
1753
1754        m1->reset(input1Len-1, status);
1755        REGEX_CHECK_STATUS;
1756        status = U_ZERO_ERROR;
1757
1758        m1->reset(input1Len, status);
1759        REGEX_CHECK_STATUS;
1760        status = U_ZERO_ERROR;
1761
1762        m1->reset(input1Len+1, status);
1763        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1764        status = U_ZERO_ERROR;
1765
1766        //
1767        // match(pos, status)
1768        //
1769        m1->reset(&input2);
1770        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1771        m1->reset();
1772        REGEX_ASSERT(m1->matches(3, status) == FALSE);
1773        m1->reset();
1774        REGEX_ASSERT(m1->matches(5, status) == FALSE);
1775        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1776        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1777        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1778
1779        // Match() at end of string should fail, but should not
1780        //  be an error.
1781        status = U_ZERO_ERROR;
1782        REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1783        REGEX_CHECK_STATUS;
1784
1785        // Match beyond end of string should fail with an error.
1786        status = U_ZERO_ERROR;
1787        REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1788        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1789
1790        // Successful match at end of string.
1791        {
1792            status = U_ZERO_ERROR;
1793            RegexMatcher m("A?", 0, status);  // will match zero length string.
1794            REGEX_CHECK_STATUS;
1795            m.reset(&input1);
1796            REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1797            REGEX_CHECK_STATUS;
1798            m.reset(&empty);
1799            REGEX_ASSERT(m.matches(0, status) == TRUE);
1800            REGEX_CHECK_STATUS;
1801        }
1802
1803
1804        //
1805        // lookingAt(pos, status)
1806        //
1807        status = U_ZERO_ERROR;
1808        m1->reset(&input2);  // "not abc"
1809        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1810        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1811        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1812        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1813        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1814        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1815        status = U_ZERO_ERROR;
1816        REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1817        REGEX_CHECK_STATUS;
1818        REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1819        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1820
1821        delete m1;
1822        delete pat2;
1823
1824        utext_close(&re);
1825        utext_close(&input1);
1826        utext_close(&input2);
1827        utext_close(&empty);
1828    }
1829
1830
1831    //
1832    // Capture Group.
1833    //     RegexMatcher::start();
1834    //     RegexMatcher::end();
1835    //     RegexMatcher::groupCount();
1836    //
1837    {
1838        int32_t             flags=0;
1839        UParseError         pe;
1840        UErrorCode          status=U_ZERO_ERROR;
1841        UText               re=UTEXT_INITIALIZER;
1842        const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1843        utext_openUTF8(&re, str_01234567_pat, -1, &status);
1844
1845        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1846        REGEX_CHECK_STATUS;
1847
1848        UText input = UTEXT_INITIALIZER;
1849        const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1850        utext_openUTF8(&input, str_0123456789, -1, &status);
1851
1852        RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1853        REGEX_CHECK_STATUS;
1854        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1855        static const int32_t matchStarts[] = {0,  2, 4, 8};
1856        static const int32_t matchEnds[]   = {10, 8, 6, 10};
1857        int32_t i;
1858        for (i=0; i<4; i++) {
1859            int32_t actualStart = matcher->start(i, status);
1860            REGEX_CHECK_STATUS;
1861            if (actualStart != matchStarts[i]) {
1862                errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
1863                      __FILE__, __LINE__, i, matchStarts[i], actualStart);
1864            }
1865            int32_t actualEnd = matcher->end(i, status);
1866            REGEX_CHECK_STATUS;
1867            if (actualEnd != matchEnds[i]) {
1868                errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
1869                      __FILE__, __LINE__, i, matchEnds[i], actualEnd);
1870            }
1871        }
1872
1873        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1874        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1875
1876        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1877        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1878        matcher->reset();
1879        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1880
1881        matcher->lookingAt(status);
1882
1883        UnicodeString dest;
1884        UText destText = UTEXT_INITIALIZER;
1885        utext_openUnicodeString(&destText, &dest, &status);
1886        UText *result;
1887        //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1888        //	Test shallow-clone API
1889        int64_t   group_len;
1890        result = matcher->group((UText *)NULL, group_len, status);
1891        REGEX_CHECK_STATUS;
1892        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
1893        utext_close(result);
1894        result = matcher->group(0, &destText, group_len, status);
1895        REGEX_CHECK_STATUS;
1896        REGEX_ASSERT(result == &destText);
1897        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
1898        //  destText is now immutable, reopen it
1899        utext_close(&destText);
1900        utext_openUnicodeString(&destText, &dest, &status);
1901
1902        result = matcher->group(0, NULL, status);
1903        REGEX_CHECK_STATUS;
1904        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
1905        utext_close(result);
1906        result = matcher->group(0, &destText, status);
1907        REGEX_CHECK_STATUS;
1908        REGEX_ASSERT(result == &destText);
1909        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
1910
1911        result = matcher->group(1, NULL, status);
1912        REGEX_CHECK_STATUS;
1913        const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
1914        REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
1915        utext_close(result);
1916        result = matcher->group(1, &destText, status);
1917        REGEX_CHECK_STATUS;
1918        REGEX_ASSERT(result == &destText);
1919        REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
1920
1921        result = matcher->group(2, NULL, status);
1922        REGEX_CHECK_STATUS;
1923        const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
1924        REGEX_ASSERT_UTEXT_UTF8(str_45, result);
1925        utext_close(result);
1926        result = matcher->group(2, &destText, status);
1927        REGEX_CHECK_STATUS;
1928        REGEX_ASSERT(result == &destText);
1929        REGEX_ASSERT_UTEXT_UTF8(str_45, result);
1930
1931        result = matcher->group(3, NULL, status);
1932        REGEX_CHECK_STATUS;
1933        const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
1934        REGEX_ASSERT_UTEXT_UTF8(str_89, result);
1935        utext_close(result);
1936        result = matcher->group(3, &destText, status);
1937        REGEX_CHECK_STATUS;
1938        REGEX_ASSERT(result == &destText);
1939        REGEX_ASSERT_UTEXT_UTF8(str_89, result);
1940
1941        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1942        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1943        matcher->reset();
1944        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
1945
1946        delete matcher;
1947        delete pat;
1948
1949        utext_close(&destText);
1950        utext_close(&input);
1951        utext_close(&re);
1952    }
1953
1954    //
1955    //  find
1956    //
1957    {
1958        int32_t             flags=0;
1959        UParseError         pe;
1960        UErrorCode          status=U_ZERO_ERROR;
1961        UText               re=UTEXT_INITIALIZER;
1962        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
1963        utext_openUTF8(&re, str_abc, -1, &status);
1964
1965        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1966        REGEX_CHECK_STATUS;
1967        UText input = UTEXT_INITIALIZER;
1968        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
1969        utext_openUTF8(&input, str_abcabcabc, -1, &status);
1970        //                      012345678901234567
1971
1972        RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1973        REGEX_CHECK_STATUS;
1974        REGEX_ASSERT(matcher->find());
1975        REGEX_ASSERT(matcher->start(status) == 1);
1976        REGEX_ASSERT(matcher->find());
1977        REGEX_ASSERT(matcher->start(status) == 6);
1978        REGEX_ASSERT(matcher->find());
1979        REGEX_ASSERT(matcher->start(status) == 12);
1980        REGEX_ASSERT(matcher->find() == FALSE);
1981        REGEX_ASSERT(matcher->find() == FALSE);
1982
1983        matcher->reset();
1984        REGEX_ASSERT(matcher->find());
1985        REGEX_ASSERT(matcher->start(status) == 1);
1986
1987        REGEX_ASSERT(matcher->find(0, status));
1988        REGEX_ASSERT(matcher->start(status) == 1);
1989        REGEX_ASSERT(matcher->find(1, status));
1990        REGEX_ASSERT(matcher->start(status) == 1);
1991        REGEX_ASSERT(matcher->find(2, status));
1992        REGEX_ASSERT(matcher->start(status) == 6);
1993        REGEX_ASSERT(matcher->find(12, status));
1994        REGEX_ASSERT(matcher->start(status) == 12);
1995        REGEX_ASSERT(matcher->find(13, status) == FALSE);
1996        REGEX_ASSERT(matcher->find(16, status) == FALSE);
1997        REGEX_ASSERT(matcher->find(17, status) == FALSE);
1998        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1999
2000        status = U_ZERO_ERROR;
2001        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2002        status = U_ZERO_ERROR;
2003        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2004
2005        REGEX_ASSERT(matcher->groupCount() == 0);
2006
2007        delete matcher;
2008        delete pat;
2009
2010        utext_close(&input);
2011        utext_close(&re);
2012    }
2013
2014
2015    //
2016    //  find, with \G in pattern (true if at the end of a previous match).
2017    //
2018    {
2019        int32_t             flags=0;
2020        UParseError         pe;
2021        UErrorCode          status=U_ZERO_ERROR;
2022        UText               re=UTEXT_INITIALIZER;
2023        const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2024        utext_openUTF8(&re, str_Gabcabc, -1, &status);
2025
2026        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2027
2028        REGEX_CHECK_STATUS;
2029        UText input = UTEXT_INITIALIZER;
2030        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2031        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2032        //                      012345678901234567
2033
2034        RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
2035        REGEX_CHECK_STATUS;
2036        REGEX_ASSERT(matcher->find());
2037        REGEX_ASSERT(matcher->start(status) == 0);
2038        REGEX_ASSERT(matcher->start(1, status) == -1);
2039        REGEX_ASSERT(matcher->start(2, status) == 1);
2040
2041        REGEX_ASSERT(matcher->find());
2042        REGEX_ASSERT(matcher->start(status) == 4);
2043        REGEX_ASSERT(matcher->start(1, status) == 4);
2044        REGEX_ASSERT(matcher->start(2, status) == -1);
2045        REGEX_CHECK_STATUS;
2046
2047        delete matcher;
2048        delete pat;
2049
2050        utext_close(&input);
2051        utext_close(&re);
2052    }
2053
2054    //
2055    //   find with zero length matches, match position should bump ahead
2056    //     to prevent loops.
2057    //
2058    {
2059        int32_t                 i;
2060        UErrorCode          status=U_ZERO_ERROR;
2061        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2062                                                      //   using an always-true look-ahead.
2063        REGEX_CHECK_STATUS;
2064        UText s = UTEXT_INITIALIZER;
2065        utext_openUTF8(&s, "    ", -1, &status);
2066        m.reset(&s);
2067        for (i=0; ; i++) {
2068            if (m.find() == FALSE) {
2069                break;
2070            }
2071            REGEX_ASSERT(m.start(status) == i);
2072            REGEX_ASSERT(m.end(status) == i);
2073        }
2074        REGEX_ASSERT(i==5);
2075
2076        // Check that the bump goes over characters outside the BMP OK
2077        // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2078        unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2079        utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2080        m.reset(&s);
2081        for (i=0; ; i+=4) {
2082            if (m.find() == FALSE) {
2083                break;
2084            }
2085            REGEX_ASSERT(m.start(status) == i);
2086            REGEX_ASSERT(m.end(status) == i);
2087        }
2088        REGEX_ASSERT(i==20);
2089
2090        utext_close(&s);
2091    }
2092    {
2093        // find() loop breaking test.
2094        //        with pattern of /.?/, should see a series of one char matches, then a single
2095        //        match of zero length at the end of the input string.
2096        int32_t                 i;
2097        UErrorCode          status=U_ZERO_ERROR;
2098        RegexMatcher        m(".?", 0, status);
2099        REGEX_CHECK_STATUS;
2100        UText s = UTEXT_INITIALIZER;
2101        utext_openUTF8(&s, "    ", -1, &status);
2102        m.reset(&s);
2103        for (i=0; ; i++) {
2104            if (m.find() == FALSE) {
2105                break;
2106            }
2107            REGEX_ASSERT(m.start(status) == i);
2108            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2109        }
2110        REGEX_ASSERT(i==5);
2111
2112        utext_close(&s);
2113    }
2114
2115
2116    //
2117    // Matchers with no input string behave as if they had an empty input string.
2118    //
2119
2120    {
2121        UErrorCode status = U_ZERO_ERROR;
2122        RegexMatcher  m(".?", 0, status);
2123        REGEX_CHECK_STATUS;
2124        REGEX_ASSERT(m.find());
2125        REGEX_ASSERT(m.start(status) == 0);
2126        REGEX_ASSERT(m.input() == "");
2127    }
2128    {
2129        UErrorCode status = U_ZERO_ERROR;
2130        RegexPattern  *p = RegexPattern::compile(".", 0, status);
2131        RegexMatcher  *m = p->matcher(status);
2132        REGEX_CHECK_STATUS;
2133
2134        REGEX_ASSERT(m->find() == FALSE);
2135        REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2136        delete m;
2137        delete p;
2138    }
2139
2140    //
2141    // Regions
2142    //
2143    {
2144        UErrorCode status = U_ZERO_ERROR;
2145        UText testPattern = UTEXT_INITIALIZER;
2146        UText testText    = UTEXT_INITIALIZER;
2147        regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2148        REGEX_VERBOSE_TEXT(&testPattern);
2149        regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2150        REGEX_VERBOSE_TEXT(&testText);
2151
2152        RegexMatcher m(&testPattern, &testText, 0, status);
2153        REGEX_CHECK_STATUS;
2154        REGEX_ASSERT(m.regionStart() == 0);
2155        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2156        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2157        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2158
2159        m.region(2,4, status);
2160        REGEX_CHECK_STATUS;
2161        REGEX_ASSERT(m.matches(status));
2162        REGEX_ASSERT(m.start(status)==2);
2163        REGEX_ASSERT(m.end(status)==4);
2164        REGEX_CHECK_STATUS;
2165
2166        m.reset();
2167        REGEX_ASSERT(m.regionStart() == 0);
2168        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2169
2170        regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2171        REGEX_VERBOSE_TEXT(&testText);
2172        m.reset(&testText);
2173        REGEX_ASSERT(m.regionStart() == 0);
2174        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2175
2176        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2177        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2178        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2179        REGEX_ASSERT(&m == &m.reset());
2180        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2181
2182        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2183        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2184        REGEX_ASSERT(&m == &m.reset());
2185        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2186
2187        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2188        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2189        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2190        REGEX_ASSERT(&m == &m.reset());
2191        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2192
2193        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2194        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2195        REGEX_ASSERT(&m == &m.reset());
2196        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2197
2198        utext_close(&testText);
2199        utext_close(&testPattern);
2200    }
2201
2202    //
2203    // hitEnd() and requireEnd()
2204    //
2205    {
2206        UErrorCode status = U_ZERO_ERROR;
2207        UText testPattern = UTEXT_INITIALIZER;
2208        UText testText    = UTEXT_INITIALIZER;
2209        const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2210        const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2211        utext_openUTF8(&testPattern, str_, -1, &status);
2212        utext_openUTF8(&testText, str_aabb, -1, &status);
2213
2214        RegexMatcher m1(&testPattern, &testText,  0, status);
2215        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2216        REGEX_ASSERT(m1.hitEnd() == TRUE);
2217        REGEX_ASSERT(m1.requireEnd() == FALSE);
2218        REGEX_CHECK_STATUS;
2219
2220        status = U_ZERO_ERROR;
2221        const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2222        utext_openUTF8(&testPattern, str_a, -1, &status);
2223        RegexMatcher m2(&testPattern, &testText, 0, status);
2224        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2225        REGEX_ASSERT(m2.hitEnd() == FALSE);
2226        REGEX_ASSERT(m2.requireEnd() == FALSE);
2227        REGEX_CHECK_STATUS;
2228
2229        status = U_ZERO_ERROR;
2230        const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2231        utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2232        RegexMatcher m3(&testPattern, &testText, 0, status);
2233        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2234        REGEX_ASSERT(m3.hitEnd() == TRUE);
2235        REGEX_ASSERT(m3.requireEnd() == TRUE);
2236        REGEX_CHECK_STATUS;
2237
2238        utext_close(&testText);
2239        utext_close(&testPattern);
2240    }
2241}
2242
2243
2244//---------------------------------------------------------------------------
2245//
2246//      API_Replace_UTF8   API test for class RegexMatcher, testing the
2247//                         Replace family of functions.
2248//
2249//---------------------------------------------------------------------------
2250void RegexTest::API_Replace_UTF8() {
2251    //
2252    //  Replace
2253    //
2254    int32_t             flags=0;
2255    UParseError         pe;
2256    UErrorCode          status=U_ZERO_ERROR;
2257
2258    UText               re=UTEXT_INITIALIZER;
2259    regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2260    REGEX_VERBOSE_TEXT(&re);
2261    RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2262    REGEX_CHECK_STATUS;
2263
2264    char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2265    //             012345678901234567
2266    UText dataText = UTEXT_INITIALIZER;
2267    utext_openUTF8(&dataText, data, -1, &status);
2268    REGEX_CHECK_STATUS;
2269    REGEX_VERBOSE_TEXT(&dataText);
2270    RegexMatcher *matcher = pat->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
2271
2272    //
2273    //  Plain vanilla matches.
2274    //
2275    UnicodeString  dest;
2276    UText destText = UTEXT_INITIALIZER;
2277    utext_openUnicodeString(&destText, &dest, &status);
2278    UText *result;
2279
2280    UText replText = UTEXT_INITIALIZER;
2281
2282    const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2283    utext_openUTF8(&replText, str_yz, -1, &status);
2284    REGEX_VERBOSE_TEXT(&replText);
2285    result = matcher->replaceFirst(&replText, NULL, status);
2286    REGEX_CHECK_STATUS;
2287    const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2288    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2289    utext_close(result);
2290    result = matcher->replaceFirst(&replText, &destText, status);
2291    REGEX_CHECK_STATUS;
2292    REGEX_ASSERT(result == &destText);
2293    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2294
2295    result = matcher->replaceAll(&replText, NULL, status);
2296    REGEX_CHECK_STATUS;
2297    const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2298    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2299    utext_close(result);
2300
2301    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2302    result = matcher->replaceAll(&replText, &destText, status);
2303    REGEX_CHECK_STATUS;
2304    REGEX_ASSERT(result == &destText);
2305    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2306
2307    //
2308    //  Plain vanilla non-matches.
2309    //
2310    const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2311    utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2312    matcher->reset(&dataText);
2313
2314    result = matcher->replaceFirst(&replText, NULL, status);
2315    REGEX_CHECK_STATUS;
2316    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2317    utext_close(result);
2318    result = matcher->replaceFirst(&replText, &destText, status);
2319    REGEX_CHECK_STATUS;
2320    REGEX_ASSERT(result == &destText);
2321    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2322
2323    result = matcher->replaceAll(&replText, NULL, status);
2324    REGEX_CHECK_STATUS;
2325    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2326    utext_close(result);
2327    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2328    result = matcher->replaceAll(&replText, &destText, status);
2329    REGEX_CHECK_STATUS;
2330    REGEX_ASSERT(result == &destText);
2331    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2332
2333    //
2334    // Empty source string
2335    //
2336    utext_openUTF8(&dataText, NULL, 0, &status);
2337    matcher->reset(&dataText);
2338
2339    result = matcher->replaceFirst(&replText, NULL, status);
2340    REGEX_CHECK_STATUS;
2341    REGEX_ASSERT_UTEXT_UTF8("", result);
2342    utext_close(result);
2343    result = matcher->replaceFirst(&replText, &destText, status);
2344    REGEX_CHECK_STATUS;
2345    REGEX_ASSERT(result == &destText);
2346    REGEX_ASSERT_UTEXT_UTF8("", result);
2347
2348    result = matcher->replaceAll(&replText, NULL, status);
2349    REGEX_CHECK_STATUS;
2350    REGEX_ASSERT_UTEXT_UTF8("", result);
2351    utext_close(result);
2352    result = matcher->replaceAll(&replText, &destText, status);
2353    REGEX_CHECK_STATUS;
2354    REGEX_ASSERT(result == &destText);
2355    REGEX_ASSERT_UTEXT_UTF8("", result);
2356
2357    //
2358    // Empty substitution string
2359    //
2360    utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2361    matcher->reset(&dataText);
2362
2363    utext_openUTF8(&replText, NULL, 0, &status);
2364    result = matcher->replaceFirst(&replText, NULL, status);
2365    REGEX_CHECK_STATUS;
2366    const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2367    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2368    utext_close(result);
2369    result = matcher->replaceFirst(&replText, &destText, status);
2370    REGEX_CHECK_STATUS;
2371    REGEX_ASSERT(result == &destText);
2372    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2373
2374    result = matcher->replaceAll(&replText, NULL, status);
2375    REGEX_CHECK_STATUS;
2376    const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2377    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2378    utext_close(result);
2379    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2380    result = matcher->replaceAll(&replText, &destText, status);
2381    REGEX_CHECK_STATUS;
2382    REGEX_ASSERT(result == &destText);
2383    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2384
2385    //
2386    // match whole string
2387    //
2388    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2389    utext_openUTF8(&dataText, str_abc, -1, &status);
2390    matcher->reset(&dataText);
2391
2392    const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2393    utext_openUTF8(&replText, str_xyz, -1, &status);
2394    result = matcher->replaceFirst(&replText, NULL, status);
2395    REGEX_CHECK_STATUS;
2396    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2397    utext_close(result);
2398    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2399    result = matcher->replaceFirst(&replText, &destText, status);
2400    REGEX_CHECK_STATUS;
2401    REGEX_ASSERT(result == &destText);
2402    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2403
2404    result = matcher->replaceAll(&replText, NULL, status);
2405    REGEX_CHECK_STATUS;
2406    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2407    utext_close(result);
2408    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2409    result = matcher->replaceAll(&replText, &destText, status);
2410    REGEX_CHECK_STATUS;
2411    REGEX_ASSERT(result == &destText);
2412    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2413
2414    //
2415    // Capture Group, simple case
2416    //
2417    const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2418    utext_openUTF8(&re, str_add, -1, &status);
2419    RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2420    REGEX_CHECK_STATUS;
2421
2422    const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2423    utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2424    RegexMatcher *matcher2 = pat2->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
2425    REGEX_CHECK_STATUS;
2426
2427    const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2428    utext_openUTF8(&replText, str_11, -1, &status);
2429    result = matcher2->replaceFirst(&replText, NULL, status);
2430    REGEX_CHECK_STATUS;
2431    const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2432    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2433    utext_close(result);
2434    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2435    result = matcher2->replaceFirst(&replText, &destText, status);
2436    REGEX_CHECK_STATUS;
2437    REGEX_ASSERT(result == &destText);
2438    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2439
2440    regextst_openUTF8FromInvariant(&replText, "The value of \\$1 is $1.", -1, &status);
2441    result = matcher2->replaceFirst(&replText, NULL, status);
2442    REGEX_CHECK_STATUS;
2443    const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2444    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2445    utext_close(result);
2446    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2447    result = matcher2->replaceFirst(&replText, &destText, status);
2448    REGEX_CHECK_STATUS;
2449    REGEX_ASSERT(result == &destText);
2450    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2451
2452    const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2453    utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2454    result = matcher2->replaceFirst(&replText, NULL, status);
2455    REGEX_CHECK_STATUS;
2456    const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2457    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2458    utext_close(result);
2459    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2460    result = matcher2->replaceFirst(&replText, &destText, status);
2461    REGEX_CHECK_STATUS;
2462    REGEX_ASSERT(result == &destText);
2463    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2464
2465    unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2466    //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2467    //                                 012345678901234567890123456
2468    supplDigitChars[22] = 0xF0;
2469    supplDigitChars[23] = 0x9D;
2470    supplDigitChars[24] = 0x9F;
2471    supplDigitChars[25] = 0x8F;
2472    utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2473
2474    result = matcher2->replaceFirst(&replText, NULL, status);
2475    REGEX_CHECK_STATUS;
2476    const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2477    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2478    utext_close(result);
2479    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2480    result = matcher2->replaceFirst(&replText, &destText, status);
2481    REGEX_CHECK_STATUS;
2482    REGEX_ASSERT(result == &destText);
2483    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2484    const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2485    utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2486    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2487//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2488    utext_close(result);
2489    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2490    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2491    REGEX_ASSERT(result == &destText);
2492//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2493
2494    //
2495    // Replacement String with \u hex escapes
2496    //
2497    {
2498      const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2499      const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2500        utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2501        utext_openUTF8(&replText, str_u0043, -1, &status);
2502        matcher->reset(&dataText);
2503
2504        result = matcher->replaceAll(&replText, NULL, status);
2505        REGEX_CHECK_STATUS;
2506        const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2507        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2508        utext_close(result);
2509        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2510        result = matcher->replaceAll(&replText, &destText, status);
2511        REGEX_CHECK_STATUS;
2512        REGEX_ASSERT(result == &destText);
2513        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2514    }
2515    {
2516      const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2517        utext_openUTF8(&dataText, str_abc, -1, &status);
2518        const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2519        utext_openUTF8(&replText, str_U00010000, -1, &status);
2520        matcher->reset(&dataText);
2521
2522        unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2523        //                          0123456789
2524        expected[2] = 0xF0;
2525        expected[3] = 0x90;
2526        expected[4] = 0x80;
2527        expected[5] = 0x80;
2528
2529        result = matcher->replaceAll(&replText, NULL, status);
2530        REGEX_CHECK_STATUS;
2531        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2532        utext_close(result);
2533        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2534        result = matcher->replaceAll(&replText, &destText, status);
2535        REGEX_CHECK_STATUS;
2536        REGEX_ASSERT(result == &destText);
2537        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2538    }
2539    // TODO:  need more through testing of capture substitutions.
2540
2541    // Bug 4057
2542    //
2543    {
2544        status = U_ZERO_ERROR;
2545const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2546const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2547const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2548        utext_openUTF8(&re, str_ssee, -1, &status);
2549        utext_openUTF8(&dataText, str_blah, -1, &status);
2550        utext_openUTF8(&replText, str_ooh, -1, &status);
2551
2552        RegexMatcher m(&re, 0, status);
2553        REGEX_CHECK_STATUS;
2554
2555        UnicodeString result;
2556        UText resultText = UTEXT_INITIALIZER;
2557        utext_openUnicodeString(&resultText, &result, &status);
2558
2559        // Multiple finds do NOT bump up the previous appendReplacement postion.
2560        m.reset(&dataText);
2561        m.find();
2562        m.find();
2563        m.appendReplacement(&resultText, &replText, status);
2564        REGEX_CHECK_STATUS;
2565        const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2566        REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2567
2568        // After a reset into the interior of a string, appendReplacement still starts at beginning.
2569        status = U_ZERO_ERROR;
2570        result.truncate(0);
2571        utext_openUnicodeString(&resultText, &result, &status);
2572        m.reset(10, status);
2573        m.find();
2574        m.find();
2575        m.appendReplacement(&resultText, &replText, status);
2576        REGEX_CHECK_STATUS;
2577        const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2578        REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2579
2580        // find() at interior of string, appendReplacement still starts at beginning.
2581        status = U_ZERO_ERROR;
2582        result.truncate(0);
2583        utext_openUnicodeString(&resultText, &result, &status);
2584        m.reset();
2585        m.find(10, status);
2586        m.find();
2587        m.appendReplacement(&resultText, &replText, status);
2588        REGEX_CHECK_STATUS;
2589        const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2590        REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2591
2592        m.appendTail(&resultText, status);
2593        const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2594        REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2595
2596        utext_close(&resultText);
2597    }
2598
2599    delete matcher2;
2600    delete pat2;
2601    delete matcher;
2602    delete pat;
2603
2604    utext_close(&dataText);
2605    utext_close(&replText);
2606    utext_close(&destText);
2607    utext_close(&re);
2608}
2609
2610
2611//---------------------------------------------------------------------------
2612//
2613//      API_Pattern_UTF8  Test that the API for class RegexPattern is
2614//                        present and nominally working.
2615//
2616//---------------------------------------------------------------------------
2617void RegexTest::API_Pattern_UTF8() {
2618    RegexPattern        pata;    // Test default constructor to not crash.
2619    RegexPattern        patb;
2620
2621    REGEX_ASSERT(pata == patb);
2622    REGEX_ASSERT(pata == pata);
2623
2624    UText         re1 = UTEXT_INITIALIZER;
2625    UText         re2 = UTEXT_INITIALIZER;
2626    UErrorCode    status = U_ZERO_ERROR;
2627    UParseError   pe;
2628
2629    const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2630    const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2631    utext_openUTF8(&re1, str_abcalmz, -1, &status);
2632    utext_openUTF8(&re2, str_def, -1, &status);
2633
2634    RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2635    RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2636    REGEX_CHECK_STATUS;
2637    REGEX_ASSERT(*pat1 == *pat1);
2638    REGEX_ASSERT(*pat1 != pata);
2639
2640    // Assign
2641    patb = *pat1;
2642    REGEX_ASSERT(patb == *pat1);
2643
2644    // Copy Construct
2645    RegexPattern patc(*pat1);
2646    REGEX_ASSERT(patc == *pat1);
2647    REGEX_ASSERT(patb == patc);
2648    REGEX_ASSERT(pat1 != pat2);
2649    patb = *pat2;
2650    REGEX_ASSERT(patb != patc);
2651    REGEX_ASSERT(patb == *pat2);
2652
2653    // Compile with no flags.
2654    RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2655    REGEX_ASSERT(*pat1a == *pat1);
2656
2657    REGEX_ASSERT(pat1a->flags() == 0);
2658
2659    // Compile with different flags should be not equal
2660    RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2661    REGEX_CHECK_STATUS;
2662
2663    REGEX_ASSERT(*pat1b != *pat1a);
2664    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2665    REGEX_ASSERT(pat1a->flags() == 0);
2666    delete pat1b;
2667
2668    // clone
2669    RegexPattern *pat1c = pat1->clone();
2670    REGEX_ASSERT(*pat1c == *pat1);
2671    REGEX_ASSERT(*pat1c != *pat2);
2672
2673    delete pat1c;
2674    delete pat1a;
2675    delete pat1;
2676    delete pat2;
2677
2678    utext_close(&re1);
2679    utext_close(&re2);
2680
2681
2682    //
2683    //   Verify that a matcher created from a cloned pattern works.
2684    //     (Jitterbug 3423)
2685    //
2686    {
2687        UErrorCode     status     = U_ZERO_ERROR;
2688        UText          pattern    = UTEXT_INITIALIZER;
2689        const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2690        utext_openUTF8(&pattern, str_pL, -1, &status);
2691
2692        RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2693        RegexPattern  *pClone     = pSource->clone();
2694        delete         pSource;
2695        RegexMatcher  *mFromClone = pClone->matcher(status);
2696        REGEX_CHECK_STATUS;
2697
2698        UText          input      = UTEXT_INITIALIZER;
2699        const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2700        utext_openUTF8(&input, str_HelloWorld, -1, &status);
2701        mFromClone->reset(&input);
2702        REGEX_ASSERT(mFromClone->find() == TRUE);
2703        REGEX_ASSERT(mFromClone->group(status) == "Hello");
2704        REGEX_ASSERT(mFromClone->find() == TRUE);
2705        REGEX_ASSERT(mFromClone->group(status) == "World");
2706        REGEX_ASSERT(mFromClone->find() == FALSE);
2707        delete mFromClone;
2708        delete pClone;
2709
2710        utext_close(&input);
2711        utext_close(&pattern);
2712    }
2713
2714    //
2715    //   matches convenience API
2716    //
2717    {
2718        UErrorCode status  = U_ZERO_ERROR;
2719        UText      pattern = UTEXT_INITIALIZER;
2720        UText      input   = UTEXT_INITIALIZER;
2721
2722        const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2723        utext_openUTF8(&input, str_randominput, -1, &status);
2724
2725        const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2726        utext_openUTF8(&pattern, str_dotstar, -1, &status);
2727        REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2728        REGEX_CHECK_STATUS;
2729
2730        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2731        utext_openUTF8(&pattern, str_abc, -1, &status);
2732        REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2733        REGEX_CHECK_STATUS;
2734
2735        const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2736        utext_openUTF8(&pattern, str_nput, -1, &status);
2737        REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2738        REGEX_CHECK_STATUS;
2739
2740        utext_openUTF8(&pattern, str_randominput, -1, &status);
2741        REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2742        REGEX_CHECK_STATUS;
2743
2744        const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2745        utext_openUTF8(&pattern, str_u, -1, &status);
2746        REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2747        REGEX_CHECK_STATUS;
2748
2749        utext_openUTF8(&input, str_abc, -1, &status);
2750        utext_openUTF8(&pattern, str_abc, -1, &status);
2751        status = U_INDEX_OUTOFBOUNDS_ERROR;
2752        REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2753        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2754
2755        utext_close(&input);
2756        utext_close(&pattern);
2757    }
2758
2759
2760    //
2761    // Split()
2762    //
2763    status = U_ZERO_ERROR;
2764    const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2765    utext_openUTF8(&re1, str_spaceplus, -1, &status);
2766    pat1 = RegexPattern::compile(&re1, pe, status);
2767    REGEX_CHECK_STATUS;
2768    UnicodeString  fields[10];
2769
2770    int32_t n;
2771    n = pat1->split("Now is the time", fields, 10, status);
2772    REGEX_CHECK_STATUS;
2773    REGEX_ASSERT(n==4);
2774    REGEX_ASSERT(fields[0]=="Now");
2775    REGEX_ASSERT(fields[1]=="is");
2776    REGEX_ASSERT(fields[2]=="the");
2777    REGEX_ASSERT(fields[3]=="time");
2778    REGEX_ASSERT(fields[4]=="");
2779
2780    n = pat1->split("Now is the time", fields, 2, status);
2781    REGEX_CHECK_STATUS;
2782    REGEX_ASSERT(n==2);
2783    REGEX_ASSERT(fields[0]=="Now");
2784    REGEX_ASSERT(fields[1]=="is the time");
2785    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2786
2787    fields[1] = "*";
2788    status = U_ZERO_ERROR;
2789    n = pat1->split("Now is the time", fields, 1, status);
2790    REGEX_CHECK_STATUS;
2791    REGEX_ASSERT(n==1);
2792    REGEX_ASSERT(fields[0]=="Now is the time");
2793    REGEX_ASSERT(fields[1]=="*");
2794    status = U_ZERO_ERROR;
2795
2796    n = pat1->split("    Now       is the time   ", fields, 10, status);
2797    REGEX_CHECK_STATUS;
2798    REGEX_ASSERT(n==5);
2799    REGEX_ASSERT(fields[0]=="");
2800    REGEX_ASSERT(fields[1]=="Now");
2801    REGEX_ASSERT(fields[2]=="is");
2802    REGEX_ASSERT(fields[3]=="the");
2803    REGEX_ASSERT(fields[4]=="time");
2804    REGEX_ASSERT(fields[5]=="");
2805
2806    n = pat1->split("     ", fields, 10, status);
2807    REGEX_CHECK_STATUS;
2808    REGEX_ASSERT(n==1);
2809    REGEX_ASSERT(fields[0]=="");
2810
2811    fields[0] = "foo";
2812    n = pat1->split("", fields, 10, status);
2813    REGEX_CHECK_STATUS;
2814    REGEX_ASSERT(n==0);
2815    REGEX_ASSERT(fields[0]=="foo");
2816
2817    delete pat1;
2818
2819    //  split, with a pattern with (capture)
2820    regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2821    pat1 = RegexPattern::compile(&re1,  pe, status);
2822    REGEX_CHECK_STATUS;
2823
2824    status = U_ZERO_ERROR;
2825    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2826    REGEX_CHECK_STATUS;
2827    REGEX_ASSERT(n==6);
2828    REGEX_ASSERT(fields[0]=="");
2829    REGEX_ASSERT(fields[1]=="a");
2830    REGEX_ASSERT(fields[2]=="Now is ");
2831    REGEX_ASSERT(fields[3]=="b");
2832    REGEX_ASSERT(fields[4]=="the time");
2833    REGEX_ASSERT(fields[5]=="c");
2834    REGEX_ASSERT(fields[6]=="");
2835    REGEX_ASSERT(status==U_ZERO_ERROR);
2836
2837    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2838    REGEX_CHECK_STATUS;
2839    REGEX_ASSERT(n==6);
2840    REGEX_ASSERT(fields[0]=="  ");
2841    REGEX_ASSERT(fields[1]=="a");
2842    REGEX_ASSERT(fields[2]=="Now is ");
2843    REGEX_ASSERT(fields[3]=="b");
2844    REGEX_ASSERT(fields[4]=="the time");
2845    REGEX_ASSERT(fields[5]=="c");
2846    REGEX_ASSERT(fields[6]=="");
2847
2848    status = U_ZERO_ERROR;
2849    fields[6] = "foo";
2850    n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
2851    REGEX_CHECK_STATUS;
2852    REGEX_ASSERT(n==6);
2853    REGEX_ASSERT(fields[0]=="  ");
2854    REGEX_ASSERT(fields[1]=="a");
2855    REGEX_ASSERT(fields[2]=="Now is ");
2856    REGEX_ASSERT(fields[3]=="b");
2857    REGEX_ASSERT(fields[4]=="the time");
2858    REGEX_ASSERT(fields[5]=="c");
2859    REGEX_ASSERT(fields[6]=="foo");
2860
2861    status = U_ZERO_ERROR;
2862    fields[5] = "foo";
2863    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
2864    REGEX_CHECK_STATUS;
2865    REGEX_ASSERT(n==5);
2866    REGEX_ASSERT(fields[0]=="  ");
2867    REGEX_ASSERT(fields[1]=="a");
2868    REGEX_ASSERT(fields[2]=="Now is ");
2869    REGEX_ASSERT(fields[3]=="b");
2870    REGEX_ASSERT(fields[4]=="the time<c>");
2871    REGEX_ASSERT(fields[5]=="foo");
2872
2873    status = U_ZERO_ERROR;
2874    fields[5] = "foo";
2875    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
2876    REGEX_CHECK_STATUS;
2877    REGEX_ASSERT(n==5);
2878    REGEX_ASSERT(fields[0]=="  ");
2879    REGEX_ASSERT(fields[1]=="a");
2880    REGEX_ASSERT(fields[2]=="Now is ");
2881    REGEX_ASSERT(fields[3]=="b");
2882    REGEX_ASSERT(fields[4]=="the time");
2883    REGEX_ASSERT(fields[5]=="foo");
2884
2885    status = U_ZERO_ERROR;
2886    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
2887    REGEX_CHECK_STATUS;
2888    REGEX_ASSERT(n==4);
2889    REGEX_ASSERT(fields[0]=="  ");
2890    REGEX_ASSERT(fields[1]=="a");
2891    REGEX_ASSERT(fields[2]=="Now is ");
2892    REGEX_ASSERT(fields[3]=="the time<c>");
2893    status = U_ZERO_ERROR;
2894    delete pat1;
2895
2896    regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
2897    pat1 = RegexPattern::compile(&re1, pe, status);
2898    REGEX_CHECK_STATUS;
2899    n = pat1->split("1-10,20", fields, 10, status);
2900    REGEX_CHECK_STATUS;
2901    REGEX_ASSERT(n==5);
2902    REGEX_ASSERT(fields[0]=="1");
2903    REGEX_ASSERT(fields[1]=="-");
2904    REGEX_ASSERT(fields[2]=="10");
2905    REGEX_ASSERT(fields[3]==",");
2906    REGEX_ASSERT(fields[4]=="20");
2907    delete pat1;
2908
2909
2910    //
2911    // RegexPattern::pattern() and patternText()
2912    //
2913    pat1 = new RegexPattern();
2914    REGEX_ASSERT(pat1->pattern() == "");
2915    REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
2916    delete pat1;
2917
2918    regextst_openUTF8FromInvariant(&re1, "(Hello, world)*", -1, &status);
2919    pat1 = RegexPattern::compile(&re1, pe, status);
2920    REGEX_CHECK_STATUS;
2921    REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
2922    REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
2923    delete pat1;
2924
2925    utext_close(&re1);
2926}
2927
2928
2929//---------------------------------------------------------------------------
2930//
2931//      Extended       A more thorough check for features of regex patterns
2932//                     The test cases are in a separate data file,
2933//                       source/tests/testdata/regextst.txt
2934//                     A description of the test data format is included in that file.
2935//
2936//---------------------------------------------------------------------------
2937
2938const char *
2939RegexTest::getPath(char buffer[2048], const char *filename) {
2940    UErrorCode status=U_ZERO_ERROR;
2941    const char *testDataDirectory = IntlTest::getSourceTestData(status);
2942    if (U_FAILURE(status)) {
2943        errln("ERROR: loadTestData() failed - %s", u_errorName(status));
2944        return NULL;
2945    }
2946
2947    strcpy(buffer, testDataDirectory);
2948    strcat(buffer, filename);
2949    return buffer;
2950}
2951
2952void RegexTest::Extended() {
2953    char tdd[2048];
2954    const char *srcPath;
2955    UErrorCode  status  = U_ZERO_ERROR;
2956    int32_t     lineNum = 0;
2957
2958    //
2959    //  Open and read the test data file.
2960    //
2961    srcPath=getPath(tdd, "regextst.txt");
2962    if(srcPath==NULL) {
2963        return; /* something went wrong, error already output */
2964    }
2965
2966    int32_t    len;
2967    UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
2968    if (U_FAILURE(status)) {
2969        return; /* something went wrong, error already output */
2970    }
2971
2972    //
2973    //  Put the test data into a UnicodeString
2974    //
2975    UnicodeString testString(FALSE, testData, len);
2976
2977    RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
2978    RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
2979    RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
2980
2981    RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
2982    UnicodeString   testPattern;   // The pattern for test from the test file.
2983    UnicodeString   testFlags;     // the flags   for a test.
2984    UnicodeString   matchString;   // The marked up string to be used as input
2985
2986    if (U_FAILURE(status)){
2987        dataerrln("Construct RegexMatcher() error.");
2988        delete [] testData;
2989        return;
2990    }
2991
2992    //
2993    //  Loop over the test data file, once per line.
2994    //
2995    while (lineMat.find()) {
2996        lineNum++;
2997        if (U_FAILURE(status)) {
2998          errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
2999        }
3000
3001        status = U_ZERO_ERROR;
3002        UnicodeString testLine = lineMat.group(1, status);
3003        if (testLine.length() == 0) {
3004            continue;
3005        }
3006
3007        //
3008        // Parse the test line.  Skip blank and comment only lines.
3009        // Separate out the three main fields - pattern, flags, target.
3010        //
3011
3012        commentMat.reset(testLine);
3013        if (commentMat.lookingAt(status)) {
3014            // This line is a comment, or blank.
3015            continue;
3016        }
3017
3018        //
3019        //  Pull out the pattern field, remove it from the test file line.
3020        //
3021        quotedStuffMat.reset(testLine);
3022        if (quotedStuffMat.lookingAt(status)) {
3023            testPattern = quotedStuffMat.group(2, status);
3024            testLine.remove(0, quotedStuffMat.end(0, status));
3025        } else {
3026            errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3027            continue;
3028        }
3029
3030
3031        //
3032        //  Pull out the flags from the test file line.
3033        //
3034        flagsMat.reset(testLine);
3035        flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3036        testFlags = flagsMat.group(1, status);
3037        if (flagsMat.group(2, status).length() > 0) {
3038            errln("Bad Match flag at line %d. Scanning %c\n",
3039                lineNum, flagsMat.group(2, status).charAt(0));
3040            continue;
3041        }
3042        testLine.remove(0, flagsMat.end(0, status));
3043
3044        //
3045        //  Pull out the match string, as a whole.
3046        //    We'll process the <tags> later.
3047        //
3048        quotedStuffMat.reset(testLine);
3049        if (quotedStuffMat.lookingAt(status)) {
3050            matchString = quotedStuffMat.group(2, status);
3051            testLine.remove(0, quotedStuffMat.end(0, status));
3052        } else {
3053            errln("Bad match string at test file line %d", lineNum);
3054            continue;
3055        }
3056
3057        //
3058        //  The only thing left from the input line should be an optional trailing comment.
3059        //
3060        commentMat.reset(testLine);
3061        if (commentMat.lookingAt(status) == FALSE) {
3062            errln("Line %d: unexpected characters at end of test line.", lineNum);
3063            continue;
3064        }
3065
3066        //
3067        //  Run the test
3068        //
3069        regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3070    }
3071
3072    delete [] testData;
3073
3074}
3075
3076
3077
3078//---------------------------------------------------------------------------
3079//
3080//    regex_find(pattern, flags, inputString, lineNumber)
3081//
3082//         Function to run a single test from the Extended (data driven) tests.
3083//         See file test/testdata/regextst.txt for a description of the
3084//         pattern and inputString fields, and the allowed flags.
3085//         lineNumber is the source line in regextst.txt of the test.
3086//
3087//---------------------------------------------------------------------------
3088
3089
3090//  Set a value into a UVector at position specified by a decimal number in
3091//   a UnicodeString.   This is a utility function needed by the actual test function,
3092//   which follows.
3093static void set(UVector &vec, int32_t val, UnicodeString index) {
3094    UErrorCode  status=U_ZERO_ERROR;
3095    int32_t  idx = 0;
3096    for (int32_t i=0; i<index.length(); i++) {
3097        int32_t d=u_charDigitValue(index.charAt(i));
3098        if (d<0) {return;}
3099        idx = idx*10 + d;
3100    }
3101    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3102    vec.setElementAt(val, idx);
3103}
3104
3105static void setInt(UVector &vec, int32_t val, int32_t idx) {
3106    UErrorCode  status=U_ZERO_ERROR;
3107    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3108    vec.setElementAt(val, idx);
3109}
3110
3111static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3112{
3113    UBool couldFind = TRUE;
3114    UTEXT_SETNATIVEINDEX(utext, 0);
3115    int32_t i = 0;
3116    while (i < unistrOffset) {
3117        UChar32 c = UTEXT_NEXT32(utext);
3118        if (c != U_SENTINEL) {
3119            i += U16_LENGTH(c);
3120        } else {
3121            couldFind = FALSE;
3122            break;
3123        }
3124    }
3125    nativeIndex = UTEXT_GETNATIVEINDEX(utext);
3126    return couldFind;
3127}
3128
3129
3130void RegexTest::regex_find(const UnicodeString &pattern,
3131                           const UnicodeString &flags,
3132                           const UnicodeString &inputString,
3133                           const char *srcPath,
3134                           int32_t line) {
3135    UnicodeString       unEscapedInput;
3136    UnicodeString       deTaggedInput;
3137
3138    int32_t             patternUTF8Length,      inputUTF8Length;
3139    char                *patternChars  = NULL, *inputChars = NULL;
3140    UText               patternText    = UTEXT_INITIALIZER;
3141    UText               inputText      = UTEXT_INITIALIZER;
3142    UConverter          *UTF8Converter = NULL;
3143
3144    UErrorCode          status         = U_ZERO_ERROR;
3145    UParseError         pe;
3146    RegexPattern        *parsePat      = NULL;
3147    RegexMatcher        *parseMatcher  = NULL;
3148    RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3149    RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3150    UVector             groupStarts(status);
3151    UVector             groupEnds(status);
3152    UVector             groupStartsUTF8(status);
3153    UVector             groupEndsUTF8(status);
3154    UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3155    UBool               failed         = FALSE;
3156    int32_t             numFinds;
3157    int32_t             i;
3158    UBool               useMatchesFunc   = FALSE;
3159    UBool               useLookingAtFunc = FALSE;
3160    int32_t             regionStart      = -1;
3161    int32_t             regionEnd        = -1;
3162    int32_t             regionStartUTF8  = -1;
3163    int32_t             regionEndUTF8    = -1;
3164
3165
3166    //
3167    //  Compile the caller's pattern
3168    //
3169    uint32_t bflags = 0;
3170    if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3171        bflags |= UREGEX_CASE_INSENSITIVE;
3172    }
3173    if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3174        bflags |= UREGEX_COMMENTS;
3175    }
3176    if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3177        bflags |= UREGEX_DOTALL;
3178    }
3179    if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3180        bflags |= UREGEX_MULTILINE;
3181    }
3182
3183    if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3184        bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3185    }
3186    if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3187        bflags |= UREGEX_UNIX_LINES;
3188    }
3189
3190
3191    callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3192    if (status != U_ZERO_ERROR) {
3193        #if UCONFIG_NO_BREAK_ITERATION==1
3194        // 'v' test flag means that the test pattern should not compile if ICU was configured
3195        //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3196        if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3197            goto cleanupAndReturn;
3198        }
3199        #endif
3200        if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3201            // Expected pattern compilation error.
3202            if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3203                logln("Pattern Compile returns \"%s\"", u_errorName(status));
3204            }
3205            goto cleanupAndReturn;
3206        } else {
3207            // Unexpected pattern compilation error.
3208            errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3209            goto cleanupAndReturn;
3210        }
3211    }
3212
3213    UTF8Converter = ucnv_open("UTF8", &status);
3214    ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3215
3216    patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3217    status = U_ZERO_ERROR; // buffer overflow
3218    patternChars = new char[patternUTF8Length+1];
3219    pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3220    utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3221
3222    if (status == U_ZERO_ERROR) {
3223        UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3224
3225        if (status != U_ZERO_ERROR) {
3226#if UCONFIG_NO_BREAK_ITERATION==1
3227            // 'v' test flag means that the test pattern should not compile if ICU was configured
3228            //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3229            if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3230                goto cleanupAndReturn;
3231            }
3232#endif
3233            if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3234                // Expected pattern compilation error.
3235                if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3236                    logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3237                }
3238                goto cleanupAndReturn;
3239            } else {
3240                // Unexpected pattern compilation error.
3241                errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3242                goto cleanupAndReturn;
3243            }
3244        }
3245    }
3246
3247    if (UTF8Pattern == NULL) {
3248        // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3249        logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3250        status = U_ZERO_ERROR;
3251    }
3252
3253    if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3254        RegexPatternDump(callerPattern);
3255    }
3256
3257    if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3258        errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3259        goto cleanupAndReturn;
3260    }
3261
3262
3263    //
3264    // Number of times find() should be called on the test string, default to 1
3265    //
3266    numFinds = 1;
3267    for (i=2; i<=9; i++) {
3268        if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3269            if (numFinds != 1) {
3270                errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3271                goto cleanupAndReturn;
3272            }
3273            numFinds = i;
3274        }
3275    }
3276
3277    // 'M' flag.  Use matches() instead of find()
3278    if (flags.indexOf((UChar)0x4d) >= 0) {
3279        useMatchesFunc = TRUE;
3280    }
3281    if (flags.indexOf((UChar)0x4c) >= 0) {
3282        useLookingAtFunc = TRUE;
3283    }
3284
3285    //
3286    //  Find the tags in the input data, remove them, and record the group boundary
3287    //    positions.
3288    //
3289    parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3290    REGEX_CHECK_STATUS_L(line);
3291
3292    unEscapedInput = inputString.unescape();
3293    parseMatcher = parsePat->matcher(unEscapedInput, status);
3294    REGEX_CHECK_STATUS_L(line);
3295    while(parseMatcher->find()) {
3296        parseMatcher->appendReplacement(deTaggedInput, "", status);
3297        REGEX_CHECK_STATUS;
3298        UnicodeString groupNum = parseMatcher->group(2, status);
3299        if (groupNum == "r") {
3300            // <r> or </r>, a region specification within the string
3301            if (parseMatcher->group(1, status) == "/") {
3302                regionEnd = deTaggedInput.length();
3303            } else {
3304                regionStart = deTaggedInput.length();
3305            }
3306        } else {
3307            // <digits> or </digits>, a group match boundary tag.
3308            if (parseMatcher->group(1, status) == "/") {
3309                set(groupEnds, deTaggedInput.length(), groupNum);
3310            } else {
3311                set(groupStarts, deTaggedInput.length(), groupNum);
3312            }
3313        }
3314    }
3315    parseMatcher->appendTail(deTaggedInput);
3316    REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3317    if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3318      errln("mismatched <r> tags");
3319      failed = TRUE;
3320      goto cleanupAndReturn;
3321    }
3322
3323    //
3324    //  Configure the matcher according to the flags specified with this test.
3325    //
3326    matcher = callerPattern->matcher(deTaggedInput, status);
3327    REGEX_CHECK_STATUS_L(line);
3328    if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3329        matcher->setTrace(TRUE);
3330    }
3331
3332    if (UTF8Pattern != NULL) {
3333        inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3334        status = U_ZERO_ERROR; // buffer overflow
3335        inputChars = new char[inputUTF8Length+1];
3336        deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3337        utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3338
3339        if (status == U_ZERO_ERROR) {
3340            UTF8Matcher = UTF8Pattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
3341            REGEX_CHECK_STATUS_L(line);
3342        }
3343
3344        if (UTF8Matcher == NULL) {
3345            // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3346          logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3347            status = U_ZERO_ERROR;
3348        }
3349    }
3350
3351    //
3352    //  Generate native indices for UTF8 versions of region and capture group info
3353    //
3354    if (UTF8Matcher != NULL) {
3355        if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3356        if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3357
3358        //  Fill out the native index UVector info.
3359        //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3360        for (i=0; i<groupStarts.size(); i++) {
3361            int32_t  start = groupStarts.elementAti(i);
3362            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3363            if (start >= 0) {
3364                int32_t  startUTF8;
3365                if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3366                    errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3367                    failed = TRUE;
3368                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3369                }
3370                setInt(groupStartsUTF8, startUTF8, i);
3371            }
3372
3373            int32_t  end = groupEnds.elementAti(i);
3374            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3375            if (end >= 0) {
3376                int32_t  endUTF8;
3377                if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3378                    errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3379                    failed = TRUE;
3380                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3381                }
3382                setInt(groupEndsUTF8, endUTF8, i);
3383            }
3384        }
3385    }
3386
3387    if (regionStart>=0) {
3388       matcher->region(regionStart, regionEnd, status);
3389       REGEX_CHECK_STATUS_L(line);
3390       if (UTF8Matcher != NULL) {
3391           UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3392           REGEX_CHECK_STATUS_L(line);
3393       }
3394    }
3395    if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3396        matcher->useAnchoringBounds(FALSE);
3397        if (UTF8Matcher != NULL) {
3398            UTF8Matcher->useAnchoringBounds(FALSE);
3399        }
3400    }
3401    if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3402        matcher->useTransparentBounds(TRUE);
3403        if (UTF8Matcher != NULL) {
3404            UTF8Matcher->useTransparentBounds(TRUE);
3405        }
3406    }
3407
3408
3409
3410    //
3411    // Do a find on the de-tagged input using the caller's pattern
3412    //     TODO: error on count>1 and not find().
3413    //           error on both matches() and lookingAt().
3414    //
3415    for (i=0; i<numFinds; i++) {
3416        if (useMatchesFunc) {
3417            isMatch = matcher->matches(status);
3418            if (UTF8Matcher != NULL) {
3419               isUTF8Match = UTF8Matcher->matches(status);
3420            }
3421        } else  if (useLookingAtFunc) {
3422            isMatch = matcher->lookingAt(status);
3423            if (UTF8Matcher != NULL) {
3424                isUTF8Match = UTF8Matcher->lookingAt(status);
3425            }
3426        } else {
3427            isMatch = matcher->find();
3428            if (UTF8Matcher != NULL) {
3429                isUTF8Match = UTF8Matcher->find();
3430            }
3431        }
3432    }
3433    matcher->setTrace(FALSE);
3434
3435    //
3436    // Match up the groups from the find() with the groups from the tags
3437    //
3438
3439    // number of tags should match number of groups from find operation.
3440    // matcher->groupCount does not include group 0, the entire match, hence the +1.
3441    //   G option in test means that capture group data is not available in the
3442    //     expected results, so the check needs to be suppressed.
3443    if (isMatch == FALSE && groupStarts.size() != 0) {
3444        errln("Error at line %d:  Match expected, but none found.", line);
3445        failed = TRUE;
3446        goto cleanupAndReturn;
3447    } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3448        errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3449        failed = TRUE;
3450        goto cleanupAndReturn;
3451    }
3452
3453    if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3454        // Only check for match / no match.  Don't check capture groups.
3455        if (isMatch && groupStarts.size() == 0) {
3456            errln("Error at line %d:  No match expected, but one found.", line);
3457            failed = TRUE;
3458        } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3459            errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3460            failed = TRUE;
3461        }
3462        goto cleanupAndReturn;
3463    }
3464
3465    REGEX_CHECK_STATUS_L(line);
3466    for (i=0; i<=matcher->groupCount(); i++) {
3467        int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3468        int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3469        if (matcher->start(i, status) != expectedStart) {
3470            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3471                line, i, expectedStart, matcher->start(i, status));
3472            failed = TRUE;
3473            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3474        } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3475            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3476                  line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3477            failed = TRUE;
3478            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3479        }
3480
3481        int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3482        int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3483        if (matcher->end(i, status) != expectedEnd) {
3484            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3485                line, i, expectedEnd, matcher->end(i, status));
3486            failed = TRUE;
3487            // Error on end position;  keep going; real error is probably yet to come as group
3488            //   end positions work from end of the input data towards the front.
3489        } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3490            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3491                  line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3492            failed = TRUE;
3493            // Error on end position;  keep going; real error is probably yet to come as group
3494            //   end positions work from end of the input data towards the front.
3495        }
3496    }
3497    if ( matcher->groupCount()+1 < groupStarts.size()) {
3498        errln("Error at line %d: Expected %d capture groups, found %d.",
3499            line, groupStarts.size()-1, matcher->groupCount());
3500        failed = TRUE;
3501        }
3502    else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3503        errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3504              line, groupStarts.size()-1, UTF8Matcher->groupCount());
3505        failed = TRUE;
3506    }
3507
3508    if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3509        matcher->requireEnd() == TRUE) {
3510        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3511        failed = TRUE;
3512    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3513        UTF8Matcher->requireEnd() == TRUE) {
3514        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3515        failed = TRUE;
3516    }
3517
3518    if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3519        matcher->requireEnd() == FALSE) {
3520        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3521        failed = TRUE;
3522    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3523        UTF8Matcher->requireEnd() == FALSE) {
3524        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3525        failed = TRUE;
3526    }
3527
3528    if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3529        matcher->hitEnd() == TRUE) {
3530        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3531        failed = TRUE;
3532    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3533               UTF8Matcher->hitEnd() == TRUE) {
3534        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3535        failed = TRUE;
3536    }
3537
3538    if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3539        matcher->hitEnd() == FALSE) {
3540        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3541        failed = TRUE;
3542    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3543               UTF8Matcher->hitEnd() == FALSE) {
3544        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3545        failed = TRUE;
3546    }
3547
3548
3549cleanupAndReturn:
3550    if (failed) {
3551        infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3552            +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3553        // callerPattern->dump();
3554    }
3555    delete parseMatcher;
3556    delete parsePat;
3557    delete UTF8Matcher;
3558    delete UTF8Pattern;
3559    delete matcher;
3560    delete callerPattern;
3561
3562    utext_close(&inputText);
3563    delete[] inputChars;
3564    utext_close(&patternText);
3565    delete[] patternChars;
3566    ucnv_close(UTF8Converter);
3567}
3568
3569
3570
3571
3572//---------------------------------------------------------------------------
3573//
3574//      Errors     Check for error handling in patterns.
3575//
3576//---------------------------------------------------------------------------
3577void RegexTest::Errors() {
3578    // \escape sequences that aren't implemented yet.
3579    //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3580
3581    // Missing close parentheses
3582    REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3583    REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3584    REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3585
3586    // Extra close paren
3587    REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3588    REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3589    REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3590
3591    // Look-ahead, Look-behind
3592    //  TODO:  add tests for unbounded length look-behinds.
3593    REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3594
3595    // Attempt to use non-default flags
3596    {
3597        UParseError   pe;
3598        UErrorCode    status = U_ZERO_ERROR;
3599        int32_t       flags  = UREGEX_CANON_EQ |
3600                               UREGEX_COMMENTS         | UREGEX_DOTALL   |
3601                               UREGEX_MULTILINE;
3602        RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3603        REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3604        delete pat1;
3605    }
3606
3607
3608    // Quantifiers are allowed only after something that can be quantified.
3609    REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3610    REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3611    REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3612
3613    // Mal-formed {min,max} quantifiers
3614    REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3615    REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3616    REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3617    REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3618    REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3619    REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3620    REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3621    REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3622    REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3623
3624    // Ticket 5389
3625    REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3626
3627    // Invalid Back Reference \0
3628    //    For ICU 3.8 and earlier
3629    //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3630    //
3631    REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3632
3633}
3634
3635
3636//-------------------------------------------------------------------------------
3637//
3638//  Read a text data file, convert it to UChars, and return the data
3639//    in one big UChar * buffer, which the caller must delete.
3640//
3641//--------------------------------------------------------------------------------
3642UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3643                                     const char *defEncoding, UErrorCode &status) {
3644    UChar       *retPtr  = NULL;
3645    char        *fileBuf = NULL;
3646    UConverter* conv     = NULL;
3647    FILE        *f       = NULL;
3648
3649    ulen = 0;
3650    if (U_FAILURE(status)) {
3651        return retPtr;
3652    }
3653
3654    //
3655    //  Open the file.
3656    //
3657    f = fopen(fileName, "rb");
3658    if (f == 0) {
3659        dataerrln("Error opening test data file %s\n", fileName);
3660        status = U_FILE_ACCESS_ERROR;
3661        return NULL;
3662    }
3663    //
3664    //  Read it in
3665    //
3666    int32_t            fileSize;
3667    int32_t            amt_read;
3668
3669    fseek( f, 0, SEEK_END);
3670    fileSize = ftell(f);
3671    fileBuf = new char[fileSize];
3672    fseek(f, 0, SEEK_SET);
3673    amt_read = fread(fileBuf, 1, fileSize, f);
3674    if (amt_read != fileSize || fileSize <= 0) {
3675        errln("Error reading test data file.");
3676        goto cleanUpAndReturn;
3677    }
3678
3679    //
3680    // Look for a Unicode Signature (BOM) on the data just read
3681    //
3682    int32_t        signatureLength;
3683    const char *   fileBufC;
3684    const char*    encoding;
3685
3686    fileBufC = fileBuf;
3687    encoding = ucnv_detectUnicodeSignature(
3688        fileBuf, fileSize, &signatureLength, &status);
3689    if(encoding!=NULL ){
3690        fileBufC  += signatureLength;
3691        fileSize  -= signatureLength;
3692    } else {
3693        encoding = defEncoding;
3694        if (strcmp(encoding, "utf-8") == 0) {
3695            errln("file %s is missing its BOM", fileName);
3696        }
3697    }
3698
3699    //
3700    // Open a converter to take the rule file to UTF-16
3701    //
3702    conv = ucnv_open(encoding, &status);
3703    if (U_FAILURE(status)) {
3704        goto cleanUpAndReturn;
3705    }
3706
3707    //
3708    // Convert the rules to UChar.
3709    //  Preflight first to determine required buffer size.
3710    //
3711    ulen = ucnv_toUChars(conv,
3712        NULL,           //  dest,
3713        0,              //  destCapacity,
3714        fileBufC,
3715        fileSize,
3716        &status);
3717    if (status == U_BUFFER_OVERFLOW_ERROR) {
3718        // Buffer Overflow is expected from the preflight operation.
3719        status = U_ZERO_ERROR;
3720
3721        retPtr = new UChar[ulen+1];
3722        ucnv_toUChars(conv,
3723            retPtr,       //  dest,
3724            ulen+1,
3725            fileBufC,
3726            fileSize,
3727            &status);
3728    }
3729
3730cleanUpAndReturn:
3731    fclose(f);
3732    delete[] fileBuf;
3733    ucnv_close(conv);
3734    if (U_FAILURE(status)) {
3735        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3736        delete retPtr;
3737        retPtr = 0;
3738        ulen   = 0;
3739    };
3740    return retPtr;
3741}
3742
3743
3744//-------------------------------------------------------------------------------
3745//
3746//   PerlTests  - Run Perl's regular expression tests
3747//                The input file for this test is re_tests, the standard regular
3748//                expression test data distributed with the Perl source code.
3749//
3750//                Here is Perl's description of the test data file:
3751//
3752//        # The tests are in a separate file 't/op/re_tests'.
3753//        # Each line in that file is a separate test.
3754//        # There are five columns, separated by tabs.
3755//        #
3756//        # Column 1 contains the pattern, optionally enclosed in C<''>.
3757//        # Modifiers can be put after the closing C<'>.
3758//        #
3759//        # Column 2 contains the string to be matched.
3760//        #
3761//        # Column 3 contains the expected result:
3762//        #     y   expect a match
3763//        #     n   expect no match
3764//        #     c   expect an error
3765//        # B   test exposes a known bug in Perl, should be skipped
3766//        # b   test exposes a known bug in Perl, should be skipped if noamp
3767//        #
3768//        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3769//        #
3770//        # Column 4 contains a string, usually C<$&>.
3771//        #
3772//        # Column 5 contains the expected result of double-quote
3773//        # interpolating that string after the match, or start of error message.
3774//        #
3775//        # Column 6, if present, contains a reason why the test is skipped.
3776//        # This is printed with "skipped", for harness to pick up.
3777//        #
3778//        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3779//        #
3780//        # If you want to add a regular expression test that can't be expressed
3781//        # in this format, don't add it here: put it in op/pat.t instead.
3782//
3783//        For ICU, if field 3 contains an 'i', the test will be skipped.
3784//        The test exposes is some known incompatibility between ICU and Perl regexps.
3785//        (The i is in addition to whatever was there before.)
3786//
3787//-------------------------------------------------------------------------------
3788void RegexTest::PerlTests() {
3789    char tdd[2048];
3790    const char *srcPath;
3791    UErrorCode  status = U_ZERO_ERROR;
3792    UParseError pe;
3793
3794    //
3795    //  Open and read the test data file.
3796    //
3797    srcPath=getPath(tdd, "re_tests.txt");
3798    if(srcPath==NULL) {
3799        return; /* something went wrong, error already output */
3800    }
3801
3802    int32_t    len;
3803    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3804    if (U_FAILURE(status)) {
3805        return; /* something went wrong, error already output */
3806    }
3807
3808    //
3809    //  Put the test data into a UnicodeString
3810    //
3811    UnicodeString testDataString(FALSE, testData, len);
3812
3813    //
3814    //  Regex to break the input file into lines, and strip the new lines.
3815    //     One line per match, capture group one is the desired data.
3816    //
3817    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3818    if (U_FAILURE(status)) {
3819        dataerrln("RegexPattern::compile() error");
3820        return;
3821    }
3822    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3823
3824    //
3825    //  Regex to split a test file line into fields.
3826    //    There are six fields, separated by tabs.
3827    //
3828    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3829
3830    //
3831    //  Regex to identify test patterns with flag settings, and to separate them.
3832    //    Test patterns with flags look like 'pattern'i
3833    //    Test patterns without flags are not quoted:   pattern
3834    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3835    //
3836    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3837    RegexMatcher* flagMat = flagPat->matcher(status);
3838
3839    //
3840    // The Perl tests reference several perl-isms, which are evaluated/substituted
3841    //   in the test data.  Not being perl, this must be done explicitly.  Here
3842    //   are string constants and REs for these constructs.
3843    //
3844    UnicodeString nulnulSrc("${nulnul}");
3845    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3846    nulnul = nulnul.unescape();
3847
3848    UnicodeString ffffSrc("${ffff}");
3849    UnicodeString ffff("\\uffff", -1, US_INV);
3850    ffff = ffff.unescape();
3851
3852    //  regexp for $-[0], $+[2], etc.
3853    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3854    RegexMatcher *groupsMat = groupsPat->matcher(status);
3855
3856    //  regexp for $0, $1, $2, etc.
3857    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3858    RegexMatcher *cgMat = cgPat->matcher(status);
3859
3860
3861    //
3862    // Main Loop for the Perl Tests, runs once per line from the
3863    //   test data file.
3864    //
3865    int32_t  lineNum = 0;
3866    int32_t  skippedUnimplementedCount = 0;
3867    while (lineMat->find()) {
3868        lineNum++;
3869
3870        //
3871        //  Get a line, break it into its fields, do the Perl
3872        //    variable substitutions.
3873        //
3874        UnicodeString line = lineMat->group(1, status);
3875        UnicodeString fields[7];
3876        fieldPat->split(line, fields, 7, status);
3877
3878        flagMat->reset(fields[0]);
3879        flagMat->matches(status);
3880        UnicodeString pattern  = flagMat->group(2, status);
3881        pattern.findAndReplace("${bang}", "!");
3882        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
3883        pattern.findAndReplace(ffffSrc, ffff);
3884
3885        //
3886        //  Identify patterns that include match flag settings,
3887        //    split off the flags, remove the extra quotes.
3888        //
3889        UnicodeString flagStr = flagMat->group(3, status);
3890        if (U_FAILURE(status)) {
3891            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3892            return;
3893        }
3894        int32_t flags = 0;
3895        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
3896        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
3897        const UChar UChar_m = 0x6d;
3898        const UChar UChar_x = 0x78;
3899        const UChar UChar_y = 0x79;
3900        if (flagStr.indexOf(UChar_i) != -1) {
3901            flags |= UREGEX_CASE_INSENSITIVE;
3902        }
3903        if (flagStr.indexOf(UChar_m) != -1) {
3904            flags |= UREGEX_MULTILINE;
3905        }
3906        if (flagStr.indexOf(UChar_x) != -1) {
3907            flags |= UREGEX_COMMENTS;
3908        }
3909
3910        //
3911        // Compile the test pattern.
3912        //
3913        status = U_ZERO_ERROR;
3914        RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
3915        if (status == U_REGEX_UNIMPLEMENTED) {
3916            //
3917            // Test of a feature that is planned for ICU, but not yet implemented.
3918            //   skip the test.
3919            skippedUnimplementedCount++;
3920            delete testPat;
3921            status = U_ZERO_ERROR;
3922            continue;
3923        }
3924
3925        if (U_FAILURE(status)) {
3926            // Some tests are supposed to generate errors.
3927            //   Only report an error for tests that are supposed to succeed.
3928            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
3929                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
3930            {
3931                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
3932            }
3933            status = U_ZERO_ERROR;
3934            delete testPat;
3935            continue;
3936        }
3937
3938        if (fields[2].indexOf(UChar_i) >= 0) {
3939            // ICU should skip this test.
3940            delete testPat;
3941            continue;
3942        }
3943
3944        if (fields[2].indexOf(UChar_c) >= 0) {
3945            // This pattern should have caused a compilation error, but didn't/
3946            errln("line %d: Expected a pattern compile error, got success.", lineNum);
3947            delete testPat;
3948            continue;
3949        }
3950
3951        //
3952        // replace the Perl variables that appear in some of the
3953        //   match data strings.
3954        //
3955        UnicodeString matchString = fields[1];
3956        matchString.findAndReplace(nulnulSrc, nulnul);
3957        matchString.findAndReplace(ffffSrc,   ffff);
3958
3959        // Replace any \n in the match string with an actual new-line char.
3960        //  Don't do full unescape, as this unescapes more than Perl does, which
3961        //  causes other spurious failures in the tests.
3962        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
3963
3964
3965
3966        //
3967        // Run the test, check for expected match/don't match result.
3968        //
3969        RegexMatcher *testMat = testPat->matcher(matchString, status);
3970        UBool found = testMat->find();
3971        UBool expected = FALSE;
3972        if (fields[2].indexOf(UChar_y) >=0) {
3973            expected = TRUE;
3974        }
3975        if (expected != found) {
3976            errln("line %d: Expected %smatch, got %smatch",
3977                lineNum, expected?"":"no ", found?"":"no " );
3978            continue;
3979        }
3980
3981        // Don't try to check expected results if there is no match.
3982        //   (Some have stuff in the expected fields)
3983        if (!found) {
3984            delete testMat;
3985            delete testPat;
3986            continue;
3987        }
3988
3989        //
3990        // Interpret the Perl expression from the fourth field of the data file,
3991        // building up an ICU string from the results of the ICU match.
3992        //   The Perl expression will contain references to the results of
3993        //     a regex match, including the matched string, capture group strings,
3994        //     group starting and ending indicies, etc.
3995        //
3996        UnicodeString resultString;
3997        UnicodeString perlExpr = fields[3];
3998#if SUPPORT_MUTATING_INPUT_STRING
3999        groupsMat->reset(perlExpr);
4000        cgMat->reset(perlExpr);
4001#endif
4002
4003        while (perlExpr.length() > 0) {
4004#if !SUPPORT_MUTATING_INPUT_STRING
4005            //  Perferred usage.  Reset after any modification to input string.
4006            groupsMat->reset(perlExpr);
4007            cgMat->reset(perlExpr);
4008#endif
4009
4010            if (perlExpr.startsWith("$&")) {
4011                resultString.append(testMat->group(status));
4012                perlExpr.remove(0, 2);
4013            }
4014
4015            else if (groupsMat->lookingAt(status)) {
4016                // $-[0]   $+[2]  etc.
4017                UnicodeString digitString = groupsMat->group(2, status);
4018                int32_t t = 0;
4019                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4020                UnicodeString plusOrMinus = groupsMat->group(1, status);
4021                int32_t matchPosition;
4022                if (plusOrMinus.compare("+") == 0) {
4023                    matchPosition = testMat->end(groupNum, status);
4024                } else {
4025                    matchPosition = testMat->start(groupNum, status);
4026                }
4027                if (matchPosition != -1) {
4028                    ICU_Utility::appendNumber(resultString, matchPosition);
4029                }
4030                perlExpr.remove(0, groupsMat->end(status));
4031            }
4032
4033            else if (cgMat->lookingAt(status)) {
4034                // $1, $2, $3, etc.
4035                UnicodeString digitString = cgMat->group(1, status);
4036                int32_t t = 0;
4037                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4038                if (U_SUCCESS(status)) {
4039                    resultString.append(testMat->group(groupNum, status));
4040                    status = U_ZERO_ERROR;
4041                }
4042                perlExpr.remove(0, cgMat->end(status));
4043            }
4044
4045            else if (perlExpr.startsWith("@-")) {
4046                int32_t i;
4047                for (i=0; i<=testMat->groupCount(); i++) {
4048                    if (i>0) {
4049                        resultString.append(" ");
4050                    }
4051                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4052                }
4053                perlExpr.remove(0, 2);
4054            }
4055
4056            else if (perlExpr.startsWith("@+")) {
4057                int32_t i;
4058                for (i=0; i<=testMat->groupCount(); i++) {
4059                    if (i>0) {
4060                        resultString.append(" ");
4061                    }
4062                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4063                }
4064                perlExpr.remove(0, 2);
4065            }
4066
4067            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4068                                                     //           or as an escaped sequence (e.g. \n)
4069                if (perlExpr.length() > 1) {
4070                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4071                }
4072                UChar c = perlExpr.charAt(0);
4073                switch (c) {
4074                case 'n':   c = '\n'; break;
4075                // add any other escape sequences that show up in the test expected results.
4076                }
4077                resultString.append(c);
4078                perlExpr.remove(0, 1);
4079            }
4080
4081            else  {
4082                // Any characters from the perl expression that we don't explicitly
4083                //  recognize before here are assumed to be literals and copied
4084                //  as-is to the expected results.
4085                resultString.append(perlExpr.charAt(0));
4086                perlExpr.remove(0, 1);
4087            }
4088
4089            if (U_FAILURE(status)) {
4090                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4091                break;
4092            }
4093        }
4094
4095        //
4096        // Expected Results Compare
4097        //
4098        UnicodeString expectedS(fields[4]);
4099        expectedS.findAndReplace(nulnulSrc, nulnul);
4100        expectedS.findAndReplace(ffffSrc,   ffff);
4101        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4102
4103
4104        if (expectedS.compare(resultString) != 0) {
4105            err("Line %d: Incorrect perl expression results.", lineNum);
4106            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4107        }
4108
4109        delete testMat;
4110        delete testPat;
4111    }
4112
4113    //
4114    // All done.  Clean up allocated stuff.
4115    //
4116    delete cgMat;
4117    delete cgPat;
4118
4119    delete groupsMat;
4120    delete groupsPat;
4121
4122    delete flagMat;
4123    delete flagPat;
4124
4125    delete lineMat;
4126    delete linePat;
4127
4128    delete fieldPat;
4129    delete [] testData;
4130
4131
4132    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4133
4134}
4135
4136
4137//-------------------------------------------------------------------------------
4138//
4139//   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4140//                  (instead of using UnicodeStrings) to test the alternate engine.
4141//                  The input file for this test is re_tests, the standard regular
4142//                  expression test data distributed with the Perl source code.
4143//                  See PerlTests() for more information.
4144//
4145//-------------------------------------------------------------------------------
4146void RegexTest::PerlTestsUTF8() {
4147    char tdd[2048];
4148    const char *srcPath;
4149    UErrorCode  status = U_ZERO_ERROR;
4150    UParseError pe;
4151    LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4152    UText       patternText = UTEXT_INITIALIZER;
4153    char       *patternChars = NULL;
4154    int32_t     patternLength;
4155    int32_t     patternCapacity = 0;
4156    UText       inputText = UTEXT_INITIALIZER;
4157    char       *inputChars = NULL;
4158    int32_t     inputLength;
4159    int32_t     inputCapacity = 0;
4160
4161    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4162
4163    //
4164    //  Open and read the test data file.
4165    //
4166    srcPath=getPath(tdd, "re_tests.txt");
4167    if(srcPath==NULL) {
4168        return; /* something went wrong, error already output */
4169    }
4170
4171    int32_t    len;
4172    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4173    if (U_FAILURE(status)) {
4174        return; /* something went wrong, error already output */
4175    }
4176
4177    //
4178    //  Put the test data into a UnicodeString
4179    //
4180    UnicodeString testDataString(FALSE, testData, len);
4181
4182    //
4183    //  Regex to break the input file into lines, and strip the new lines.
4184    //     One line per match, capture group one is the desired data.
4185    //
4186    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4187    if (U_FAILURE(status)) {
4188        dataerrln("RegexPattern::compile() error");
4189        return;
4190    }
4191    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4192
4193    //
4194    //  Regex to split a test file line into fields.
4195    //    There are six fields, separated by tabs.
4196    //
4197    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4198
4199    //
4200    //  Regex to identify test patterns with flag settings, and to separate them.
4201    //    Test patterns with flags look like 'pattern'i
4202    //    Test patterns without flags are not quoted:   pattern
4203    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4204    //
4205    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4206    RegexMatcher* flagMat = flagPat->matcher(status);
4207
4208    //
4209    // The Perl tests reference several perl-isms, which are evaluated/substituted
4210    //   in the test data.  Not being perl, this must be done explicitly.  Here
4211    //   are string constants and REs for these constructs.
4212    //
4213    UnicodeString nulnulSrc("${nulnul}");
4214    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4215    nulnul = nulnul.unescape();
4216
4217    UnicodeString ffffSrc("${ffff}");
4218    UnicodeString ffff("\\uffff", -1, US_INV);
4219    ffff = ffff.unescape();
4220
4221    //  regexp for $-[0], $+[2], etc.
4222    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4223    RegexMatcher *groupsMat = groupsPat->matcher(status);
4224
4225    //  regexp for $0, $1, $2, etc.
4226    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4227    RegexMatcher *cgMat = cgPat->matcher(status);
4228
4229
4230    //
4231    // Main Loop for the Perl Tests, runs once per line from the
4232    //   test data file.
4233    //
4234    int32_t  lineNum = 0;
4235    int32_t  skippedUnimplementedCount = 0;
4236    while (lineMat->find()) {
4237        lineNum++;
4238
4239        //
4240        //  Get a line, break it into its fields, do the Perl
4241        //    variable substitutions.
4242        //
4243        UnicodeString line = lineMat->group(1, status);
4244        UnicodeString fields[7];
4245        fieldPat->split(line, fields, 7, status);
4246
4247        flagMat->reset(fields[0]);
4248        flagMat->matches(status);
4249        UnicodeString pattern  = flagMat->group(2, status);
4250        pattern.findAndReplace("${bang}", "!");
4251        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4252        pattern.findAndReplace(ffffSrc, ffff);
4253
4254        //
4255        //  Identify patterns that include match flag settings,
4256        //    split off the flags, remove the extra quotes.
4257        //
4258        UnicodeString flagStr = flagMat->group(3, status);
4259        if (U_FAILURE(status)) {
4260            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4261            return;
4262        }
4263        int32_t flags = 0;
4264        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4265        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4266        const UChar UChar_m = 0x6d;
4267        const UChar UChar_x = 0x78;
4268        const UChar UChar_y = 0x79;
4269        if (flagStr.indexOf(UChar_i) != -1) {
4270            flags |= UREGEX_CASE_INSENSITIVE;
4271        }
4272        if (flagStr.indexOf(UChar_m) != -1) {
4273            flags |= UREGEX_MULTILINE;
4274        }
4275        if (flagStr.indexOf(UChar_x) != -1) {
4276            flags |= UREGEX_COMMENTS;
4277        }
4278
4279        //
4280        // Put the pattern in a UTF-8 UText
4281        //
4282        status = U_ZERO_ERROR;
4283        patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4284        if (status == U_BUFFER_OVERFLOW_ERROR) {
4285            status = U_ZERO_ERROR;
4286            delete[] patternChars;
4287            patternCapacity = patternLength + 1;
4288            patternChars = new char[patternCapacity];
4289            pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4290        }
4291        utext_openUTF8(&patternText, patternChars, patternLength, &status);
4292
4293        //
4294        // Compile the test pattern.
4295        //
4296        RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4297        if (status == U_REGEX_UNIMPLEMENTED) {
4298            //
4299            // Test of a feature that is planned for ICU, but not yet implemented.
4300            //   skip the test.
4301            skippedUnimplementedCount++;
4302            delete testPat;
4303            status = U_ZERO_ERROR;
4304            continue;
4305        }
4306
4307        if (U_FAILURE(status)) {
4308            // Some tests are supposed to generate errors.
4309            //   Only report an error for tests that are supposed to succeed.
4310            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4311                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4312            {
4313                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4314            }
4315            status = U_ZERO_ERROR;
4316            delete testPat;
4317            continue;
4318        }
4319
4320        if (fields[2].indexOf(UChar_i) >= 0) {
4321            // ICU should skip this test.
4322            delete testPat;
4323            continue;
4324        }
4325
4326        if (fields[2].indexOf(UChar_c) >= 0) {
4327            // This pattern should have caused a compilation error, but didn't/
4328            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4329            delete testPat;
4330            continue;
4331        }
4332
4333
4334        //
4335        // replace the Perl variables that appear in some of the
4336        //   match data strings.
4337        //
4338        UnicodeString matchString = fields[1];
4339        matchString.findAndReplace(nulnulSrc, nulnul);
4340        matchString.findAndReplace(ffffSrc,   ffff);
4341
4342        // Replace any \n in the match string with an actual new-line char.
4343        //  Don't do full unescape, as this unescapes more than Perl does, which
4344        //  causes other spurious failures in the tests.
4345        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4346
4347        //
4348        // Put the input in a UTF-8 UText
4349        //
4350        status = U_ZERO_ERROR;
4351        inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4352        if (status == U_BUFFER_OVERFLOW_ERROR) {
4353            status = U_ZERO_ERROR;
4354            delete[] inputChars;
4355            inputCapacity = inputLength + 1;
4356            inputChars = new char[inputCapacity];
4357            matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4358        }
4359        utext_openUTF8(&inputText, inputChars, inputLength, &status);
4360
4361        //
4362        // Run the test, check for expected match/don't match result.
4363        //
4364        RegexMatcher *testMat = testPat->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
4365        UBool found = testMat->find();
4366        UBool expected = FALSE;
4367        if (fields[2].indexOf(UChar_y) >=0) {
4368            expected = TRUE;
4369        }
4370        if (expected != found) {
4371            errln("line %d: Expected %smatch, got %smatch",
4372                lineNum, expected?"":"no ", found?"":"no " );
4373            continue;
4374        }
4375
4376        // Don't try to check expected results if there is no match.
4377        //   (Some have stuff in the expected fields)
4378        if (!found) {
4379            delete testMat;
4380            delete testPat;
4381            continue;
4382        }
4383
4384        //
4385        // Interpret the Perl expression from the fourth field of the data file,
4386        // building up an ICU string from the results of the ICU match.
4387        //   The Perl expression will contain references to the results of
4388        //     a regex match, including the matched string, capture group strings,
4389        //     group starting and ending indicies, etc.
4390        //
4391        UnicodeString resultString;
4392        UnicodeString perlExpr = fields[3];
4393
4394        while (perlExpr.length() > 0) {
4395            groupsMat->reset(perlExpr);
4396            cgMat->reset(perlExpr);
4397
4398            if (perlExpr.startsWith("$&")) {
4399                resultString.append(testMat->group(status));
4400                perlExpr.remove(0, 2);
4401            }
4402
4403            else if (groupsMat->lookingAt(status)) {
4404                // $-[0]   $+[2]  etc.
4405                UnicodeString digitString = groupsMat->group(2, status);
4406                int32_t t = 0;
4407                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4408                UnicodeString plusOrMinus = groupsMat->group(1, status);
4409                int32_t matchPosition;
4410                if (plusOrMinus.compare("+") == 0) {
4411                    matchPosition = testMat->end(groupNum, status);
4412                } else {
4413                    matchPosition = testMat->start(groupNum, status);
4414                }
4415                if (matchPosition != -1) {
4416                    ICU_Utility::appendNumber(resultString, matchPosition);
4417                }
4418                perlExpr.remove(0, groupsMat->end(status));
4419            }
4420
4421            else if (cgMat->lookingAt(status)) {
4422                // $1, $2, $3, etc.
4423                UnicodeString digitString = cgMat->group(1, status);
4424                int32_t t = 0;
4425                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4426                if (U_SUCCESS(status)) {
4427                    resultString.append(testMat->group(groupNum, status));
4428                    status = U_ZERO_ERROR;
4429                }
4430                perlExpr.remove(0, cgMat->end(status));
4431            }
4432
4433            else if (perlExpr.startsWith("@-")) {
4434                int32_t i;
4435                for (i=0; i<=testMat->groupCount(); i++) {
4436                    if (i>0) {
4437                        resultString.append(" ");
4438                    }
4439                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4440                }
4441                perlExpr.remove(0, 2);
4442            }
4443
4444            else if (perlExpr.startsWith("@+")) {
4445                int32_t i;
4446                for (i=0; i<=testMat->groupCount(); i++) {
4447                    if (i>0) {
4448                        resultString.append(" ");
4449                    }
4450                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4451                }
4452                perlExpr.remove(0, 2);
4453            }
4454
4455            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4456                                                     //           or as an escaped sequence (e.g. \n)
4457                if (perlExpr.length() > 1) {
4458                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4459                }
4460                UChar c = perlExpr.charAt(0);
4461                switch (c) {
4462                case 'n':   c = '\n'; break;
4463                // add any other escape sequences that show up in the test expected results.
4464                }
4465                resultString.append(c);
4466                perlExpr.remove(0, 1);
4467            }
4468
4469            else  {
4470                // Any characters from the perl expression that we don't explicitly
4471                //  recognize before here are assumed to be literals and copied
4472                //  as-is to the expected results.
4473                resultString.append(perlExpr.charAt(0));
4474                perlExpr.remove(0, 1);
4475            }
4476
4477            if (U_FAILURE(status)) {
4478                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4479                break;
4480            }
4481        }
4482
4483        //
4484        // Expected Results Compare
4485        //
4486        UnicodeString expectedS(fields[4]);
4487        expectedS.findAndReplace(nulnulSrc, nulnul);
4488        expectedS.findAndReplace(ffffSrc,   ffff);
4489        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4490
4491
4492        if (expectedS.compare(resultString) != 0) {
4493            err("Line %d: Incorrect perl expression results.", lineNum);
4494            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4495        }
4496
4497        delete testMat;
4498        delete testPat;
4499    }
4500
4501    //
4502    // All done.  Clean up allocated stuff.
4503    //
4504    delete cgMat;
4505    delete cgPat;
4506
4507    delete groupsMat;
4508    delete groupsPat;
4509
4510    delete flagMat;
4511    delete flagPat;
4512
4513    delete lineMat;
4514    delete linePat;
4515
4516    delete fieldPat;
4517    delete [] testData;
4518
4519    utext_close(&patternText);
4520    utext_close(&inputText);
4521
4522    delete [] patternChars;
4523    delete [] inputChars;
4524
4525
4526    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4527
4528}
4529
4530
4531//--------------------------------------------------------------
4532//
4533//  Bug6149   Verify limits to heap expansion for backtrack stack.
4534//             Use this pattern,
4535//                 "(a?){1,}"
4536//             The zero-length match will repeat forever.
4537//                (That this goes into a loop is another bug)
4538//
4539//---------------------------------------------------------------
4540void RegexTest::Bug6149() {
4541    UnicodeString pattern("(a?){1,}");
4542    UnicodeString s("xyz");
4543    uint32_t flags = 0;
4544    UErrorCode status = U_ZERO_ERROR;
4545
4546    RegexMatcher  matcher(pattern, s, flags, status);
4547    UBool result = false;
4548    REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4549    REGEX_ASSERT(result == FALSE);
4550 }
4551
4552
4553//
4554//   Callbacks()    Test the callback function.
4555//                  When set, callbacks occur periodically during matching operations,
4556//                  giving the application code the ability to abort the operation
4557//                  before it's normal completion.
4558//
4559
4560struct callBackContext {
4561    RegexTest        *test;
4562    int32_t          maxCalls;
4563    int32_t          numCalls;
4564    int32_t          lastSteps;
4565    void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4566};
4567
4568U_CDECL_BEGIN
4569static UBool U_CALLCONV
4570testCallBackFn(const void *context, int32_t steps) {
4571    callBackContext  *info = (callBackContext *)context;
4572    if (info->lastSteps+1 != steps) {
4573        info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4574    }
4575    info->lastSteps = steps;
4576    info->numCalls++;
4577    return (info->numCalls < info->maxCalls);
4578}
4579U_CDECL_END
4580
4581void RegexTest::Callbacks() {
4582   {
4583        // Getter returns NULLs if no callback has been set
4584
4585        //   The variables that the getter will fill in.
4586        //   Init to non-null values so that the action of the getter can be seen.
4587        const void          *returnedContext = &returnedContext;
4588        URegexMatchCallback *returnedFn = &testCallBackFn;
4589
4590        UErrorCode status = U_ZERO_ERROR;
4591        RegexMatcher matcher("x", 0, status);
4592        REGEX_CHECK_STATUS;
4593        matcher.getMatchCallback(returnedFn, returnedContext, status);
4594        REGEX_CHECK_STATUS;
4595        REGEX_ASSERT(returnedFn == NULL);
4596        REGEX_ASSERT(returnedContext == NULL);
4597    }
4598
4599   {
4600        // Set and Get work
4601        callBackContext cbInfo = {this, 0, 0, 0};
4602        const void          *returnedContext;
4603        URegexMatchCallback *returnedFn;
4604        UErrorCode status = U_ZERO_ERROR;
4605        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4606        REGEX_CHECK_STATUS;
4607        matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4608        REGEX_CHECK_STATUS;
4609        matcher.getMatchCallback(returnedFn, returnedContext, status);
4610        REGEX_CHECK_STATUS;
4611        REGEX_ASSERT(returnedFn == testCallBackFn);
4612        REGEX_ASSERT(returnedContext == &cbInfo);
4613
4614        // A short-running match shouldn't invoke the callback
4615        status = U_ZERO_ERROR;
4616        cbInfo.reset(1);
4617        UnicodeString s = "xxx";
4618        matcher.reset(s);
4619        REGEX_ASSERT(matcher.matches(status));
4620        REGEX_CHECK_STATUS;
4621        REGEX_ASSERT(cbInfo.numCalls == 0);
4622
4623        // A medium-length match that runs long enough to invoke the
4624        //   callback, but not so long that the callback aborts it.
4625        status = U_ZERO_ERROR;
4626        cbInfo.reset(4);
4627        s = "aaaaaaaaaaaaaaaaaaab";
4628        matcher.reset(s);
4629        REGEX_ASSERT(matcher.matches(status)==FALSE);
4630        REGEX_CHECK_STATUS;
4631        REGEX_ASSERT(cbInfo.numCalls > 0);
4632
4633        // A longer running match that the callback function will abort.
4634        status = U_ZERO_ERROR;
4635        cbInfo.reset(4);
4636        s = "aaaaaaaaaaaaaaaaaaaaaaab";
4637        matcher.reset(s);
4638        REGEX_ASSERT(matcher.matches(status)==FALSE);
4639        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4640        REGEX_ASSERT(cbInfo.numCalls == 4);
4641    }
4642
4643
4644}
4645
4646
4647//
4648//   FindProgressCallbacks()    Test the find "progress" callback function.
4649//                  When set, the find progress callback will be invoked during a find operations
4650//                  after each return from a match attempt, giving the application the opportunity
4651//                  to terminate a long-running find operation before it's normal completion.
4652//
4653
4654struct progressCallBackContext {
4655    RegexTest        *test;
4656    int64_t          lastIndex;
4657    int32_t          maxCalls;
4658    int32_t          numCalls;
4659    void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4660};
4661
4662U_CDECL_BEGIN
4663static UBool U_CALLCONV
4664testProgressCallBackFn(const void *context, int64_t matchIndex) {
4665    progressCallBackContext  *info = (progressCallBackContext *)context;
4666    info->numCalls++;
4667    info->lastIndex = matchIndex;
4668//    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4669    return (info->numCalls < info->maxCalls);
4670}
4671U_CDECL_END
4672
4673void RegexTest::FindProgressCallbacks() {
4674   {
4675        // Getter returns NULLs if no callback has been set
4676
4677        //   The variables that the getter will fill in.
4678        //   Init to non-null values so that the action of the getter can be seen.
4679        const void                  *returnedContext = &returnedContext;
4680        URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4681
4682        UErrorCode status = U_ZERO_ERROR;
4683        RegexMatcher matcher("x", 0, status);
4684        REGEX_CHECK_STATUS;
4685        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4686        REGEX_CHECK_STATUS;
4687        REGEX_ASSERT(returnedFn == NULL);
4688        REGEX_ASSERT(returnedContext == NULL);
4689    }
4690
4691   {
4692        // Set and Get work
4693        progressCallBackContext cbInfo = {this, 0, 0, 0};
4694        const void                  *returnedContext;
4695        URegexFindProgressCallback  *returnedFn;
4696        UErrorCode status = U_ZERO_ERROR;
4697        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4698        REGEX_CHECK_STATUS;
4699        matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4700        REGEX_CHECK_STATUS;
4701        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4702        REGEX_CHECK_STATUS;
4703        REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4704        REGEX_ASSERT(returnedContext == &cbInfo);
4705
4706        // A short-running match should NOT invoke the callback.
4707        status = U_ZERO_ERROR;
4708        cbInfo.reset(100);
4709        UnicodeString s = "abxxx";
4710        matcher.reset(s);
4711#if 0
4712        matcher.setTrace(TRUE);
4713#endif
4714        REGEX_ASSERT(matcher.find(0, status));
4715        REGEX_CHECK_STATUS;
4716        REGEX_ASSERT(cbInfo.numCalls == 0);
4717
4718        // A medium running match that causes matcher.find() to invoke our callback for each index.
4719        status = U_ZERO_ERROR;
4720        s = "aaaaaaaaaaaaaaaaaaab";
4721        cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4722        matcher.reset(s);
4723        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4724        REGEX_CHECK_STATUS;
4725        REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4726
4727        // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4728        status = U_ZERO_ERROR;
4729        UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4730        cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4731        matcher.reset(s1);
4732        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4733        REGEX_CHECK_STATUS;
4734        REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4735
4736#if 0
4737        // Now a match that will succeed, but after an interruption
4738        status = U_ZERO_ERROR;
4739        UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4740        cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4741        matcher.reset(s2);
4742        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4743        REGEX_CHECK_STATUS;
4744        // Now retry the match from where left off
4745        cbInfo.maxCalls = 100; //  No callback limit
4746        REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4747        REGEX_CHECK_STATUS;
4748#endif
4749    }
4750
4751
4752}
4753
4754
4755//---------------------------------------------------------------------------
4756//
4757//    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4758//                             UTexts. The pure-C implementation of UText
4759//                             has no mutable backing stores, but we can
4760//                             use UnicodeString here to test the functionality.
4761//
4762//---------------------------------------------------------------------------
4763void RegexTest::PreAllocatedUTextCAPI () {
4764    UErrorCode           status = U_ZERO_ERROR;
4765    URegularExpression  *re;
4766    UText                patternText = UTEXT_INITIALIZER;
4767    UnicodeString        buffer;
4768    UText                bufferText = UTEXT_INITIALIZER;
4769
4770    utext_openUnicodeString(&bufferText, &buffer, &status);
4771
4772    /*
4773     *  getText() and getUText()
4774     */
4775    {
4776        UText  text1 = UTEXT_INITIALIZER;
4777        UText  text2 = UTEXT_INITIALIZER;
4778        UChar  text2Chars[20];
4779        UText  *resultText;
4780
4781        status = U_ZERO_ERROR;
4782        regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4783        regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4784        u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4785        utext_openUChars(&text2, text2Chars, -1, &status);
4786
4787        regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4788        re = uregex_openUText(&patternText, 0, NULL, &status);
4789
4790        /* First set a UText */
4791        uregex_setUText(re, &text1, &status);
4792        resultText = uregex_getUText(re, &bufferText, &status);
4793        REGEX_CHECK_STATUS;
4794        REGEX_ASSERT(resultText == &bufferText);
4795        utext_setNativeIndex(resultText, 0);
4796        utext_setNativeIndex(&text1, 0);
4797        REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4798
4799        resultText = uregex_getUText(re, &bufferText, &status);
4800        REGEX_CHECK_STATUS;
4801        REGEX_ASSERT(resultText == &bufferText);
4802        utext_setNativeIndex(resultText, 0);
4803        utext_setNativeIndex(&text1, 0);
4804        REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4805
4806        /* Then set a UChar * */
4807        uregex_setText(re, text2Chars, 7, &status);
4808        resultText = uregex_getUText(re, &bufferText, &status);
4809        REGEX_CHECK_STATUS;
4810        REGEX_ASSERT(resultText == &bufferText);
4811        utext_setNativeIndex(resultText, 0);
4812        utext_setNativeIndex(&text2, 0);
4813        REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
4814
4815        uregex_close(re);
4816        utext_close(&text1);
4817        utext_close(&text2);
4818    }
4819
4820    /*
4821     *  group()
4822     */
4823    {
4824        UChar    text1[80];
4825        UText   *actual;
4826        UBool    result;
4827        u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
4828
4829        status = U_ZERO_ERROR;
4830        re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4831        REGEX_CHECK_STATUS;
4832
4833        uregex_setText(re, text1, -1, &status);
4834        result = uregex_find(re, 0, &status);
4835        REGEX_ASSERT(result==TRUE);
4836
4837        /*  Capture Group 0, the full match.  Should succeed.  */
4838        status = U_ZERO_ERROR;
4839        actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
4840        REGEX_CHECK_STATUS;
4841        REGEX_ASSERT(actual == &bufferText);
4842        REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
4843
4844        /*  Capture group #1.  Should succeed. */
4845        status = U_ZERO_ERROR;
4846        actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
4847        REGEX_CHECK_STATUS;
4848        REGEX_ASSERT(actual == &bufferText);
4849        REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
4850
4851        /*  Capture group out of range.  Error. */
4852        status = U_ZERO_ERROR;
4853        actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
4854        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
4855        REGEX_ASSERT(actual == &bufferText);
4856
4857        uregex_close(re);
4858
4859    }
4860
4861    /*
4862     *  replaceFirst()
4863     */
4864    {
4865        UChar    text1[80];
4866        UChar    text2[80];
4867        UText    replText = UTEXT_INITIALIZER;
4868        UText   *result;
4869
4870        status = U_ZERO_ERROR;
4871        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
4872        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
4873        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
4874
4875        re = uregex_openC("x(.*?)x", 0, NULL, &status);
4876        REGEX_CHECK_STATUS;
4877
4878        /*  Normal case, with match */
4879        uregex_setText(re, text1, -1, &status);
4880        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4881        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4882        REGEX_CHECK_STATUS;
4883        REGEX_ASSERT(result == &bufferText);
4884        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
4885
4886        /* No match.  Text should copy to output with no changes.  */
4887        uregex_setText(re, text2, -1, &status);
4888        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4889        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4890        REGEX_CHECK_STATUS;
4891        REGEX_ASSERT(result == &bufferText);
4892        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
4893
4894        /* Unicode escapes */
4895        uregex_setText(re, text1, -1, &status);
4896        regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
4897        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4898        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4899        REGEX_CHECK_STATUS;
4900        REGEX_ASSERT(result == &bufferText);
4901        REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
4902
4903        uregex_close(re);
4904        utext_close(&replText);
4905    }
4906
4907
4908    /*
4909     *  replaceAll()
4910     */
4911    {
4912        UChar    text1[80];
4913        UChar    text2[80];
4914        UText    replText = UTEXT_INITIALIZER;
4915        UText   *result;
4916
4917        status = U_ZERO_ERROR;
4918        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
4919        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
4920        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
4921
4922        re = uregex_openC("x(.*?)x", 0, NULL, &status);
4923        REGEX_CHECK_STATUS;
4924
4925        /*  Normal case, with match */
4926        uregex_setText(re, text1, -1, &status);
4927        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4928        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
4929        REGEX_CHECK_STATUS;
4930        REGEX_ASSERT(result == &bufferText);
4931        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
4932
4933        /* No match.  Text should copy to output with no changes.  */
4934        uregex_setText(re, text2, -1, &status);
4935        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4936        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
4937        REGEX_CHECK_STATUS;
4938        REGEX_ASSERT(result == &bufferText);
4939        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
4940
4941        uregex_close(re);
4942        utext_close(&replText);
4943    }
4944
4945
4946    /*
4947     *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
4948     *   so we don't need to test it here.
4949     */
4950
4951    utext_close(&bufferText);
4952    utext_close(&patternText);
4953}
4954
4955//--------------------------------------------------------------
4956//
4957//  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
4958//
4959//---------------------------------------------------------------
4960void RegexTest::Bug7651() {
4961    UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
4962    //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
4963    //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
4964    UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
4965    UnicodeString s("#ff @abcd This is test");
4966    RegexPattern  *REPattern = NULL;
4967    RegexMatcher  *REMatcher = NULL;
4968    UErrorCode status = U_ZERO_ERROR;
4969    UParseError pe;
4970
4971    REPattern = RegexPattern::compile(pattern1, 0, pe, status);
4972    REGEX_CHECK_STATUS;
4973    REMatcher = REPattern->matcher(s, status);
4974    REGEX_CHECK_STATUS;
4975    REGEX_ASSERT(REMatcher->find());
4976    REGEX_ASSERT(REMatcher->start(status) == 0);
4977    delete REPattern;
4978    delete REMatcher;
4979    status = U_ZERO_ERROR;
4980
4981    REPattern = RegexPattern::compile(pattern2, 0, pe, status);
4982    REGEX_CHECK_STATUS;
4983    REMatcher = REPattern->matcher(s, status);
4984    REGEX_CHECK_STATUS;
4985    REGEX_ASSERT(REMatcher->find());
4986    REGEX_ASSERT(REMatcher->start(status) == 0);
4987    delete REPattern;
4988    delete REMatcher;
4989    status = U_ZERO_ERROR;
4990 }
4991
4992void RegexTest::Bug7740() {
4993    UErrorCode status = U_ZERO_ERROR;
4994    UnicodeString pattern = "(a)";
4995    UnicodeString text = "abcdef";
4996    RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
4997    REGEX_CHECK_STATUS;
4998    REGEX_ASSERT(m->lookingAt(status));
4999    REGEX_CHECK_STATUS;
5000    status = U_ILLEGAL_ARGUMENT_ERROR;
5001    UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5002    REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5003    REGEX_ASSERT(s == "");
5004    delete m;
5005}
5006
5007
5008
5009
5010#endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5011
5012