1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8
9//
10//   regextst.cpp
11//
12//      ICU Regular Expressions test, part of intltest.
13//
14
15/*
16     NOTE!!
17
18     PLEASE be careful about ASCII assumptions in this test.
19     This test is one of the worst repeat offenders.
20     If you have questions, contact someone on the ICU PMC
21     who has access to an EBCDIC system.
22
23 */
24
25#include "intltest.h"
26#if !UCONFIG_NO_REGULAR_EXPRESSIONS
27
28#include <stdlib.h>
29#include <stdio.h>
30#include <string.h>
31
32#include "unicode/localpointer.h"
33#include "unicode/regex.h"
34#include "unicode/uchar.h"
35#include "unicode/ucnv.h"
36#include "unicode/uniset.h"
37#include "unicode/uregex.h"
38#include "unicode/usetiter.h"
39#include "unicode/ustring.h"
40#include "unicode/utext.h"
41#include "unicode/utf16.h"
42#include "cstr.h"
43#include "regextst.h"
44#include "regexcmp.h"
45#include "uvector.h"
46#include "util.h"
47#include "cmemory.h"
48#include "cstring.h"
49#include "uinvchar.h"
50
51#define SUPPORT_MUTATING_INPUT_STRING   0
52
53//---------------------------------------------------------------------------
54//
55//  Test class boilerplate
56//
57//---------------------------------------------------------------------------
58RegexTest::RegexTest()
59{
60}
61
62
63RegexTest::~RegexTest()
64{
65}
66
67
68
69void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
70{
71    if (exec) logln("TestSuite RegexTest: ");
72    TESTCASE_AUTO_BEGIN;
73    TESTCASE_AUTO(Basic);
74    TESTCASE_AUTO(API_Match);
75    TESTCASE_AUTO(API_Replace);
76    TESTCASE_AUTO(API_Pattern);
77#if !UCONFIG_NO_FILE_IO
78    TESTCASE_AUTO(Extended);
79#endif
80    TESTCASE_AUTO(Errors);
81    TESTCASE_AUTO(PerlTests);
82    TESTCASE_AUTO(Callbacks);
83    TESTCASE_AUTO(FindProgressCallbacks);
84    TESTCASE_AUTO(Bug6149);
85    TESTCASE_AUTO(UTextBasic);
86    TESTCASE_AUTO(API_Match_UTF8);
87    TESTCASE_AUTO(API_Replace_UTF8);
88    TESTCASE_AUTO(API_Pattern_UTF8);
89    TESTCASE_AUTO(PerlTestsUTF8);
90    TESTCASE_AUTO(PreAllocatedUTextCAPI);
91    TESTCASE_AUTO(Bug7651);
92    TESTCASE_AUTO(Bug7740);
93    TESTCASE_AUTO(Bug8479);
94    TESTCASE_AUTO(Bug7029);
95    TESTCASE_AUTO(CheckInvBufSize);
96    TESTCASE_AUTO(Bug9283);
97    TESTCASE_AUTO(Bug10459);
98    TESTCASE_AUTO(TestCaseInsensitiveStarters);
99    TESTCASE_AUTO(TestBug11049);
100    TESTCASE_AUTO(TestBug11371);
101    TESTCASE_AUTO(TestBug11480);
102    TESTCASE_AUTO(NamedCapture);
103    TESTCASE_AUTO(NamedCaptureLimits);
104    TESTCASE_AUTO(TestBug12884);
105    TESTCASE_AUTO(TestBug13631);
106    TESTCASE_AUTO_END;
107}
108
109
110/**
111 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
112 * into ASCII.
113 * @see utext_openUTF8
114 */
115static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
116
117//---------------------------------------------------------------------------
118//
119//   Error Checking / Reporting macros used in all of the tests.
120//
121//---------------------------------------------------------------------------
122
123static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
124  int64_t oldIndex = utext_getNativeIndex(text);
125  utext_setNativeIndex(text, 0);
126  char *bufPtr = buf;
127  UChar32 c = utext_next32From(text, 0);
128  while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
129    if (0x000020<=c && c<0x00007e) {
130      *bufPtr = c;
131    } else {
132#if 0
133      sprintf(bufPtr,"U+%04X", c);
134      bufPtr+= strlen(bufPtr)-1;
135#else
136      *bufPtr = '%';
137#endif
138    }
139    bufPtr++;
140    c = UTEXT_NEXT32(text);
141  }
142  *bufPtr = 0;
143#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
144  char *ebuf = (char*)malloc(bufLen);
145  uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
146  uprv_strncpy(buf, ebuf, bufLen);
147  free((void*)ebuf);
148#endif
149  utext_setNativeIndex(text, oldIndex);
150}
151
152
153static char ASSERT_BUF[1024];
154
155const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
156  if(message.length()==0) {
157    strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
158  } else {
159    UnicodeString buf;
160    IntlTest::prettify(message,buf);
161    if(buf.length()==0) {
162      strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
163    } else {
164      buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
165      if(ASSERT_BUF[0]==0) {
166        ASSERT_BUF[0]=0;
167        for(int32_t i=0;i<buf.length();i++) {
168          UChar ch = buf[i];
169          sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
170        }
171      }
172    }
173  }
174  ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
175  return ASSERT_BUF;
176}
177
178#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
179
180#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
181                                                              __FILE__, __LINE__, u_errorName(status)); return;}}
182
183#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
184
185#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
186if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
187    __LINE__, u_errorName(errcode), u_errorName(status));};}
188
189#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
190    "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
191
192#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
193    errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
194
195// expected: const char * , restricted to invariant characters.
196// actual: const UnicodeString &
197#define REGEX_ASSERT_UNISTR(expected, actual) { \
198    if (UnicodeString(expected, -1, US_INV) != (actual)) { \
199        errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
200                __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
201
202
203static UBool testUTextEqual(UText *uta, UText *utb) {
204    UChar32 ca = 0;
205    UChar32 cb = 0;
206    utext_setNativeIndex(uta, 0);
207    utext_setNativeIndex(utb, 0);
208    do {
209        ca = utext_next32(uta);
210        cb = utext_next32(utb);
211        if (ca != cb) {
212            break;
213        }
214    } while (ca != U_SENTINEL);
215    return ca == cb;
216}
217
218
219/**
220 * @param expected expected text in UTF-8 (not platform) codepage
221 */
222void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
223    UErrorCode status = U_ZERO_ERROR;
224    UText expectedText = UTEXT_INITIALIZER;
225    utext_openUTF8(&expectedText, expected, -1, &status);
226    if(U_FAILURE(status)) {
227      errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
228      return;
229    }
230    if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
231      errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
232      return;
233    }
234    utext_setNativeIndex(actual, 0);
235    if (!testUTextEqual(&expectedText, actual)) {
236        char buf[201 /*21*/];
237        char expectedBuf[201];
238        utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
239        utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
240        errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
241    }
242    utext_close(&expectedText);
243}
244/**
245 * @param expected invariant (platform local text) input
246 */
247
248void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
249    UErrorCode status = U_ZERO_ERROR;
250    UText expectedText = UTEXT_INITIALIZER;
251    regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
252    if(U_FAILURE(status)) {
253      errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
254      return;
255    }
256    utext_setNativeIndex(actual, 0);
257    if (!testUTextEqual(&expectedText, actual)) {
258        char buf[201 /*21*/];
259        char expectedBuf[201];
260        utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
261        utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
262        errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
263    }
264    utext_close(&expectedText);
265}
266
267/**
268 * Assumes utf-8 input
269 */
270#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
271/**
272 * Assumes Invariant input
273 */
274#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
275
276/**
277 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
278 * passed into utext_openUTF8. An error will be given if
279 * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
280 */
281
282#define INV_BUFSIZ 2048 /* increase this if too small */
283
284static int64_t inv_next=0;
285
286#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
287static char inv_buf[INV_BUFSIZ];
288#endif
289
290static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
291  if(length==-1) length=strlen(inv);
292#if U_CHARSET_FAMILY==U_ASCII_FAMILY
293  inv_next+=length;
294  return utext_openUTF8(ut, inv, length, status);
295#else
296  if(inv_next+length+1>INV_BUFSIZ) {
297    fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
298            __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
299    *status = U_MEMORY_ALLOCATION_ERROR;
300    return NULL;
301  }
302
303  unsigned char *buf = (unsigned char*)inv_buf+inv_next;
304  uprv_aestrncpy(buf, (const uint8_t*)inv, length);
305  inv_next+=length;
306
307#if 0
308  fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
309#endif
310
311  return utext_openUTF8(ut, (const char*)buf, length, status);
312#endif
313}
314
315
316//---------------------------------------------------------------------------
317//
318//    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
319//                       for the LookingAt() and  Match() functions.
320//
321//       usage:
322//          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
323//
324//          The expected results are UBool - TRUE or FALSE.
325//          The input text is unescaped.  The pattern is not.
326//
327//
328//---------------------------------------------------------------------------
329
330#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
331
332UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
333    const UnicodeString pattern(pat, -1, US_INV);
334    const UnicodeString inputText(text, -1, US_INV);
335    UErrorCode          status  = U_ZERO_ERROR;
336    UParseError         pe;
337    RegexPattern        *REPattern = NULL;
338    RegexMatcher        *REMatcher = NULL;
339    UBool               retVal     = TRUE;
340
341    UnicodeString patString(pat, -1, US_INV);
342    REPattern = RegexPattern::compile(patString, 0, pe, status);
343    if (U_FAILURE(status)) {
344        dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
345            line, u_errorName(status));
346        return FALSE;
347    }
348    if (line==376) { REPattern->dumpPattern();}
349
350    UnicodeString inputString(inputText);
351    UnicodeString unEscapedInput = inputString.unescape();
352    REMatcher = REPattern->matcher(unEscapedInput, status);
353    if (U_FAILURE(status)) {
354        errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
355            line, u_errorName(status));
356        return FALSE;
357    }
358
359    UBool actualmatch;
360    actualmatch = REMatcher->lookingAt(status);
361    if (U_FAILURE(status)) {
362        errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
363            line, u_errorName(status));
364        retVal =  FALSE;
365    }
366    if (actualmatch != looking) {
367        errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
368        retVal = FALSE;
369    }
370
371    status = U_ZERO_ERROR;
372    actualmatch = REMatcher->matches(status);
373    if (U_FAILURE(status)) {
374        errln("RegexTest failure in matches() at line %d.  Status = %s\n",
375            line, u_errorName(status));
376        retVal = FALSE;
377    }
378    if (actualmatch != match) {
379        errln("RegexTest: wrong return from matches() at line %d.\n", line);
380        retVal = FALSE;
381    }
382
383    if (retVal == FALSE) {
384        REPattern->dumpPattern();
385    }
386
387    delete REPattern;
388    delete REMatcher;
389    return retVal;
390}
391
392
393UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
394    UText               pattern    = UTEXT_INITIALIZER;
395    int32_t             inputUTF8Length;
396    char                *textChars = NULL;
397    UText               inputText  = UTEXT_INITIALIZER;
398    UErrorCode          status     = U_ZERO_ERROR;
399    UParseError         pe;
400    RegexPattern        *REPattern = NULL;
401    RegexMatcher        *REMatcher = NULL;
402    UBool               retVal     = TRUE;
403
404    regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
405    REPattern = RegexPattern::compile(&pattern, 0, pe, status);
406    if (U_FAILURE(status)) {
407        dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
408            line, u_errorName(status));
409        return FALSE;
410    }
411
412    UnicodeString inputString(text, -1, US_INV);
413    UnicodeString unEscapedInput = inputString.unescape();
414    LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
415    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
416
417    inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
418    if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
419        // UTF-8 does not allow unpaired surrogates, so this could actually happen
420        logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
421        return TRUE; // not a failure of the Regex engine
422    }
423    status = U_ZERO_ERROR; // buffer overflow
424    textChars = new char[inputUTF8Length+1];
425    unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
426    utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
427
428    REMatcher = &REPattern->matcher(status)->reset(&inputText);
429    if (U_FAILURE(status)) {
430        errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
431            line, u_errorName(status));
432        return FALSE;
433    }
434
435    UBool actualmatch;
436    actualmatch = REMatcher->lookingAt(status);
437    if (U_FAILURE(status)) {
438        errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
439            line, u_errorName(status));
440        retVal =  FALSE;
441    }
442    if (actualmatch != looking) {
443        errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
444        retVal = FALSE;
445    }
446
447    status = U_ZERO_ERROR;
448    actualmatch = REMatcher->matches(status);
449    if (U_FAILURE(status)) {
450        errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
451            line, u_errorName(status));
452        retVal = FALSE;
453    }
454    if (actualmatch != match) {
455        errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
456        retVal = FALSE;
457    }
458
459    if (retVal == FALSE) {
460        REPattern->dumpPattern();
461    }
462
463    delete REPattern;
464    delete REMatcher;
465    utext_close(&inputText);
466    utext_close(&pattern);
467    delete[] textChars;
468    return retVal;
469}
470
471
472
473//---------------------------------------------------------------------------
474//
475//    REGEX_ERR       Macro + invocation function to simplify writing tests
476//                       regex tests for incorrect patterns
477//
478//       usage:
479//          REGEX_ERR("pattern",   expected error line, column, expected status);
480//
481//---------------------------------------------------------------------------
482#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
483
484void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
485                          UErrorCode expectedStatus, int32_t line) {
486    UnicodeString       pattern(pat);
487
488    UErrorCode          status         = U_ZERO_ERROR;
489    UParseError         pe;
490    RegexPattern        *callerPattern = NULL;
491
492    //
493    //  Compile the caller's pattern
494    //
495    UnicodeString patString(pat);
496    callerPattern = RegexPattern::compile(patString, 0, pe, status);
497    if (status != expectedStatus) {
498        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
499    } else {
500        if (status != U_ZERO_ERROR) {
501            if (pe.line != errLine || pe.offset != errCol) {
502                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
503                    line, errLine, errCol, pe.line, pe.offset);
504            }
505        }
506    }
507
508    delete callerPattern;
509
510    //
511    //  Compile again, using a UTF-8-based UText
512    //
513    UText patternText = UTEXT_INITIALIZER;
514    regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
515    callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
516    if (status != expectedStatus) {
517        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
518    } else {
519        if (status != U_ZERO_ERROR) {
520            if (pe.line != errLine || pe.offset != errCol) {
521                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
522                    line, errLine, errCol, pe.line, pe.offset);
523            }
524        }
525    }
526
527    delete callerPattern;
528    utext_close(&patternText);
529}
530
531
532
533//---------------------------------------------------------------------------
534//
535//      Basic      Check for basic functionality of regex pattern matching.
536//                 Avoid the use of REGEX_FIND test macro, which has
537//                 substantial dependencies on basic Regex functionality.
538//
539//---------------------------------------------------------------------------
540void RegexTest::Basic() {
541
542
543//
544// Debug - slide failing test cases early
545//
546#if 0
547    {
548        // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
549        UParseError pe;
550        UErrorCode  status = U_ZERO_ERROR;
551        RegexPattern *pattern;
552        pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
553        pattern->dumpPattern();
554        RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
555        UBool result = m->find();
556        printf("result = %d\n", result);
557        // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
558        // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
559    }
560    exit(1);
561#endif
562
563
564    //
565    // Pattern with parentheses
566    //
567    REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
568    REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
569    REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
570
571    //
572    // Patterns with *
573    //
574    REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
575    REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
576    REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
577    REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
578    REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
579
580    REGEX_TESTLM("a*", "",  TRUE, TRUE);
581    REGEX_TESTLM("a*", "b", TRUE, FALSE);
582
583
584    //
585    //  Patterns with "."
586    //
587    REGEX_TESTLM(".", "abc", TRUE, FALSE);
588    REGEX_TESTLM("...", "abc", TRUE, TRUE);
589    REGEX_TESTLM("....", "abc", FALSE, FALSE);
590    REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
591    REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
592    REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
593    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
594    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
595
596    //
597    //  Patterns with * applied to chars at end of literal string
598    //
599    REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
600    REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
601
602    //
603    //  Supplemental chars match as single chars, not a pair of surrogates.
604    //
605    REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
606    REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
607    REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
608
609
610    //
611    //  UnicodeSets in the pattern
612    //
613    REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
614    REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
615    REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
616    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
617    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
618    REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
619
620    REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
621    REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
622    REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
623    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
624    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
625
626    //
627    //   OR operator in patterns
628    //
629    REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
630    REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
631    REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
632    REGEX_TESTLM("a|b", "b", TRUE, TRUE);
633
634    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
635    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
636    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
637    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
638    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
639    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
640
641    //
642    //  +
643    //
644    REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
645    REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
646    REGEX_TESTLM("b+", "", FALSE, FALSE);
647    REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
648    REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
649    REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
650
651    //
652    //   ?
653    //
654    REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
655    REGEX_TESTLM("ab?", "a", TRUE, TRUE);
656    REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
657    REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
658    REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
659    REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
660    REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
661    REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
662    REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
663
664    //
665    //  Escape sequences that become single literal chars, handled internally
666    //   by ICU's Unescape.
667    //
668
669    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
670    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
671    REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
672    REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
673    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
674    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
675    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
676    REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
677    REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
678    REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
679
680    REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
681    REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
682
683    // Escape of special chars in patterns
684    REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
685}
686
687
688//---------------------------------------------------------------------------
689//
690//    UTextBasic   Check for quirks that are specific to the UText
691//                 implementation.
692//
693//---------------------------------------------------------------------------
694void RegexTest::UTextBasic() {
695    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
696    UErrorCode status = U_ZERO_ERROR;
697    UText pattern = UTEXT_INITIALIZER;
698    utext_openUTF8(&pattern, str_abc, -1, &status);
699    RegexMatcher matcher(&pattern, 0, status);
700    REGEX_CHECK_STATUS;
701
702    UText input = UTEXT_INITIALIZER;
703    utext_openUTF8(&input, str_abc, -1, &status);
704    REGEX_CHECK_STATUS;
705    matcher.reset(&input);
706    REGEX_CHECK_STATUS;
707    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
708
709    matcher.reset(matcher.inputText());
710    REGEX_CHECK_STATUS;
711    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
712
713    utext_close(&pattern);
714    utext_close(&input);
715}
716
717
718//---------------------------------------------------------------------------
719//
720//      API_Match   Test that the API for class RegexMatcher
721//                  is present and nominally working, but excluding functions
722//                  implementing replace operations.
723//
724//---------------------------------------------------------------------------
725void RegexTest::API_Match() {
726    UParseError         pe;
727    UErrorCode          status=U_ZERO_ERROR;
728    int32_t             flags = 0;
729
730    //
731    // Debug - slide failing test cases early
732    //
733#if 0
734    {
735    }
736    return;
737#endif
738
739    //
740    // Simple pattern compilation
741    //
742    {
743        UnicodeString       re("abc");
744        RegexPattern        *pat2;
745        pat2 = RegexPattern::compile(re, flags, pe, status);
746        REGEX_CHECK_STATUS;
747
748        UnicodeString inStr1 = "abcdef this is a test";
749        UnicodeString instr2 = "not abc";
750        UnicodeString empty  = "";
751
752
753        //
754        // Matcher creation and reset.
755        //
756        RegexMatcher *m1 = pat2->matcher(inStr1, status);
757        REGEX_CHECK_STATUS;
758        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
759        REGEX_ASSERT(m1->input() == inStr1);
760        m1->reset(instr2);
761        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
762        REGEX_ASSERT(m1->input() == instr2);
763        m1->reset(inStr1);
764        REGEX_ASSERT(m1->input() == inStr1);
765        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
766        m1->reset(empty);
767        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
768        REGEX_ASSERT(m1->input() == empty);
769        REGEX_ASSERT(&m1->pattern() == pat2);
770
771        //
772        //  reset(pos, status)
773        //
774        m1->reset(inStr1);
775        m1->reset(4, status);
776        REGEX_CHECK_STATUS;
777        REGEX_ASSERT(m1->input() == inStr1);
778        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
779
780        m1->reset(-1, status);
781        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
782        status = U_ZERO_ERROR;
783
784        m1->reset(0, status);
785        REGEX_CHECK_STATUS;
786        status = U_ZERO_ERROR;
787
788        int32_t len = m1->input().length();
789        m1->reset(len-1, status);
790        REGEX_CHECK_STATUS;
791        status = U_ZERO_ERROR;
792
793        m1->reset(len, status);
794        REGEX_CHECK_STATUS;
795        status = U_ZERO_ERROR;
796
797        m1->reset(len+1, status);
798        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
799        status = U_ZERO_ERROR;
800
801        //
802        // match(pos, status)
803        //
804        m1->reset(instr2);
805        REGEX_ASSERT(m1->matches(4, status) == TRUE);
806        m1->reset();
807        REGEX_ASSERT(m1->matches(3, status) == FALSE);
808        m1->reset();
809        REGEX_ASSERT(m1->matches(5, status) == FALSE);
810        REGEX_ASSERT(m1->matches(4, status) == TRUE);
811        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
812        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
813
814        // Match() at end of string should fail, but should not
815        //  be an error.
816        status = U_ZERO_ERROR;
817        len = m1->input().length();
818        REGEX_ASSERT(m1->matches(len, status) == FALSE);
819        REGEX_CHECK_STATUS;
820
821        // Match beyond end of string should fail with an error.
822        status = U_ZERO_ERROR;
823        REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
824        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
825
826        // Successful match at end of string.
827        {
828            status = U_ZERO_ERROR;
829            RegexMatcher m("A?", 0, status);  // will match zero length string.
830            REGEX_CHECK_STATUS;
831            m.reset(inStr1);
832            len = inStr1.length();
833            REGEX_ASSERT(m.matches(len, status) == TRUE);
834            REGEX_CHECK_STATUS;
835            m.reset(empty);
836            REGEX_ASSERT(m.matches(0, status) == TRUE);
837            REGEX_CHECK_STATUS;
838        }
839
840
841        //
842        // lookingAt(pos, status)
843        //
844        status = U_ZERO_ERROR;
845        m1->reset(instr2);  // "not abc"
846        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
847        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
848        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
849        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
850        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
851        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
852        status = U_ZERO_ERROR;
853        len = m1->input().length();
854        REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
855        REGEX_CHECK_STATUS;
856        REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
857        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
858
859        delete m1;
860        delete pat2;
861    }
862
863
864    //
865    // Capture Group.
866    //     RegexMatcher::start();
867    //     RegexMatcher::end();
868    //     RegexMatcher::groupCount();
869    //
870    {
871        int32_t             flags=0;
872        UParseError         pe;
873        UErrorCode          status=U_ZERO_ERROR;
874
875        UnicodeString       re("01(23(45)67)(.*)");
876        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
877        REGEX_CHECK_STATUS;
878        UnicodeString data = "0123456789";
879
880        RegexMatcher *matcher = pat->matcher(data, status);
881        REGEX_CHECK_STATUS;
882        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
883        static const int32_t matchStarts[] = {0,  2, 4, 8};
884        static const int32_t matchEnds[]   = {10, 8, 6, 10};
885        int32_t i;
886        for (i=0; i<4; i++) {
887            int32_t actualStart = matcher->start(i, status);
888            REGEX_CHECK_STATUS;
889            if (actualStart != matchStarts[i]) {
890                errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
891                    __LINE__, i, matchStarts[i], actualStart);
892            }
893            int32_t actualEnd = matcher->end(i, status);
894            REGEX_CHECK_STATUS;
895            if (actualEnd != matchEnds[i]) {
896                errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
897                    __LINE__, i, matchEnds[i], actualEnd);
898            }
899        }
900
901        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
902        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
903
904        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
905        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
906        matcher->reset();
907        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
908
909        matcher->lookingAt(status);
910        REGEX_ASSERT(matcher->group(status)    == "0123456789");
911        REGEX_ASSERT(matcher->group(0, status) == "0123456789");
912        REGEX_ASSERT(matcher->group(1, status) == "234567"    );
913        REGEX_ASSERT(matcher->group(2, status) == "45"        );
914        REGEX_ASSERT(matcher->group(3, status) == "89"        );
915        REGEX_CHECK_STATUS;
916        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
917        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
918        matcher->reset();
919        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
920
921        delete matcher;
922        delete pat;
923
924    }
925
926    //
927    //  find
928    //
929    {
930        int32_t             flags=0;
931        UParseError         pe;
932        UErrorCode          status=U_ZERO_ERROR;
933
934        UnicodeString       re("abc");
935        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
936        REGEX_CHECK_STATUS;
937        UnicodeString data = ".abc..abc...abc..";
938        //                    012345678901234567
939
940        RegexMatcher *matcher = pat->matcher(data, status);
941        REGEX_CHECK_STATUS;
942        REGEX_ASSERT(matcher->find());
943        REGEX_ASSERT(matcher->start(status) == 1);
944        REGEX_ASSERT(matcher->find());
945        REGEX_ASSERT(matcher->start(status) == 6);
946        REGEX_ASSERT(matcher->find());
947        REGEX_ASSERT(matcher->start(status) == 12);
948        REGEX_ASSERT(matcher->find() == FALSE);
949        REGEX_ASSERT(matcher->find() == FALSE);
950
951        matcher->reset();
952        REGEX_ASSERT(matcher->find());
953        REGEX_ASSERT(matcher->start(status) == 1);
954
955        REGEX_ASSERT(matcher->find(0, status));
956        REGEX_ASSERT(matcher->start(status) == 1);
957        REGEX_ASSERT(matcher->find(1, status));
958        REGEX_ASSERT(matcher->start(status) == 1);
959        REGEX_ASSERT(matcher->find(2, status));
960        REGEX_ASSERT(matcher->start(status) == 6);
961        REGEX_ASSERT(matcher->find(12, status));
962        REGEX_ASSERT(matcher->start(status) == 12);
963        REGEX_ASSERT(matcher->find(13, status) == FALSE);
964        REGEX_ASSERT(matcher->find(16, status) == FALSE);
965        REGEX_ASSERT(matcher->find(17, status) == FALSE);
966        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
967
968        status = U_ZERO_ERROR;
969        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
970        status = U_ZERO_ERROR;
971        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
972
973        REGEX_ASSERT(matcher->groupCount() == 0);
974
975        delete matcher;
976        delete pat;
977    }
978
979
980    //
981    //  find, with \G in pattern (true if at the end of a previous match).
982    //
983    {
984        int32_t             flags=0;
985        UParseError         pe;
986        UErrorCode          status=U_ZERO_ERROR;
987
988        UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
989        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
990        REGEX_CHECK_STATUS;
991        UnicodeString data = ".abcabc.abc..";
992        //                    012345678901234567
993
994        RegexMatcher *matcher = pat->matcher(data, status);
995        REGEX_CHECK_STATUS;
996        REGEX_ASSERT(matcher->find());
997        REGEX_ASSERT(matcher->start(status) == 0);
998        REGEX_ASSERT(matcher->start(1, status) == -1);
999        REGEX_ASSERT(matcher->start(2, status) == 1);
1000
1001        REGEX_ASSERT(matcher->find());
1002        REGEX_ASSERT(matcher->start(status) == 4);
1003        REGEX_ASSERT(matcher->start(1, status) == 4);
1004        REGEX_ASSERT(matcher->start(2, status) == -1);
1005        REGEX_CHECK_STATUS;
1006
1007        delete matcher;
1008        delete pat;
1009    }
1010
1011    //
1012    //   find with zero length matches, match position should bump ahead
1013    //     to prevent loops.
1014    //
1015    {
1016        int32_t                 i;
1017        UErrorCode          status=U_ZERO_ERROR;
1018        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1019                                                      //   using an always-true look-ahead.
1020        REGEX_CHECK_STATUS;
1021        UnicodeString s("    ");
1022        m.reset(s);
1023        for (i=0; ; i++) {
1024            if (m.find() == FALSE) {
1025                break;
1026            }
1027            REGEX_ASSERT(m.start(status) == i);
1028            REGEX_ASSERT(m.end(status) == i);
1029        }
1030        REGEX_ASSERT(i==5);
1031
1032        // Check that the bump goes over surrogate pairs OK
1033        s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1034        s = s.unescape();
1035        m.reset(s);
1036        for (i=0; ; i+=2) {
1037            if (m.find() == FALSE) {
1038                break;
1039            }
1040            REGEX_ASSERT(m.start(status) == i);
1041            REGEX_ASSERT(m.end(status) == i);
1042        }
1043        REGEX_ASSERT(i==10);
1044    }
1045    {
1046        // find() loop breaking test.
1047        //        with pattern of /.?/, should see a series of one char matches, then a single
1048        //        match of zero length at the end of the input string.
1049        int32_t                 i;
1050        UErrorCode          status=U_ZERO_ERROR;
1051        RegexMatcher        m(".?", 0, status);
1052        REGEX_CHECK_STATUS;
1053        UnicodeString s("    ");
1054        m.reset(s);
1055        for (i=0; ; i++) {
1056            if (m.find() == FALSE) {
1057                break;
1058            }
1059            REGEX_ASSERT(m.start(status) == i);
1060            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1061        }
1062        REGEX_ASSERT(i==5);
1063    }
1064
1065
1066    //
1067    // Matchers with no input string behave as if they had an empty input string.
1068    //
1069
1070    {
1071        UErrorCode status = U_ZERO_ERROR;
1072        RegexMatcher  m(".?", 0, status);
1073        REGEX_CHECK_STATUS;
1074        REGEX_ASSERT(m.find());
1075        REGEX_ASSERT(m.start(status) == 0);
1076        REGEX_ASSERT(m.input() == "");
1077    }
1078    {
1079        UErrorCode status = U_ZERO_ERROR;
1080        RegexPattern  *p = RegexPattern::compile(".", 0, status);
1081        RegexMatcher  *m = p->matcher(status);
1082        REGEX_CHECK_STATUS;
1083
1084        REGEX_ASSERT(m->find() == FALSE);
1085        REGEX_ASSERT(m->input() == "");
1086        delete m;
1087        delete p;
1088    }
1089
1090    //
1091    // Regions
1092    //
1093    {
1094        UErrorCode status = U_ZERO_ERROR;
1095        UnicodeString testString("This is test data");
1096        RegexMatcher m(".*", testString,  0, status);
1097        REGEX_CHECK_STATUS;
1098        REGEX_ASSERT(m.regionStart() == 0);
1099        REGEX_ASSERT(m.regionEnd() == testString.length());
1100        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1101        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1102
1103        m.region(2,4, status);
1104        REGEX_CHECK_STATUS;
1105        REGEX_ASSERT(m.matches(status));
1106        REGEX_ASSERT(m.start(status)==2);
1107        REGEX_ASSERT(m.end(status)==4);
1108        REGEX_CHECK_STATUS;
1109
1110        m.reset();
1111        REGEX_ASSERT(m.regionStart() == 0);
1112        REGEX_ASSERT(m.regionEnd() == testString.length());
1113
1114        UnicodeString shorterString("short");
1115        m.reset(shorterString);
1116        REGEX_ASSERT(m.regionStart() == 0);
1117        REGEX_ASSERT(m.regionEnd() == shorterString.length());
1118
1119        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1120        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1121        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1122        REGEX_ASSERT(&m == &m.reset());
1123        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1124
1125        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1126        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1127        REGEX_ASSERT(&m == &m.reset());
1128        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1129
1130        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1131        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1132        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1133        REGEX_ASSERT(&m == &m.reset());
1134        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1135
1136        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1137        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1138        REGEX_ASSERT(&m == &m.reset());
1139        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1140
1141    }
1142
1143    //
1144    // hitEnd() and requireEnd()
1145    //
1146    {
1147        UErrorCode status = U_ZERO_ERROR;
1148        UnicodeString testString("aabb");
1149        RegexMatcher m1(".*", testString,  0, status);
1150        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1151        REGEX_ASSERT(m1.hitEnd() == TRUE);
1152        REGEX_ASSERT(m1.requireEnd() == FALSE);
1153        REGEX_CHECK_STATUS;
1154
1155        status = U_ZERO_ERROR;
1156        RegexMatcher m2("a*", testString, 0, status);
1157        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1158        REGEX_ASSERT(m2.hitEnd() == FALSE);
1159        REGEX_ASSERT(m2.requireEnd() == FALSE);
1160        REGEX_CHECK_STATUS;
1161
1162        status = U_ZERO_ERROR;
1163        RegexMatcher m3(".*$", testString, 0, status);
1164        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1165        REGEX_ASSERT(m3.hitEnd() == TRUE);
1166        REGEX_ASSERT(m3.requireEnd() == TRUE);
1167        REGEX_CHECK_STATUS;
1168    }
1169
1170
1171    //
1172    // Compilation error on reset with UChar *
1173    //   These were a hazard that people were stumbling over with runtime errors.
1174    //   Changed them to compiler errors by adding private methods that more closely
1175    //   matched the incorrect use of the functions.
1176    //
1177#if 0
1178    {
1179        UErrorCode status = U_ZERO_ERROR;
1180        UChar ucharString[20];
1181        RegexMatcher m(".", 0, status);
1182        m.reset(ucharString);  // should not compile.
1183
1184        RegexPattern *p = RegexPattern::compile(".", 0, status);
1185        RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1186
1187        RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1188    }
1189#endif
1190
1191    //
1192    //  Time Outs.
1193    //       Note:  These tests will need to be changed when the regexp engine is
1194    //              able to detect and cut short the exponential time behavior on
1195    //              this type of match.
1196    //
1197    {
1198        UErrorCode status = U_ZERO_ERROR;
1199        //    Enough 'a's in the string to cause the match to time out.
1200        //       (Each on additonal 'a' doubles the time)
1201        UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1202        RegexMatcher matcher("(a+)+b", testString, 0, status);
1203        REGEX_CHECK_STATUS;
1204        REGEX_ASSERT(matcher.getTimeLimit() == 0);
1205        matcher.setTimeLimit(100, status);
1206        REGEX_ASSERT(matcher.getTimeLimit() == 100);
1207        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1208        REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1209    }
1210    {
1211        UErrorCode status = U_ZERO_ERROR;
1212        //   Few enough 'a's to slip in under the time limit.
1213        UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1214        RegexMatcher matcher("(a+)+b", testString, 0, status);
1215        REGEX_CHECK_STATUS;
1216        matcher.setTimeLimit(100, status);
1217        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1218        REGEX_CHECK_STATUS;
1219    }
1220
1221    //
1222    //  Stack Limits
1223    //
1224    {
1225        UErrorCode status = U_ZERO_ERROR;
1226        UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1227
1228        // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1229        //   of the '+', and makes the stack frames larger.
1230        RegexMatcher matcher("(A)+A$", testString, 0, status);
1231
1232        // With the default stack, this match should fail to run
1233        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1234        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1235
1236        // With unlimited stack, it should run
1237        status = U_ZERO_ERROR;
1238        matcher.setStackLimit(0, status);
1239        REGEX_CHECK_STATUS;
1240        REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1241        REGEX_CHECK_STATUS;
1242        REGEX_ASSERT(matcher.getStackLimit() == 0);
1243
1244        // With a limited stack, it the match should fail
1245        status = U_ZERO_ERROR;
1246        matcher.setStackLimit(10000, status);
1247        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1248        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1249        REGEX_ASSERT(matcher.getStackLimit() == 10000);
1250    }
1251
1252        // A pattern that doesn't save state should work with
1253        //   a minimal sized stack
1254    {
1255        UErrorCode status = U_ZERO_ERROR;
1256        UnicodeString testString = "abc";
1257        RegexMatcher matcher("abc", testString, 0, status);
1258        REGEX_CHECK_STATUS;
1259        matcher.setStackLimit(30, status);
1260        REGEX_CHECK_STATUS;
1261        REGEX_ASSERT(matcher.matches(status) == TRUE);
1262        REGEX_CHECK_STATUS;
1263        REGEX_ASSERT(matcher.getStackLimit() == 30);
1264
1265        // Negative stack sizes should fail
1266        status = U_ZERO_ERROR;
1267        matcher.setStackLimit(1000, status);
1268        REGEX_CHECK_STATUS;
1269        matcher.setStackLimit(-1, status);
1270        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1271        REGEX_ASSERT(matcher.getStackLimit() == 1000);
1272    }
1273
1274
1275}
1276
1277
1278
1279
1280
1281
1282//---------------------------------------------------------------------------
1283//
1284//      API_Replace        API test for class RegexMatcher, testing the
1285//                         Replace family of functions.
1286//
1287//---------------------------------------------------------------------------
1288void RegexTest::API_Replace() {
1289    //
1290    //  Replace
1291    //
1292    int32_t             flags=0;
1293    UParseError         pe;
1294    UErrorCode          status=U_ZERO_ERROR;
1295
1296    UnicodeString       re("abc");
1297    RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1298    REGEX_CHECK_STATUS;
1299    UnicodeString data = ".abc..abc...abc..";
1300    //                    012345678901234567
1301    RegexMatcher *matcher = pat->matcher(data, status);
1302
1303    //
1304    //  Plain vanilla matches.
1305    //
1306    UnicodeString  dest;
1307    dest = matcher->replaceFirst("yz", status);
1308    REGEX_CHECK_STATUS;
1309    REGEX_ASSERT(dest == ".yz..abc...abc..");
1310
1311    dest = matcher->replaceAll("yz", status);
1312    REGEX_CHECK_STATUS;
1313    REGEX_ASSERT(dest == ".yz..yz...yz..");
1314
1315    //
1316    //  Plain vanilla non-matches.
1317    //
1318    UnicodeString d2 = ".abx..abx...abx..";
1319    matcher->reset(d2);
1320    dest = matcher->replaceFirst("yz", status);
1321    REGEX_CHECK_STATUS;
1322    REGEX_ASSERT(dest == ".abx..abx...abx..");
1323
1324    dest = matcher->replaceAll("yz", status);
1325    REGEX_CHECK_STATUS;
1326    REGEX_ASSERT(dest == ".abx..abx...abx..");
1327
1328    //
1329    // Empty source string
1330    //
1331    UnicodeString d3 = "";
1332    matcher->reset(d3);
1333    dest = matcher->replaceFirst("yz", status);
1334    REGEX_CHECK_STATUS;
1335    REGEX_ASSERT(dest == "");
1336
1337    dest = matcher->replaceAll("yz", status);
1338    REGEX_CHECK_STATUS;
1339    REGEX_ASSERT(dest == "");
1340
1341    //
1342    // Empty substitution string
1343    //
1344    matcher->reset(data);              // ".abc..abc...abc.."
1345    dest = matcher->replaceFirst("", status);
1346    REGEX_CHECK_STATUS;
1347    REGEX_ASSERT(dest == "...abc...abc..");
1348
1349    dest = matcher->replaceAll("", status);
1350    REGEX_CHECK_STATUS;
1351    REGEX_ASSERT(dest == "........");
1352
1353    //
1354    // match whole string
1355    //
1356    UnicodeString d4 = "abc";
1357    matcher->reset(d4);
1358    dest = matcher->replaceFirst("xyz", status);
1359    REGEX_CHECK_STATUS;
1360    REGEX_ASSERT(dest == "xyz");
1361
1362    dest = matcher->replaceAll("xyz", status);
1363    REGEX_CHECK_STATUS;
1364    REGEX_ASSERT(dest == "xyz");
1365
1366    //
1367    // Capture Group, simple case
1368    //
1369    UnicodeString       re2("a(..)");
1370    RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1371    REGEX_CHECK_STATUS;
1372    UnicodeString d5 = "abcdefg";
1373    RegexMatcher *matcher2 = pat2->matcher(d5, status);
1374    REGEX_CHECK_STATUS;
1375    dest = matcher2->replaceFirst("$1$1", status);
1376    REGEX_CHECK_STATUS;
1377    REGEX_ASSERT(dest == "bcbcdefg");
1378
1379    dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1380    REGEX_CHECK_STATUS;
1381    REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1382
1383    dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1384    REGEX_ASSERT(U_FAILURE(status));
1385    status = U_ZERO_ERROR;
1386
1387    UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1388    replacement = replacement.unescape();
1389    dest = matcher2->replaceFirst(replacement, status);
1390    REGEX_CHECK_STATUS;
1391    REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1392
1393    REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1394
1395
1396    //
1397    // Replacement String with \u hex escapes
1398    //
1399    {
1400        UnicodeString  src = "abc 1 abc 2 abc 3";
1401        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1402        matcher->reset(src);
1403        UnicodeString  result = matcher->replaceAll(substitute, status);
1404        REGEX_CHECK_STATUS;
1405        REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1406    }
1407    {
1408        UnicodeString  src = "abc !";
1409        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1410        matcher->reset(src);
1411        UnicodeString  result = matcher->replaceAll(substitute, status);
1412        REGEX_CHECK_STATUS;
1413        UnicodeString expected = UnicodeString("--");
1414        expected.append((UChar32)0x10000);
1415        expected.append("-- !");
1416        REGEX_ASSERT(result == expected);
1417    }
1418    // TODO:  need more through testing of capture substitutions.
1419
1420    // Bug 4057
1421    //
1422    {
1423        status = U_ZERO_ERROR;
1424        UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1425        RegexMatcher m("ss(.*?)ee", 0, status);
1426        REGEX_CHECK_STATUS;
1427        UnicodeString result;
1428
1429        // Multiple finds do NOT bump up the previous appendReplacement postion.
1430        m.reset(s);
1431        m.find();
1432        m.find();
1433        m.appendReplacement(result, "ooh", status);
1434        REGEX_CHECK_STATUS;
1435        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1436
1437        // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1438        status = U_ZERO_ERROR;
1439        result.truncate(0);
1440        m.reset(10, status);
1441        m.find();
1442        m.find();
1443        m.appendReplacement(result, "ooh", status);
1444        REGEX_CHECK_STATUS;
1445        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1446
1447        // find() at interior of string, appendReplacemnt still starts at beginning.
1448        status = U_ZERO_ERROR;
1449        result.truncate(0);
1450        m.reset();
1451        m.find(10, status);
1452        m.find();
1453        m.appendReplacement(result, "ooh", status);
1454        REGEX_CHECK_STATUS;
1455        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1456
1457        m.appendTail(result);
1458        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1459
1460    }
1461
1462    delete matcher2;
1463    delete pat2;
1464    delete matcher;
1465    delete pat;
1466}
1467
1468
1469//---------------------------------------------------------------------------
1470//
1471//      API_Pattern       Test that the API for class RegexPattern is
1472//                        present and nominally working.
1473//
1474//---------------------------------------------------------------------------
1475void RegexTest::API_Pattern() {
1476    RegexPattern        pata;    // Test default constructor to not crash.
1477    RegexPattern        patb;
1478
1479    REGEX_ASSERT(pata == patb);
1480    REGEX_ASSERT(pata == pata);
1481
1482    UnicodeString re1("abc[a-l][m-z]");
1483    UnicodeString re2("def");
1484    UErrorCode    status = U_ZERO_ERROR;
1485    UParseError   pe;
1486
1487    RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1488    RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1489    REGEX_CHECK_STATUS;
1490    REGEX_ASSERT(*pat1 == *pat1);
1491    REGEX_ASSERT(*pat1 != pata);
1492
1493    // Assign
1494    patb = *pat1;
1495    REGEX_ASSERT(patb == *pat1);
1496
1497    // Copy Construct
1498    RegexPattern patc(*pat1);
1499    REGEX_ASSERT(patc == *pat1);
1500    REGEX_ASSERT(patb == patc);
1501    REGEX_ASSERT(pat1 != pat2);
1502    patb = *pat2;
1503    REGEX_ASSERT(patb != patc);
1504    REGEX_ASSERT(patb == *pat2);
1505
1506    // Compile with no flags.
1507    RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1508    REGEX_ASSERT(*pat1a == *pat1);
1509
1510    REGEX_ASSERT(pat1a->flags() == 0);
1511
1512    // Compile with different flags should be not equal
1513    RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1514    REGEX_CHECK_STATUS;
1515
1516    REGEX_ASSERT(*pat1b != *pat1a);
1517    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1518    REGEX_ASSERT(pat1a->flags() == 0);
1519    delete pat1b;
1520
1521    // clone
1522    RegexPattern *pat1c = pat1->clone();
1523    REGEX_ASSERT(*pat1c == *pat1);
1524    REGEX_ASSERT(*pat1c != *pat2);
1525
1526    delete pat1c;
1527    delete pat1a;
1528    delete pat1;
1529    delete pat2;
1530
1531
1532    //
1533    //   Verify that a matcher created from a cloned pattern works.
1534    //     (Jitterbug 3423)
1535    //
1536    {
1537        UErrorCode     status     = U_ZERO_ERROR;
1538        RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1539        RegexPattern  *pClone     = pSource->clone();
1540        delete         pSource;
1541        RegexMatcher  *mFromClone = pClone->matcher(status);
1542        REGEX_CHECK_STATUS;
1543        UnicodeString s = "Hello World";
1544        mFromClone->reset(s);
1545        REGEX_ASSERT(mFromClone->find() == TRUE);
1546        REGEX_ASSERT(mFromClone->group(status) == "Hello");
1547        REGEX_ASSERT(mFromClone->find() == TRUE);
1548        REGEX_ASSERT(mFromClone->group(status) == "World");
1549        REGEX_ASSERT(mFromClone->find() == FALSE);
1550        delete mFromClone;
1551        delete pClone;
1552    }
1553
1554    //
1555    //   matches convenience API
1556    //
1557    REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1558    REGEX_CHECK_STATUS;
1559    REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1560    REGEX_CHECK_STATUS;
1561    REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1562    REGEX_CHECK_STATUS;
1563    REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1564    REGEX_CHECK_STATUS;
1565    REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1566    REGEX_CHECK_STATUS;
1567    status = U_INDEX_OUTOFBOUNDS_ERROR;
1568    REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1569    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1570
1571
1572    //
1573    // Split()
1574    //
1575    status = U_ZERO_ERROR;
1576    pat1 = RegexPattern::compile(" +",  pe, status);
1577    REGEX_CHECK_STATUS;
1578    UnicodeString  fields[10];
1579
1580    int32_t n;
1581    n = pat1->split("Now is the time", fields, 10, status);
1582    REGEX_CHECK_STATUS;
1583    REGEX_ASSERT(n==4);
1584    REGEX_ASSERT(fields[0]=="Now");
1585    REGEX_ASSERT(fields[1]=="is");
1586    REGEX_ASSERT(fields[2]=="the");
1587    REGEX_ASSERT(fields[3]=="time");
1588    REGEX_ASSERT(fields[4]=="");
1589
1590    n = pat1->split("Now is the time", fields, 2, status);
1591    REGEX_CHECK_STATUS;
1592    REGEX_ASSERT(n==2);
1593    REGEX_ASSERT(fields[0]=="Now");
1594    REGEX_ASSERT(fields[1]=="is the time");
1595    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1596
1597    fields[1] = "*";
1598    status = U_ZERO_ERROR;
1599    n = pat1->split("Now is the time", fields, 1, status);
1600    REGEX_CHECK_STATUS;
1601    REGEX_ASSERT(n==1);
1602    REGEX_ASSERT(fields[0]=="Now is the time");
1603    REGEX_ASSERT(fields[1]=="*");
1604    status = U_ZERO_ERROR;
1605
1606    n = pat1->split("    Now       is the time   ", fields, 10, status);
1607    REGEX_CHECK_STATUS;
1608    REGEX_ASSERT(n==6);
1609    REGEX_ASSERT(fields[0]=="");
1610    REGEX_ASSERT(fields[1]=="Now");
1611    REGEX_ASSERT(fields[2]=="is");
1612    REGEX_ASSERT(fields[3]=="the");
1613    REGEX_ASSERT(fields[4]=="time");
1614    REGEX_ASSERT(fields[5]=="");
1615
1616    n = pat1->split("     ", fields, 10, status);
1617    REGEX_CHECK_STATUS;
1618    REGEX_ASSERT(n==2);
1619    REGEX_ASSERT(fields[0]=="");
1620    REGEX_ASSERT(fields[1]=="");
1621
1622    fields[0] = "foo";
1623    n = pat1->split("", fields, 10, status);
1624    REGEX_CHECK_STATUS;
1625    REGEX_ASSERT(n==0);
1626    REGEX_ASSERT(fields[0]=="foo");
1627
1628    delete pat1;
1629
1630    //  split, with a pattern with (capture)
1631    pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1632    REGEX_CHECK_STATUS;
1633
1634    status = U_ZERO_ERROR;
1635    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1636    REGEX_CHECK_STATUS;
1637    REGEX_ASSERT(n==7);
1638    REGEX_ASSERT(fields[0]=="");
1639    REGEX_ASSERT(fields[1]=="a");
1640    REGEX_ASSERT(fields[2]=="Now is ");
1641    REGEX_ASSERT(fields[3]=="b");
1642    REGEX_ASSERT(fields[4]=="the time");
1643    REGEX_ASSERT(fields[5]=="c");
1644    REGEX_ASSERT(fields[6]=="");
1645    REGEX_ASSERT(status==U_ZERO_ERROR);
1646
1647    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1648    REGEX_CHECK_STATUS;
1649    REGEX_ASSERT(n==7);
1650    REGEX_ASSERT(fields[0]=="  ");
1651    REGEX_ASSERT(fields[1]=="a");
1652    REGEX_ASSERT(fields[2]=="Now is ");
1653    REGEX_ASSERT(fields[3]=="b");
1654    REGEX_ASSERT(fields[4]=="the time");
1655    REGEX_ASSERT(fields[5]=="c");
1656    REGEX_ASSERT(fields[6]=="");
1657
1658    status = U_ZERO_ERROR;
1659    fields[6] = "foo";
1660    n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1661    REGEX_CHECK_STATUS;
1662    REGEX_ASSERT(n==6);
1663    REGEX_ASSERT(fields[0]=="  ");
1664    REGEX_ASSERT(fields[1]=="a");
1665    REGEX_ASSERT(fields[2]=="Now is ");
1666    REGEX_ASSERT(fields[3]=="b");
1667    REGEX_ASSERT(fields[4]=="the time");
1668    REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1669    REGEX_ASSERT(fields[6]=="foo");
1670
1671    status = U_ZERO_ERROR;
1672    fields[5] = "foo";
1673    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1674    REGEX_CHECK_STATUS;
1675    REGEX_ASSERT(n==5);
1676    REGEX_ASSERT(fields[0]=="  ");
1677    REGEX_ASSERT(fields[1]=="a");
1678    REGEX_ASSERT(fields[2]=="Now is ");
1679    REGEX_ASSERT(fields[3]=="b");
1680    REGEX_ASSERT(fields[4]=="the time<c>");
1681    REGEX_ASSERT(fields[5]=="foo");
1682
1683    status = U_ZERO_ERROR;
1684    fields[5] = "foo";
1685    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1686    REGEX_CHECK_STATUS;
1687    REGEX_ASSERT(n==5);
1688    REGEX_ASSERT(fields[0]=="  ");
1689    REGEX_ASSERT(fields[1]=="a");
1690    REGEX_ASSERT(fields[2]=="Now is ");
1691    REGEX_ASSERT(fields[3]=="b");
1692    REGEX_ASSERT(fields[4]=="the time");
1693    REGEX_ASSERT(fields[5]=="foo");
1694
1695    status = U_ZERO_ERROR;
1696    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1697    REGEX_CHECK_STATUS;
1698    REGEX_ASSERT(n==4);
1699    REGEX_ASSERT(fields[0]=="  ");
1700    REGEX_ASSERT(fields[1]=="a");
1701    REGEX_ASSERT(fields[2]=="Now is ");
1702    REGEX_ASSERT(fields[3]=="the time<c>");
1703    status = U_ZERO_ERROR;
1704    delete pat1;
1705
1706    pat1 = RegexPattern::compile("([-,])",  pe, status);
1707    REGEX_CHECK_STATUS;
1708    n = pat1->split("1-10,20", fields, 10, status);
1709    REGEX_CHECK_STATUS;
1710    REGEX_ASSERT(n==5);
1711    REGEX_ASSERT(fields[0]=="1");
1712    REGEX_ASSERT(fields[1]=="-");
1713    REGEX_ASSERT(fields[2]=="10");
1714    REGEX_ASSERT(fields[3]==",");
1715    REGEX_ASSERT(fields[4]=="20");
1716    delete pat1;
1717
1718    // Test split of string with empty trailing fields
1719    pat1 = RegexPattern::compile(",", pe, status);
1720    REGEX_CHECK_STATUS;
1721    n = pat1->split("a,b,c,", fields, 10, status);
1722    REGEX_CHECK_STATUS;
1723    REGEX_ASSERT(n==4);
1724    REGEX_ASSERT(fields[0]=="a");
1725    REGEX_ASSERT(fields[1]=="b");
1726    REGEX_ASSERT(fields[2]=="c");
1727    REGEX_ASSERT(fields[3]=="");
1728
1729    n = pat1->split("a,,,", fields, 10, status);
1730    REGEX_CHECK_STATUS;
1731    REGEX_ASSERT(n==4);
1732    REGEX_ASSERT(fields[0]=="a");
1733    REGEX_ASSERT(fields[1]=="");
1734    REGEX_ASSERT(fields[2]=="");
1735    REGEX_ASSERT(fields[3]=="");
1736    delete pat1;
1737
1738    // Split Separator with zero length match.
1739    pat1 = RegexPattern::compile(":?", pe, status);
1740    REGEX_CHECK_STATUS;
1741    n = pat1->split("abc", fields, 10, status);
1742    REGEX_CHECK_STATUS;
1743    REGEX_ASSERT(n==5);
1744    REGEX_ASSERT(fields[0]=="");
1745    REGEX_ASSERT(fields[1]=="a");
1746    REGEX_ASSERT(fields[2]=="b");
1747    REGEX_ASSERT(fields[3]=="c");
1748    REGEX_ASSERT(fields[4]=="");
1749
1750    delete pat1;
1751
1752    //
1753    // RegexPattern::pattern()
1754    //
1755    pat1 = new RegexPattern();
1756    REGEX_ASSERT(pat1->pattern() == "");
1757    delete pat1;
1758
1759    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1760    REGEX_CHECK_STATUS;
1761    REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1762    delete pat1;
1763
1764
1765    //
1766    // classID functions
1767    //
1768    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1769    REGEX_CHECK_STATUS;
1770    REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1771    REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1772    UnicodeString Hello("Hello, world.");
1773    RegexMatcher *m = pat1->matcher(Hello, status);
1774    REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1775    REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1776    REGEX_ASSERT(m->getDynamicClassID() != NULL);
1777    delete m;
1778    delete pat1;
1779
1780}
1781
1782//---------------------------------------------------------------------------
1783//
1784//      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1785//                       is present and working, but excluding functions
1786//                       implementing replace operations.
1787//
1788//---------------------------------------------------------------------------
1789void RegexTest::API_Match_UTF8() {
1790    UParseError         pe;
1791    UErrorCode          status=U_ZERO_ERROR;
1792    int32_t             flags = 0;
1793
1794    //
1795    // Debug - slide failing test cases early
1796    //
1797#if 0
1798    {
1799    }
1800    return;
1801#endif
1802
1803    //
1804    // Simple pattern compilation
1805    //
1806    {
1807        UText               re = UTEXT_INITIALIZER;
1808        regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1809        REGEX_VERBOSE_TEXT(&re);
1810        RegexPattern        *pat2;
1811        pat2 = RegexPattern::compile(&re, flags, pe, status);
1812        REGEX_CHECK_STATUS;
1813
1814        UText input1 = UTEXT_INITIALIZER;
1815        UText input2 = UTEXT_INITIALIZER;
1816        UText empty  = UTEXT_INITIALIZER;
1817        regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1818        REGEX_VERBOSE_TEXT(&input1);
1819        regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1820        REGEX_VERBOSE_TEXT(&input2);
1821        utext_openUChars(&empty, NULL, 0, &status);
1822
1823        int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1824        int32_t input2Len = strlen("not abc");
1825
1826
1827        //
1828        // Matcher creation and reset.
1829        //
1830        RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1831        REGEX_CHECK_STATUS;
1832        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1833        const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1834        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1835        m1->reset(&input2);
1836        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1837        const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1838        REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1839        m1->reset(&input1);
1840        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1841        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1842        m1->reset(&empty);
1843        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1844        REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1845
1846        //
1847        //  reset(pos, status)
1848        //
1849        m1->reset(&input1);
1850        m1->reset(4, status);
1851        REGEX_CHECK_STATUS;
1852        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1853        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1854
1855        m1->reset(-1, status);
1856        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1857        status = U_ZERO_ERROR;
1858
1859        m1->reset(0, status);
1860        REGEX_CHECK_STATUS;
1861        status = U_ZERO_ERROR;
1862
1863        m1->reset(input1Len-1, status);
1864        REGEX_CHECK_STATUS;
1865        status = U_ZERO_ERROR;
1866
1867        m1->reset(input1Len, status);
1868        REGEX_CHECK_STATUS;
1869        status = U_ZERO_ERROR;
1870
1871        m1->reset(input1Len+1, status);
1872        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1873        status = U_ZERO_ERROR;
1874
1875        //
1876        // match(pos, status)
1877        //
1878        m1->reset(&input2);
1879        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1880        m1->reset();
1881        REGEX_ASSERT(m1->matches(3, status) == FALSE);
1882        m1->reset();
1883        REGEX_ASSERT(m1->matches(5, status) == FALSE);
1884        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1885        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1886        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1887
1888        // Match() at end of string should fail, but should not
1889        //  be an error.
1890        status = U_ZERO_ERROR;
1891        REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1892        REGEX_CHECK_STATUS;
1893
1894        // Match beyond end of string should fail with an error.
1895        status = U_ZERO_ERROR;
1896        REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1897        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1898
1899        // Successful match at end of string.
1900        {
1901            status = U_ZERO_ERROR;
1902            RegexMatcher m("A?", 0, status);  // will match zero length string.
1903            REGEX_CHECK_STATUS;
1904            m.reset(&input1);
1905            REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1906            REGEX_CHECK_STATUS;
1907            m.reset(&empty);
1908            REGEX_ASSERT(m.matches(0, status) == TRUE);
1909            REGEX_CHECK_STATUS;
1910        }
1911
1912
1913        //
1914        // lookingAt(pos, status)
1915        //
1916        status = U_ZERO_ERROR;
1917        m1->reset(&input2);  // "not abc"
1918        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1919        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1920        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1921        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1922        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1923        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1924        status = U_ZERO_ERROR;
1925        REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1926        REGEX_CHECK_STATUS;
1927        REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1928        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1929
1930        delete m1;
1931        delete pat2;
1932
1933        utext_close(&re);
1934        utext_close(&input1);
1935        utext_close(&input2);
1936        utext_close(&empty);
1937    }
1938
1939
1940    //
1941    // Capture Group.
1942    //     RegexMatcher::start();
1943    //     RegexMatcher::end();
1944    //     RegexMatcher::groupCount();
1945    //
1946    {
1947        int32_t             flags=0;
1948        UParseError         pe;
1949        UErrorCode          status=U_ZERO_ERROR;
1950        UText               re=UTEXT_INITIALIZER;
1951        const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1952        utext_openUTF8(&re, str_01234567_pat, -1, &status);
1953
1954        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1955        REGEX_CHECK_STATUS;
1956
1957        UText input = UTEXT_INITIALIZER;
1958        const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1959        utext_openUTF8(&input, str_0123456789, -1, &status);
1960
1961        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1962        REGEX_CHECK_STATUS;
1963        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1964        static const int32_t matchStarts[] = {0,  2, 4, 8};
1965        static const int32_t matchEnds[]   = {10, 8, 6, 10};
1966        int32_t i;
1967        for (i=0; i<4; i++) {
1968            int32_t actualStart = matcher->start(i, status);
1969            REGEX_CHECK_STATUS;
1970            if (actualStart != matchStarts[i]) {
1971                errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
1972                      __FILE__, __LINE__, i, matchStarts[i], actualStart);
1973            }
1974            int32_t actualEnd = matcher->end(i, status);
1975            REGEX_CHECK_STATUS;
1976            if (actualEnd != matchEnds[i]) {
1977                errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
1978                      __FILE__, __LINE__, i, matchEnds[i], actualEnd);
1979            }
1980        }
1981
1982        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1983        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1984
1985        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1986        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1987        matcher->reset();
1988        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1989
1990        matcher->lookingAt(status);
1991
1992        UnicodeString dest;
1993        UText destText = UTEXT_INITIALIZER;
1994        utext_openUnicodeString(&destText, &dest, &status);
1995        UText *result;
1996        //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1997        //  Test shallow-clone API
1998        int64_t   group_len;
1999        result = matcher->group((UText *)NULL, group_len, status);
2000        REGEX_CHECK_STATUS;
2001        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2002        utext_close(result);
2003        result = matcher->group(0, &destText, group_len, status);
2004        REGEX_CHECK_STATUS;
2005        REGEX_ASSERT(result == &destText);
2006        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2007        //  destText is now immutable, reopen it
2008        utext_close(&destText);
2009        utext_openUnicodeString(&destText, &dest, &status);
2010
2011        int64_t length;
2012        result = matcher->group(0, NULL, length, status);
2013        REGEX_CHECK_STATUS;
2014        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2015        utext_close(result);
2016        result = matcher->group(0, &destText, length, status);
2017        REGEX_CHECK_STATUS;
2018        REGEX_ASSERT(result == &destText);
2019        REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2020        REGEX_ASSERT(length == 10);
2021        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2022
2023        // Capture Group 1 == "234567"
2024        result = matcher->group(1, NULL, length, status);
2025        REGEX_CHECK_STATUS;
2026        REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2027        REGEX_ASSERT(length == 6);
2028        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2029        utext_close(result);
2030
2031        result = matcher->group(1, &destText, length, status);
2032        REGEX_CHECK_STATUS;
2033        REGEX_ASSERT(result == &destText);
2034        REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2035        REGEX_ASSERT(length == 6);
2036        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2037        utext_close(result);
2038
2039        // Capture Group 2 == "45"
2040        result = matcher->group(2, NULL, length, status);
2041        REGEX_CHECK_STATUS;
2042        REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2043        REGEX_ASSERT(length == 2);
2044        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2045        utext_close(result);
2046
2047        result = matcher->group(2, &destText, length, status);
2048        REGEX_CHECK_STATUS;
2049        REGEX_ASSERT(result == &destText);
2050        REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2051        REGEX_ASSERT(length == 2);
2052        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2053        utext_close(result);
2054
2055        // Capture Group 3 == "89"
2056        result = matcher->group(3, NULL, length, status);
2057        REGEX_CHECK_STATUS;
2058        REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2059        REGEX_ASSERT(length == 2);
2060        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2061        utext_close(result);
2062
2063        result = matcher->group(3, &destText, length, status);
2064        REGEX_CHECK_STATUS;
2065        REGEX_ASSERT(result == &destText);
2066        REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2067        REGEX_ASSERT(length == 2);
2068        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2069        utext_close(result);
2070
2071        // Capture Group number out of range.
2072        status = U_ZERO_ERROR;
2073        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2074        status = U_ZERO_ERROR;
2075        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2076        status = U_ZERO_ERROR;
2077        matcher->reset();
2078        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2079
2080        delete matcher;
2081        delete pat;
2082
2083        utext_close(&destText);
2084        utext_close(&input);
2085        utext_close(&re);
2086    }
2087
2088    //
2089    //  find
2090    //
2091    {
2092        int32_t             flags=0;
2093        UParseError         pe;
2094        UErrorCode          status=U_ZERO_ERROR;
2095        UText               re=UTEXT_INITIALIZER;
2096        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2097        utext_openUTF8(&re, str_abc, -1, &status);
2098
2099        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2100        REGEX_CHECK_STATUS;
2101        UText input = UTEXT_INITIALIZER;
2102        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2103        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2104        //                      012345678901234567
2105
2106        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2107        REGEX_CHECK_STATUS;
2108        REGEX_ASSERT(matcher->find());
2109        REGEX_ASSERT(matcher->start(status) == 1);
2110        REGEX_ASSERT(matcher->find());
2111        REGEX_ASSERT(matcher->start(status) == 6);
2112        REGEX_ASSERT(matcher->find());
2113        REGEX_ASSERT(matcher->start(status) == 12);
2114        REGEX_ASSERT(matcher->find() == FALSE);
2115        REGEX_ASSERT(matcher->find() == FALSE);
2116
2117        matcher->reset();
2118        REGEX_ASSERT(matcher->find());
2119        REGEX_ASSERT(matcher->start(status) == 1);
2120
2121        REGEX_ASSERT(matcher->find(0, status));
2122        REGEX_ASSERT(matcher->start(status) == 1);
2123        REGEX_ASSERT(matcher->find(1, status));
2124        REGEX_ASSERT(matcher->start(status) == 1);
2125        REGEX_ASSERT(matcher->find(2, status));
2126        REGEX_ASSERT(matcher->start(status) == 6);
2127        REGEX_ASSERT(matcher->find(12, status));
2128        REGEX_ASSERT(matcher->start(status) == 12);
2129        REGEX_ASSERT(matcher->find(13, status) == FALSE);
2130        REGEX_ASSERT(matcher->find(16, status) == FALSE);
2131        REGEX_ASSERT(matcher->find(17, status) == FALSE);
2132        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2133
2134        status = U_ZERO_ERROR;
2135        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2136        status = U_ZERO_ERROR;
2137        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2138
2139        REGEX_ASSERT(matcher->groupCount() == 0);
2140
2141        delete matcher;
2142        delete pat;
2143
2144        utext_close(&input);
2145        utext_close(&re);
2146    }
2147
2148
2149    //
2150    //  find, with \G in pattern (true if at the end of a previous match).
2151    //
2152    {
2153        int32_t             flags=0;
2154        UParseError         pe;
2155        UErrorCode          status=U_ZERO_ERROR;
2156        UText               re=UTEXT_INITIALIZER;
2157        const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2158        utext_openUTF8(&re, str_Gabcabc, -1, &status);
2159
2160        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2161
2162        REGEX_CHECK_STATUS;
2163        UText input = UTEXT_INITIALIZER;
2164        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2165        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2166        //                      012345678901234567
2167
2168        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2169        REGEX_CHECK_STATUS;
2170        REGEX_ASSERT(matcher->find());
2171        REGEX_ASSERT(matcher->start(status) == 0);
2172        REGEX_ASSERT(matcher->start(1, status) == -1);
2173        REGEX_ASSERT(matcher->start(2, status) == 1);
2174
2175        REGEX_ASSERT(matcher->find());
2176        REGEX_ASSERT(matcher->start(status) == 4);
2177        REGEX_ASSERT(matcher->start(1, status) == 4);
2178        REGEX_ASSERT(matcher->start(2, status) == -1);
2179        REGEX_CHECK_STATUS;
2180
2181        delete matcher;
2182        delete pat;
2183
2184        utext_close(&input);
2185        utext_close(&re);
2186    }
2187
2188    //
2189    //   find with zero length matches, match position should bump ahead
2190    //     to prevent loops.
2191    //
2192    {
2193        int32_t                 i;
2194        UErrorCode          status=U_ZERO_ERROR;
2195        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2196                                                      //   using an always-true look-ahead.
2197        REGEX_CHECK_STATUS;
2198        UText s = UTEXT_INITIALIZER;
2199        utext_openUTF8(&s, "    ", -1, &status);
2200        m.reset(&s);
2201        for (i=0; ; i++) {
2202            if (m.find() == FALSE) {
2203                break;
2204            }
2205            REGEX_ASSERT(m.start(status) == i);
2206            REGEX_ASSERT(m.end(status) == i);
2207        }
2208        REGEX_ASSERT(i==5);
2209
2210        // Check that the bump goes over characters outside the BMP OK
2211        // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2212        unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2213        utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2214        m.reset(&s);
2215        for (i=0; ; i+=4) {
2216            if (m.find() == FALSE) {
2217                break;
2218            }
2219            REGEX_ASSERT(m.start(status) == i);
2220            REGEX_ASSERT(m.end(status) == i);
2221        }
2222        REGEX_ASSERT(i==20);
2223
2224        utext_close(&s);
2225    }
2226    {
2227        // find() loop breaking test.
2228        //        with pattern of /.?/, should see a series of one char matches, then a single
2229        //        match of zero length at the end of the input string.
2230        int32_t                 i;
2231        UErrorCode          status=U_ZERO_ERROR;
2232        RegexMatcher        m(".?", 0, status);
2233        REGEX_CHECK_STATUS;
2234        UText s = UTEXT_INITIALIZER;
2235        utext_openUTF8(&s, "    ", -1, &status);
2236        m.reset(&s);
2237        for (i=0; ; i++) {
2238            if (m.find() == FALSE) {
2239                break;
2240            }
2241            REGEX_ASSERT(m.start(status) == i);
2242            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2243        }
2244        REGEX_ASSERT(i==5);
2245
2246        utext_close(&s);
2247    }
2248
2249
2250    //
2251    // Matchers with no input string behave as if they had an empty input string.
2252    //
2253
2254    {
2255        UErrorCode status = U_ZERO_ERROR;
2256        RegexMatcher  m(".?", 0, status);
2257        REGEX_CHECK_STATUS;
2258        REGEX_ASSERT(m.find());
2259        REGEX_ASSERT(m.start(status) == 0);
2260        REGEX_ASSERT(m.input() == "");
2261    }
2262    {
2263        UErrorCode status = U_ZERO_ERROR;
2264        RegexPattern  *p = RegexPattern::compile(".", 0, status);
2265        RegexMatcher  *m = p->matcher(status);
2266        REGEX_CHECK_STATUS;
2267
2268        REGEX_ASSERT(m->find() == FALSE);
2269        REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2270        delete m;
2271        delete p;
2272    }
2273
2274    //
2275    // Regions
2276    //
2277    {
2278        UErrorCode status = U_ZERO_ERROR;
2279        UText testPattern = UTEXT_INITIALIZER;
2280        UText testText    = UTEXT_INITIALIZER;
2281        regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2282        REGEX_VERBOSE_TEXT(&testPattern);
2283        regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2284        REGEX_VERBOSE_TEXT(&testText);
2285
2286        RegexMatcher m(&testPattern, &testText, 0, status);
2287        REGEX_CHECK_STATUS;
2288        REGEX_ASSERT(m.regionStart() == 0);
2289        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2290        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2291        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2292
2293        m.region(2,4, status);
2294        REGEX_CHECK_STATUS;
2295        REGEX_ASSERT(m.matches(status));
2296        REGEX_ASSERT(m.start(status)==2);
2297        REGEX_ASSERT(m.end(status)==4);
2298        REGEX_CHECK_STATUS;
2299
2300        m.reset();
2301        REGEX_ASSERT(m.regionStart() == 0);
2302        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2303
2304        regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2305        REGEX_VERBOSE_TEXT(&testText);
2306        m.reset(&testText);
2307        REGEX_ASSERT(m.regionStart() == 0);
2308        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2309
2310        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2311        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2312        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2313        REGEX_ASSERT(&m == &m.reset());
2314        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2315
2316        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2317        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2318        REGEX_ASSERT(&m == &m.reset());
2319        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2320
2321        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2322        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2323        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2324        REGEX_ASSERT(&m == &m.reset());
2325        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2326
2327        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2328        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2329        REGEX_ASSERT(&m == &m.reset());
2330        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2331
2332        utext_close(&testText);
2333        utext_close(&testPattern);
2334    }
2335
2336    //
2337    // hitEnd() and requireEnd()
2338    //
2339    {
2340        UErrorCode status = U_ZERO_ERROR;
2341        UText testPattern = UTEXT_INITIALIZER;
2342        UText testText    = UTEXT_INITIALIZER;
2343        const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2344        const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2345        utext_openUTF8(&testPattern, str_, -1, &status);
2346        utext_openUTF8(&testText, str_aabb, -1, &status);
2347
2348        RegexMatcher m1(&testPattern, &testText,  0, status);
2349        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2350        REGEX_ASSERT(m1.hitEnd() == TRUE);
2351        REGEX_ASSERT(m1.requireEnd() == FALSE);
2352        REGEX_CHECK_STATUS;
2353
2354        status = U_ZERO_ERROR;
2355        const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2356        utext_openUTF8(&testPattern, str_a, -1, &status);
2357        RegexMatcher m2(&testPattern, &testText, 0, status);
2358        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2359        REGEX_ASSERT(m2.hitEnd() == FALSE);
2360        REGEX_ASSERT(m2.requireEnd() == FALSE);
2361        REGEX_CHECK_STATUS;
2362
2363        status = U_ZERO_ERROR;
2364        const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2365        utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2366        RegexMatcher m3(&testPattern, &testText, 0, status);
2367        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2368        REGEX_ASSERT(m3.hitEnd() == TRUE);
2369        REGEX_ASSERT(m3.requireEnd() == TRUE);
2370        REGEX_CHECK_STATUS;
2371
2372        utext_close(&testText);
2373        utext_close(&testPattern);
2374    }
2375}
2376
2377
2378//---------------------------------------------------------------------------
2379//
2380//      API_Replace_UTF8   API test for class RegexMatcher, testing the
2381//                         Replace family of functions.
2382//
2383//---------------------------------------------------------------------------
2384void RegexTest::API_Replace_UTF8() {
2385    //
2386    //  Replace
2387    //
2388    int32_t             flags=0;
2389    UParseError         pe;
2390    UErrorCode          status=U_ZERO_ERROR;
2391
2392    UText               re=UTEXT_INITIALIZER;
2393    regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2394    REGEX_VERBOSE_TEXT(&re);
2395    RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2396    REGEX_CHECK_STATUS;
2397
2398    char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2399    //             012345678901234567
2400    UText dataText = UTEXT_INITIALIZER;
2401    utext_openUTF8(&dataText, data, -1, &status);
2402    REGEX_CHECK_STATUS;
2403    REGEX_VERBOSE_TEXT(&dataText);
2404    RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2405
2406    //
2407    //  Plain vanilla matches.
2408    //
2409    UnicodeString  dest;
2410    UText destText = UTEXT_INITIALIZER;
2411    utext_openUnicodeString(&destText, &dest, &status);
2412    UText *result;
2413
2414    UText replText = UTEXT_INITIALIZER;
2415
2416    const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2417    utext_openUTF8(&replText, str_yz, -1, &status);
2418    REGEX_VERBOSE_TEXT(&replText);
2419    result = matcher->replaceFirst(&replText, NULL, status);
2420    REGEX_CHECK_STATUS;
2421    const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2422    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2423    utext_close(result);
2424    result = matcher->replaceFirst(&replText, &destText, status);
2425    REGEX_CHECK_STATUS;
2426    REGEX_ASSERT(result == &destText);
2427    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2428
2429    result = matcher->replaceAll(&replText, NULL, status);
2430    REGEX_CHECK_STATUS;
2431    const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2432    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2433    utext_close(result);
2434
2435    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2436    result = matcher->replaceAll(&replText, &destText, status);
2437    REGEX_CHECK_STATUS;
2438    REGEX_ASSERT(result == &destText);
2439    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2440
2441    //
2442    //  Plain vanilla non-matches.
2443    //
2444    const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2445    utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2446    matcher->reset(&dataText);
2447
2448    result = matcher->replaceFirst(&replText, NULL, status);
2449    REGEX_CHECK_STATUS;
2450    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2451    utext_close(result);
2452    result = matcher->replaceFirst(&replText, &destText, status);
2453    REGEX_CHECK_STATUS;
2454    REGEX_ASSERT(result == &destText);
2455    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2456
2457    result = matcher->replaceAll(&replText, NULL, status);
2458    REGEX_CHECK_STATUS;
2459    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2460    utext_close(result);
2461    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2462    result = matcher->replaceAll(&replText, &destText, status);
2463    REGEX_CHECK_STATUS;
2464    REGEX_ASSERT(result == &destText);
2465    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2466
2467    //
2468    // Empty source string
2469    //
2470    utext_openUTF8(&dataText, NULL, 0, &status);
2471    matcher->reset(&dataText);
2472
2473    result = matcher->replaceFirst(&replText, NULL, status);
2474    REGEX_CHECK_STATUS;
2475    REGEX_ASSERT_UTEXT_UTF8("", result);
2476    utext_close(result);
2477    result = matcher->replaceFirst(&replText, &destText, status);
2478    REGEX_CHECK_STATUS;
2479    REGEX_ASSERT(result == &destText);
2480    REGEX_ASSERT_UTEXT_UTF8("", result);
2481
2482    result = matcher->replaceAll(&replText, NULL, status);
2483    REGEX_CHECK_STATUS;
2484    REGEX_ASSERT_UTEXT_UTF8("", result);
2485    utext_close(result);
2486    result = matcher->replaceAll(&replText, &destText, status);
2487    REGEX_CHECK_STATUS;
2488    REGEX_ASSERT(result == &destText);
2489    REGEX_ASSERT_UTEXT_UTF8("", result);
2490
2491    //
2492    // Empty substitution string
2493    //
2494    utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2495    matcher->reset(&dataText);
2496
2497    utext_openUTF8(&replText, NULL, 0, &status);
2498    result = matcher->replaceFirst(&replText, NULL, status);
2499    REGEX_CHECK_STATUS;
2500    const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2501    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2502    utext_close(result);
2503    result = matcher->replaceFirst(&replText, &destText, status);
2504    REGEX_CHECK_STATUS;
2505    REGEX_ASSERT(result == &destText);
2506    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2507
2508    result = matcher->replaceAll(&replText, NULL, status);
2509    REGEX_CHECK_STATUS;
2510    const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2511    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2512    utext_close(result);
2513    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2514    result = matcher->replaceAll(&replText, &destText, status);
2515    REGEX_CHECK_STATUS;
2516    REGEX_ASSERT(result == &destText);
2517    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2518
2519    //
2520    // match whole string
2521    //
2522    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2523    utext_openUTF8(&dataText, str_abc, -1, &status);
2524    matcher->reset(&dataText);
2525
2526    const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2527    utext_openUTF8(&replText, str_xyz, -1, &status);
2528    result = matcher->replaceFirst(&replText, NULL, status);
2529    REGEX_CHECK_STATUS;
2530    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2531    utext_close(result);
2532    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2533    result = matcher->replaceFirst(&replText, &destText, status);
2534    REGEX_CHECK_STATUS;
2535    REGEX_ASSERT(result == &destText);
2536    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2537
2538    result = matcher->replaceAll(&replText, NULL, status);
2539    REGEX_CHECK_STATUS;
2540    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2541    utext_close(result);
2542    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2543    result = matcher->replaceAll(&replText, &destText, status);
2544    REGEX_CHECK_STATUS;
2545    REGEX_ASSERT(result == &destText);
2546    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2547
2548    //
2549    // Capture Group, simple case
2550    //
2551    const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2552    utext_openUTF8(&re, str_add, -1, &status);
2553    RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2554    REGEX_CHECK_STATUS;
2555
2556    const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2557    utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2558    RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2559    REGEX_CHECK_STATUS;
2560
2561    const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2562    utext_openUTF8(&replText, str_11, -1, &status);
2563    result = matcher2->replaceFirst(&replText, NULL, status);
2564    REGEX_CHECK_STATUS;
2565    const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2566    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2567    utext_close(result);
2568    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2569    result = matcher2->replaceFirst(&replText, &destText, status);
2570    REGEX_CHECK_STATUS;
2571    REGEX_ASSERT(result == &destText);
2572    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2573
2574    const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2575    utext_openUTF8(&replText, str_v, -1, &status);
2576    REGEX_VERBOSE_TEXT(&replText);
2577    result = matcher2->replaceFirst(&replText, NULL, status);
2578    REGEX_CHECK_STATUS;
2579    const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2580    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2581    utext_close(result);
2582    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2583    result = matcher2->replaceFirst(&replText, &destText, status);
2584    REGEX_CHECK_STATUS;
2585    REGEX_ASSERT(result == &destText);
2586    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2587
2588    const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2589               0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2590               0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2591    utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2592    result = matcher2->replaceFirst(&replText, NULL, status);
2593    REGEX_CHECK_STATUS;
2594    const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2595    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2596    utext_close(result);
2597    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2598    result = matcher2->replaceFirst(&replText, &destText, status);
2599    REGEX_CHECK_STATUS;
2600    REGEX_ASSERT(result == &destText);
2601    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2602
2603    unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2604    //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2605    //                                 012345678901234567890123456
2606    supplDigitChars[22] = 0xF0;
2607    supplDigitChars[23] = 0x9D;
2608    supplDigitChars[24] = 0x9F;
2609    supplDigitChars[25] = 0x8F;
2610    utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2611
2612    result = matcher2->replaceFirst(&replText, NULL, status);
2613    REGEX_CHECK_STATUS;
2614    const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2615    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2616    utext_close(result);
2617    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2618    result = matcher2->replaceFirst(&replText, &destText, status);
2619    REGEX_CHECK_STATUS;
2620    REGEX_ASSERT(result == &destText);
2621    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2622    const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2623    utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2624    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2625//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2626    utext_close(result);
2627    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2628    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2629    REGEX_ASSERT(result == &destText);
2630//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2631
2632    //
2633    // Replacement String with \u hex escapes
2634    //
2635    {
2636      const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2637      const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2638        utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2639        utext_openUTF8(&replText, str_u0043, -1, &status);
2640        matcher->reset(&dataText);
2641
2642        result = matcher->replaceAll(&replText, NULL, status);
2643        REGEX_CHECK_STATUS;
2644        const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2645        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2646        utext_close(result);
2647        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2648        result = matcher->replaceAll(&replText, &destText, status);
2649        REGEX_CHECK_STATUS;
2650        REGEX_ASSERT(result == &destText);
2651        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2652    }
2653    {
2654      const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2655        utext_openUTF8(&dataText, str_abc, -1, &status);
2656        const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2657        utext_openUTF8(&replText, str_U00010000, -1, &status);
2658        matcher->reset(&dataText);
2659
2660        unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2661        //                          0123456789
2662        expected[2] = 0xF0;
2663        expected[3] = 0x90;
2664        expected[4] = 0x80;
2665        expected[5] = 0x80;
2666
2667        result = matcher->replaceAll(&replText, NULL, status);
2668        REGEX_CHECK_STATUS;
2669        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2670        utext_close(result);
2671        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2672        result = matcher->replaceAll(&replText, &destText, status);
2673        REGEX_CHECK_STATUS;
2674        REGEX_ASSERT(result == &destText);
2675        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2676    }
2677    // TODO:  need more through testing of capture substitutions.
2678
2679    // Bug 4057
2680    //
2681    {
2682        status = U_ZERO_ERROR;
2683const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2684const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2685const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2686        utext_openUTF8(&re, str_ssee, -1, &status);
2687        utext_openUTF8(&dataText, str_blah, -1, &status);
2688        utext_openUTF8(&replText, str_ooh, -1, &status);
2689
2690        RegexMatcher m(&re, 0, status);
2691        REGEX_CHECK_STATUS;
2692
2693        UnicodeString result;
2694        UText resultText = UTEXT_INITIALIZER;
2695        utext_openUnicodeString(&resultText, &result, &status);
2696
2697        // Multiple finds do NOT bump up the previous appendReplacement postion.
2698        m.reset(&dataText);
2699        m.find();
2700        m.find();
2701        m.appendReplacement(&resultText, &replText, status);
2702        REGEX_CHECK_STATUS;
2703        const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2704        REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2705
2706        // After a reset into the interior of a string, appendReplacement still starts at beginning.
2707        status = U_ZERO_ERROR;
2708        result.truncate(0);
2709        utext_openUnicodeString(&resultText, &result, &status);
2710        m.reset(10, status);
2711        m.find();
2712        m.find();
2713        m.appendReplacement(&resultText, &replText, status);
2714        REGEX_CHECK_STATUS;
2715        const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2716        REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2717
2718        // find() at interior of string, appendReplacement still starts at beginning.
2719        status = U_ZERO_ERROR;
2720        result.truncate(0);
2721        utext_openUnicodeString(&resultText, &result, &status);
2722        m.reset();
2723        m.find(10, status);
2724        m.find();
2725        m.appendReplacement(&resultText, &replText, status);
2726        REGEX_CHECK_STATUS;
2727        const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2728        REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2729
2730        m.appendTail(&resultText, status);
2731        const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2732        REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2733
2734        utext_close(&resultText);
2735    }
2736
2737    delete matcher2;
2738    delete pat2;
2739    delete matcher;
2740    delete pat;
2741
2742    utext_close(&dataText);
2743    utext_close(&replText);
2744    utext_close(&destText);
2745    utext_close(&re);
2746}
2747
2748
2749//---------------------------------------------------------------------------
2750//
2751//      API_Pattern_UTF8  Test that the API for class RegexPattern is
2752//                        present and nominally working.
2753//
2754//---------------------------------------------------------------------------
2755void RegexTest::API_Pattern_UTF8() {
2756    RegexPattern        pata;    // Test default constructor to not crash.
2757    RegexPattern        patb;
2758
2759    REGEX_ASSERT(pata == patb);
2760    REGEX_ASSERT(pata == pata);
2761
2762    UText         re1 = UTEXT_INITIALIZER;
2763    UText         re2 = UTEXT_INITIALIZER;
2764    UErrorCode    status = U_ZERO_ERROR;
2765    UParseError   pe;
2766
2767    const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2768    const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2769    utext_openUTF8(&re1, str_abcalmz, -1, &status);
2770    utext_openUTF8(&re2, str_def, -1, &status);
2771
2772    RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2773    RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2774    REGEX_CHECK_STATUS;
2775    REGEX_ASSERT(*pat1 == *pat1);
2776    REGEX_ASSERT(*pat1 != pata);
2777
2778    // Assign
2779    patb = *pat1;
2780    REGEX_ASSERT(patb == *pat1);
2781
2782    // Copy Construct
2783    RegexPattern patc(*pat1);
2784    REGEX_ASSERT(patc == *pat1);
2785    REGEX_ASSERT(patb == patc);
2786    REGEX_ASSERT(pat1 != pat2);
2787    patb = *pat2;
2788    REGEX_ASSERT(patb != patc);
2789    REGEX_ASSERT(patb == *pat2);
2790
2791    // Compile with no flags.
2792    RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2793    REGEX_ASSERT(*pat1a == *pat1);
2794
2795    REGEX_ASSERT(pat1a->flags() == 0);
2796
2797    // Compile with different flags should be not equal
2798    RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2799    REGEX_CHECK_STATUS;
2800
2801    REGEX_ASSERT(*pat1b != *pat1a);
2802    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2803    REGEX_ASSERT(pat1a->flags() == 0);
2804    delete pat1b;
2805
2806    // clone
2807    RegexPattern *pat1c = pat1->clone();
2808    REGEX_ASSERT(*pat1c == *pat1);
2809    REGEX_ASSERT(*pat1c != *pat2);
2810
2811    delete pat1c;
2812    delete pat1a;
2813    delete pat1;
2814    delete pat2;
2815
2816    utext_close(&re1);
2817    utext_close(&re2);
2818
2819
2820    //
2821    //   Verify that a matcher created from a cloned pattern works.
2822    //     (Jitterbug 3423)
2823    //
2824    {
2825        UErrorCode     status     = U_ZERO_ERROR;
2826        UText          pattern    = UTEXT_INITIALIZER;
2827        const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2828        utext_openUTF8(&pattern, str_pL, -1, &status);
2829
2830        RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2831        RegexPattern  *pClone     = pSource->clone();
2832        delete         pSource;
2833        RegexMatcher  *mFromClone = pClone->matcher(status);
2834        REGEX_CHECK_STATUS;
2835
2836        UText          input      = UTEXT_INITIALIZER;
2837        const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2838        utext_openUTF8(&input, str_HelloWorld, -1, &status);
2839        mFromClone->reset(&input);
2840        REGEX_ASSERT(mFromClone->find() == TRUE);
2841        REGEX_ASSERT(mFromClone->group(status) == "Hello");
2842        REGEX_ASSERT(mFromClone->find() == TRUE);
2843        REGEX_ASSERT(mFromClone->group(status) == "World");
2844        REGEX_ASSERT(mFromClone->find() == FALSE);
2845        delete mFromClone;
2846        delete pClone;
2847
2848        utext_close(&input);
2849        utext_close(&pattern);
2850    }
2851
2852    //
2853    //   matches convenience API
2854    //
2855    {
2856        UErrorCode status  = U_ZERO_ERROR;
2857        UText      pattern = UTEXT_INITIALIZER;
2858        UText      input   = UTEXT_INITIALIZER;
2859
2860        const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2861        utext_openUTF8(&input, str_randominput, -1, &status);
2862
2863        const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2864        utext_openUTF8(&pattern, str_dotstar, -1, &status);
2865        REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2866        REGEX_CHECK_STATUS;
2867
2868        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2869        utext_openUTF8(&pattern, str_abc, -1, &status);
2870        REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2871        REGEX_CHECK_STATUS;
2872
2873        const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2874        utext_openUTF8(&pattern, str_nput, -1, &status);
2875        REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2876        REGEX_CHECK_STATUS;
2877
2878        utext_openUTF8(&pattern, str_randominput, -1, &status);
2879        REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2880        REGEX_CHECK_STATUS;
2881
2882        const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2883        utext_openUTF8(&pattern, str_u, -1, &status);
2884        REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2885        REGEX_CHECK_STATUS;
2886
2887        utext_openUTF8(&input, str_abc, -1, &status);
2888        utext_openUTF8(&pattern, str_abc, -1, &status);
2889        status = U_INDEX_OUTOFBOUNDS_ERROR;
2890        REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2891        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2892
2893        utext_close(&input);
2894        utext_close(&pattern);
2895    }
2896
2897
2898    //
2899    // Split()
2900    //
2901    status = U_ZERO_ERROR;
2902    const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2903    utext_openUTF8(&re1, str_spaceplus, -1, &status);
2904    pat1 = RegexPattern::compile(&re1, pe, status);
2905    REGEX_CHECK_STATUS;
2906    UnicodeString  fields[10];
2907
2908    int32_t n;
2909    n = pat1->split("Now is the time", fields, 10, status);
2910    REGEX_CHECK_STATUS;
2911    REGEX_ASSERT(n==4);
2912    REGEX_ASSERT(fields[0]=="Now");
2913    REGEX_ASSERT(fields[1]=="is");
2914    REGEX_ASSERT(fields[2]=="the");
2915    REGEX_ASSERT(fields[3]=="time");
2916    REGEX_ASSERT(fields[4]=="");
2917
2918    n = pat1->split("Now is the time", fields, 2, status);
2919    REGEX_CHECK_STATUS;
2920    REGEX_ASSERT(n==2);
2921    REGEX_ASSERT(fields[0]=="Now");
2922    REGEX_ASSERT(fields[1]=="is the time");
2923    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2924
2925    fields[1] = "*";
2926    status = U_ZERO_ERROR;
2927    n = pat1->split("Now is the time", fields, 1, status);
2928    REGEX_CHECK_STATUS;
2929    REGEX_ASSERT(n==1);
2930    REGEX_ASSERT(fields[0]=="Now is the time");
2931    REGEX_ASSERT(fields[1]=="*");
2932    status = U_ZERO_ERROR;
2933
2934    n = pat1->split("    Now       is the time   ", fields, 10, status);
2935    REGEX_CHECK_STATUS;
2936    REGEX_ASSERT(n==6);
2937    REGEX_ASSERT(fields[0]=="");
2938    REGEX_ASSERT(fields[1]=="Now");
2939    REGEX_ASSERT(fields[2]=="is");
2940    REGEX_ASSERT(fields[3]=="the");
2941    REGEX_ASSERT(fields[4]=="time");
2942    REGEX_ASSERT(fields[5]=="");
2943    REGEX_ASSERT(fields[6]=="");
2944
2945    fields[2] = "*";
2946    n = pat1->split("     ", fields, 10, status);
2947    REGEX_CHECK_STATUS;
2948    REGEX_ASSERT(n==2);
2949    REGEX_ASSERT(fields[0]=="");
2950    REGEX_ASSERT(fields[1]=="");
2951    REGEX_ASSERT(fields[2]=="*");
2952
2953    fields[0] = "foo";
2954    n = pat1->split("", fields, 10, status);
2955    REGEX_CHECK_STATUS;
2956    REGEX_ASSERT(n==0);
2957    REGEX_ASSERT(fields[0]=="foo");
2958
2959    delete pat1;
2960
2961    //  split, with a pattern with (capture)
2962    regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2963    pat1 = RegexPattern::compile(&re1,  pe, status);
2964    REGEX_CHECK_STATUS;
2965
2966    status = U_ZERO_ERROR;
2967    fields[6] = fields[7] = "*";
2968    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2969    REGEX_CHECK_STATUS;
2970    REGEX_ASSERT(n==7);
2971    REGEX_ASSERT(fields[0]=="");
2972    REGEX_ASSERT(fields[1]=="a");
2973    REGEX_ASSERT(fields[2]=="Now is ");
2974    REGEX_ASSERT(fields[3]=="b");
2975    REGEX_ASSERT(fields[4]=="the time");
2976    REGEX_ASSERT(fields[5]=="c");
2977    REGEX_ASSERT(fields[6]=="");
2978    REGEX_ASSERT(fields[7]=="*");
2979    REGEX_ASSERT(status==U_ZERO_ERROR);
2980
2981    fields[6] = fields[7] = "*";
2982    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2983    REGEX_CHECK_STATUS;
2984    REGEX_ASSERT(n==7);
2985    REGEX_ASSERT(fields[0]=="  ");
2986    REGEX_ASSERT(fields[1]=="a");
2987    REGEX_ASSERT(fields[2]=="Now is ");
2988    REGEX_ASSERT(fields[3]=="b");
2989    REGEX_ASSERT(fields[4]=="the time");
2990    REGEX_ASSERT(fields[5]=="c");
2991    REGEX_ASSERT(fields[6]=="");
2992    REGEX_ASSERT(fields[7]=="*");
2993
2994    status = U_ZERO_ERROR;
2995    fields[6] = "foo";
2996    n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
2997    REGEX_CHECK_STATUS;
2998    REGEX_ASSERT(n==6);
2999    REGEX_ASSERT(fields[0]=="  ");
3000    REGEX_ASSERT(fields[1]=="a");
3001    REGEX_ASSERT(fields[2]=="Now is ");
3002    REGEX_ASSERT(fields[3]=="b");
3003    REGEX_ASSERT(fields[4]=="the time");
3004    REGEX_ASSERT(fields[5]==" ");
3005    REGEX_ASSERT(fields[6]=="foo");
3006
3007    status = U_ZERO_ERROR;
3008    fields[5] = "foo";
3009    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3010    REGEX_CHECK_STATUS;
3011    REGEX_ASSERT(n==5);
3012    REGEX_ASSERT(fields[0]=="  ");
3013    REGEX_ASSERT(fields[1]=="a");
3014    REGEX_ASSERT(fields[2]=="Now is ");
3015    REGEX_ASSERT(fields[3]=="b");
3016    REGEX_ASSERT(fields[4]=="the time<c>");
3017    REGEX_ASSERT(fields[5]=="foo");
3018
3019    status = U_ZERO_ERROR;
3020    fields[5] = "foo";
3021    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3022    REGEX_CHECK_STATUS;
3023    REGEX_ASSERT(n==5);
3024    REGEX_ASSERT(fields[0]=="  ");
3025    REGEX_ASSERT(fields[1]=="a");
3026    REGEX_ASSERT(fields[2]=="Now is ");
3027    REGEX_ASSERT(fields[3]=="b");
3028    REGEX_ASSERT(fields[4]=="the time");
3029    REGEX_ASSERT(fields[5]=="foo");
3030
3031    status = U_ZERO_ERROR;
3032    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3033    REGEX_CHECK_STATUS;
3034    REGEX_ASSERT(n==4);
3035    REGEX_ASSERT(fields[0]=="  ");
3036    REGEX_ASSERT(fields[1]=="a");
3037    REGEX_ASSERT(fields[2]=="Now is ");
3038    REGEX_ASSERT(fields[3]=="the time<c>");
3039    status = U_ZERO_ERROR;
3040    delete pat1;
3041
3042    regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3043    pat1 = RegexPattern::compile(&re1, pe, status);
3044    REGEX_CHECK_STATUS;
3045    n = pat1->split("1-10,20", fields, 10, status);
3046    REGEX_CHECK_STATUS;
3047    REGEX_ASSERT(n==5);
3048    REGEX_ASSERT(fields[0]=="1");
3049    REGEX_ASSERT(fields[1]=="-");
3050    REGEX_ASSERT(fields[2]=="10");
3051    REGEX_ASSERT(fields[3]==",");
3052    REGEX_ASSERT(fields[4]=="20");
3053    delete pat1;
3054
3055
3056    //
3057    // split of a UText based string, with library allocating output UTexts.
3058    //
3059    {
3060        status = U_ZERO_ERROR;
3061        RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3062        UnicodeString stringToSplit("first:second:third");
3063        UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3064        REGEX_CHECK_STATUS;
3065
3066        UText *splits[10] = {NULL};
3067        int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3068        REGEX_CHECK_STATUS;
3069        REGEX_ASSERT(numFields == 5);
3070        REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3071        REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3072        REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3073        REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3074        REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3075        REGEX_ASSERT(splits[5] == NULL);
3076
3077        for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3078            if (splits[i]) {
3079                utext_close(splits[i]);
3080                splits[i] = NULL;
3081            }
3082        }
3083        utext_close(textToSplit);
3084    }
3085
3086
3087    //
3088    // RegexPattern::pattern() and patternText()
3089    //
3090    pat1 = new RegexPattern();
3091    REGEX_ASSERT(pat1->pattern() == "");
3092    REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3093    delete pat1;
3094    const char *helloWorldInvariant = "(Hello, world)*";
3095    regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3096    pat1 = RegexPattern::compile(&re1, pe, status);
3097    REGEX_CHECK_STATUS;
3098    REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3099    REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3100    delete pat1;
3101
3102    utext_close(&re1);
3103}
3104
3105
3106//---------------------------------------------------------------------------
3107//
3108//      Extended       A more thorough check for features of regex patterns
3109//                     The test cases are in a separate data file,
3110//                       source/tests/testdata/regextst.txt
3111//                     A description of the test data format is included in that file.
3112//
3113//---------------------------------------------------------------------------
3114
3115const char *
3116RegexTest::getPath(char buffer[2048], const char *filename) {
3117    UErrorCode status=U_ZERO_ERROR;
3118    const char *testDataDirectory = IntlTest::getSourceTestData(status);
3119    if (U_FAILURE(status)) {
3120        errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3121        return NULL;
3122    }
3123
3124    strcpy(buffer, testDataDirectory);
3125    strcat(buffer, filename);
3126    return buffer;
3127}
3128
3129void RegexTest::Extended() {
3130    char tdd[2048];
3131    const char *srcPath;
3132    UErrorCode  status  = U_ZERO_ERROR;
3133    int32_t     lineNum = 0;
3134
3135    //
3136    //  Open and read the test data file.
3137    //
3138    srcPath=getPath(tdd, "regextst.txt");
3139    if(srcPath==NULL) {
3140        return; /* something went wrong, error already output */
3141    }
3142
3143    int32_t    len;
3144    UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3145    if (U_FAILURE(status)) {
3146        return; /* something went wrong, error already output */
3147    }
3148
3149    //
3150    //  Put the test data into a UnicodeString
3151    //
3152    UnicodeString testString(FALSE, testData, len);
3153
3154    RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3155    RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3156    RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3157
3158    RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3159    UnicodeString   testPattern;   // The pattern for test from the test file.
3160    UnicodeString   testFlags;     // the flags   for a test.
3161    UnicodeString   matchString;   // The marked up string to be used as input
3162
3163    if (U_FAILURE(status)){
3164        dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3165        delete [] testData;
3166        return;
3167    }
3168
3169    //
3170    //  Loop over the test data file, once per line.
3171    //
3172    while (lineMat.find()) {
3173        lineNum++;
3174        if (U_FAILURE(status)) {
3175          errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3176        }
3177
3178        status = U_ZERO_ERROR;
3179        UnicodeString testLine = lineMat.group(1, status);
3180        if (testLine.length() == 0) {
3181            continue;
3182        }
3183
3184        //
3185        // Parse the test line.  Skip blank and comment only lines.
3186        // Separate out the three main fields - pattern, flags, target.
3187        //
3188
3189        commentMat.reset(testLine);
3190        if (commentMat.lookingAt(status)) {
3191            // This line is a comment, or blank.
3192            continue;
3193        }
3194
3195        //
3196        //  Pull out the pattern field, remove it from the test file line.
3197        //
3198        quotedStuffMat.reset(testLine);
3199        if (quotedStuffMat.lookingAt(status)) {
3200            testPattern = quotedStuffMat.group(2, status);
3201            testLine.remove(0, quotedStuffMat.end(0, status));
3202        } else {
3203            errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3204            continue;
3205        }
3206
3207
3208        //
3209        //  Pull out the flags from the test file line.
3210        //
3211        flagsMat.reset(testLine);
3212        flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3213        testFlags = flagsMat.group(1, status);
3214        if (flagsMat.group(2, status).length() > 0) {
3215            errln("Bad Match flag at line %d. Scanning %c\n",
3216                lineNum, flagsMat.group(2, status).charAt(0));
3217            continue;
3218        }
3219        testLine.remove(0, flagsMat.end(0, status));
3220
3221        //
3222        //  Pull out the match string, as a whole.
3223        //    We'll process the <tags> later.
3224        //
3225        quotedStuffMat.reset(testLine);
3226        if (quotedStuffMat.lookingAt(status)) {
3227            matchString = quotedStuffMat.group(2, status);
3228            testLine.remove(0, quotedStuffMat.end(0, status));
3229        } else {
3230            errln("Bad match string at test file line %d", lineNum);
3231            continue;
3232        }
3233
3234        //
3235        //  The only thing left from the input line should be an optional trailing comment.
3236        //
3237        commentMat.reset(testLine);
3238        if (commentMat.lookingAt(status) == FALSE) {
3239            errln("Line %d: unexpected characters at end of test line.", lineNum);
3240            continue;
3241        }
3242
3243        //
3244        //  Run the test
3245        //
3246        regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3247    }
3248
3249    delete [] testData;
3250
3251}
3252
3253
3254
3255//---------------------------------------------------------------------------
3256//
3257//    regex_find(pattern, flags, inputString, lineNumber)
3258//
3259//         Function to run a single test from the Extended (data driven) tests.
3260//         See file test/testdata/regextst.txt for a description of the
3261//         pattern and inputString fields, and the allowed flags.
3262//         lineNumber is the source line in regextst.txt of the test.
3263//
3264//---------------------------------------------------------------------------
3265
3266
3267//  Set a value into a UVector at position specified by a decimal number in
3268//   a UnicodeString.   This is a utility function needed by the actual test function,
3269//   which follows.
3270static void set(UVector &vec, int32_t val, UnicodeString index) {
3271    UErrorCode  status=U_ZERO_ERROR;
3272    int32_t  idx = 0;
3273    for (int32_t i=0; i<index.length(); i++) {
3274        int32_t d=u_charDigitValue(index.charAt(i));
3275        if (d<0) {return;}
3276        idx = idx*10 + d;
3277    }
3278    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3279    vec.setElementAt(val, idx);
3280}
3281
3282static void setInt(UVector &vec, int32_t val, int32_t idx) {
3283    UErrorCode  status=U_ZERO_ERROR;
3284    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3285    vec.setElementAt(val, idx);
3286}
3287
3288static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3289{
3290    UBool couldFind = TRUE;
3291    UTEXT_SETNATIVEINDEX(utext, 0);
3292    int32_t i = 0;
3293    while (i < unistrOffset) {
3294        UChar32 c = UTEXT_NEXT32(utext);
3295        if (c != U_SENTINEL) {
3296            i += U16_LENGTH(c);
3297        } else {
3298            couldFind = FALSE;
3299            break;
3300        }
3301    }
3302    nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3303    return couldFind;
3304}
3305
3306
3307void RegexTest::regex_find(const UnicodeString &pattern,
3308                           const UnicodeString &flags,
3309                           const UnicodeString &inputString,
3310                           const char *srcPath,
3311                           int32_t line) {
3312    UnicodeString       unEscapedInput;
3313    UnicodeString       deTaggedInput;
3314
3315    int32_t             patternUTF8Length,      inputUTF8Length;
3316    char                *patternChars  = NULL, *inputChars = NULL;
3317    UText               patternText    = UTEXT_INITIALIZER;
3318    UText               inputText      = UTEXT_INITIALIZER;
3319    UConverter          *UTF8Converter = NULL;
3320
3321    UErrorCode          status         = U_ZERO_ERROR;
3322    UParseError         pe;
3323    RegexPattern        *parsePat      = NULL;
3324    RegexMatcher        *parseMatcher  = NULL;
3325    RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3326    RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3327    UVector             groupStarts(status);
3328    UVector             groupEnds(status);
3329    UVector             groupStartsUTF8(status);
3330    UVector             groupEndsUTF8(status);
3331    UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3332    UBool               failed         = FALSE;
3333    int32_t             numFinds;
3334    int32_t             i;
3335    UBool               useMatchesFunc   = FALSE;
3336    UBool               useLookingAtFunc = FALSE;
3337    int32_t             regionStart      = -1;
3338    int32_t             regionEnd        = -1;
3339    int32_t             regionStartUTF8  = -1;
3340    int32_t             regionEndUTF8    = -1;
3341
3342
3343    //
3344    //  Compile the caller's pattern
3345    //
3346    uint32_t bflags = 0;
3347    if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3348        bflags |= UREGEX_CASE_INSENSITIVE;
3349    }
3350    if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3351        bflags |= UREGEX_COMMENTS;
3352    }
3353    if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3354        bflags |= UREGEX_DOTALL;
3355    }
3356    if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3357        bflags |= UREGEX_MULTILINE;
3358    }
3359
3360    if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3361        bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3362    }
3363    if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3364        bflags |= UREGEX_UNIX_LINES;
3365    }
3366    if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3367        bflags |= UREGEX_LITERAL;
3368    }
3369
3370
3371    callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3372    if (status != U_ZERO_ERROR) {
3373        #if UCONFIG_NO_BREAK_ITERATION==1
3374        // 'v' test flag means that the test pattern should not compile if ICU was configured
3375        //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3376        if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3377            goto cleanupAndReturn;
3378        }
3379        #endif
3380        if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3381            // Expected pattern compilation error.
3382            if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3383                logln("Pattern Compile returns \"%s\"", u_errorName(status));
3384            }
3385            goto cleanupAndReturn;
3386        } else {
3387            // Unexpected pattern compilation error.
3388            dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3389            goto cleanupAndReturn;
3390        }
3391    }
3392
3393    UTF8Converter = ucnv_open("UTF8", &status);
3394    ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3395
3396    patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3397    status = U_ZERO_ERROR; // buffer overflow
3398    patternChars = new char[patternUTF8Length+1];
3399    pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3400    utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3401
3402    if (status == U_ZERO_ERROR) {
3403        UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3404
3405        if (status != U_ZERO_ERROR) {
3406#if UCONFIG_NO_BREAK_ITERATION==1
3407            // 'v' test flag means that the test pattern should not compile if ICU was configured
3408            //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3409            if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3410                goto cleanupAndReturn;
3411            }
3412#endif
3413            if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3414                // Expected pattern compilation error.
3415                if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3416                    logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3417                }
3418                goto cleanupAndReturn;
3419            } else {
3420                // Unexpected pattern compilation error.
3421                errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3422                goto cleanupAndReturn;
3423            }
3424        }
3425    }
3426
3427    if (UTF8Pattern == NULL) {
3428        // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3429        logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3430        status = U_ZERO_ERROR;
3431    }
3432
3433    if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3434        callerPattern->dumpPattern();
3435    }
3436
3437    if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3438        errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3439        goto cleanupAndReturn;
3440    }
3441
3442
3443    //
3444    // Number of times find() should be called on the test string, default to 1
3445    //
3446    numFinds = 1;
3447    for (i=2; i<=9; i++) {
3448        if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3449            if (numFinds != 1) {
3450                errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3451                goto cleanupAndReturn;
3452            }
3453            numFinds = i;
3454        }
3455    }
3456
3457    // 'M' flag.  Use matches() instead of find()
3458    if (flags.indexOf((UChar)0x4d) >= 0) {
3459        useMatchesFunc = TRUE;
3460    }
3461    if (flags.indexOf((UChar)0x4c) >= 0) {
3462        useLookingAtFunc = TRUE;
3463    }
3464
3465    //
3466    //  Find the tags in the input data, remove them, and record the group boundary
3467    //    positions.
3468    //
3469    parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3470    REGEX_CHECK_STATUS_L(line);
3471
3472    unEscapedInput = inputString.unescape();
3473    parseMatcher = parsePat->matcher(unEscapedInput, status);
3474    REGEX_CHECK_STATUS_L(line);
3475    while(parseMatcher->find()) {
3476        parseMatcher->appendReplacement(deTaggedInput, "", status);
3477        REGEX_CHECK_STATUS;
3478        UnicodeString groupNum = parseMatcher->group(2, status);
3479        if (groupNum == "r") {
3480            // <r> or </r>, a region specification within the string
3481            if (parseMatcher->group(1, status) == "/") {
3482                regionEnd = deTaggedInput.length();
3483            } else {
3484                regionStart = deTaggedInput.length();
3485            }
3486        } else {
3487            // <digits> or </digits>, a group match boundary tag.
3488            if (parseMatcher->group(1, status) == "/") {
3489                set(groupEnds, deTaggedInput.length(), groupNum);
3490            } else {
3491                set(groupStarts, deTaggedInput.length(), groupNum);
3492            }
3493        }
3494    }
3495    parseMatcher->appendTail(deTaggedInput);
3496    REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3497    if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3498      errln("mismatched <r> tags");
3499      failed = TRUE;
3500      goto cleanupAndReturn;
3501    }
3502
3503    //
3504    //  Configure the matcher according to the flags specified with this test.
3505    //
3506    matcher = callerPattern->matcher(deTaggedInput, status);
3507    REGEX_CHECK_STATUS_L(line);
3508    if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3509        matcher->setTrace(TRUE);
3510    }
3511
3512    if (UTF8Pattern != NULL) {
3513        inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3514        status = U_ZERO_ERROR; // buffer overflow
3515        inputChars = new char[inputUTF8Length+1];
3516        deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3517        utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3518
3519        if (status == U_ZERO_ERROR) {
3520            UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3521            REGEX_CHECK_STATUS_L(line);
3522        }
3523
3524        if (UTF8Matcher == NULL) {
3525            // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3526            logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3527            status = U_ZERO_ERROR;
3528        }
3529    }
3530
3531    //
3532    //  Generate native indices for UTF8 versions of region and capture group info
3533    //
3534    if (UTF8Matcher != NULL) {
3535        if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3536            UTF8Matcher->setTrace(TRUE);
3537        }
3538        if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3539        if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3540
3541        //  Fill out the native index UVector info.
3542        //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3543        for (i=0; i<groupStarts.size(); i++) {
3544            int32_t  start = groupStarts.elementAti(i);
3545            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3546            if (start >= 0) {
3547                int32_t  startUTF8;
3548                if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3549                    errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3550                    failed = TRUE;
3551                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3552                }
3553                setInt(groupStartsUTF8, startUTF8, i);
3554            }
3555
3556            int32_t  end = groupEnds.elementAti(i);
3557            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3558            if (end >= 0) {
3559                int32_t  endUTF8;
3560                if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3561                    errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3562                    failed = TRUE;
3563                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3564                }
3565                setInt(groupEndsUTF8, endUTF8, i);
3566            }
3567        }
3568    }
3569
3570    if (regionStart>=0) {
3571       matcher->region(regionStart, regionEnd, status);
3572       REGEX_CHECK_STATUS_L(line);
3573       if (UTF8Matcher != NULL) {
3574           UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3575           REGEX_CHECK_STATUS_L(line);
3576       }
3577    }
3578    if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3579        matcher->useAnchoringBounds(FALSE);
3580        if (UTF8Matcher != NULL) {
3581            UTF8Matcher->useAnchoringBounds(FALSE);
3582        }
3583    }
3584    if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3585        matcher->useTransparentBounds(TRUE);
3586        if (UTF8Matcher != NULL) {
3587            UTF8Matcher->useTransparentBounds(TRUE);
3588        }
3589    }
3590
3591
3592
3593    //
3594    // Do a find on the de-tagged input using the caller's pattern
3595    //     TODO: error on count>1 and not find().
3596    //           error on both matches() and lookingAt().
3597    //
3598    for (i=0; i<numFinds; i++) {
3599        if (useMatchesFunc) {
3600            isMatch = matcher->matches(status);
3601            if (UTF8Matcher != NULL) {
3602               isUTF8Match = UTF8Matcher->matches(status);
3603            }
3604        } else  if (useLookingAtFunc) {
3605            isMatch = matcher->lookingAt(status);
3606            if (UTF8Matcher != NULL) {
3607                isUTF8Match = UTF8Matcher->lookingAt(status);
3608            }
3609        } else {
3610            isMatch = matcher->find();
3611            if (UTF8Matcher != NULL) {
3612                isUTF8Match = UTF8Matcher->find();
3613            }
3614        }
3615    }
3616    matcher->setTrace(FALSE);
3617    if (UTF8Matcher) {
3618        UTF8Matcher->setTrace(FALSE);
3619    }
3620    if (U_FAILURE(status)) {
3621        errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3622    }
3623
3624    //
3625    // Match up the groups from the find() with the groups from the tags
3626    //
3627
3628    // number of tags should match number of groups from find operation.
3629    // matcher->groupCount does not include group 0, the entire match, hence the +1.
3630    //   G option in test means that capture group data is not available in the
3631    //     expected results, so the check needs to be suppressed.
3632    if (isMatch == FALSE && groupStarts.size() != 0) {
3633        dataerrln("Error at line %d:  Match expected, but none found.", line);
3634        failed = TRUE;
3635        goto cleanupAndReturn;
3636    } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3637        errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3638        failed = TRUE;
3639        goto cleanupAndReturn;
3640    }
3641    if (isMatch && groupStarts.size() == 0) {
3642        errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3643        failed = TRUE;
3644    }
3645    if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3646        errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3647        failed = TRUE;
3648    }
3649
3650    if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3651        // Only check for match / no match.  Don't check capture groups.
3652        goto cleanupAndReturn;
3653    }
3654
3655    REGEX_CHECK_STATUS_L(line);
3656    for (i=0; i<=matcher->groupCount(); i++) {
3657        int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3658        int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3659        if (matcher->start(i, status) != expectedStart) {
3660            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3661                line, i, expectedStart, matcher->start(i, status));
3662            failed = TRUE;
3663            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3664        } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3665            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3666                  line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3667            failed = TRUE;
3668            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3669        }
3670
3671        int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3672        int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3673        if (matcher->end(i, status) != expectedEnd) {
3674            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3675                line, i, expectedEnd, matcher->end(i, status));
3676            failed = TRUE;
3677            // Error on end position;  keep going; real error is probably yet to come as group
3678            //   end positions work from end of the input data towards the front.
3679        } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3680            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3681                  line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3682            failed = TRUE;
3683            // Error on end position;  keep going; real error is probably yet to come as group
3684            //   end positions work from end of the input data towards the front.
3685        }
3686    }
3687    if ( matcher->groupCount()+1 < groupStarts.size()) {
3688        errln("Error at line %d: Expected %d capture groups, found %d.",
3689            line, groupStarts.size()-1, matcher->groupCount());
3690        failed = TRUE;
3691        }
3692    else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3693        errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3694              line, groupStarts.size()-1, UTF8Matcher->groupCount());
3695        failed = TRUE;
3696    }
3697
3698    if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3699        matcher->requireEnd() == TRUE) {
3700        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3701        failed = TRUE;
3702    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3703        UTF8Matcher->requireEnd() == TRUE) {
3704        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3705        failed = TRUE;
3706    }
3707
3708    if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3709        matcher->requireEnd() == FALSE) {
3710        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3711        failed = TRUE;
3712    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3713        UTF8Matcher->requireEnd() == FALSE) {
3714        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3715        failed = TRUE;
3716    }
3717
3718    if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3719        matcher->hitEnd() == TRUE) {
3720        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3721        failed = TRUE;
3722    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3723               UTF8Matcher->hitEnd() == TRUE) {
3724        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3725        failed = TRUE;
3726    }
3727
3728    if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3729        matcher->hitEnd() == FALSE) {
3730        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3731        failed = TRUE;
3732    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3733               UTF8Matcher->hitEnd() == FALSE) {
3734        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3735        failed = TRUE;
3736    }
3737
3738
3739cleanupAndReturn:
3740    if (failed) {
3741        infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3742            +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3743        // callerPattern->dump();
3744    }
3745    delete parseMatcher;
3746    delete parsePat;
3747    delete UTF8Matcher;
3748    delete UTF8Pattern;
3749    delete matcher;
3750    delete callerPattern;
3751
3752    utext_close(&inputText);
3753    delete[] inputChars;
3754    utext_close(&patternText);
3755    delete[] patternChars;
3756    ucnv_close(UTF8Converter);
3757}
3758
3759
3760
3761
3762//---------------------------------------------------------------------------
3763//
3764//      Errors     Check for error handling in patterns.
3765//
3766//---------------------------------------------------------------------------
3767void RegexTest::Errors() {
3768    // \escape sequences that aren't implemented yet.
3769    //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3770
3771    // Missing close parentheses
3772    REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3773    REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3774    REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3775
3776    // Extra close paren
3777    REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3778    REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3779    REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3780
3781    // Look-ahead, Look-behind
3782    //  TODO:  add tests for unbounded length look-behinds.
3783    REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3784
3785    // Attempt to use non-default flags
3786    {
3787        UParseError   pe;
3788        UErrorCode    status = U_ZERO_ERROR;
3789        int32_t       flags  = UREGEX_CANON_EQ |
3790                               UREGEX_COMMENTS         | UREGEX_DOTALL   |
3791                               UREGEX_MULTILINE;
3792        RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3793        REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3794        delete pat1;
3795    }
3796
3797
3798    // Quantifiers are allowed only after something that can be quantified.
3799    REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3800    REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3801    REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3802
3803    // Mal-formed {min,max} quantifiers
3804    REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3805    REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3806    REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3807    REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3808    REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3809    REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3810    REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3811    REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3812    REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3813
3814    // Ticket 5389
3815    REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3816
3817    // Invalid Back Reference \0
3818    //    For ICU 3.8 and earlier
3819    //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3820    //
3821    REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3822
3823}
3824
3825
3826//-------------------------------------------------------------------------------
3827//
3828//  Read a text data file, convert it to UChars, and return the data
3829//    in one big UChar * buffer, which the caller must delete.
3830//
3831//--------------------------------------------------------------------------------
3832UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3833                                     const char *defEncoding, UErrorCode &status) {
3834    UChar       *retPtr  = NULL;
3835    char        *fileBuf = NULL;
3836    UConverter* conv     = NULL;
3837    FILE        *f       = NULL;
3838
3839    ulen = 0;
3840    if (U_FAILURE(status)) {
3841        return retPtr;
3842    }
3843
3844    //
3845    //  Open the file.
3846    //
3847    f = fopen(fileName, "rb");
3848    if (f == 0) {
3849        dataerrln("Error opening test data file %s\n", fileName);
3850        status = U_FILE_ACCESS_ERROR;
3851        return NULL;
3852    }
3853    //
3854    //  Read it in
3855    //
3856    int32_t            fileSize;
3857    int32_t            amt_read;
3858
3859    fseek( f, 0, SEEK_END);
3860    fileSize = ftell(f);
3861    fileBuf = new char[fileSize];
3862    fseek(f, 0, SEEK_SET);
3863    amt_read = fread(fileBuf, 1, fileSize, f);
3864    if (amt_read != fileSize || fileSize <= 0) {
3865        errln("Error reading test data file.");
3866        goto cleanUpAndReturn;
3867    }
3868
3869    //
3870    // Look for a Unicode Signature (BOM) on the data just read
3871    //
3872    int32_t        signatureLength;
3873    const char *   fileBufC;
3874    const char*    encoding;
3875
3876    fileBufC = fileBuf;
3877    encoding = ucnv_detectUnicodeSignature(
3878        fileBuf, fileSize, &signatureLength, &status);
3879    if(encoding!=NULL ){
3880        fileBufC  += signatureLength;
3881        fileSize  -= signatureLength;
3882    } else {
3883        encoding = defEncoding;
3884        if (strcmp(encoding, "utf-8") == 0) {
3885            errln("file %s is missing its BOM", fileName);
3886        }
3887    }
3888
3889    //
3890    // Open a converter to take the rule file to UTF-16
3891    //
3892    conv = ucnv_open(encoding, &status);
3893    if (U_FAILURE(status)) {
3894        goto cleanUpAndReturn;
3895    }
3896
3897    //
3898    // Convert the rules to UChar.
3899    //  Preflight first to determine required buffer size.
3900    //
3901    ulen = ucnv_toUChars(conv,
3902        NULL,           //  dest,
3903        0,              //  destCapacity,
3904        fileBufC,
3905        fileSize,
3906        &status);
3907    if (status == U_BUFFER_OVERFLOW_ERROR) {
3908        // Buffer Overflow is expected from the preflight operation.
3909        status = U_ZERO_ERROR;
3910
3911        retPtr = new UChar[ulen+1];
3912        ucnv_toUChars(conv,
3913            retPtr,       //  dest,
3914            ulen+1,
3915            fileBufC,
3916            fileSize,
3917            &status);
3918    }
3919
3920cleanUpAndReturn:
3921    fclose(f);
3922    delete[] fileBuf;
3923    ucnv_close(conv);
3924    if (U_FAILURE(status)) {
3925        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3926        delete []retPtr;
3927        retPtr = 0;
3928        ulen   = 0;
3929    };
3930    return retPtr;
3931}
3932
3933
3934//-------------------------------------------------------------------------------
3935//
3936//   PerlTests  - Run Perl's regular expression tests
3937//                The input file for this test is re_tests, the standard regular
3938//                expression test data distributed with the Perl source code.
3939//
3940//                Here is Perl's description of the test data file:
3941//
3942//        # The tests are in a separate file 't/op/re_tests'.
3943//        # Each line in that file is a separate test.
3944//        # There are five columns, separated by tabs.
3945//        #
3946//        # Column 1 contains the pattern, optionally enclosed in C<''>.
3947//        # Modifiers can be put after the closing C<'>.
3948//        #
3949//        # Column 2 contains the string to be matched.
3950//        #
3951//        # Column 3 contains the expected result:
3952//        #     y   expect a match
3953//        #     n   expect no match
3954//        #     c   expect an error
3955//        # B   test exposes a known bug in Perl, should be skipped
3956//        # b   test exposes a known bug in Perl, should be skipped if noamp
3957//        #
3958//        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3959//        #
3960//        # Column 4 contains a string, usually C<$&>.
3961//        #
3962//        # Column 5 contains the expected result of double-quote
3963//        # interpolating that string after the match, or start of error message.
3964//        #
3965//        # Column 6, if present, contains a reason why the test is skipped.
3966//        # This is printed with "skipped", for harness to pick up.
3967//        #
3968//        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3969//        #
3970//        # If you want to add a regular expression test that can't be expressed
3971//        # in this format, don't add it here: put it in op/pat.t instead.
3972//
3973//        For ICU, if field 3 contains an 'i', the test will be skipped.
3974//        The test exposes is some known incompatibility between ICU and Perl regexps.
3975//        (The i is in addition to whatever was there before.)
3976//
3977//-------------------------------------------------------------------------------
3978void RegexTest::PerlTests() {
3979    char tdd[2048];
3980    const char *srcPath;
3981    UErrorCode  status = U_ZERO_ERROR;
3982    UParseError pe;
3983
3984    //
3985    //  Open and read the test data file.
3986    //
3987    srcPath=getPath(tdd, "re_tests.txt");
3988    if(srcPath==NULL) {
3989        return; /* something went wrong, error already output */
3990    }
3991
3992    int32_t    len;
3993    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3994    if (U_FAILURE(status)) {
3995        return; /* something went wrong, error already output */
3996    }
3997
3998    //
3999    //  Put the test data into a UnicodeString
4000    //
4001    UnicodeString testDataString(FALSE, testData, len);
4002
4003    //
4004    //  Regex to break the input file into lines, and strip the new lines.
4005    //     One line per match, capture group one is the desired data.
4006    //
4007    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4008    if (U_FAILURE(status)) {
4009        dataerrln("RegexPattern::compile() error");
4010        return;
4011    }
4012    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4013
4014    //
4015    //  Regex to split a test file line into fields.
4016    //    There are six fields, separated by tabs.
4017    //
4018    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4019
4020    //
4021    //  Regex to identify test patterns with flag settings, and to separate them.
4022    //    Test patterns with flags look like 'pattern'i
4023    //    Test patterns without flags are not quoted:   pattern
4024    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4025    //
4026    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4027    RegexMatcher* flagMat = flagPat->matcher(status);
4028
4029    //
4030    // The Perl tests reference several perl-isms, which are evaluated/substituted
4031    //   in the test data.  Not being perl, this must be done explicitly.  Here
4032    //   are string constants and REs for these constructs.
4033    //
4034    UnicodeString nulnulSrc("${nulnul}");
4035    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4036    nulnul = nulnul.unescape();
4037
4038    UnicodeString ffffSrc("${ffff}");
4039    UnicodeString ffff("\\uffff", -1, US_INV);
4040    ffff = ffff.unescape();
4041
4042    //  regexp for $-[0], $+[2], etc.
4043    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4044    RegexMatcher *groupsMat = groupsPat->matcher(status);
4045
4046    //  regexp for $0, $1, $2, etc.
4047    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4048    RegexMatcher *cgMat = cgPat->matcher(status);
4049
4050
4051    //
4052    // Main Loop for the Perl Tests, runs once per line from the
4053    //   test data file.
4054    //
4055    int32_t  lineNum = 0;
4056    int32_t  skippedUnimplementedCount = 0;
4057    while (lineMat->find()) {
4058        lineNum++;
4059
4060        //
4061        //  Get a line, break it into its fields, do the Perl
4062        //    variable substitutions.
4063        //
4064        UnicodeString line = lineMat->group(1, status);
4065        UnicodeString fields[7];
4066        fieldPat->split(line, fields, 7, status);
4067
4068        flagMat->reset(fields[0]);
4069        flagMat->matches(status);
4070        UnicodeString pattern  = flagMat->group(2, status);
4071        pattern.findAndReplace("${bang}", "!");
4072        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4073        pattern.findAndReplace(ffffSrc, ffff);
4074
4075        //
4076        //  Identify patterns that include match flag settings,
4077        //    split off the flags, remove the extra quotes.
4078        //
4079        UnicodeString flagStr = flagMat->group(3, status);
4080        if (U_FAILURE(status)) {
4081            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4082            return;
4083        }
4084        int32_t flags = 0;
4085        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4086        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4087        const UChar UChar_m = 0x6d;
4088        const UChar UChar_x = 0x78;
4089        const UChar UChar_y = 0x79;
4090        if (flagStr.indexOf(UChar_i) != -1) {
4091            flags |= UREGEX_CASE_INSENSITIVE;
4092        }
4093        if (flagStr.indexOf(UChar_m) != -1) {
4094            flags |= UREGEX_MULTILINE;
4095        }
4096        if (flagStr.indexOf(UChar_x) != -1) {
4097            flags |= UREGEX_COMMENTS;
4098        }
4099
4100        //
4101        // Compile the test pattern.
4102        //
4103        status = U_ZERO_ERROR;
4104        RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4105        if (status == U_REGEX_UNIMPLEMENTED) {
4106            //
4107            // Test of a feature that is planned for ICU, but not yet implemented.
4108            //   skip the test.
4109            skippedUnimplementedCount++;
4110            delete testPat;
4111            status = U_ZERO_ERROR;
4112            continue;
4113        }
4114
4115        if (U_FAILURE(status)) {
4116            // Some tests are supposed to generate errors.
4117            //   Only report an error for tests that are supposed to succeed.
4118            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4119                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4120            {
4121                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4122            }
4123            status = U_ZERO_ERROR;
4124            delete testPat;
4125            continue;
4126        }
4127
4128        if (fields[2].indexOf(UChar_i) >= 0) {
4129            // ICU should skip this test.
4130            delete testPat;
4131            continue;
4132        }
4133
4134        if (fields[2].indexOf(UChar_c) >= 0) {
4135            // This pattern should have caused a compilation error, but didn't/
4136            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4137            delete testPat;
4138            continue;
4139        }
4140
4141        //
4142        // replace the Perl variables that appear in some of the
4143        //   match data strings.
4144        //
4145        UnicodeString matchString = fields[1];
4146        matchString.findAndReplace(nulnulSrc, nulnul);
4147        matchString.findAndReplace(ffffSrc,   ffff);
4148
4149        // Replace any \n in the match string with an actual new-line char.
4150        //  Don't do full unescape, as this unescapes more than Perl does, which
4151        //  causes other spurious failures in the tests.
4152        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4153
4154
4155
4156        //
4157        // Run the test, check for expected match/don't match result.
4158        //
4159        RegexMatcher *testMat = testPat->matcher(matchString, status);
4160        UBool found = testMat->find();
4161        UBool expected = FALSE;
4162        if (fields[2].indexOf(UChar_y) >=0) {
4163            expected = TRUE;
4164        }
4165        if (expected != found) {
4166            errln("line %d: Expected %smatch, got %smatch",
4167                lineNum, expected?"":"no ", found?"":"no " );
4168            continue;
4169        }
4170
4171        // Don't try to check expected results if there is no match.
4172        //   (Some have stuff in the expected fields)
4173        if (!found) {
4174            delete testMat;
4175            delete testPat;
4176            continue;
4177        }
4178
4179        //
4180        // Interpret the Perl expression from the fourth field of the data file,
4181        // building up an ICU string from the results of the ICU match.
4182        //   The Perl expression will contain references to the results of
4183        //     a regex match, including the matched string, capture group strings,
4184        //     group starting and ending indicies, etc.
4185        //
4186        UnicodeString resultString;
4187        UnicodeString perlExpr = fields[3];
4188#if SUPPORT_MUTATING_INPUT_STRING
4189        groupsMat->reset(perlExpr);
4190        cgMat->reset(perlExpr);
4191#endif
4192
4193        while (perlExpr.length() > 0) {
4194#if !SUPPORT_MUTATING_INPUT_STRING
4195            //  Perferred usage.  Reset after any modification to input string.
4196            groupsMat->reset(perlExpr);
4197            cgMat->reset(perlExpr);
4198#endif
4199
4200            if (perlExpr.startsWith("$&")) {
4201                resultString.append(testMat->group(status));
4202                perlExpr.remove(0, 2);
4203            }
4204
4205            else if (groupsMat->lookingAt(status)) {
4206                // $-[0]   $+[2]  etc.
4207                UnicodeString digitString = groupsMat->group(2, status);
4208                int32_t t = 0;
4209                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4210                UnicodeString plusOrMinus = groupsMat->group(1, status);
4211                int32_t matchPosition;
4212                if (plusOrMinus.compare("+") == 0) {
4213                    matchPosition = testMat->end(groupNum, status);
4214                } else {
4215                    matchPosition = testMat->start(groupNum, status);
4216                }
4217                if (matchPosition != -1) {
4218                    ICU_Utility::appendNumber(resultString, matchPosition);
4219                }
4220                perlExpr.remove(0, groupsMat->end(status));
4221            }
4222
4223            else if (cgMat->lookingAt(status)) {
4224                // $1, $2, $3, etc.
4225                UnicodeString digitString = cgMat->group(1, status);
4226                int32_t t = 0;
4227                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4228                if (U_SUCCESS(status)) {
4229                    resultString.append(testMat->group(groupNum, status));
4230                    status = U_ZERO_ERROR;
4231                }
4232                perlExpr.remove(0, cgMat->end(status));
4233            }
4234
4235            else if (perlExpr.startsWith("@-")) {
4236                int32_t i;
4237                for (i=0; i<=testMat->groupCount(); i++) {
4238                    if (i>0) {
4239                        resultString.append(" ");
4240                    }
4241                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4242                }
4243                perlExpr.remove(0, 2);
4244            }
4245
4246            else if (perlExpr.startsWith("@+")) {
4247                int32_t i;
4248                for (i=0; i<=testMat->groupCount(); i++) {
4249                    if (i>0) {
4250                        resultString.append(" ");
4251                    }
4252                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4253                }
4254                perlExpr.remove(0, 2);
4255            }
4256
4257            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4258                                                     //           or as an escaped sequence (e.g. \n)
4259                if (perlExpr.length() > 1) {
4260                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4261                }
4262                UChar c = perlExpr.charAt(0);
4263                switch (c) {
4264                case 'n':   c = '\n'; break;
4265                // add any other escape sequences that show up in the test expected results.
4266                }
4267                resultString.append(c);
4268                perlExpr.remove(0, 1);
4269            }
4270
4271            else  {
4272                // Any characters from the perl expression that we don't explicitly
4273                //  recognize before here are assumed to be literals and copied
4274                //  as-is to the expected results.
4275                resultString.append(perlExpr.charAt(0));
4276                perlExpr.remove(0, 1);
4277            }
4278
4279            if (U_FAILURE(status)) {
4280                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4281                break;
4282            }
4283        }
4284
4285        //
4286        // Expected Results Compare
4287        //
4288        UnicodeString expectedS(fields[4]);
4289        expectedS.findAndReplace(nulnulSrc, nulnul);
4290        expectedS.findAndReplace(ffffSrc,   ffff);
4291        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4292
4293
4294        if (expectedS.compare(resultString) != 0) {
4295            err("Line %d: Incorrect perl expression results.", lineNum);
4296            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4297        }
4298
4299        delete testMat;
4300        delete testPat;
4301    }
4302
4303    //
4304    // All done.  Clean up allocated stuff.
4305    //
4306    delete cgMat;
4307    delete cgPat;
4308
4309    delete groupsMat;
4310    delete groupsPat;
4311
4312    delete flagMat;
4313    delete flagPat;
4314
4315    delete lineMat;
4316    delete linePat;
4317
4318    delete fieldPat;
4319    delete [] testData;
4320
4321
4322    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4323
4324}
4325
4326
4327//-------------------------------------------------------------------------------
4328//
4329//   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4330//                  (instead of using UnicodeStrings) to test the alternate engine.
4331//                  The input file for this test is re_tests, the standard regular
4332//                  expression test data distributed with the Perl source code.
4333//                  See PerlTests() for more information.
4334//
4335//-------------------------------------------------------------------------------
4336void RegexTest::PerlTestsUTF8() {
4337    char tdd[2048];
4338    const char *srcPath;
4339    UErrorCode  status = U_ZERO_ERROR;
4340    UParseError pe;
4341    LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4342    UText       patternText = UTEXT_INITIALIZER;
4343    char       *patternChars = NULL;
4344    int32_t     patternLength;
4345    int32_t     patternCapacity = 0;
4346    UText       inputText = UTEXT_INITIALIZER;
4347    char       *inputChars = NULL;
4348    int32_t     inputLength;
4349    int32_t     inputCapacity = 0;
4350
4351    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4352
4353    //
4354    //  Open and read the test data file.
4355    //
4356    srcPath=getPath(tdd, "re_tests.txt");
4357    if(srcPath==NULL) {
4358        return; /* something went wrong, error already output */
4359    }
4360
4361    int32_t    len;
4362    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4363    if (U_FAILURE(status)) {
4364        return; /* something went wrong, error already output */
4365    }
4366
4367    //
4368    //  Put the test data into a UnicodeString
4369    //
4370    UnicodeString testDataString(FALSE, testData, len);
4371
4372    //
4373    //  Regex to break the input file into lines, and strip the new lines.
4374    //     One line per match, capture group one is the desired data.
4375    //
4376    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4377    if (U_FAILURE(status)) {
4378        dataerrln("RegexPattern::compile() error");
4379        return;
4380    }
4381    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4382
4383    //
4384    //  Regex to split a test file line into fields.
4385    //    There are six fields, separated by tabs.
4386    //
4387    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4388
4389    //
4390    //  Regex to identify test patterns with flag settings, and to separate them.
4391    //    Test patterns with flags look like 'pattern'i
4392    //    Test patterns without flags are not quoted:   pattern
4393    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4394    //
4395    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4396    RegexMatcher* flagMat = flagPat->matcher(status);
4397
4398    //
4399    // The Perl tests reference several perl-isms, which are evaluated/substituted
4400    //   in the test data.  Not being perl, this must be done explicitly.  Here
4401    //   are string constants and REs for these constructs.
4402    //
4403    UnicodeString nulnulSrc("${nulnul}");
4404    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4405    nulnul = nulnul.unescape();
4406
4407    UnicodeString ffffSrc("${ffff}");
4408    UnicodeString ffff("\\uffff", -1, US_INV);
4409    ffff = ffff.unescape();
4410
4411    //  regexp for $-[0], $+[2], etc.
4412    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4413    RegexMatcher *groupsMat = groupsPat->matcher(status);
4414
4415    //  regexp for $0, $1, $2, etc.
4416    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4417    RegexMatcher *cgMat = cgPat->matcher(status);
4418
4419
4420    //
4421    // Main Loop for the Perl Tests, runs once per line from the
4422    //   test data file.
4423    //
4424    int32_t  lineNum = 0;
4425    int32_t  skippedUnimplementedCount = 0;
4426    while (lineMat->find()) {
4427        lineNum++;
4428
4429        //
4430        //  Get a line, break it into its fields, do the Perl
4431        //    variable substitutions.
4432        //
4433        UnicodeString line = lineMat->group(1, status);
4434        UnicodeString fields[7];
4435        fieldPat->split(line, fields, 7, status);
4436
4437        flagMat->reset(fields[0]);
4438        flagMat->matches(status);
4439        UnicodeString pattern  = flagMat->group(2, status);
4440        pattern.findAndReplace("${bang}", "!");
4441        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4442        pattern.findAndReplace(ffffSrc, ffff);
4443
4444        //
4445        //  Identify patterns that include match flag settings,
4446        //    split off the flags, remove the extra quotes.
4447        //
4448        UnicodeString flagStr = flagMat->group(3, status);
4449        if (U_FAILURE(status)) {
4450            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4451            return;
4452        }
4453        int32_t flags = 0;
4454        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4455        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4456        const UChar UChar_m = 0x6d;
4457        const UChar UChar_x = 0x78;
4458        const UChar UChar_y = 0x79;
4459        if (flagStr.indexOf(UChar_i) != -1) {
4460            flags |= UREGEX_CASE_INSENSITIVE;
4461        }
4462        if (flagStr.indexOf(UChar_m) != -1) {
4463            flags |= UREGEX_MULTILINE;
4464        }
4465        if (flagStr.indexOf(UChar_x) != -1) {
4466            flags |= UREGEX_COMMENTS;
4467        }
4468
4469        //
4470        // Put the pattern in a UTF-8 UText
4471        //
4472        status = U_ZERO_ERROR;
4473        patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4474        if (status == U_BUFFER_OVERFLOW_ERROR) {
4475            status = U_ZERO_ERROR;
4476            delete[] patternChars;
4477            patternCapacity = patternLength + 1;
4478            patternChars = new char[patternCapacity];
4479            pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4480        }
4481        utext_openUTF8(&patternText, patternChars, patternLength, &status);
4482
4483        //
4484        // Compile the test pattern.
4485        //
4486        RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4487        if (status == U_REGEX_UNIMPLEMENTED) {
4488            //
4489            // Test of a feature that is planned for ICU, but not yet implemented.
4490            //   skip the test.
4491            skippedUnimplementedCount++;
4492            delete testPat;
4493            status = U_ZERO_ERROR;
4494            continue;
4495        }
4496
4497        if (U_FAILURE(status)) {
4498            // Some tests are supposed to generate errors.
4499            //   Only report an error for tests that are supposed to succeed.
4500            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4501                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4502            {
4503                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4504            }
4505            status = U_ZERO_ERROR;
4506            delete testPat;
4507            continue;
4508        }
4509
4510        if (fields[2].indexOf(UChar_i) >= 0) {
4511            // ICU should skip this test.
4512            delete testPat;
4513            continue;
4514        }
4515
4516        if (fields[2].indexOf(UChar_c) >= 0) {
4517            // This pattern should have caused a compilation error, but didn't/
4518            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4519            delete testPat;
4520            continue;
4521        }
4522
4523
4524        //
4525        // replace the Perl variables that appear in some of the
4526        //   match data strings.
4527        //
4528        UnicodeString matchString = fields[1];
4529        matchString.findAndReplace(nulnulSrc, nulnul);
4530        matchString.findAndReplace(ffffSrc,   ffff);
4531
4532        // Replace any \n in the match string with an actual new-line char.
4533        //  Don't do full unescape, as this unescapes more than Perl does, which
4534        //  causes other spurious failures in the tests.
4535        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4536
4537        //
4538        // Put the input in a UTF-8 UText
4539        //
4540        status = U_ZERO_ERROR;
4541        inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4542        if (status == U_BUFFER_OVERFLOW_ERROR) {
4543            status = U_ZERO_ERROR;
4544            delete[] inputChars;
4545            inputCapacity = inputLength + 1;
4546            inputChars = new char[inputCapacity];
4547            matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4548        }
4549        utext_openUTF8(&inputText, inputChars, inputLength, &status);
4550
4551        //
4552        // Run the test, check for expected match/don't match result.
4553        //
4554        RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4555        UBool found = testMat->find();
4556        UBool expected = FALSE;
4557        if (fields[2].indexOf(UChar_y) >=0) {
4558            expected = TRUE;
4559        }
4560        if (expected != found) {
4561            errln("line %d: Expected %smatch, got %smatch",
4562                lineNum, expected?"":"no ", found?"":"no " );
4563            continue;
4564        }
4565
4566        // Don't try to check expected results if there is no match.
4567        //   (Some have stuff in the expected fields)
4568        if (!found) {
4569            delete testMat;
4570            delete testPat;
4571            continue;
4572        }
4573
4574        //
4575        // Interpret the Perl expression from the fourth field of the data file,
4576        // building up an ICU string from the results of the ICU match.
4577        //   The Perl expression will contain references to the results of
4578        //     a regex match, including the matched string, capture group strings,
4579        //     group starting and ending indicies, etc.
4580        //
4581        UnicodeString resultString;
4582        UnicodeString perlExpr = fields[3];
4583
4584        while (perlExpr.length() > 0) {
4585            groupsMat->reset(perlExpr);
4586            cgMat->reset(perlExpr);
4587
4588            if (perlExpr.startsWith("$&")) {
4589                resultString.append(testMat->group(status));
4590                perlExpr.remove(0, 2);
4591            }
4592
4593            else if (groupsMat->lookingAt(status)) {
4594                // $-[0]   $+[2]  etc.
4595                UnicodeString digitString = groupsMat->group(2, status);
4596                int32_t t = 0;
4597                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4598                UnicodeString plusOrMinus = groupsMat->group(1, status);
4599                int32_t matchPosition;
4600                if (plusOrMinus.compare("+") == 0) {
4601                    matchPosition = testMat->end(groupNum, status);
4602                } else {
4603                    matchPosition = testMat->start(groupNum, status);
4604                }
4605                if (matchPosition != -1) {
4606                    ICU_Utility::appendNumber(resultString, matchPosition);
4607                }
4608                perlExpr.remove(0, groupsMat->end(status));
4609            }
4610
4611            else if (cgMat->lookingAt(status)) {
4612                // $1, $2, $3, etc.
4613                UnicodeString digitString = cgMat->group(1, status);
4614                int32_t t = 0;
4615                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4616                if (U_SUCCESS(status)) {
4617                    resultString.append(testMat->group(groupNum, status));
4618                    status = U_ZERO_ERROR;
4619                }
4620                perlExpr.remove(0, cgMat->end(status));
4621            }
4622
4623            else if (perlExpr.startsWith("@-")) {
4624                int32_t i;
4625                for (i=0; i<=testMat->groupCount(); i++) {
4626                    if (i>0) {
4627                        resultString.append(" ");
4628                    }
4629                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4630                }
4631                perlExpr.remove(0, 2);
4632            }
4633
4634            else if (perlExpr.startsWith("@+")) {
4635                int32_t i;
4636                for (i=0; i<=testMat->groupCount(); i++) {
4637                    if (i>0) {
4638                        resultString.append(" ");
4639                    }
4640                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4641                }
4642                perlExpr.remove(0, 2);
4643            }
4644
4645            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4646                                                     //           or as an escaped sequence (e.g. \n)
4647                if (perlExpr.length() > 1) {
4648                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4649                }
4650                UChar c = perlExpr.charAt(0);
4651                switch (c) {
4652                case 'n':   c = '\n'; break;
4653                // add any other escape sequences that show up in the test expected results.
4654                }
4655                resultString.append(c);
4656                perlExpr.remove(0, 1);
4657            }
4658
4659            else  {
4660                // Any characters from the perl expression that we don't explicitly
4661                //  recognize before here are assumed to be literals and copied
4662                //  as-is to the expected results.
4663                resultString.append(perlExpr.charAt(0));
4664                perlExpr.remove(0, 1);
4665            }
4666
4667            if (U_FAILURE(status)) {
4668                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4669                break;
4670            }
4671        }
4672
4673        //
4674        // Expected Results Compare
4675        //
4676        UnicodeString expectedS(fields[4]);
4677        expectedS.findAndReplace(nulnulSrc, nulnul);
4678        expectedS.findAndReplace(ffffSrc,   ffff);
4679        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4680
4681
4682        if (expectedS.compare(resultString) != 0) {
4683            err("Line %d: Incorrect perl expression results.", lineNum);
4684            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4685        }
4686
4687        delete testMat;
4688        delete testPat;
4689    }
4690
4691    //
4692    // All done.  Clean up allocated stuff.
4693    //
4694    delete cgMat;
4695    delete cgPat;
4696
4697    delete groupsMat;
4698    delete groupsPat;
4699
4700    delete flagMat;
4701    delete flagPat;
4702
4703    delete lineMat;
4704    delete linePat;
4705
4706    delete fieldPat;
4707    delete [] testData;
4708
4709    utext_close(&patternText);
4710    utext_close(&inputText);
4711
4712    delete [] patternChars;
4713    delete [] inputChars;
4714
4715
4716    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4717
4718}
4719
4720
4721//--------------------------------------------------------------
4722//
4723//  Bug6149   Verify limits to heap expansion for backtrack stack.
4724//             Use this pattern,
4725//                 "(a?){1,8000000}"
4726//             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4727//                   This test is likely to be fragile, as further optimizations stop
4728//                   more cases of pointless looping in the match engine.
4729//
4730//---------------------------------------------------------------
4731void RegexTest::Bug6149() {
4732    UnicodeString pattern("(a?){1,8000000}");
4733    UnicodeString s("xyz");
4734    uint32_t flags = 0;
4735    UErrorCode status = U_ZERO_ERROR;
4736
4737    RegexMatcher  matcher(pattern, s, flags, status);
4738    UBool result = false;
4739    REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4740    REGEX_ASSERT(result == FALSE);
4741 }
4742
4743
4744//
4745//   Callbacks()    Test the callback function.
4746//                  When set, callbacks occur periodically during matching operations,
4747//                  giving the application code the ability to abort the operation
4748//                  before it's normal completion.
4749//
4750
4751struct callBackContext {
4752    RegexTest        *test;
4753    int32_t          maxCalls;
4754    int32_t          numCalls;
4755    int32_t          lastSteps;
4756    void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4757};
4758
4759U_CDECL_BEGIN
4760static UBool U_CALLCONV
4761testCallBackFn(const void *context, int32_t steps) {
4762    callBackContext  *info = (callBackContext *)context;
4763    if (info->lastSteps+1 != steps) {
4764        info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4765    }
4766    info->lastSteps = steps;
4767    info->numCalls++;
4768    return (info->numCalls < info->maxCalls);
4769}
4770U_CDECL_END
4771
4772void RegexTest::Callbacks() {
4773   {
4774        // Getter returns NULLs if no callback has been set
4775
4776        //   The variables that the getter will fill in.
4777        //   Init to non-null values so that the action of the getter can be seen.
4778        const void          *returnedContext = &returnedContext;
4779        URegexMatchCallback *returnedFn = &testCallBackFn;
4780
4781        UErrorCode status = U_ZERO_ERROR;
4782        RegexMatcher matcher("x", 0, status);
4783        REGEX_CHECK_STATUS;
4784        matcher.getMatchCallback(returnedFn, returnedContext, status);
4785        REGEX_CHECK_STATUS;
4786        REGEX_ASSERT(returnedFn == NULL);
4787        REGEX_ASSERT(returnedContext == NULL);
4788    }
4789
4790   {
4791        // Set and Get work
4792        callBackContext cbInfo = {this, 0, 0, 0};
4793        const void          *returnedContext;
4794        URegexMatchCallback *returnedFn;
4795        UErrorCode status = U_ZERO_ERROR;
4796        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4797        REGEX_CHECK_STATUS;
4798        matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4799        REGEX_CHECK_STATUS;
4800        matcher.getMatchCallback(returnedFn, returnedContext, status);
4801        REGEX_CHECK_STATUS;
4802        REGEX_ASSERT(returnedFn == testCallBackFn);
4803        REGEX_ASSERT(returnedContext == &cbInfo);
4804
4805        // A short-running match shouldn't invoke the callback
4806        status = U_ZERO_ERROR;
4807        cbInfo.reset(1);
4808        UnicodeString s = "xxx";
4809        matcher.reset(s);
4810        REGEX_ASSERT(matcher.matches(status));
4811        REGEX_CHECK_STATUS;
4812        REGEX_ASSERT(cbInfo.numCalls == 0);
4813
4814        // A medium-length match that runs long enough to invoke the
4815        //   callback, but not so long that the callback aborts it.
4816        status = U_ZERO_ERROR;
4817        cbInfo.reset(4);
4818        s = "aaaaaaaaaaaaaaaaaaab";
4819        matcher.reset(s);
4820        REGEX_ASSERT(matcher.matches(status)==FALSE);
4821        REGEX_CHECK_STATUS;
4822        REGEX_ASSERT(cbInfo.numCalls > 0);
4823
4824        // A longer running match that the callback function will abort.
4825        status = U_ZERO_ERROR;
4826        cbInfo.reset(4);
4827        s = "aaaaaaaaaaaaaaaaaaaaaaab";
4828        matcher.reset(s);
4829        REGEX_ASSERT(matcher.matches(status)==FALSE);
4830        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4831        REGEX_ASSERT(cbInfo.numCalls == 4);
4832
4833        // A longer running find that the callback function will abort.
4834        status = U_ZERO_ERROR;
4835        cbInfo.reset(4);
4836        s = "aaaaaaaaaaaaaaaaaaaaaaab";
4837        matcher.reset(s);
4838        REGEX_ASSERT(matcher.find(status)==FALSE);
4839        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4840        REGEX_ASSERT(cbInfo.numCalls == 4);
4841    }
4842
4843
4844}
4845
4846
4847//
4848//   FindProgressCallbacks()    Test the find "progress" callback function.
4849//                  When set, the find progress callback will be invoked during a find operations
4850//                  after each return from a match attempt, giving the application the opportunity
4851//                  to terminate a long-running find operation before it's normal completion.
4852//
4853
4854struct progressCallBackContext {
4855    RegexTest        *test;
4856    int64_t          lastIndex;
4857    int32_t          maxCalls;
4858    int32_t          numCalls;
4859    void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4860};
4861
4862// call-back function for find().
4863// Return TRUE to continue the find().
4864// Return FALSE to stop the find().
4865U_CDECL_BEGIN
4866static UBool U_CALLCONV
4867testProgressCallBackFn(const void *context, int64_t matchIndex) {
4868    progressCallBackContext  *info = (progressCallBackContext *)context;
4869    info->numCalls++;
4870    info->lastIndex = matchIndex;
4871//    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4872    return (info->numCalls < info->maxCalls);
4873}
4874U_CDECL_END
4875
4876void RegexTest::FindProgressCallbacks() {
4877   {
4878        // Getter returns NULLs if no callback has been set
4879
4880        //   The variables that the getter will fill in.
4881        //   Init to non-null values so that the action of the getter can be seen.
4882        const void                  *returnedContext = &returnedContext;
4883        URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4884
4885        UErrorCode status = U_ZERO_ERROR;
4886        RegexMatcher matcher("x", 0, status);
4887        REGEX_CHECK_STATUS;
4888        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4889        REGEX_CHECK_STATUS;
4890        REGEX_ASSERT(returnedFn == NULL);
4891        REGEX_ASSERT(returnedContext == NULL);
4892    }
4893
4894   {
4895        // Set and Get work
4896        progressCallBackContext cbInfo = {this, 0, 0, 0};
4897        const void                  *returnedContext;
4898        URegexFindProgressCallback  *returnedFn;
4899        UErrorCode status = U_ZERO_ERROR;
4900        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4901        REGEX_CHECK_STATUS;
4902        matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4903        REGEX_CHECK_STATUS;
4904        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4905        REGEX_CHECK_STATUS;
4906        REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4907        REGEX_ASSERT(returnedContext == &cbInfo);
4908
4909        // A find that matches on the initial position does NOT invoke the callback.
4910        status = U_ZERO_ERROR;
4911        cbInfo.reset(100);
4912        UnicodeString s = "aaxxx";
4913        matcher.reset(s);
4914#if 0
4915        matcher.setTrace(TRUE);
4916#endif
4917        REGEX_ASSERT(matcher.find(0, status));
4918        REGEX_CHECK_STATUS;
4919        REGEX_ASSERT(cbInfo.numCalls == 0);
4920
4921        // A medium running find() that causes matcher.find() to invoke our callback for each index,
4922        //   but not so many times that we interrupt the operation.
4923        status = U_ZERO_ERROR;
4924        s = "aaaaaaaaaaaaaaaaaaab";
4925        cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4926        matcher.reset(s);
4927        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4928        REGEX_CHECK_STATUS;
4929        REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4930
4931        // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4932        status = U_ZERO_ERROR;
4933        UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4934        cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4935        matcher.reset(s1);
4936        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4937        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4938        REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4939
4940        // Now a match that will succeed, but after an interruption
4941        status = U_ZERO_ERROR;
4942        UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4943        cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4944        matcher.reset(s2);
4945        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4946        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4947        // Now retry the match from where left off
4948        cbInfo.maxCalls = 100; //  No callback limit
4949        status = U_ZERO_ERROR;
4950        REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4951        REGEX_CHECK_STATUS;
4952    }
4953
4954
4955}
4956
4957
4958//---------------------------------------------------------------------------
4959//
4960//    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4961//                             UTexts. The pure-C implementation of UText
4962//                             has no mutable backing stores, but we can
4963//                             use UnicodeString here to test the functionality.
4964//
4965//---------------------------------------------------------------------------
4966void RegexTest::PreAllocatedUTextCAPI () {
4967    UErrorCode           status = U_ZERO_ERROR;
4968    URegularExpression  *re;
4969    UText                patternText = UTEXT_INITIALIZER;
4970    UnicodeString        buffer;
4971    UText                bufferText = UTEXT_INITIALIZER;
4972
4973    utext_openUnicodeString(&bufferText, &buffer, &status);
4974
4975    /*
4976     *  getText() and getUText()
4977     */
4978    {
4979        UText  text1 = UTEXT_INITIALIZER;
4980        UText  text2 = UTEXT_INITIALIZER;
4981        UChar  text2Chars[20];
4982        UText  *resultText;
4983
4984        status = U_ZERO_ERROR;
4985        regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4986        regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4987        u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4988        utext_openUChars(&text2, text2Chars, -1, &status);
4989
4990        regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4991        re = uregex_openUText(&patternText, 0, NULL, &status);
4992
4993        /* First set a UText */
4994        uregex_setUText(re, &text1, &status);
4995        resultText = uregex_getUText(re, &bufferText, &status);
4996        REGEX_CHECK_STATUS;
4997        REGEX_ASSERT(resultText == &bufferText);
4998        utext_setNativeIndex(resultText, 0);
4999        utext_setNativeIndex(&text1, 0);
5000        REGEX_ASSERT(testUTextEqual(resultText, &text1));
5001
5002        resultText = uregex_getUText(re, &bufferText, &status);
5003        REGEX_CHECK_STATUS;
5004        REGEX_ASSERT(resultText == &bufferText);
5005        utext_setNativeIndex(resultText, 0);
5006        utext_setNativeIndex(&text1, 0);
5007        REGEX_ASSERT(testUTextEqual(resultText, &text1));
5008
5009        /* Then set a UChar * */
5010        uregex_setText(re, text2Chars, 7, &status);
5011        resultText = uregex_getUText(re, &bufferText, &status);
5012        REGEX_CHECK_STATUS;
5013        REGEX_ASSERT(resultText == &bufferText);
5014        utext_setNativeIndex(resultText, 0);
5015        utext_setNativeIndex(&text2, 0);
5016        REGEX_ASSERT(testUTextEqual(resultText, &text2));
5017
5018        uregex_close(re);
5019        utext_close(&text1);
5020        utext_close(&text2);
5021    }
5022
5023    /*
5024     *  group()
5025     */
5026    {
5027        UChar    text1[80];
5028        UText   *actual;
5029        UBool    result;
5030        int64_t  length = 0;
5031
5032        u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
5033        //                  012345678901234567890123456789012345678901234567
5034        //                  0         1         2         3         4
5035
5036        status = U_ZERO_ERROR;
5037        re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5038        REGEX_CHECK_STATUS;
5039
5040        uregex_setText(re, text1, -1, &status);
5041        result = uregex_find(re, 0, &status);
5042        REGEX_ASSERT(result==TRUE);
5043
5044        /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
5045        status = U_ZERO_ERROR;
5046        actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5047        REGEX_CHECK_STATUS;
5048        REGEX_ASSERT(actual == &bufferText);
5049        REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5050        REGEX_ASSERT(length == 16);
5051        REGEX_ASSERT(utext_nativeLength(actual) == 47);
5052
5053        /*  Capture group #1.  Should succeed, matching " interior ". */
5054        status = U_ZERO_ERROR;
5055        actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5056        REGEX_CHECK_STATUS;
5057        REGEX_ASSERT(actual == &bufferText);
5058        REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
5059        REGEX_ASSERT(length == 10);
5060        REGEX_ASSERT(utext_nativeLength(actual) == 47);
5061
5062        /*  Capture group out of range.  Error. */
5063        status = U_ZERO_ERROR;
5064        actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5065        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5066        REGEX_ASSERT(actual == &bufferText);
5067        uregex_close(re);
5068
5069    }
5070
5071    /*
5072     *  replaceFirst()
5073     */
5074    {
5075        UChar    text1[80];
5076        UChar    text2[80];
5077        UText    replText = UTEXT_INITIALIZER;
5078        UText   *result;
5079        status = U_ZERO_ERROR;
5080        utext_openUnicodeString(&bufferText, &buffer, &status);
5081
5082        status = U_ZERO_ERROR;
5083        u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5084        u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5085        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5086
5087        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5088        REGEX_CHECK_STATUS;
5089
5090        /*  Normal case, with match */
5091        uregex_setText(re, text1, -1, &status);
5092        REGEX_CHECK_STATUS;
5093        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5094        REGEX_CHECK_STATUS;
5095        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5096        REGEX_CHECK_STATUS;
5097        REGEX_ASSERT(result == &bufferText);
5098        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5099
5100        /* No match.  Text should copy to output with no changes.  */
5101        uregex_setText(re, text2, -1, &status);
5102        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5103        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5104        REGEX_CHECK_STATUS;
5105        REGEX_ASSERT(result == &bufferText);
5106        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5107
5108        /* Unicode escapes */
5109        uregex_setText(re, text1, -1, &status);
5110        regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5111        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5112        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5113        REGEX_CHECK_STATUS;
5114        REGEX_ASSERT(result == &bufferText);
5115        REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5116
5117        uregex_close(re);
5118        utext_close(&replText);
5119    }
5120
5121
5122    /*
5123     *  replaceAll()
5124     */
5125    {
5126        UChar    text1[80];
5127        UChar    text2[80];
5128        UText    replText = UTEXT_INITIALIZER;
5129        UText   *result;
5130
5131        status = U_ZERO_ERROR;
5132        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5133        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5134        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5135
5136        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5137        REGEX_CHECK_STATUS;
5138
5139        /*  Normal case, with match */
5140        uregex_setText(re, text1, -1, &status);
5141        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5142        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5143        REGEX_CHECK_STATUS;
5144        REGEX_ASSERT(result == &bufferText);
5145        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5146
5147        /* No match.  Text should copy to output with no changes.  */
5148        uregex_setText(re, text2, -1, &status);
5149        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5150        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5151        REGEX_CHECK_STATUS;
5152        REGEX_ASSERT(result == &bufferText);
5153        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5154
5155        uregex_close(re);
5156        utext_close(&replText);
5157    }
5158
5159
5160    /*
5161     *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5162     *   so we don't need to test it here.
5163     */
5164
5165    utext_close(&bufferText);
5166    utext_close(&patternText);
5167}
5168
5169
5170//--------------------------------------------------------------
5171//
5172//  NamedCapture   Check basic named capture group functionality
5173//
5174//--------------------------------------------------------------
5175void RegexTest::NamedCapture() {
5176    UErrorCode status = U_ZERO_ERROR;
5177    RegexPattern *pat = RegexPattern::compile(UnicodeString(
5178            "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5179    REGEX_CHECK_STATUS;
5180    int32_t group = pat->groupNumberFromName("five", -1, status);
5181    REGEX_CHECK_STATUS;
5182    REGEX_ASSERT(5 == group);
5183    group = pat->groupNumberFromName("three", -1, status);
5184    REGEX_CHECK_STATUS;
5185    REGEX_ASSERT(3 == group);
5186
5187    status = U_ZERO_ERROR;
5188    group = pat->groupNumberFromName(UnicodeString("six"), status);
5189    REGEX_CHECK_STATUS;
5190    REGEX_ASSERT(6 == group);
5191
5192    status = U_ZERO_ERROR;
5193    group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5194    U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5195
5196    status = U_ZERO_ERROR;
5197
5198    // After copying a pattern, named capture should still work in the copy.
5199    RegexPattern *copiedPat = new RegexPattern(*pat);
5200    REGEX_ASSERT(*copiedPat == *pat);
5201    delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5202
5203    group = copiedPat->groupNumberFromName("five", -1, status);
5204    REGEX_CHECK_STATUS;
5205    REGEX_ASSERT(5 == group);
5206    group = copiedPat->groupNumberFromName("three", -1, status);
5207    REGEX_CHECK_STATUS;
5208    REGEX_ASSERT(3 == group);
5209    delete copiedPat;
5210
5211    // ReplaceAll with named capture group.
5212    status = U_ZERO_ERROR;
5213    UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5214    RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5215    REGEX_CHECK_STATUS;
5216    // m.pattern().dumpPattern();
5217    UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5218    REGEX_CHECK_STATUS;
5219    REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5220    delete m;
5221
5222    // ReplaceAll, allowed capture group numbers.
5223    text = UnicodeString("abcmxyz");
5224    m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5225    REGEX_CHECK_STATUS;
5226
5227    status = U_ZERO_ERROR;
5228    replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5229    REGEX_CHECK_STATUS;
5230    REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5231
5232    status = U_ZERO_ERROR;
5233    replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5234    REGEX_CHECK_STATUS;
5235    REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5236
5237    status = U_ZERO_ERROR;
5238    replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5239    REGEX_CHECK_STATUS;
5240    REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5241
5242    status = U_ZERO_ERROR;
5243    replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5244    REGEX_CHECK_STATUS;
5245    REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5246
5247    status = U_ZERO_ERROR;
5248    replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5249    REGEX_CHECK_STATUS;
5250    REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5251
5252    status = U_ZERO_ERROR;
5253    replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5254    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5255
5256    status = U_ZERO_ERROR;
5257    replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5258    REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5259    REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5260
5261    status = U_ZERO_ERROR;
5262    replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5263    REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5264    REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5265
5266    status = U_ZERO_ERROR;
5267    replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5268    REGEX_CHECK_STATUS;
5269    REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5270
5271    status = U_ZERO_ERROR;
5272    replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5273    REGEX_CHECK_STATUS;
5274    REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5275
5276    status = U_ZERO_ERROR;
5277    replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5278    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5279
5280    status = U_ZERO_ERROR;
5281    replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5282    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5283
5284    status = U_ZERO_ERROR;
5285    replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5286    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5287
5288    status = U_ZERO_ERROR;
5289    replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5290    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5291
5292    delete m;
5293
5294    // Repeat the above replaceAll() tests using the plain C API, which
5295    //  has a separate implementation internally.
5296    //  TODO: factor out the test data.
5297
5298    status = U_ZERO_ERROR;
5299    URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5300    REGEX_CHECK_STATUS;
5301    text = UnicodeString("abcmxyz");
5302    uregex_setText(re, text.getBuffer(), text.length(), &status);
5303    REGEX_CHECK_STATUS;
5304
5305    UChar resultBuf[100];
5306    int32_t resultLength;
5307    UnicodeString repl;
5308
5309    status = U_ZERO_ERROR;
5310    repl = UnicodeString("<$0>");
5311    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5312    REGEX_CHECK_STATUS;
5313    REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5314
5315    status = U_ZERO_ERROR;
5316    repl = UnicodeString("<$1>");
5317    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5318    REGEX_CHECK_STATUS;
5319    REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5320
5321    status = U_ZERO_ERROR;
5322    repl = UnicodeString("<${one}>");
5323    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5324    REGEX_CHECK_STATUS;
5325    REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5326
5327    status = U_ZERO_ERROR;
5328    repl = UnicodeString("<$2>");
5329    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5330    REGEX_CHECK_STATUS;
5331    REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5332
5333    status = U_ZERO_ERROR;
5334    repl = UnicodeString("<$3>");
5335    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5336    REGEX_CHECK_STATUS;
5337    REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5338
5339    status = U_ZERO_ERROR;
5340    repl = UnicodeString("<$4>");
5341    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5342    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5343
5344    status = U_ZERO_ERROR;
5345    repl = UnicodeString("<$04>");
5346    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5347    REGEX_CHECK_STATUS;
5348    REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5349
5350    status = U_ZERO_ERROR;
5351    repl = UnicodeString("<$000016>");
5352    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5353    REGEX_CHECK_STATUS;
5354    REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5355
5356    status = U_ZERO_ERROR;
5357    repl = UnicodeString("<$3$2$1${one}>");
5358    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5359    REGEX_CHECK_STATUS;
5360    REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5361
5362    status = U_ZERO_ERROR;
5363    repl = UnicodeString("$3$2$1${one}");
5364    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5365    REGEX_CHECK_STATUS;
5366    REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5367
5368    status = U_ZERO_ERROR;
5369    repl = UnicodeString("<${noSuchName}>");
5370    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5371    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5372
5373    status = U_ZERO_ERROR;
5374    repl = UnicodeString("<${invalid-name}>");
5375    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5376    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5377
5378    status = U_ZERO_ERROR;
5379    repl = UnicodeString("<${one");
5380    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5381    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5382
5383    status = U_ZERO_ERROR;
5384    repl = UnicodeString("$not a capture group");
5385    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5386    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5387
5388    uregex_close(re);
5389}
5390
5391//--------------------------------------------------------------
5392//
5393//  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5394//                       The point is not so much what the exact limit is,
5395//                       but that a largish number doesn't hit bad non-linear performance,
5396//                       and that exceeding the limit fails cleanly.
5397//
5398//--------------------------------------------------------------
5399void RegexTest::NamedCaptureLimits() {
5400    if (quick) {
5401        logln("Skipping test. Runs in exhuastive mode only.");
5402        return;
5403    }
5404    const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5405    const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5406    char nnbuf[100];
5407    UnicodeString pattern;
5408    int32_t nn;
5409
5410    for (nn=1; nn<goodLimit; nn++) {
5411        sprintf(nnbuf, "(?<nn%d>)", nn);
5412        pattern.append(UnicodeString(nnbuf, -1, US_INV));
5413    }
5414    UErrorCode status = U_ZERO_ERROR;
5415    RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5416    REGEX_CHECK_STATUS;
5417    for (nn=1; nn<goodLimit; nn++) {
5418        sprintf(nnbuf, "nn%d", nn);
5419        int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5420        REGEX_ASSERT(nn == groupNum);
5421        if (nn != groupNum) {
5422            break;
5423        }
5424    }
5425    delete pat;
5426
5427    pattern.remove();
5428    for (nn=1; nn<failLimit; nn++) {
5429        sprintf(nnbuf, "(?<nn%d>)", nn);
5430        pattern.append(UnicodeString(nnbuf, -1, US_INV));
5431    }
5432    status = U_ZERO_ERROR;
5433    pat = RegexPattern::compile(pattern, 0, status);
5434    REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5435    delete pat;
5436}
5437
5438
5439//--------------------------------------------------------------
5440//
5441//  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5442//
5443//---------------------------------------------------------------
5444void RegexTest::Bug7651() {
5445    UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5446    //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5447    //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5448    UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5449    UnicodeString s("#ff @abcd This is test");
5450    RegexPattern  *REPattern = NULL;
5451    RegexMatcher  *REMatcher = NULL;
5452    UErrorCode status = U_ZERO_ERROR;
5453    UParseError pe;
5454
5455    REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5456    REGEX_CHECK_STATUS;
5457    REMatcher = REPattern->matcher(s, status);
5458    REGEX_CHECK_STATUS;
5459    REGEX_ASSERT(REMatcher->find());
5460    REGEX_ASSERT(REMatcher->start(status) == 0);
5461    delete REPattern;
5462    delete REMatcher;
5463    status = U_ZERO_ERROR;
5464
5465    REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5466    REGEX_CHECK_STATUS;
5467    REMatcher = REPattern->matcher(s, status);
5468    REGEX_CHECK_STATUS;
5469    REGEX_ASSERT(REMatcher->find());
5470    REGEX_ASSERT(REMatcher->start(status) == 0);
5471    delete REPattern;
5472    delete REMatcher;
5473    status = U_ZERO_ERROR;
5474 }
5475
5476void RegexTest::Bug7740() {
5477    UErrorCode status = U_ZERO_ERROR;
5478    UnicodeString pattern = "(a)";
5479    UnicodeString text = "abcdef";
5480    RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5481    REGEX_CHECK_STATUS;
5482    REGEX_ASSERT(m->lookingAt(status));
5483    REGEX_CHECK_STATUS;
5484    status = U_ILLEGAL_ARGUMENT_ERROR;
5485    UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5486    REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5487    REGEX_ASSERT(s == "");
5488    delete m;
5489}
5490
5491// Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5492
5493void RegexTest::Bug8479() {
5494    UErrorCode status = U_ZERO_ERROR;
5495
5496    RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5497    REGEX_CHECK_STATUS;
5498    if (U_SUCCESS(status))
5499    {
5500        UnicodeString str;
5501        str.setToBogus();
5502        pMatcher->reset(str);
5503        status = U_ZERO_ERROR;
5504        pMatcher->matches(status);
5505        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5506        delete pMatcher;
5507    }
5508}
5509
5510
5511// Bug 7029
5512void RegexTest::Bug7029() {
5513    UErrorCode status = U_ZERO_ERROR;
5514
5515    RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5516    UnicodeString text = "abc.def";
5517    UnicodeString splits[10];
5518    REGEX_CHECK_STATUS;
5519    int32_t numFields = pMatcher->split(text, splits, 10, status);
5520    REGEX_CHECK_STATUS;
5521    REGEX_ASSERT(numFields == 8);
5522    delete pMatcher;
5523}
5524
5525// Bug 9283
5526//   This test is checking for the existance of any supplemental characters that case-fold
5527//   to a bmp character.
5528//
5529//   At the time of this writing there are none. If any should appear in a subsequent release
5530//   of Unicode, the code in regular expressions compilation that determines the longest
5531//   posssible match for a literal string  will need to be enhanced.
5532//
5533//   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5534//   for details on what to do in case of a failure of this test.
5535//
5536void RegexTest::Bug9283() {
5537#if !UCONFIG_NO_NORMALIZATION
5538    UErrorCode status = U_ZERO_ERROR;
5539    UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5540    REGEX_CHECK_STATUS;
5541    int32_t index;
5542    UChar32 c;
5543    for (index=0; ; index++) {
5544        c = supplementalsWithCaseFolding.charAt(index);
5545        if (c == -1) {
5546            break;
5547        }
5548        UnicodeString cf = UnicodeString(c).foldCase();
5549        REGEX_ASSERT(cf.length() >= 2);
5550    }
5551#endif /* #if !UCONFIG_NO_NORMALIZATION */
5552}
5553
5554
5555void RegexTest::CheckInvBufSize() {
5556  if(inv_next>=INV_BUFSIZ) {
5557    errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5558          __FILE__, INV_BUFSIZ, inv_next);
5559  } else {
5560    logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5561  }
5562}
5563
5564
5565void RegexTest::Bug10459() {
5566    UErrorCode status = U_ZERO_ERROR;
5567    UnicodeString patternString("(txt)");
5568    UnicodeString txtString("txt");
5569
5570    UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5571    REGEX_CHECK_STATUS;
5572    UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5573    REGEX_CHECK_STATUS;
5574
5575    URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5576    REGEX_CHECK_STATUS;
5577
5578    uregex_setUText(icu_re, utext_txt, &status);
5579    REGEX_CHECK_STATUS;
5580
5581    // The bug was that calling uregex_group() before doing a matching operation
5582    //   was causing a segfault. Only for Regular Expressions created from UText.
5583    //   It should set an U_REGEX_INVALID_STATE.
5584
5585    UChar buf[100];
5586    int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5587    REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5588    REGEX_ASSERT(len == 0);
5589
5590    uregex_close(icu_re);
5591    utext_close(utext_pat);
5592    utext_close(utext_txt);
5593}
5594
5595void RegexTest::TestCaseInsensitiveStarters() {
5596    // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5597    //  become stale because of new Unicode characters.
5598    // If it is stale, rerun the generation tool
5599    //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5600    // and replace the embedded data in i18n/regexcmp.cpp
5601
5602    for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5603        if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5604            continue;
5605        }
5606        UnicodeSet s(cp, cp);
5607        s.closeOver(USET_CASE_INSENSITIVE);
5608        UnicodeSetIterator setIter(s);
5609        while (setIter.next()) {
5610            if (!setIter.isString()) {
5611                continue;
5612            }
5613            const UnicodeString &str = setIter.getString();
5614            UChar32 firstChar = str.char32At(0);
5615            UnicodeSet starters;
5616            RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5617            if (!starters.contains(cp)) {
5618                errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5619                return;
5620            }
5621        }
5622    }
5623}
5624
5625
5626void RegexTest::TestBug11049() {
5627    // Original bug report: pattern with match start consisting of one of several individual characters,
5628    //  and the text being matched ending with a supplementary character. find() would read past the
5629    //  end of the input text when searching for potential match starting points.
5630
5631    // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5632    // detect the bad read.
5633
5634    TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5635    TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5636
5637    // Test again with a pattern starting with a single character,
5638    // which takes a different code path than starting with an OR expression,
5639    // but with similar logic.
5640    TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5641    TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5642}
5643
5644// Run a single test case from TestBug11049(). Internal function.
5645void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5646    UErrorCode status = U_ZERO_ERROR;
5647    UnicodeString patternString = UnicodeString(pattern).unescape();
5648    LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5649
5650    UnicodeString dataString = UnicodeString(data).unescape();
5651    UChar *exactBuffer = new UChar[dataString.length()];
5652    dataString.extract(exactBuffer, dataString.length(), status);
5653    UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5654
5655    LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5656    REGEX_CHECK_STATUS;
5657    matcher->reset(ut);
5658    UBool result = matcher->find();
5659    if (result != expectMatch) {
5660        errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5661              __FILE__, lineNumber, expectMatch, result, pattern, data);
5662    }
5663
5664    // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5665    //   off-by-one on find() with match at the last code point.
5666    //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5667    //   because string.unescape() will only shrink it.
5668    char * utf8Buffer = new char[uprv_strlen(data)+1];
5669    u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5670    REGEX_CHECK_STATUS;
5671    ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5672    REGEX_CHECK_STATUS;
5673    matcher->reset(ut);
5674    result = matcher->find();
5675    if (result != expectMatch) {
5676        errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5677              __FILE__, lineNumber, expectMatch, result, pattern, data);
5678    }
5679    delete [] utf8Buffer;
5680
5681    utext_close(ut);
5682    delete [] exactBuffer;
5683}
5684
5685
5686void RegexTest::TestBug11371() {
5687    if (quick) {
5688        logln("Skipping test. Runs in exhuastive mode only.");
5689        return;
5690    }
5691    UErrorCode status = U_ZERO_ERROR;
5692    UnicodeString patternString;
5693
5694    for (int i=0; i<8000000; i++) {
5695        patternString.append(UnicodeString("()"));
5696    }
5697    LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5698    if (status != U_REGEX_PATTERN_TOO_BIG) {
5699        errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5700              __FILE__, __LINE__, u_errorName(status));
5701    }
5702
5703    status = U_ZERO_ERROR;
5704    patternString = "(";
5705    for (int i=0; i<20000000; i++) {
5706        patternString.append(UnicodeString("A++"));
5707    }
5708    patternString.append(UnicodeString("){0}B++"));
5709    LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5710    if (status != U_REGEX_PATTERN_TOO_BIG) {
5711        errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5712              __FILE__, __LINE__, u_errorName(status));
5713    }
5714
5715    // Pattern with too much string data, such that string indexes overflow operand data field size
5716    // in compiled instruction.
5717    status = U_ZERO_ERROR;
5718    patternString = "";
5719    while (patternString.length() < 0x00ffffff) {
5720        patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5721    }
5722    patternString.append(UnicodeString("X? trailing string"));
5723    LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5724    if (status != U_REGEX_PATTERN_TOO_BIG) {
5725        errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5726              __FILE__, __LINE__, u_errorName(status));
5727    }
5728}
5729
5730void RegexTest::TestBug11480() {
5731    // C API, get capture group of a group that does not participate in the match.
5732    //        (Returns a zero length string, with nul termination,
5733    //         indistinguishable from a group with a zero length match.)
5734
5735    UErrorCode status = U_ZERO_ERROR;
5736    URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5737    REGEX_CHECK_STATUS;
5738    UnicodeString text = UNICODE_STRING_SIMPLE("A");
5739    uregex_setText(re, text.getBuffer(), text.length(), &status);
5740    REGEX_CHECK_STATUS;
5741    REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5742    UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5743    int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5744    REGEX_ASSERT(length == 0);
5745    REGEX_ASSERT(buf[0] == 13);
5746    REGEX_ASSERT(buf[1] == 0);
5747    REGEX_ASSERT(buf[2] == 13);
5748    uregex_close(re);
5749
5750    // UText C++ API, length of match is 0 for non-participating matches.
5751    UText ut = UTEXT_INITIALIZER;
5752    utext_openUnicodeString(&ut, &text, &status);
5753    RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5754    REGEX_CHECK_STATUS;
5755    matcher.reset(&ut);
5756    REGEX_ASSERT(matcher.lookingAt(0, status));
5757
5758    // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5759    int64_t groupLen = -666;
5760    UText group = UTEXT_INITIALIZER;
5761    matcher.group(1, &group, groupLen, status);
5762    REGEX_CHECK_STATUS;
5763    REGEX_ASSERT(groupLen == 1);
5764    REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5765
5766    // Capture group 2, the (B), does not participate in the match.
5767    matcher.group(2, &group, groupLen, status);
5768    REGEX_CHECK_STATUS;
5769    REGEX_ASSERT(groupLen == 0);
5770    REGEX_ASSERT(matcher.start(2, status) == -1);
5771    REGEX_CHECK_STATUS;
5772}
5773
5774void RegexTest::TestBug12884() {
5775    // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5776    UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5777    UnicodeString text(u"hello");
5778    UErrorCode status = U_ZERO_ERROR;
5779    RegexMatcher m(pattern, text, 0, status);
5780    REGEX_CHECK_STATUS;
5781    m.setTimeLimit(5, status);
5782    m.find(status);
5783    REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5784
5785    // Non-greedy loops. They take a different code path during matching.
5786    UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5787    status = U_ZERO_ERROR;
5788    RegexMatcher ngM(ngPattern, text, 0, status);
5789    REGEX_CHECK_STATUS;
5790    ngM.setTimeLimit(5, status);
5791    ngM.find(status);
5792    REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5793
5794    // UText, wrapping non-UTF-16 text, also takes a different execution path.
5795    const char *text8 = u8"¿Qué es Unicode?  Unicode proporciona un número único para cada"
5796                          "carácter, sin importar la plataforma, sin importar el programa,"
5797                          "sin importar el idioma.";
5798    status = U_ZERO_ERROR;
5799    LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
5800    REGEX_CHECK_STATUS;
5801    m.reset(ut.getAlias());
5802    m.find(status);
5803    REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5804
5805    status = U_ZERO_ERROR;
5806    ngM.reset(ut.getAlias());
5807    ngM.find(status);
5808    REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5809}
5810
5811// Bug 13631. A find() of a pattern with a zero length look-behind assertions
5812//            can cause a read past the end of the input text.
5813//            The failure is seen when running this test with Clang's Addresss Sanitizer.
5814
5815void RegexTest::TestBug13631() {
5816    const UChar *pats[] = { u"(?<!^)",
5817                            u"(?<=^)",
5818                            nullptr
5819                          };
5820    for (const UChar **pat=pats; *pat; ++pat) {
5821        UErrorCode status = U_ZERO_ERROR;
5822        UnicodeString upat(*pat);
5823        RegexMatcher matcher(upat, 0, status);
5824        const UChar s =u'a';
5825        UText *ut = utext_openUChars(nullptr, &s, 1, &status);
5826        REGEX_CHECK_STATUS;
5827        matcher.reset(ut);
5828        while (matcher.find()) {
5829        }
5830        utext_close(ut);
5831    }
5832}
5833
5834
5835#endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5836