1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7//
8//   regextst.cpp
9//
10//      ICU Regular Expressions test, part of intltest.
11//
12
13/*
14     NOTE!!
15
16     PLEASE be careful about ASCII assumptions in this test.
17     This test is one of the worst repeat offenders.
18     If you have questions, contact someone on the ICU PMC
19     who has access to an EBCDIC system.
20
21 */
22
23#include "intltest.h"
24#if !UCONFIG_NO_REGULAR_EXPRESSIONS
25
26#include <stdlib.h>
27#include <stdio.h>
28#include <string.h>
29
30#include "unicode/localpointer.h"
31#include "unicode/regex.h"
32#include "unicode/uchar.h"
33#include "unicode/ucnv.h"
34#include "unicode/uniset.h"
35#include "unicode/uregex.h"
36#include "unicode/usetiter.h"
37#include "unicode/ustring.h"
38#include "unicode/utext.h"
39
40#include "regextst.h"
41#include "regexcmp.h"
42#include "uvector.h"
43#include "util.h"
44#include "cmemory.h"
45#include "cstring.h"
46#include "uinvchar.h"
47
48#define SUPPORT_MUTATING_INPUT_STRING   0
49
50//---------------------------------------------------------------------------
51//
52//  Test class boilerplate
53//
54//---------------------------------------------------------------------------
55RegexTest::RegexTest()
56{
57}
58
59
60RegexTest::~RegexTest()
61{
62}
63
64
65
66void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
67{
68    if (exec) logln("TestSuite RegexTest: ");
69    switch (index) {
70
71        case 0: name = "Basic";
72            if (exec) Basic();
73            break;
74        case 1: name = "API_Match";
75            if (exec) API_Match();
76            break;
77        case 2: name = "API_Replace";
78            if (exec) API_Replace();
79            break;
80        case 3: name = "API_Pattern";
81            if (exec) API_Pattern();
82            break;
83        case 4:
84#if !UCONFIG_NO_FILE_IO
85            name = "Extended";
86            if (exec) Extended();
87#else
88            name = "skip";
89#endif
90            break;
91        case 5: name = "Errors";
92            if (exec) Errors();
93            break;
94        case 6: name = "PerlTests";
95            if (exec) PerlTests();
96            break;
97        case 7: name = "Callbacks";
98            if (exec) Callbacks();
99            break;
100        case 8: name = "FindProgressCallbacks";
101            if (exec) FindProgressCallbacks();
102            break;
103        case 9: name = "Bug 6149";
104             if (exec) Bug6149();
105             break;
106        case 10: name = "UTextBasic";
107          if (exec) UTextBasic();
108          break;
109        case 11: name = "API_Match_UTF8";
110          if (exec) API_Match_UTF8();
111          break;
112        case 12: name = "API_Replace_UTF8";
113          if (exec) API_Replace_UTF8();
114          break;
115        case 13: name = "API_Pattern_UTF8";
116          if (exec) API_Pattern_UTF8();
117          break;
118        case 14: name = "PerlTestsUTF8";
119          if (exec) PerlTestsUTF8();
120          break;
121        case 15: name = "PreAllocatedUTextCAPI";
122          if (exec) PreAllocatedUTextCAPI();
123          break;
124        case 16: name = "Bug 7651";
125             if (exec) Bug7651();
126             break;
127        case 17: name = "Bug 7740";
128            if (exec) Bug7740();
129            break;
130        case 18: name = "Bug 8479";
131            if (exec) Bug8479();
132            break;
133        case 19: name = "Bug 7029";
134            if (exec) Bug7029();
135            break;
136        case 20: name = "CheckInvBufSize";
137            if (exec) CheckInvBufSize();
138            break;
139        case 21: name = "Bug 9283";
140            if (exec) Bug9283();
141            break;
142        case 22: name = "Bug10459";
143            if (exec) Bug10459();
144            break;
145        case 23: name = "TestCaseInsensitiveStarters";
146            if (exec) TestCaseInsensitiveStarters();
147            break;
148        case 24: name = "TestBug11049";
149            if (exec) TestBug11049();
150            break;
151        case 25: name = "TestBug11371";
152            if (exec) TestBug11371();
153            break;
154        case 26: name = "TestBug11480";
155            if (exec) TestBug11480();
156            break;
157        case 27: name = "NamedCapture";
158            if (exec) NamedCapture();
159            break;
160        case 28: name = "NamedCaptureLimits";
161            if (exec) NamedCaptureLimits();
162            break;
163        default: name = "";
164            break; //needed to end loop
165    }
166}
167
168
169
170/**
171 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
172 * into ASCII.
173 * @see utext_openUTF8
174 */
175static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
176
177//---------------------------------------------------------------------------
178//
179//   Error Checking / Reporting macros used in all of the tests.
180//
181//---------------------------------------------------------------------------
182
183static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
184  int64_t oldIndex = utext_getNativeIndex(text);
185  utext_setNativeIndex(text, 0);
186  char *bufPtr = buf;
187  UChar32 c = utext_next32From(text, 0);
188  while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
189    if (0x000020<=c && c<0x00007e) {
190      *bufPtr = c;
191    } else {
192#if 0
193      sprintf(bufPtr,"U+%04X", c);
194      bufPtr+= strlen(bufPtr)-1;
195#else
196      *bufPtr = '%';
197#endif
198    }
199    bufPtr++;
200    c = UTEXT_NEXT32(text);
201  }
202  *bufPtr = 0;
203#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
204  char *ebuf = (char*)malloc(bufLen);
205  uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
206  uprv_strncpy(buf, ebuf, bufLen);
207  free((void*)ebuf);
208#endif
209  utext_setNativeIndex(text, oldIndex);
210}
211
212
213static char ASSERT_BUF[1024];
214
215const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
216  if(message.length()==0) {
217    strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
218  } else {
219    UnicodeString buf;
220    IntlTest::prettify(message,buf);
221    if(buf.length()==0) {
222      strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
223    } else {
224      buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
225      if(ASSERT_BUF[0]==0) {
226        ASSERT_BUF[0]=0;
227        for(int32_t i=0;i<buf.length();i++) {
228          UChar ch = buf[i];
229          sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
230        }
231      }
232    }
233  }
234  ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
235  return ASSERT_BUF;
236}
237
238#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
239
240#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
241                                                              __FILE__, __LINE__, u_errorName(status)); return;}}
242
243#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
244
245#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
246if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
247    __LINE__, u_errorName(errcode), u_errorName(status));};}
248
249#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
250    "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
251
252#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
253    errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
254
255// expected: const char * , restricted to invariant characters.
256// actual: const UnicodeString &
257#define REGEX_ASSERT_UNISTR(expected, actual) { \
258    if (UnicodeString(expected, -1, US_INV) != (actual)) { \
259        errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
260                __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
261
262
263static UBool testUTextEqual(UText *uta, UText *utb) {
264    UChar32 ca = 0;
265    UChar32 cb = 0;
266    utext_setNativeIndex(uta, 0);
267    utext_setNativeIndex(utb, 0);
268    do {
269        ca = utext_next32(uta);
270        cb = utext_next32(utb);
271        if (ca != cb) {
272            break;
273        }
274    } while (ca != U_SENTINEL);
275    return ca == cb;
276}
277
278
279/**
280 * @param expected expected text in UTF-8 (not platform) codepage
281 */
282void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
283    UErrorCode status = U_ZERO_ERROR;
284    UText expectedText = UTEXT_INITIALIZER;
285    utext_openUTF8(&expectedText, expected, -1, &status);
286    if(U_FAILURE(status)) {
287      errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
288      return;
289    }
290    if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
291      errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
292      return;
293    }
294    utext_setNativeIndex(actual, 0);
295    if (!testUTextEqual(&expectedText, actual)) {
296        char buf[201 /*21*/];
297        char expectedBuf[201];
298        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
299        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
300        errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
301    }
302    utext_close(&expectedText);
303}
304/**
305 * @param expected invariant (platform local text) input
306 */
307
308void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
309    UErrorCode status = U_ZERO_ERROR;
310    UText expectedText = UTEXT_INITIALIZER;
311    regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
312    if(U_FAILURE(status)) {
313      errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
314      return;
315    }
316    utext_setNativeIndex(actual, 0);
317    if (!testUTextEqual(&expectedText, actual)) {
318        char buf[201 /*21*/];
319        char expectedBuf[201];
320        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
321        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
322        errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
323    }
324    utext_close(&expectedText);
325}
326
327/**
328 * Assumes utf-8 input
329 */
330#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
331/**
332 * Assumes Invariant input
333 */
334#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
335
336/**
337 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
338 * passed into utext_openUTF8. An error will be given if
339 * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
340 */
341
342#define INV_BUFSIZ 2048 /* increase this if too small */
343
344static int64_t inv_next=0;
345
346#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
347static char inv_buf[INV_BUFSIZ];
348#endif
349
350static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
351  if(length==-1) length=strlen(inv);
352#if U_CHARSET_FAMILY==U_ASCII_FAMILY
353  inv_next+=length;
354  return utext_openUTF8(ut, inv, length, status);
355#else
356  if(inv_next+length+1>INV_BUFSIZ) {
357    fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
358            __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
359    *status = U_MEMORY_ALLOCATION_ERROR;
360    return NULL;
361  }
362
363  unsigned char *buf = (unsigned char*)inv_buf+inv_next;
364  uprv_aestrncpy(buf, (const uint8_t*)inv, length);
365  inv_next+=length;
366
367#if 0
368  fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
369#endif
370
371  return utext_openUTF8(ut, (const char*)buf, length, status);
372#endif
373}
374
375
376//---------------------------------------------------------------------------
377//
378//    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
379//                       for the LookingAt() and  Match() functions.
380//
381//       usage:
382//          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
383//
384//          The expected results are UBool - TRUE or FALSE.
385//          The input text is unescaped.  The pattern is not.
386//
387//
388//---------------------------------------------------------------------------
389
390#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
391
392UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
393    const UnicodeString pattern(pat, -1, US_INV);
394    const UnicodeString inputText(text, -1, US_INV);
395    UErrorCode          status  = U_ZERO_ERROR;
396    UParseError         pe;
397    RegexPattern        *REPattern = NULL;
398    RegexMatcher        *REMatcher = NULL;
399    UBool               retVal     = TRUE;
400
401    UnicodeString patString(pat, -1, US_INV);
402    REPattern = RegexPattern::compile(patString, 0, pe, status);
403    if (U_FAILURE(status)) {
404        dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
405            line, u_errorName(status));
406        return FALSE;
407    }
408    if (line==376) { REPattern->dumpPattern();}
409
410    UnicodeString inputString(inputText);
411    UnicodeString unEscapedInput = inputString.unescape();
412    REMatcher = REPattern->matcher(unEscapedInput, status);
413    if (U_FAILURE(status)) {
414        errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
415            line, u_errorName(status));
416        return FALSE;
417    }
418
419    UBool actualmatch;
420    actualmatch = REMatcher->lookingAt(status);
421    if (U_FAILURE(status)) {
422        errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
423            line, u_errorName(status));
424        retVal =  FALSE;
425    }
426    if (actualmatch != looking) {
427        errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
428        retVal = FALSE;
429    }
430
431    status = U_ZERO_ERROR;
432    actualmatch = REMatcher->matches(status);
433    if (U_FAILURE(status)) {
434        errln("RegexTest failure in matches() at line %d.  Status = %s\n",
435            line, u_errorName(status));
436        retVal = FALSE;
437    }
438    if (actualmatch != match) {
439        errln("RegexTest: wrong return from matches() at line %d.\n", line);
440        retVal = FALSE;
441    }
442
443    if (retVal == FALSE) {
444        REPattern->dumpPattern();
445    }
446
447    delete REPattern;
448    delete REMatcher;
449    return retVal;
450}
451
452
453UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
454    UText               pattern    = UTEXT_INITIALIZER;
455    int32_t             inputUTF8Length;
456    char                *textChars = NULL;
457    UText               inputText  = UTEXT_INITIALIZER;
458    UErrorCode          status     = U_ZERO_ERROR;
459    UParseError         pe;
460    RegexPattern        *REPattern = NULL;
461    RegexMatcher        *REMatcher = NULL;
462    UBool               retVal     = TRUE;
463
464    regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
465    REPattern = RegexPattern::compile(&pattern, 0, pe, status);
466    if (U_FAILURE(status)) {
467        dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
468            line, u_errorName(status));
469        return FALSE;
470    }
471
472    UnicodeString inputString(text, -1, US_INV);
473    UnicodeString unEscapedInput = inputString.unescape();
474    LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
475    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
476
477    inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
478    if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
479        // UTF-8 does not allow unpaired surrogates, so this could actually happen
480        logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
481        return TRUE; // not a failure of the Regex engine
482    }
483    status = U_ZERO_ERROR; // buffer overflow
484    textChars = new char[inputUTF8Length+1];
485    unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
486    utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
487
488    REMatcher = &REPattern->matcher(status)->reset(&inputText);
489    if (U_FAILURE(status)) {
490        errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
491            line, u_errorName(status));
492        return FALSE;
493    }
494
495    UBool actualmatch;
496    actualmatch = REMatcher->lookingAt(status);
497    if (U_FAILURE(status)) {
498        errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
499            line, u_errorName(status));
500        retVal =  FALSE;
501    }
502    if (actualmatch != looking) {
503        errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
504        retVal = FALSE;
505    }
506
507    status = U_ZERO_ERROR;
508    actualmatch = REMatcher->matches(status);
509    if (U_FAILURE(status)) {
510        errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
511            line, u_errorName(status));
512        retVal = FALSE;
513    }
514    if (actualmatch != match) {
515        errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
516        retVal = FALSE;
517    }
518
519    if (retVal == FALSE) {
520        REPattern->dumpPattern();
521    }
522
523    delete REPattern;
524    delete REMatcher;
525    utext_close(&inputText);
526    utext_close(&pattern);
527    delete[] textChars;
528    return retVal;
529}
530
531
532
533//---------------------------------------------------------------------------
534//
535//    REGEX_ERR       Macro + invocation function to simplify writing tests
536//                       regex tests for incorrect patterns
537//
538//       usage:
539//          REGEX_ERR("pattern",   expected error line, column, expected status);
540//
541//---------------------------------------------------------------------------
542#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
543
544void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
545                          UErrorCode expectedStatus, int32_t line) {
546    UnicodeString       pattern(pat);
547
548    UErrorCode          status         = U_ZERO_ERROR;
549    UParseError         pe;
550    RegexPattern        *callerPattern = NULL;
551
552    //
553    //  Compile the caller's pattern
554    //
555    UnicodeString patString(pat);
556    callerPattern = RegexPattern::compile(patString, 0, pe, status);
557    if (status != expectedStatus) {
558        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
559    } else {
560        if (status != U_ZERO_ERROR) {
561            if (pe.line != errLine || pe.offset != errCol) {
562                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
563                    line, errLine, errCol, pe.line, pe.offset);
564            }
565        }
566    }
567
568    delete callerPattern;
569
570    //
571    //  Compile again, using a UTF-8-based UText
572    //
573    UText patternText = UTEXT_INITIALIZER;
574    regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
575    callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
576    if (status != expectedStatus) {
577        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
578    } else {
579        if (status != U_ZERO_ERROR) {
580            if (pe.line != errLine || pe.offset != errCol) {
581                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
582                    line, errLine, errCol, pe.line, pe.offset);
583            }
584        }
585    }
586
587    delete callerPattern;
588    utext_close(&patternText);
589}
590
591
592
593//---------------------------------------------------------------------------
594//
595//      Basic      Check for basic functionality of regex pattern matching.
596//                 Avoid the use of REGEX_FIND test macro, which has
597//                 substantial dependencies on basic Regex functionality.
598//
599//---------------------------------------------------------------------------
600void RegexTest::Basic() {
601
602
603//
604// Debug - slide failing test cases early
605//
606#if 0
607    {
608        // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
609        UParseError pe;
610        UErrorCode  status = U_ZERO_ERROR;
611        RegexPattern *pattern;
612        pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
613        pattern->dumpPattern();
614        RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
615        UBool result = m->find();
616        printf("result = %d\n", result);
617        // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
618        // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
619    }
620    exit(1);
621#endif
622
623
624    //
625    // Pattern with parentheses
626    //
627    REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
628    REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
629    REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
630
631    //
632    // Patterns with *
633    //
634    REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
635    REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
636    REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
637    REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
638    REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
639
640    REGEX_TESTLM("a*", "",  TRUE, TRUE);
641    REGEX_TESTLM("a*", "b", TRUE, FALSE);
642
643
644    //
645    //  Patterns with "."
646    //
647    REGEX_TESTLM(".", "abc", TRUE, FALSE);
648    REGEX_TESTLM("...", "abc", TRUE, TRUE);
649    REGEX_TESTLM("....", "abc", FALSE, FALSE);
650    REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
651    REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
652    REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
653    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
654    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
655
656    //
657    //  Patterns with * applied to chars at end of literal string
658    //
659    REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
660    REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
661
662    //
663    //  Supplemental chars match as single chars, not a pair of surrogates.
664    //
665    REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
666    REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
667    REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
668
669
670    //
671    //  UnicodeSets in the pattern
672    //
673    REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
674    REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
675    REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
676    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
677    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
678    REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
679
680    REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
681    REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
682    REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
683    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
684    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
685
686    //
687    //   OR operator in patterns
688    //
689    REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
690    REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
691    REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
692    REGEX_TESTLM("a|b", "b", TRUE, TRUE);
693
694    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
695    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
696    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
697    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
698    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
699    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
700
701    //
702    //  +
703    //
704    REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
705    REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
706    REGEX_TESTLM("b+", "", FALSE, FALSE);
707    REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
708    REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
709    REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
710
711    //
712    //   ?
713    //
714    REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
715    REGEX_TESTLM("ab?", "a", TRUE, TRUE);
716    REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
717    REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
718    REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
719    REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
720    REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
721    REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
722    REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
723
724    //
725    //  Escape sequences that become single literal chars, handled internally
726    //   by ICU's Unescape.
727    //
728
729    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
730    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
731    REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
732    REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
733    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
734    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
735    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
736    REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
737    REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
738    REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
739
740    REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
741    REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
742
743    // Escape of special chars in patterns
744    REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
745}
746
747
748//---------------------------------------------------------------------------
749//
750//    UTextBasic   Check for quirks that are specific to the UText
751//                 implementation.
752//
753//---------------------------------------------------------------------------
754void RegexTest::UTextBasic() {
755    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
756    UErrorCode status = U_ZERO_ERROR;
757    UText pattern = UTEXT_INITIALIZER;
758    utext_openUTF8(&pattern, str_abc, -1, &status);
759    RegexMatcher matcher(&pattern, 0, status);
760    REGEX_CHECK_STATUS;
761
762    UText input = UTEXT_INITIALIZER;
763    utext_openUTF8(&input, str_abc, -1, &status);
764    REGEX_CHECK_STATUS;
765    matcher.reset(&input);
766    REGEX_CHECK_STATUS;
767    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
768
769    matcher.reset(matcher.inputText());
770    REGEX_CHECK_STATUS;
771    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
772
773    utext_close(&pattern);
774    utext_close(&input);
775}
776
777
778//---------------------------------------------------------------------------
779//
780//      API_Match   Test that the API for class RegexMatcher
781//                  is present and nominally working, but excluding functions
782//                  implementing replace operations.
783//
784//---------------------------------------------------------------------------
785void RegexTest::API_Match() {
786    UParseError         pe;
787    UErrorCode          status=U_ZERO_ERROR;
788    int32_t             flags = 0;
789
790    //
791    // Debug - slide failing test cases early
792    //
793#if 0
794    {
795    }
796    return;
797#endif
798
799    //
800    // Simple pattern compilation
801    //
802    {
803        UnicodeString       re("abc");
804        RegexPattern        *pat2;
805        pat2 = RegexPattern::compile(re, flags, pe, status);
806        REGEX_CHECK_STATUS;
807
808        UnicodeString inStr1 = "abcdef this is a test";
809        UnicodeString instr2 = "not abc";
810        UnicodeString empty  = "";
811
812
813        //
814        // Matcher creation and reset.
815        //
816        RegexMatcher *m1 = pat2->matcher(inStr1, status);
817        REGEX_CHECK_STATUS;
818        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
819        REGEX_ASSERT(m1->input() == inStr1);
820        m1->reset(instr2);
821        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
822        REGEX_ASSERT(m1->input() == instr2);
823        m1->reset(inStr1);
824        REGEX_ASSERT(m1->input() == inStr1);
825        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
826        m1->reset(empty);
827        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
828        REGEX_ASSERT(m1->input() == empty);
829        REGEX_ASSERT(&m1->pattern() == pat2);
830
831        //
832        //  reset(pos, status)
833        //
834        m1->reset(inStr1);
835        m1->reset(4, status);
836        REGEX_CHECK_STATUS;
837        REGEX_ASSERT(m1->input() == inStr1);
838        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
839
840        m1->reset(-1, status);
841        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
842        status = U_ZERO_ERROR;
843
844        m1->reset(0, status);
845        REGEX_CHECK_STATUS;
846        status = U_ZERO_ERROR;
847
848        int32_t len = m1->input().length();
849        m1->reset(len-1, status);
850        REGEX_CHECK_STATUS;
851        status = U_ZERO_ERROR;
852
853        m1->reset(len, status);
854        REGEX_CHECK_STATUS;
855        status = U_ZERO_ERROR;
856
857        m1->reset(len+1, status);
858        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
859        status = U_ZERO_ERROR;
860
861        //
862        // match(pos, status)
863        //
864        m1->reset(instr2);
865        REGEX_ASSERT(m1->matches(4, status) == TRUE);
866        m1->reset();
867        REGEX_ASSERT(m1->matches(3, status) == FALSE);
868        m1->reset();
869        REGEX_ASSERT(m1->matches(5, status) == FALSE);
870        REGEX_ASSERT(m1->matches(4, status) == TRUE);
871        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
872        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
873
874        // Match() at end of string should fail, but should not
875        //  be an error.
876        status = U_ZERO_ERROR;
877        len = m1->input().length();
878        REGEX_ASSERT(m1->matches(len, status) == FALSE);
879        REGEX_CHECK_STATUS;
880
881        // Match beyond end of string should fail with an error.
882        status = U_ZERO_ERROR;
883        REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
884        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
885
886        // Successful match at end of string.
887        {
888            status = U_ZERO_ERROR;
889            RegexMatcher m("A?", 0, status);  // will match zero length string.
890            REGEX_CHECK_STATUS;
891            m.reset(inStr1);
892            len = inStr1.length();
893            REGEX_ASSERT(m.matches(len, status) == TRUE);
894            REGEX_CHECK_STATUS;
895            m.reset(empty);
896            REGEX_ASSERT(m.matches(0, status) == TRUE);
897            REGEX_CHECK_STATUS;
898        }
899
900
901        //
902        // lookingAt(pos, status)
903        //
904        status = U_ZERO_ERROR;
905        m1->reset(instr2);  // "not abc"
906        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
907        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
908        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
909        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
910        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
911        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
912        status = U_ZERO_ERROR;
913        len = m1->input().length();
914        REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
915        REGEX_CHECK_STATUS;
916        REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
917        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
918
919        delete m1;
920        delete pat2;
921    }
922
923
924    //
925    // Capture Group.
926    //     RegexMatcher::start();
927    //     RegexMatcher::end();
928    //     RegexMatcher::groupCount();
929    //
930    {
931        int32_t             flags=0;
932        UParseError         pe;
933        UErrorCode          status=U_ZERO_ERROR;
934
935        UnicodeString       re("01(23(45)67)(.*)");
936        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
937        REGEX_CHECK_STATUS;
938        UnicodeString data = "0123456789";
939
940        RegexMatcher *matcher = pat->matcher(data, status);
941        REGEX_CHECK_STATUS;
942        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
943        static const int32_t matchStarts[] = {0,  2, 4, 8};
944        static const int32_t matchEnds[]   = {10, 8, 6, 10};
945        int32_t i;
946        for (i=0; i<4; i++) {
947            int32_t actualStart = matcher->start(i, status);
948            REGEX_CHECK_STATUS;
949            if (actualStart != matchStarts[i]) {
950                errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
951                    __LINE__, i, matchStarts[i], actualStart);
952            }
953            int32_t actualEnd = matcher->end(i, status);
954            REGEX_CHECK_STATUS;
955            if (actualEnd != matchEnds[i]) {
956                errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
957                    __LINE__, i, matchEnds[i], actualEnd);
958            }
959        }
960
961        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
962        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
963
964        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
965        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
966        matcher->reset();
967        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
968
969        matcher->lookingAt(status);
970        REGEX_ASSERT(matcher->group(status)    == "0123456789");
971        REGEX_ASSERT(matcher->group(0, status) == "0123456789");
972        REGEX_ASSERT(matcher->group(1, status) == "234567"    );
973        REGEX_ASSERT(matcher->group(2, status) == "45"        );
974        REGEX_ASSERT(matcher->group(3, status) == "89"        );
975        REGEX_CHECK_STATUS;
976        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
977        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
978        matcher->reset();
979        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
980
981        delete matcher;
982        delete pat;
983
984    }
985
986    //
987    //  find
988    //
989    {
990        int32_t             flags=0;
991        UParseError         pe;
992        UErrorCode          status=U_ZERO_ERROR;
993
994        UnicodeString       re("abc");
995        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
996        REGEX_CHECK_STATUS;
997        UnicodeString data = ".abc..abc...abc..";
998        //                    012345678901234567
999
1000        RegexMatcher *matcher = pat->matcher(data, status);
1001        REGEX_CHECK_STATUS;
1002        REGEX_ASSERT(matcher->find());
1003        REGEX_ASSERT(matcher->start(status) == 1);
1004        REGEX_ASSERT(matcher->find());
1005        REGEX_ASSERT(matcher->start(status) == 6);
1006        REGEX_ASSERT(matcher->find());
1007        REGEX_ASSERT(matcher->start(status) == 12);
1008        REGEX_ASSERT(matcher->find() == FALSE);
1009        REGEX_ASSERT(matcher->find() == FALSE);
1010
1011        matcher->reset();
1012        REGEX_ASSERT(matcher->find());
1013        REGEX_ASSERT(matcher->start(status) == 1);
1014
1015        REGEX_ASSERT(matcher->find(0, status));
1016        REGEX_ASSERT(matcher->start(status) == 1);
1017        REGEX_ASSERT(matcher->find(1, status));
1018        REGEX_ASSERT(matcher->start(status) == 1);
1019        REGEX_ASSERT(matcher->find(2, status));
1020        REGEX_ASSERT(matcher->start(status) == 6);
1021        REGEX_ASSERT(matcher->find(12, status));
1022        REGEX_ASSERT(matcher->start(status) == 12);
1023        REGEX_ASSERT(matcher->find(13, status) == FALSE);
1024        REGEX_ASSERT(matcher->find(16, status) == FALSE);
1025        REGEX_ASSERT(matcher->find(17, status) == FALSE);
1026        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1027
1028        status = U_ZERO_ERROR;
1029        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1030        status = U_ZERO_ERROR;
1031        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1032
1033        REGEX_ASSERT(matcher->groupCount() == 0);
1034
1035        delete matcher;
1036        delete pat;
1037    }
1038
1039
1040    //
1041    //  find, with \G in pattern (true if at the end of a previous match).
1042    //
1043    {
1044        int32_t             flags=0;
1045        UParseError         pe;
1046        UErrorCode          status=U_ZERO_ERROR;
1047
1048        UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1049        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1050        REGEX_CHECK_STATUS;
1051        UnicodeString data = ".abcabc.abc..";
1052        //                    012345678901234567
1053
1054        RegexMatcher *matcher = pat->matcher(data, status);
1055        REGEX_CHECK_STATUS;
1056        REGEX_ASSERT(matcher->find());
1057        REGEX_ASSERT(matcher->start(status) == 0);
1058        REGEX_ASSERT(matcher->start(1, status) == -1);
1059        REGEX_ASSERT(matcher->start(2, status) == 1);
1060
1061        REGEX_ASSERT(matcher->find());
1062        REGEX_ASSERT(matcher->start(status) == 4);
1063        REGEX_ASSERT(matcher->start(1, status) == 4);
1064        REGEX_ASSERT(matcher->start(2, status) == -1);
1065        REGEX_CHECK_STATUS;
1066
1067        delete matcher;
1068        delete pat;
1069    }
1070
1071    //
1072    //   find with zero length matches, match position should bump ahead
1073    //     to prevent loops.
1074    //
1075    {
1076        int32_t                 i;
1077        UErrorCode          status=U_ZERO_ERROR;
1078        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1079                                                      //   using an always-true look-ahead.
1080        REGEX_CHECK_STATUS;
1081        UnicodeString s("    ");
1082        m.reset(s);
1083        for (i=0; ; i++) {
1084            if (m.find() == FALSE) {
1085                break;
1086            }
1087            REGEX_ASSERT(m.start(status) == i);
1088            REGEX_ASSERT(m.end(status) == i);
1089        }
1090        REGEX_ASSERT(i==5);
1091
1092        // Check that the bump goes over surrogate pairs OK
1093        s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1094        s = s.unescape();
1095        m.reset(s);
1096        for (i=0; ; i+=2) {
1097            if (m.find() == FALSE) {
1098                break;
1099            }
1100            REGEX_ASSERT(m.start(status) == i);
1101            REGEX_ASSERT(m.end(status) == i);
1102        }
1103        REGEX_ASSERT(i==10);
1104    }
1105    {
1106        // find() loop breaking test.
1107        //        with pattern of /.?/, should see a series of one char matches, then a single
1108        //        match of zero length at the end of the input string.
1109        int32_t                 i;
1110        UErrorCode          status=U_ZERO_ERROR;
1111        RegexMatcher        m(".?", 0, status);
1112        REGEX_CHECK_STATUS;
1113        UnicodeString s("    ");
1114        m.reset(s);
1115        for (i=0; ; i++) {
1116            if (m.find() == FALSE) {
1117                break;
1118            }
1119            REGEX_ASSERT(m.start(status) == i);
1120            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1121        }
1122        REGEX_ASSERT(i==5);
1123    }
1124
1125
1126    //
1127    // Matchers with no input string behave as if they had an empty input string.
1128    //
1129
1130    {
1131        UErrorCode status = U_ZERO_ERROR;
1132        RegexMatcher  m(".?", 0, status);
1133        REGEX_CHECK_STATUS;
1134        REGEX_ASSERT(m.find());
1135        REGEX_ASSERT(m.start(status) == 0);
1136        REGEX_ASSERT(m.input() == "");
1137    }
1138    {
1139        UErrorCode status = U_ZERO_ERROR;
1140        RegexPattern  *p = RegexPattern::compile(".", 0, status);
1141        RegexMatcher  *m = p->matcher(status);
1142        REGEX_CHECK_STATUS;
1143
1144        REGEX_ASSERT(m->find() == FALSE);
1145        REGEX_ASSERT(m->input() == "");
1146        delete m;
1147        delete p;
1148    }
1149
1150    //
1151    // Regions
1152    //
1153    {
1154        UErrorCode status = U_ZERO_ERROR;
1155        UnicodeString testString("This is test data");
1156        RegexMatcher m(".*", testString,  0, status);
1157        REGEX_CHECK_STATUS;
1158        REGEX_ASSERT(m.regionStart() == 0);
1159        REGEX_ASSERT(m.regionEnd() == testString.length());
1160        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1161        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1162
1163        m.region(2,4, status);
1164        REGEX_CHECK_STATUS;
1165        REGEX_ASSERT(m.matches(status));
1166        REGEX_ASSERT(m.start(status)==2);
1167        REGEX_ASSERT(m.end(status)==4);
1168        REGEX_CHECK_STATUS;
1169
1170        m.reset();
1171        REGEX_ASSERT(m.regionStart() == 0);
1172        REGEX_ASSERT(m.regionEnd() == testString.length());
1173
1174        UnicodeString shorterString("short");
1175        m.reset(shorterString);
1176        REGEX_ASSERT(m.regionStart() == 0);
1177        REGEX_ASSERT(m.regionEnd() == shorterString.length());
1178
1179        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1180        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1181        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1182        REGEX_ASSERT(&m == &m.reset());
1183        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1184
1185        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1186        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1187        REGEX_ASSERT(&m == &m.reset());
1188        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1189
1190        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1191        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1192        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1193        REGEX_ASSERT(&m == &m.reset());
1194        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1195
1196        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1197        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1198        REGEX_ASSERT(&m == &m.reset());
1199        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1200
1201    }
1202
1203    //
1204    // hitEnd() and requireEnd()
1205    //
1206    {
1207        UErrorCode status = U_ZERO_ERROR;
1208        UnicodeString testString("aabb");
1209        RegexMatcher m1(".*", testString,  0, status);
1210        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1211        REGEX_ASSERT(m1.hitEnd() == TRUE);
1212        REGEX_ASSERT(m1.requireEnd() == FALSE);
1213        REGEX_CHECK_STATUS;
1214
1215        status = U_ZERO_ERROR;
1216        RegexMatcher m2("a*", testString, 0, status);
1217        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1218        REGEX_ASSERT(m2.hitEnd() == FALSE);
1219        REGEX_ASSERT(m2.requireEnd() == FALSE);
1220        REGEX_CHECK_STATUS;
1221
1222        status = U_ZERO_ERROR;
1223        RegexMatcher m3(".*$", testString, 0, status);
1224        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1225        REGEX_ASSERT(m3.hitEnd() == TRUE);
1226        REGEX_ASSERT(m3.requireEnd() == TRUE);
1227        REGEX_CHECK_STATUS;
1228    }
1229
1230
1231    //
1232    // Compilation error on reset with UChar *
1233    //   These were a hazard that people were stumbling over with runtime errors.
1234    //   Changed them to compiler errors by adding private methods that more closely
1235    //   matched the incorrect use of the functions.
1236    //
1237#if 0
1238    {
1239        UErrorCode status = U_ZERO_ERROR;
1240        UChar ucharString[20];
1241        RegexMatcher m(".", 0, status);
1242        m.reset(ucharString);  // should not compile.
1243
1244        RegexPattern *p = RegexPattern::compile(".", 0, status);
1245        RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1246
1247        RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1248    }
1249#endif
1250
1251    //
1252    //  Time Outs.
1253    //       Note:  These tests will need to be changed when the regexp engine is
1254    //              able to detect and cut short the exponential time behavior on
1255    //              this type of match.
1256    //
1257    {
1258        UErrorCode status = U_ZERO_ERROR;
1259        //    Enough 'a's in the string to cause the match to time out.
1260        //       (Each on additonal 'a' doubles the time)
1261        UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1262        RegexMatcher matcher("(a+)+b", testString, 0, status);
1263        REGEX_CHECK_STATUS;
1264        REGEX_ASSERT(matcher.getTimeLimit() == 0);
1265        matcher.setTimeLimit(100, status);
1266        REGEX_ASSERT(matcher.getTimeLimit() == 100);
1267        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1268        REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1269    }
1270    {
1271        UErrorCode status = U_ZERO_ERROR;
1272        //   Few enough 'a's to slip in under the time limit.
1273        UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1274        RegexMatcher matcher("(a+)+b", testString, 0, status);
1275        REGEX_CHECK_STATUS;
1276        matcher.setTimeLimit(100, status);
1277        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1278        REGEX_CHECK_STATUS;
1279    }
1280
1281    //
1282    //  Stack Limits
1283    //
1284    {
1285        UErrorCode status = U_ZERO_ERROR;
1286        UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1287
1288        // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1289        //   of the '+', and makes the stack frames larger.
1290        RegexMatcher matcher("(A)+A$", testString, 0, status);
1291
1292        // With the default stack, this match should fail to run
1293        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1294        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1295
1296        // With unlimited stack, it should run
1297        status = U_ZERO_ERROR;
1298        matcher.setStackLimit(0, status);
1299        REGEX_CHECK_STATUS;
1300        REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1301        REGEX_CHECK_STATUS;
1302        REGEX_ASSERT(matcher.getStackLimit() == 0);
1303
1304        // With a limited stack, it the match should fail
1305        status = U_ZERO_ERROR;
1306        matcher.setStackLimit(10000, status);
1307        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1308        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1309        REGEX_ASSERT(matcher.getStackLimit() == 10000);
1310    }
1311
1312        // A pattern that doesn't save state should work with
1313        //   a minimal sized stack
1314    {
1315        UErrorCode status = U_ZERO_ERROR;
1316        UnicodeString testString = "abc";
1317        RegexMatcher matcher("abc", testString, 0, status);
1318        REGEX_CHECK_STATUS;
1319        matcher.setStackLimit(30, status);
1320        REGEX_CHECK_STATUS;
1321        REGEX_ASSERT(matcher.matches(status) == TRUE);
1322        REGEX_CHECK_STATUS;
1323        REGEX_ASSERT(matcher.getStackLimit() == 30);
1324
1325        // Negative stack sizes should fail
1326        status = U_ZERO_ERROR;
1327        matcher.setStackLimit(1000, status);
1328        REGEX_CHECK_STATUS;
1329        matcher.setStackLimit(-1, status);
1330        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1331        REGEX_ASSERT(matcher.getStackLimit() == 1000);
1332    }
1333
1334
1335}
1336
1337
1338
1339
1340
1341
1342//---------------------------------------------------------------------------
1343//
1344//      API_Replace        API test for class RegexMatcher, testing the
1345//                         Replace family of functions.
1346//
1347//---------------------------------------------------------------------------
1348void RegexTest::API_Replace() {
1349    //
1350    //  Replace
1351    //
1352    int32_t             flags=0;
1353    UParseError         pe;
1354    UErrorCode          status=U_ZERO_ERROR;
1355
1356    UnicodeString       re("abc");
1357    RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1358    REGEX_CHECK_STATUS;
1359    UnicodeString data = ".abc..abc...abc..";
1360    //                    012345678901234567
1361    RegexMatcher *matcher = pat->matcher(data, status);
1362
1363    //
1364    //  Plain vanilla matches.
1365    //
1366    UnicodeString  dest;
1367    dest = matcher->replaceFirst("yz", status);
1368    REGEX_CHECK_STATUS;
1369    REGEX_ASSERT(dest == ".yz..abc...abc..");
1370
1371    dest = matcher->replaceAll("yz", status);
1372    REGEX_CHECK_STATUS;
1373    REGEX_ASSERT(dest == ".yz..yz...yz..");
1374
1375    //
1376    //  Plain vanilla non-matches.
1377    //
1378    UnicodeString d2 = ".abx..abx...abx..";
1379    matcher->reset(d2);
1380    dest = matcher->replaceFirst("yz", status);
1381    REGEX_CHECK_STATUS;
1382    REGEX_ASSERT(dest == ".abx..abx...abx..");
1383
1384    dest = matcher->replaceAll("yz", status);
1385    REGEX_CHECK_STATUS;
1386    REGEX_ASSERT(dest == ".abx..abx...abx..");
1387
1388    //
1389    // Empty source string
1390    //
1391    UnicodeString d3 = "";
1392    matcher->reset(d3);
1393    dest = matcher->replaceFirst("yz", status);
1394    REGEX_CHECK_STATUS;
1395    REGEX_ASSERT(dest == "");
1396
1397    dest = matcher->replaceAll("yz", status);
1398    REGEX_CHECK_STATUS;
1399    REGEX_ASSERT(dest == "");
1400
1401    //
1402    // Empty substitution string
1403    //
1404    matcher->reset(data);              // ".abc..abc...abc.."
1405    dest = matcher->replaceFirst("", status);
1406    REGEX_CHECK_STATUS;
1407    REGEX_ASSERT(dest == "...abc...abc..");
1408
1409    dest = matcher->replaceAll("", status);
1410    REGEX_CHECK_STATUS;
1411    REGEX_ASSERT(dest == "........");
1412
1413    //
1414    // match whole string
1415    //
1416    UnicodeString d4 = "abc";
1417    matcher->reset(d4);
1418    dest = matcher->replaceFirst("xyz", status);
1419    REGEX_CHECK_STATUS;
1420    REGEX_ASSERT(dest == "xyz");
1421
1422    dest = matcher->replaceAll("xyz", status);
1423    REGEX_CHECK_STATUS;
1424    REGEX_ASSERT(dest == "xyz");
1425
1426    //
1427    // Capture Group, simple case
1428    //
1429    UnicodeString       re2("a(..)");
1430    RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1431    REGEX_CHECK_STATUS;
1432    UnicodeString d5 = "abcdefg";
1433    RegexMatcher *matcher2 = pat2->matcher(d5, status);
1434    REGEX_CHECK_STATUS;
1435    dest = matcher2->replaceFirst("$1$1", status);
1436    REGEX_CHECK_STATUS;
1437    REGEX_ASSERT(dest == "bcbcdefg");
1438
1439    dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1440    REGEX_CHECK_STATUS;
1441    REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1442
1443    dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1444    REGEX_ASSERT(U_FAILURE(status));
1445    status = U_ZERO_ERROR;
1446
1447    UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1448    replacement = replacement.unescape();
1449    dest = matcher2->replaceFirst(replacement, status);
1450    REGEX_CHECK_STATUS;
1451    REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1452
1453    REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1454
1455
1456    //
1457    // Replacement String with \u hex escapes
1458    //
1459    {
1460        UnicodeString  src = "abc 1 abc 2 abc 3";
1461        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1462        matcher->reset(src);
1463        UnicodeString  result = matcher->replaceAll(substitute, status);
1464        REGEX_CHECK_STATUS;
1465        REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1466    }
1467    {
1468        UnicodeString  src = "abc !";
1469        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1470        matcher->reset(src);
1471        UnicodeString  result = matcher->replaceAll(substitute, status);
1472        REGEX_CHECK_STATUS;
1473        UnicodeString expected = UnicodeString("--");
1474        expected.append((UChar32)0x10000);
1475        expected.append("-- !");
1476        REGEX_ASSERT(result == expected);
1477    }
1478    // TODO:  need more through testing of capture substitutions.
1479
1480    // Bug 4057
1481    //
1482    {
1483        status = U_ZERO_ERROR;
1484        UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1485        RegexMatcher m("ss(.*?)ee", 0, status);
1486        REGEX_CHECK_STATUS;
1487        UnicodeString result;
1488
1489        // Multiple finds do NOT bump up the previous appendReplacement postion.
1490        m.reset(s);
1491        m.find();
1492        m.find();
1493        m.appendReplacement(result, "ooh", status);
1494        REGEX_CHECK_STATUS;
1495        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1496
1497        // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1498        status = U_ZERO_ERROR;
1499        result.truncate(0);
1500        m.reset(10, status);
1501        m.find();
1502        m.find();
1503        m.appendReplacement(result, "ooh", status);
1504        REGEX_CHECK_STATUS;
1505        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1506
1507        // find() at interior of string, appendReplacemnt still starts at beginning.
1508        status = U_ZERO_ERROR;
1509        result.truncate(0);
1510        m.reset();
1511        m.find(10, status);
1512        m.find();
1513        m.appendReplacement(result, "ooh", status);
1514        REGEX_CHECK_STATUS;
1515        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1516
1517        m.appendTail(result);
1518        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1519
1520    }
1521
1522    delete matcher2;
1523    delete pat2;
1524    delete matcher;
1525    delete pat;
1526}
1527
1528
1529//---------------------------------------------------------------------------
1530//
1531//      API_Pattern       Test that the API for class RegexPattern is
1532//                        present and nominally working.
1533//
1534//---------------------------------------------------------------------------
1535void RegexTest::API_Pattern() {
1536    RegexPattern        pata;    // Test default constructor to not crash.
1537    RegexPattern        patb;
1538
1539    REGEX_ASSERT(pata == patb);
1540    REGEX_ASSERT(pata == pata);
1541
1542    UnicodeString re1("abc[a-l][m-z]");
1543    UnicodeString re2("def");
1544    UErrorCode    status = U_ZERO_ERROR;
1545    UParseError   pe;
1546
1547    RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1548    RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1549    REGEX_CHECK_STATUS;
1550    REGEX_ASSERT(*pat1 == *pat1);
1551    REGEX_ASSERT(*pat1 != pata);
1552
1553    // Assign
1554    patb = *pat1;
1555    REGEX_ASSERT(patb == *pat1);
1556
1557    // Copy Construct
1558    RegexPattern patc(*pat1);
1559    REGEX_ASSERT(patc == *pat1);
1560    REGEX_ASSERT(patb == patc);
1561    REGEX_ASSERT(pat1 != pat2);
1562    patb = *pat2;
1563    REGEX_ASSERT(patb != patc);
1564    REGEX_ASSERT(patb == *pat2);
1565
1566    // Compile with no flags.
1567    RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1568    REGEX_ASSERT(*pat1a == *pat1);
1569
1570    REGEX_ASSERT(pat1a->flags() == 0);
1571
1572    // Compile with different flags should be not equal
1573    RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1574    REGEX_CHECK_STATUS;
1575
1576    REGEX_ASSERT(*pat1b != *pat1a);
1577    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1578    REGEX_ASSERT(pat1a->flags() == 0);
1579    delete pat1b;
1580
1581    // clone
1582    RegexPattern *pat1c = pat1->clone();
1583    REGEX_ASSERT(*pat1c == *pat1);
1584    REGEX_ASSERT(*pat1c != *pat2);
1585
1586    delete pat1c;
1587    delete pat1a;
1588    delete pat1;
1589    delete pat2;
1590
1591
1592    //
1593    //   Verify that a matcher created from a cloned pattern works.
1594    //     (Jitterbug 3423)
1595    //
1596    {
1597        UErrorCode     status     = U_ZERO_ERROR;
1598        RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1599        RegexPattern  *pClone     = pSource->clone();
1600        delete         pSource;
1601        RegexMatcher  *mFromClone = pClone->matcher(status);
1602        REGEX_CHECK_STATUS;
1603        UnicodeString s = "Hello World";
1604        mFromClone->reset(s);
1605        REGEX_ASSERT(mFromClone->find() == TRUE);
1606        REGEX_ASSERT(mFromClone->group(status) == "Hello");
1607        REGEX_ASSERT(mFromClone->find() == TRUE);
1608        REGEX_ASSERT(mFromClone->group(status) == "World");
1609        REGEX_ASSERT(mFromClone->find() == FALSE);
1610        delete mFromClone;
1611        delete pClone;
1612    }
1613
1614    //
1615    //   matches convenience API
1616    //
1617    REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1618    REGEX_CHECK_STATUS;
1619    REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1620    REGEX_CHECK_STATUS;
1621    REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1622    REGEX_CHECK_STATUS;
1623    REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1624    REGEX_CHECK_STATUS;
1625    REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1626    REGEX_CHECK_STATUS;
1627    status = U_INDEX_OUTOFBOUNDS_ERROR;
1628    REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1629    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1630
1631
1632    //
1633    // Split()
1634    //
1635    status = U_ZERO_ERROR;
1636    pat1 = RegexPattern::compile(" +",  pe, status);
1637    REGEX_CHECK_STATUS;
1638    UnicodeString  fields[10];
1639
1640    int32_t n;
1641    n = pat1->split("Now is the time", fields, 10, status);
1642    REGEX_CHECK_STATUS;
1643    REGEX_ASSERT(n==4);
1644    REGEX_ASSERT(fields[0]=="Now");
1645    REGEX_ASSERT(fields[1]=="is");
1646    REGEX_ASSERT(fields[2]=="the");
1647    REGEX_ASSERT(fields[3]=="time");
1648    REGEX_ASSERT(fields[4]=="");
1649
1650    n = pat1->split("Now is the time", fields, 2, status);
1651    REGEX_CHECK_STATUS;
1652    REGEX_ASSERT(n==2);
1653    REGEX_ASSERT(fields[0]=="Now");
1654    REGEX_ASSERT(fields[1]=="is the time");
1655    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1656
1657    fields[1] = "*";
1658    status = U_ZERO_ERROR;
1659    n = pat1->split("Now is the time", fields, 1, status);
1660    REGEX_CHECK_STATUS;
1661    REGEX_ASSERT(n==1);
1662    REGEX_ASSERT(fields[0]=="Now is the time");
1663    REGEX_ASSERT(fields[1]=="*");
1664    status = U_ZERO_ERROR;
1665
1666    n = pat1->split("    Now       is the time   ", fields, 10, status);
1667    REGEX_CHECK_STATUS;
1668    REGEX_ASSERT(n==6);
1669    REGEX_ASSERT(fields[0]=="");
1670    REGEX_ASSERT(fields[1]=="Now");
1671    REGEX_ASSERT(fields[2]=="is");
1672    REGEX_ASSERT(fields[3]=="the");
1673    REGEX_ASSERT(fields[4]=="time");
1674    REGEX_ASSERT(fields[5]=="");
1675
1676    n = pat1->split("     ", fields, 10, status);
1677    REGEX_CHECK_STATUS;
1678    REGEX_ASSERT(n==2);
1679    REGEX_ASSERT(fields[0]=="");
1680    REGEX_ASSERT(fields[1]=="");
1681
1682    fields[0] = "foo";
1683    n = pat1->split("", fields, 10, status);
1684    REGEX_CHECK_STATUS;
1685    REGEX_ASSERT(n==0);
1686    REGEX_ASSERT(fields[0]=="foo");
1687
1688    delete pat1;
1689
1690    //  split, with a pattern with (capture)
1691    pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1692    REGEX_CHECK_STATUS;
1693
1694    status = U_ZERO_ERROR;
1695    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1696    REGEX_CHECK_STATUS;
1697    REGEX_ASSERT(n==7);
1698    REGEX_ASSERT(fields[0]=="");
1699    REGEX_ASSERT(fields[1]=="a");
1700    REGEX_ASSERT(fields[2]=="Now is ");
1701    REGEX_ASSERT(fields[3]=="b");
1702    REGEX_ASSERT(fields[4]=="the time");
1703    REGEX_ASSERT(fields[5]=="c");
1704    REGEX_ASSERT(fields[6]=="");
1705    REGEX_ASSERT(status==U_ZERO_ERROR);
1706
1707    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1708    REGEX_CHECK_STATUS;
1709    REGEX_ASSERT(n==7);
1710    REGEX_ASSERT(fields[0]=="  ");
1711    REGEX_ASSERT(fields[1]=="a");
1712    REGEX_ASSERT(fields[2]=="Now is ");
1713    REGEX_ASSERT(fields[3]=="b");
1714    REGEX_ASSERT(fields[4]=="the time");
1715    REGEX_ASSERT(fields[5]=="c");
1716    REGEX_ASSERT(fields[6]=="");
1717
1718    status = U_ZERO_ERROR;
1719    fields[6] = "foo";
1720    n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1721    REGEX_CHECK_STATUS;
1722    REGEX_ASSERT(n==6);
1723    REGEX_ASSERT(fields[0]=="  ");
1724    REGEX_ASSERT(fields[1]=="a");
1725    REGEX_ASSERT(fields[2]=="Now is ");
1726    REGEX_ASSERT(fields[3]=="b");
1727    REGEX_ASSERT(fields[4]=="the time");
1728    REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1729    REGEX_ASSERT(fields[6]=="foo");
1730
1731    status = U_ZERO_ERROR;
1732    fields[5] = "foo";
1733    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1734    REGEX_CHECK_STATUS;
1735    REGEX_ASSERT(n==5);
1736    REGEX_ASSERT(fields[0]=="  ");
1737    REGEX_ASSERT(fields[1]=="a");
1738    REGEX_ASSERT(fields[2]=="Now is ");
1739    REGEX_ASSERT(fields[3]=="b");
1740    REGEX_ASSERT(fields[4]=="the time<c>");
1741    REGEX_ASSERT(fields[5]=="foo");
1742
1743    status = U_ZERO_ERROR;
1744    fields[5] = "foo";
1745    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1746    REGEX_CHECK_STATUS;
1747    REGEX_ASSERT(n==5);
1748    REGEX_ASSERT(fields[0]=="  ");
1749    REGEX_ASSERT(fields[1]=="a");
1750    REGEX_ASSERT(fields[2]=="Now is ");
1751    REGEX_ASSERT(fields[3]=="b");
1752    REGEX_ASSERT(fields[4]=="the time");
1753    REGEX_ASSERT(fields[5]=="foo");
1754
1755    status = U_ZERO_ERROR;
1756    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1757    REGEX_CHECK_STATUS;
1758    REGEX_ASSERT(n==4);
1759    REGEX_ASSERT(fields[0]=="  ");
1760    REGEX_ASSERT(fields[1]=="a");
1761    REGEX_ASSERT(fields[2]=="Now is ");
1762    REGEX_ASSERT(fields[3]=="the time<c>");
1763    status = U_ZERO_ERROR;
1764    delete pat1;
1765
1766    pat1 = RegexPattern::compile("([-,])",  pe, status);
1767    REGEX_CHECK_STATUS;
1768    n = pat1->split("1-10,20", fields, 10, status);
1769    REGEX_CHECK_STATUS;
1770    REGEX_ASSERT(n==5);
1771    REGEX_ASSERT(fields[0]=="1");
1772    REGEX_ASSERT(fields[1]=="-");
1773    REGEX_ASSERT(fields[2]=="10");
1774    REGEX_ASSERT(fields[3]==",");
1775    REGEX_ASSERT(fields[4]=="20");
1776    delete pat1;
1777
1778    // Test split of string with empty trailing fields
1779    pat1 = RegexPattern::compile(",", pe, status);
1780    REGEX_CHECK_STATUS;
1781    n = pat1->split("a,b,c,", fields, 10, status);
1782    REGEX_CHECK_STATUS;
1783    REGEX_ASSERT(n==4);
1784    REGEX_ASSERT(fields[0]=="a");
1785    REGEX_ASSERT(fields[1]=="b");
1786    REGEX_ASSERT(fields[2]=="c");
1787    REGEX_ASSERT(fields[3]=="");
1788
1789    n = pat1->split("a,,,", fields, 10, status);
1790    REGEX_CHECK_STATUS;
1791    REGEX_ASSERT(n==4);
1792    REGEX_ASSERT(fields[0]=="a");
1793    REGEX_ASSERT(fields[1]=="");
1794    REGEX_ASSERT(fields[2]=="");
1795    REGEX_ASSERT(fields[3]=="");
1796    delete pat1;
1797
1798    // Split Separator with zero length match.
1799    pat1 = RegexPattern::compile(":?", pe, status);
1800    REGEX_CHECK_STATUS;
1801    n = pat1->split("abc", fields, 10, status);
1802    REGEX_CHECK_STATUS;
1803    REGEX_ASSERT(n==5);
1804    REGEX_ASSERT(fields[0]=="");
1805    REGEX_ASSERT(fields[1]=="a");
1806    REGEX_ASSERT(fields[2]=="b");
1807    REGEX_ASSERT(fields[3]=="c");
1808    REGEX_ASSERT(fields[4]=="");
1809
1810    delete pat1;
1811
1812    //
1813    // RegexPattern::pattern()
1814    //
1815    pat1 = new RegexPattern();
1816    REGEX_ASSERT(pat1->pattern() == "");
1817    delete pat1;
1818
1819    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1820    REGEX_CHECK_STATUS;
1821    REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1822    delete pat1;
1823
1824
1825    //
1826    // classID functions
1827    //
1828    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1829    REGEX_CHECK_STATUS;
1830    REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1831    REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1832    UnicodeString Hello("Hello, world.");
1833    RegexMatcher *m = pat1->matcher(Hello, status);
1834    REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1835    REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1836    REGEX_ASSERT(m->getDynamicClassID() != NULL);
1837    delete m;
1838    delete pat1;
1839
1840}
1841
1842//---------------------------------------------------------------------------
1843//
1844//      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1845//                       is present and working, but excluding functions
1846//                       implementing replace operations.
1847//
1848//---------------------------------------------------------------------------
1849void RegexTest::API_Match_UTF8() {
1850    UParseError         pe;
1851    UErrorCode          status=U_ZERO_ERROR;
1852    int32_t             flags = 0;
1853
1854    //
1855    // Debug - slide failing test cases early
1856    //
1857#if 0
1858    {
1859    }
1860    return;
1861#endif
1862
1863    //
1864    // Simple pattern compilation
1865    //
1866    {
1867        UText               re = UTEXT_INITIALIZER;
1868        regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1869        REGEX_VERBOSE_TEXT(&re);
1870        RegexPattern        *pat2;
1871        pat2 = RegexPattern::compile(&re, flags, pe, status);
1872        REGEX_CHECK_STATUS;
1873
1874        UText input1 = UTEXT_INITIALIZER;
1875        UText input2 = UTEXT_INITIALIZER;
1876        UText empty  = UTEXT_INITIALIZER;
1877        regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1878        REGEX_VERBOSE_TEXT(&input1);
1879        regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1880        REGEX_VERBOSE_TEXT(&input2);
1881        utext_openUChars(&empty, NULL, 0, &status);
1882
1883        int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1884        int32_t input2Len = strlen("not abc");
1885
1886
1887        //
1888        // Matcher creation and reset.
1889        //
1890        RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1891        REGEX_CHECK_STATUS;
1892        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1893        const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1894        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1895        m1->reset(&input2);
1896        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1897        const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1898        REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1899        m1->reset(&input1);
1900        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1901        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1902        m1->reset(&empty);
1903        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1904        REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1905
1906        //
1907        //  reset(pos, status)
1908        //
1909        m1->reset(&input1);
1910        m1->reset(4, status);
1911        REGEX_CHECK_STATUS;
1912        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1913        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1914
1915        m1->reset(-1, status);
1916        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1917        status = U_ZERO_ERROR;
1918
1919        m1->reset(0, status);
1920        REGEX_CHECK_STATUS;
1921        status = U_ZERO_ERROR;
1922
1923        m1->reset(input1Len-1, status);
1924        REGEX_CHECK_STATUS;
1925        status = U_ZERO_ERROR;
1926
1927        m1->reset(input1Len, status);
1928        REGEX_CHECK_STATUS;
1929        status = U_ZERO_ERROR;
1930
1931        m1->reset(input1Len+1, status);
1932        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1933        status = U_ZERO_ERROR;
1934
1935        //
1936        // match(pos, status)
1937        //
1938        m1->reset(&input2);
1939        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1940        m1->reset();
1941        REGEX_ASSERT(m1->matches(3, status) == FALSE);
1942        m1->reset();
1943        REGEX_ASSERT(m1->matches(5, status) == FALSE);
1944        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1945        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1946        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1947
1948        // Match() at end of string should fail, but should not
1949        //  be an error.
1950        status = U_ZERO_ERROR;
1951        REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1952        REGEX_CHECK_STATUS;
1953
1954        // Match beyond end of string should fail with an error.
1955        status = U_ZERO_ERROR;
1956        REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1957        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1958
1959        // Successful match at end of string.
1960        {
1961            status = U_ZERO_ERROR;
1962            RegexMatcher m("A?", 0, status);  // will match zero length string.
1963            REGEX_CHECK_STATUS;
1964            m.reset(&input1);
1965            REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1966            REGEX_CHECK_STATUS;
1967            m.reset(&empty);
1968            REGEX_ASSERT(m.matches(0, status) == TRUE);
1969            REGEX_CHECK_STATUS;
1970        }
1971
1972
1973        //
1974        // lookingAt(pos, status)
1975        //
1976        status = U_ZERO_ERROR;
1977        m1->reset(&input2);  // "not abc"
1978        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1979        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1980        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1981        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1982        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1983        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1984        status = U_ZERO_ERROR;
1985        REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1986        REGEX_CHECK_STATUS;
1987        REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1988        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1989
1990        delete m1;
1991        delete pat2;
1992
1993        utext_close(&re);
1994        utext_close(&input1);
1995        utext_close(&input2);
1996        utext_close(&empty);
1997    }
1998
1999
2000    //
2001    // Capture Group.
2002    //     RegexMatcher::start();
2003    //     RegexMatcher::end();
2004    //     RegexMatcher::groupCount();
2005    //
2006    {
2007        int32_t             flags=0;
2008        UParseError         pe;
2009        UErrorCode          status=U_ZERO_ERROR;
2010        UText               re=UTEXT_INITIALIZER;
2011        const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
2012        utext_openUTF8(&re, str_01234567_pat, -1, &status);
2013
2014        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2015        REGEX_CHECK_STATUS;
2016
2017        UText input = UTEXT_INITIALIZER;
2018        const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2019        utext_openUTF8(&input, str_0123456789, -1, &status);
2020
2021        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2022        REGEX_CHECK_STATUS;
2023        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
2024        static const int32_t matchStarts[] = {0,  2, 4, 8};
2025        static const int32_t matchEnds[]   = {10, 8, 6, 10};
2026        int32_t i;
2027        for (i=0; i<4; i++) {
2028            int32_t actualStart = matcher->start(i, status);
2029            REGEX_CHECK_STATUS;
2030            if (actualStart != matchStarts[i]) {
2031                errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2032                      __FILE__, __LINE__, i, matchStarts[i], actualStart);
2033            }
2034            int32_t actualEnd = matcher->end(i, status);
2035            REGEX_CHECK_STATUS;
2036            if (actualEnd != matchEnds[i]) {
2037                errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2038                      __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2039            }
2040        }
2041
2042        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2043        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2044
2045        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2046        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2047        matcher->reset();
2048        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2049
2050        matcher->lookingAt(status);
2051
2052        UnicodeString dest;
2053        UText destText = UTEXT_INITIALIZER;
2054        utext_openUnicodeString(&destText, &dest, &status);
2055        UText *result;
2056        //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2057        //  Test shallow-clone API
2058        int64_t   group_len;
2059        result = matcher->group((UText *)NULL, group_len, status);
2060        REGEX_CHECK_STATUS;
2061        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2062        utext_close(result);
2063        result = matcher->group(0, &destText, group_len, status);
2064        REGEX_CHECK_STATUS;
2065        REGEX_ASSERT(result == &destText);
2066        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2067        //  destText is now immutable, reopen it
2068        utext_close(&destText);
2069        utext_openUnicodeString(&destText, &dest, &status);
2070
2071        int64_t length;
2072        result = matcher->group(0, NULL, length, status);
2073        REGEX_CHECK_STATUS;
2074        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2075        utext_close(result);
2076        result = matcher->group(0, &destText, length, status);
2077        REGEX_CHECK_STATUS;
2078        REGEX_ASSERT(result == &destText);
2079        REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2080        REGEX_ASSERT(length == 10);
2081        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2082
2083        // Capture Group 1 == "234567"
2084        result = matcher->group(1, NULL, length, status);
2085        REGEX_CHECK_STATUS;
2086        REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2087        REGEX_ASSERT(length == 6);
2088        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2089        utext_close(result);
2090
2091        result = matcher->group(1, &destText, length, status);
2092        REGEX_CHECK_STATUS;
2093        REGEX_ASSERT(result == &destText);
2094        REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2095        REGEX_ASSERT(length == 6);
2096        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2097        utext_close(result);
2098
2099        // Capture Group 2 == "45"
2100        result = matcher->group(2, NULL, length, status);
2101        REGEX_CHECK_STATUS;
2102        REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2103        REGEX_ASSERT(length == 2);
2104        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2105        utext_close(result);
2106
2107        result = matcher->group(2, &destText, length, status);
2108        REGEX_CHECK_STATUS;
2109        REGEX_ASSERT(result == &destText);
2110        REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2111        REGEX_ASSERT(length == 2);
2112        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2113        utext_close(result);
2114
2115        // Capture Group 3 == "89"
2116        result = matcher->group(3, NULL, length, status);
2117        REGEX_CHECK_STATUS;
2118        REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2119        REGEX_ASSERT(length == 2);
2120        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2121        utext_close(result);
2122
2123        result = matcher->group(3, &destText, length, status);
2124        REGEX_CHECK_STATUS;
2125        REGEX_ASSERT(result == &destText);
2126        REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2127        REGEX_ASSERT(length == 2);
2128        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2129        utext_close(result);
2130
2131        // Capture Group number out of range.
2132        status = U_ZERO_ERROR;
2133        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2134        status = U_ZERO_ERROR;
2135        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2136        status = U_ZERO_ERROR;
2137        matcher->reset();
2138        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2139
2140        delete matcher;
2141        delete pat;
2142
2143        utext_close(&destText);
2144        utext_close(&input);
2145        utext_close(&re);
2146    }
2147
2148    //
2149    //  find
2150    //
2151    {
2152        int32_t             flags=0;
2153        UParseError         pe;
2154        UErrorCode          status=U_ZERO_ERROR;
2155        UText               re=UTEXT_INITIALIZER;
2156        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2157        utext_openUTF8(&re, str_abc, -1, &status);
2158
2159        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2160        REGEX_CHECK_STATUS;
2161        UText input = UTEXT_INITIALIZER;
2162        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2163        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2164        //                      012345678901234567
2165
2166        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2167        REGEX_CHECK_STATUS;
2168        REGEX_ASSERT(matcher->find());
2169        REGEX_ASSERT(matcher->start(status) == 1);
2170        REGEX_ASSERT(matcher->find());
2171        REGEX_ASSERT(matcher->start(status) == 6);
2172        REGEX_ASSERT(matcher->find());
2173        REGEX_ASSERT(matcher->start(status) == 12);
2174        REGEX_ASSERT(matcher->find() == FALSE);
2175        REGEX_ASSERT(matcher->find() == FALSE);
2176
2177        matcher->reset();
2178        REGEX_ASSERT(matcher->find());
2179        REGEX_ASSERT(matcher->start(status) == 1);
2180
2181        REGEX_ASSERT(matcher->find(0, status));
2182        REGEX_ASSERT(matcher->start(status) == 1);
2183        REGEX_ASSERT(matcher->find(1, status));
2184        REGEX_ASSERT(matcher->start(status) == 1);
2185        REGEX_ASSERT(matcher->find(2, status));
2186        REGEX_ASSERT(matcher->start(status) == 6);
2187        REGEX_ASSERT(matcher->find(12, status));
2188        REGEX_ASSERT(matcher->start(status) == 12);
2189        REGEX_ASSERT(matcher->find(13, status) == FALSE);
2190        REGEX_ASSERT(matcher->find(16, status) == FALSE);
2191        REGEX_ASSERT(matcher->find(17, status) == FALSE);
2192        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2193
2194        status = U_ZERO_ERROR;
2195        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2196        status = U_ZERO_ERROR;
2197        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2198
2199        REGEX_ASSERT(matcher->groupCount() == 0);
2200
2201        delete matcher;
2202        delete pat;
2203
2204        utext_close(&input);
2205        utext_close(&re);
2206    }
2207
2208
2209    //
2210    //  find, with \G in pattern (true if at the end of a previous match).
2211    //
2212    {
2213        int32_t             flags=0;
2214        UParseError         pe;
2215        UErrorCode          status=U_ZERO_ERROR;
2216        UText               re=UTEXT_INITIALIZER;
2217        const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2218        utext_openUTF8(&re, str_Gabcabc, -1, &status);
2219
2220        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2221
2222        REGEX_CHECK_STATUS;
2223        UText input = UTEXT_INITIALIZER;
2224        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2225        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2226        //                      012345678901234567
2227
2228        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2229        REGEX_CHECK_STATUS;
2230        REGEX_ASSERT(matcher->find());
2231        REGEX_ASSERT(matcher->start(status) == 0);
2232        REGEX_ASSERT(matcher->start(1, status) == -1);
2233        REGEX_ASSERT(matcher->start(2, status) == 1);
2234
2235        REGEX_ASSERT(matcher->find());
2236        REGEX_ASSERT(matcher->start(status) == 4);
2237        REGEX_ASSERT(matcher->start(1, status) == 4);
2238        REGEX_ASSERT(matcher->start(2, status) == -1);
2239        REGEX_CHECK_STATUS;
2240
2241        delete matcher;
2242        delete pat;
2243
2244        utext_close(&input);
2245        utext_close(&re);
2246    }
2247
2248    //
2249    //   find with zero length matches, match position should bump ahead
2250    //     to prevent loops.
2251    //
2252    {
2253        int32_t                 i;
2254        UErrorCode          status=U_ZERO_ERROR;
2255        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2256                                                      //   using an always-true look-ahead.
2257        REGEX_CHECK_STATUS;
2258        UText s = UTEXT_INITIALIZER;
2259        utext_openUTF8(&s, "    ", -1, &status);
2260        m.reset(&s);
2261        for (i=0; ; i++) {
2262            if (m.find() == FALSE) {
2263                break;
2264            }
2265            REGEX_ASSERT(m.start(status) == i);
2266            REGEX_ASSERT(m.end(status) == i);
2267        }
2268        REGEX_ASSERT(i==5);
2269
2270        // Check that the bump goes over characters outside the BMP OK
2271        // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2272        unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2273        utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2274        m.reset(&s);
2275        for (i=0; ; i+=4) {
2276            if (m.find() == FALSE) {
2277                break;
2278            }
2279            REGEX_ASSERT(m.start(status) == i);
2280            REGEX_ASSERT(m.end(status) == i);
2281        }
2282        REGEX_ASSERT(i==20);
2283
2284        utext_close(&s);
2285    }
2286    {
2287        // find() loop breaking test.
2288        //        with pattern of /.?/, should see a series of one char matches, then a single
2289        //        match of zero length at the end of the input string.
2290        int32_t                 i;
2291        UErrorCode          status=U_ZERO_ERROR;
2292        RegexMatcher        m(".?", 0, status);
2293        REGEX_CHECK_STATUS;
2294        UText s = UTEXT_INITIALIZER;
2295        utext_openUTF8(&s, "    ", -1, &status);
2296        m.reset(&s);
2297        for (i=0; ; i++) {
2298            if (m.find() == FALSE) {
2299                break;
2300            }
2301            REGEX_ASSERT(m.start(status) == i);
2302            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2303        }
2304        REGEX_ASSERT(i==5);
2305
2306        utext_close(&s);
2307    }
2308
2309
2310    //
2311    // Matchers with no input string behave as if they had an empty input string.
2312    //
2313
2314    {
2315        UErrorCode status = U_ZERO_ERROR;
2316        RegexMatcher  m(".?", 0, status);
2317        REGEX_CHECK_STATUS;
2318        REGEX_ASSERT(m.find());
2319        REGEX_ASSERT(m.start(status) == 0);
2320        REGEX_ASSERT(m.input() == "");
2321    }
2322    {
2323        UErrorCode status = U_ZERO_ERROR;
2324        RegexPattern  *p = RegexPattern::compile(".", 0, status);
2325        RegexMatcher  *m = p->matcher(status);
2326        REGEX_CHECK_STATUS;
2327
2328        REGEX_ASSERT(m->find() == FALSE);
2329        REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2330        delete m;
2331        delete p;
2332    }
2333
2334    //
2335    // Regions
2336    //
2337    {
2338        UErrorCode status = U_ZERO_ERROR;
2339        UText testPattern = UTEXT_INITIALIZER;
2340        UText testText    = UTEXT_INITIALIZER;
2341        regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2342        REGEX_VERBOSE_TEXT(&testPattern);
2343        regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2344        REGEX_VERBOSE_TEXT(&testText);
2345
2346        RegexMatcher m(&testPattern, &testText, 0, status);
2347        REGEX_CHECK_STATUS;
2348        REGEX_ASSERT(m.regionStart() == 0);
2349        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2350        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2351        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2352
2353        m.region(2,4, status);
2354        REGEX_CHECK_STATUS;
2355        REGEX_ASSERT(m.matches(status));
2356        REGEX_ASSERT(m.start(status)==2);
2357        REGEX_ASSERT(m.end(status)==4);
2358        REGEX_CHECK_STATUS;
2359
2360        m.reset();
2361        REGEX_ASSERT(m.regionStart() == 0);
2362        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2363
2364        regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2365        REGEX_VERBOSE_TEXT(&testText);
2366        m.reset(&testText);
2367        REGEX_ASSERT(m.regionStart() == 0);
2368        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2369
2370        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2371        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2372        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2373        REGEX_ASSERT(&m == &m.reset());
2374        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2375
2376        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2377        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2378        REGEX_ASSERT(&m == &m.reset());
2379        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2380
2381        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2382        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2383        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2384        REGEX_ASSERT(&m == &m.reset());
2385        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2386
2387        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2388        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2389        REGEX_ASSERT(&m == &m.reset());
2390        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2391
2392        utext_close(&testText);
2393        utext_close(&testPattern);
2394    }
2395
2396    //
2397    // hitEnd() and requireEnd()
2398    //
2399    {
2400        UErrorCode status = U_ZERO_ERROR;
2401        UText testPattern = UTEXT_INITIALIZER;
2402        UText testText    = UTEXT_INITIALIZER;
2403        const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2404        const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2405        utext_openUTF8(&testPattern, str_, -1, &status);
2406        utext_openUTF8(&testText, str_aabb, -1, &status);
2407
2408        RegexMatcher m1(&testPattern, &testText,  0, status);
2409        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2410        REGEX_ASSERT(m1.hitEnd() == TRUE);
2411        REGEX_ASSERT(m1.requireEnd() == FALSE);
2412        REGEX_CHECK_STATUS;
2413
2414        status = U_ZERO_ERROR;
2415        const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2416        utext_openUTF8(&testPattern, str_a, -1, &status);
2417        RegexMatcher m2(&testPattern, &testText, 0, status);
2418        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2419        REGEX_ASSERT(m2.hitEnd() == FALSE);
2420        REGEX_ASSERT(m2.requireEnd() == FALSE);
2421        REGEX_CHECK_STATUS;
2422
2423        status = U_ZERO_ERROR;
2424        const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2425        utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2426        RegexMatcher m3(&testPattern, &testText, 0, status);
2427        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2428        REGEX_ASSERT(m3.hitEnd() == TRUE);
2429        REGEX_ASSERT(m3.requireEnd() == TRUE);
2430        REGEX_CHECK_STATUS;
2431
2432        utext_close(&testText);
2433        utext_close(&testPattern);
2434    }
2435}
2436
2437
2438//---------------------------------------------------------------------------
2439//
2440//      API_Replace_UTF8   API test for class RegexMatcher, testing the
2441//                         Replace family of functions.
2442//
2443//---------------------------------------------------------------------------
2444void RegexTest::API_Replace_UTF8() {
2445    //
2446    //  Replace
2447    //
2448    int32_t             flags=0;
2449    UParseError         pe;
2450    UErrorCode          status=U_ZERO_ERROR;
2451
2452    UText               re=UTEXT_INITIALIZER;
2453    regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2454    REGEX_VERBOSE_TEXT(&re);
2455    RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2456    REGEX_CHECK_STATUS;
2457
2458    char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2459    //             012345678901234567
2460    UText dataText = UTEXT_INITIALIZER;
2461    utext_openUTF8(&dataText, data, -1, &status);
2462    REGEX_CHECK_STATUS;
2463    REGEX_VERBOSE_TEXT(&dataText);
2464    RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2465
2466    //
2467    //  Plain vanilla matches.
2468    //
2469    UnicodeString  dest;
2470    UText destText = UTEXT_INITIALIZER;
2471    utext_openUnicodeString(&destText, &dest, &status);
2472    UText *result;
2473
2474    UText replText = UTEXT_INITIALIZER;
2475
2476    const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2477    utext_openUTF8(&replText, str_yz, -1, &status);
2478    REGEX_VERBOSE_TEXT(&replText);
2479    result = matcher->replaceFirst(&replText, NULL, status);
2480    REGEX_CHECK_STATUS;
2481    const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2482    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2483    utext_close(result);
2484    result = matcher->replaceFirst(&replText, &destText, status);
2485    REGEX_CHECK_STATUS;
2486    REGEX_ASSERT(result == &destText);
2487    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2488
2489    result = matcher->replaceAll(&replText, NULL, status);
2490    REGEX_CHECK_STATUS;
2491    const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2492    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2493    utext_close(result);
2494
2495    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2496    result = matcher->replaceAll(&replText, &destText, status);
2497    REGEX_CHECK_STATUS;
2498    REGEX_ASSERT(result == &destText);
2499    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2500
2501    //
2502    //  Plain vanilla non-matches.
2503    //
2504    const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2505    utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2506    matcher->reset(&dataText);
2507
2508    result = matcher->replaceFirst(&replText, NULL, status);
2509    REGEX_CHECK_STATUS;
2510    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2511    utext_close(result);
2512    result = matcher->replaceFirst(&replText, &destText, status);
2513    REGEX_CHECK_STATUS;
2514    REGEX_ASSERT(result == &destText);
2515    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2516
2517    result = matcher->replaceAll(&replText, NULL, status);
2518    REGEX_CHECK_STATUS;
2519    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2520    utext_close(result);
2521    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2522    result = matcher->replaceAll(&replText, &destText, status);
2523    REGEX_CHECK_STATUS;
2524    REGEX_ASSERT(result == &destText);
2525    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2526
2527    //
2528    // Empty source string
2529    //
2530    utext_openUTF8(&dataText, NULL, 0, &status);
2531    matcher->reset(&dataText);
2532
2533    result = matcher->replaceFirst(&replText, NULL, status);
2534    REGEX_CHECK_STATUS;
2535    REGEX_ASSERT_UTEXT_UTF8("", result);
2536    utext_close(result);
2537    result = matcher->replaceFirst(&replText, &destText, status);
2538    REGEX_CHECK_STATUS;
2539    REGEX_ASSERT(result == &destText);
2540    REGEX_ASSERT_UTEXT_UTF8("", result);
2541
2542    result = matcher->replaceAll(&replText, NULL, status);
2543    REGEX_CHECK_STATUS;
2544    REGEX_ASSERT_UTEXT_UTF8("", result);
2545    utext_close(result);
2546    result = matcher->replaceAll(&replText, &destText, status);
2547    REGEX_CHECK_STATUS;
2548    REGEX_ASSERT(result == &destText);
2549    REGEX_ASSERT_UTEXT_UTF8("", result);
2550
2551    //
2552    // Empty substitution string
2553    //
2554    utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2555    matcher->reset(&dataText);
2556
2557    utext_openUTF8(&replText, NULL, 0, &status);
2558    result = matcher->replaceFirst(&replText, NULL, status);
2559    REGEX_CHECK_STATUS;
2560    const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2561    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2562    utext_close(result);
2563    result = matcher->replaceFirst(&replText, &destText, status);
2564    REGEX_CHECK_STATUS;
2565    REGEX_ASSERT(result == &destText);
2566    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2567
2568    result = matcher->replaceAll(&replText, NULL, status);
2569    REGEX_CHECK_STATUS;
2570    const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2571    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2572    utext_close(result);
2573    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2574    result = matcher->replaceAll(&replText, &destText, status);
2575    REGEX_CHECK_STATUS;
2576    REGEX_ASSERT(result == &destText);
2577    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2578
2579    //
2580    // match whole string
2581    //
2582    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2583    utext_openUTF8(&dataText, str_abc, -1, &status);
2584    matcher->reset(&dataText);
2585
2586    const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2587    utext_openUTF8(&replText, str_xyz, -1, &status);
2588    result = matcher->replaceFirst(&replText, NULL, status);
2589    REGEX_CHECK_STATUS;
2590    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2591    utext_close(result);
2592    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2593    result = matcher->replaceFirst(&replText, &destText, status);
2594    REGEX_CHECK_STATUS;
2595    REGEX_ASSERT(result == &destText);
2596    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2597
2598    result = matcher->replaceAll(&replText, NULL, status);
2599    REGEX_CHECK_STATUS;
2600    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2601    utext_close(result);
2602    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2603    result = matcher->replaceAll(&replText, &destText, status);
2604    REGEX_CHECK_STATUS;
2605    REGEX_ASSERT(result == &destText);
2606    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2607
2608    //
2609    // Capture Group, simple case
2610    //
2611    const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2612    utext_openUTF8(&re, str_add, -1, &status);
2613    RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2614    REGEX_CHECK_STATUS;
2615
2616    const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2617    utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2618    RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2619    REGEX_CHECK_STATUS;
2620
2621    const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2622    utext_openUTF8(&replText, str_11, -1, &status);
2623    result = matcher2->replaceFirst(&replText, NULL, status);
2624    REGEX_CHECK_STATUS;
2625    const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2626    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2627    utext_close(result);
2628    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2629    result = matcher2->replaceFirst(&replText, &destText, status);
2630    REGEX_CHECK_STATUS;
2631    REGEX_ASSERT(result == &destText);
2632    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2633
2634    const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2635    utext_openUTF8(&replText, str_v, -1, &status);
2636    REGEX_VERBOSE_TEXT(&replText);
2637    result = matcher2->replaceFirst(&replText, NULL, status);
2638    REGEX_CHECK_STATUS;
2639    const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2640    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2641    utext_close(result);
2642    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2643    result = matcher2->replaceFirst(&replText, &destText, status);
2644    REGEX_CHECK_STATUS;
2645    REGEX_ASSERT(result == &destText);
2646    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2647
2648    const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2649               0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2650               0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2651    utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2652    result = matcher2->replaceFirst(&replText, NULL, status);
2653    REGEX_CHECK_STATUS;
2654    const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2655    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2656    utext_close(result);
2657    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2658    result = matcher2->replaceFirst(&replText, &destText, status);
2659    REGEX_CHECK_STATUS;
2660    REGEX_ASSERT(result == &destText);
2661    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2662
2663    unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2664    //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2665    //                                 012345678901234567890123456
2666    supplDigitChars[22] = 0xF0;
2667    supplDigitChars[23] = 0x9D;
2668    supplDigitChars[24] = 0x9F;
2669    supplDigitChars[25] = 0x8F;
2670    utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2671
2672    result = matcher2->replaceFirst(&replText, NULL, status);
2673    REGEX_CHECK_STATUS;
2674    const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2675    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2676    utext_close(result);
2677    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2678    result = matcher2->replaceFirst(&replText, &destText, status);
2679    REGEX_CHECK_STATUS;
2680    REGEX_ASSERT(result == &destText);
2681    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2682    const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2683    utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2684    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2685//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2686    utext_close(result);
2687    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2688    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2689    REGEX_ASSERT(result == &destText);
2690//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2691
2692    //
2693    // Replacement String with \u hex escapes
2694    //
2695    {
2696      const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2697      const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2698        utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2699        utext_openUTF8(&replText, str_u0043, -1, &status);
2700        matcher->reset(&dataText);
2701
2702        result = matcher->replaceAll(&replText, NULL, status);
2703        REGEX_CHECK_STATUS;
2704        const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2705        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2706        utext_close(result);
2707        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2708        result = matcher->replaceAll(&replText, &destText, status);
2709        REGEX_CHECK_STATUS;
2710        REGEX_ASSERT(result == &destText);
2711        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2712    }
2713    {
2714      const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2715        utext_openUTF8(&dataText, str_abc, -1, &status);
2716        const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2717        utext_openUTF8(&replText, str_U00010000, -1, &status);
2718        matcher->reset(&dataText);
2719
2720        unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2721        //                          0123456789
2722        expected[2] = 0xF0;
2723        expected[3] = 0x90;
2724        expected[4] = 0x80;
2725        expected[5] = 0x80;
2726
2727        result = matcher->replaceAll(&replText, NULL, status);
2728        REGEX_CHECK_STATUS;
2729        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2730        utext_close(result);
2731        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2732        result = matcher->replaceAll(&replText, &destText, status);
2733        REGEX_CHECK_STATUS;
2734        REGEX_ASSERT(result == &destText);
2735        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2736    }
2737    // TODO:  need more through testing of capture substitutions.
2738
2739    // Bug 4057
2740    //
2741    {
2742        status = U_ZERO_ERROR;
2743const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2744const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2745const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2746        utext_openUTF8(&re, str_ssee, -1, &status);
2747        utext_openUTF8(&dataText, str_blah, -1, &status);
2748        utext_openUTF8(&replText, str_ooh, -1, &status);
2749
2750        RegexMatcher m(&re, 0, status);
2751        REGEX_CHECK_STATUS;
2752
2753        UnicodeString result;
2754        UText resultText = UTEXT_INITIALIZER;
2755        utext_openUnicodeString(&resultText, &result, &status);
2756
2757        // Multiple finds do NOT bump up the previous appendReplacement postion.
2758        m.reset(&dataText);
2759        m.find();
2760        m.find();
2761        m.appendReplacement(&resultText, &replText, status);
2762        REGEX_CHECK_STATUS;
2763        const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2764        REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2765
2766        // After a reset into the interior of a string, appendReplacement still starts at beginning.
2767        status = U_ZERO_ERROR;
2768        result.truncate(0);
2769        utext_openUnicodeString(&resultText, &result, &status);
2770        m.reset(10, status);
2771        m.find();
2772        m.find();
2773        m.appendReplacement(&resultText, &replText, status);
2774        REGEX_CHECK_STATUS;
2775        const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2776        REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2777
2778        // find() at interior of string, appendReplacement still starts at beginning.
2779        status = U_ZERO_ERROR;
2780        result.truncate(0);
2781        utext_openUnicodeString(&resultText, &result, &status);
2782        m.reset();
2783        m.find(10, status);
2784        m.find();
2785        m.appendReplacement(&resultText, &replText, status);
2786        REGEX_CHECK_STATUS;
2787        const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2788        REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2789
2790        m.appendTail(&resultText, status);
2791        const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2792        REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2793
2794        utext_close(&resultText);
2795    }
2796
2797    delete matcher2;
2798    delete pat2;
2799    delete matcher;
2800    delete pat;
2801
2802    utext_close(&dataText);
2803    utext_close(&replText);
2804    utext_close(&destText);
2805    utext_close(&re);
2806}
2807
2808
2809//---------------------------------------------------------------------------
2810//
2811//      API_Pattern_UTF8  Test that the API for class RegexPattern is
2812//                        present and nominally working.
2813//
2814//---------------------------------------------------------------------------
2815void RegexTest::API_Pattern_UTF8() {
2816    RegexPattern        pata;    // Test default constructor to not crash.
2817    RegexPattern        patb;
2818
2819    REGEX_ASSERT(pata == patb);
2820    REGEX_ASSERT(pata == pata);
2821
2822    UText         re1 = UTEXT_INITIALIZER;
2823    UText         re2 = UTEXT_INITIALIZER;
2824    UErrorCode    status = U_ZERO_ERROR;
2825    UParseError   pe;
2826
2827    const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2828    const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2829    utext_openUTF8(&re1, str_abcalmz, -1, &status);
2830    utext_openUTF8(&re2, str_def, -1, &status);
2831
2832    RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2833    RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2834    REGEX_CHECK_STATUS;
2835    REGEX_ASSERT(*pat1 == *pat1);
2836    REGEX_ASSERT(*pat1 != pata);
2837
2838    // Assign
2839    patb = *pat1;
2840    REGEX_ASSERT(patb == *pat1);
2841
2842    // Copy Construct
2843    RegexPattern patc(*pat1);
2844    REGEX_ASSERT(patc == *pat1);
2845    REGEX_ASSERT(patb == patc);
2846    REGEX_ASSERT(pat1 != pat2);
2847    patb = *pat2;
2848    REGEX_ASSERT(patb != patc);
2849    REGEX_ASSERT(patb == *pat2);
2850
2851    // Compile with no flags.
2852    RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2853    REGEX_ASSERT(*pat1a == *pat1);
2854
2855    REGEX_ASSERT(pat1a->flags() == 0);
2856
2857    // Compile with different flags should be not equal
2858    RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2859    REGEX_CHECK_STATUS;
2860
2861    REGEX_ASSERT(*pat1b != *pat1a);
2862    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2863    REGEX_ASSERT(pat1a->flags() == 0);
2864    delete pat1b;
2865
2866    // clone
2867    RegexPattern *pat1c = pat1->clone();
2868    REGEX_ASSERT(*pat1c == *pat1);
2869    REGEX_ASSERT(*pat1c != *pat2);
2870
2871    delete pat1c;
2872    delete pat1a;
2873    delete pat1;
2874    delete pat2;
2875
2876    utext_close(&re1);
2877    utext_close(&re2);
2878
2879
2880    //
2881    //   Verify that a matcher created from a cloned pattern works.
2882    //     (Jitterbug 3423)
2883    //
2884    {
2885        UErrorCode     status     = U_ZERO_ERROR;
2886        UText          pattern    = UTEXT_INITIALIZER;
2887        const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2888        utext_openUTF8(&pattern, str_pL, -1, &status);
2889
2890        RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2891        RegexPattern  *pClone     = pSource->clone();
2892        delete         pSource;
2893        RegexMatcher  *mFromClone = pClone->matcher(status);
2894        REGEX_CHECK_STATUS;
2895
2896        UText          input      = UTEXT_INITIALIZER;
2897        const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2898        utext_openUTF8(&input, str_HelloWorld, -1, &status);
2899        mFromClone->reset(&input);
2900        REGEX_ASSERT(mFromClone->find() == TRUE);
2901        REGEX_ASSERT(mFromClone->group(status) == "Hello");
2902        REGEX_ASSERT(mFromClone->find() == TRUE);
2903        REGEX_ASSERT(mFromClone->group(status) == "World");
2904        REGEX_ASSERT(mFromClone->find() == FALSE);
2905        delete mFromClone;
2906        delete pClone;
2907
2908        utext_close(&input);
2909        utext_close(&pattern);
2910    }
2911
2912    //
2913    //   matches convenience API
2914    //
2915    {
2916        UErrorCode status  = U_ZERO_ERROR;
2917        UText      pattern = UTEXT_INITIALIZER;
2918        UText      input   = UTEXT_INITIALIZER;
2919
2920        const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2921        utext_openUTF8(&input, str_randominput, -1, &status);
2922
2923        const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2924        utext_openUTF8(&pattern, str_dotstar, -1, &status);
2925        REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2926        REGEX_CHECK_STATUS;
2927
2928        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2929        utext_openUTF8(&pattern, str_abc, -1, &status);
2930        REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2931        REGEX_CHECK_STATUS;
2932
2933        const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2934        utext_openUTF8(&pattern, str_nput, -1, &status);
2935        REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2936        REGEX_CHECK_STATUS;
2937
2938        utext_openUTF8(&pattern, str_randominput, -1, &status);
2939        REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2940        REGEX_CHECK_STATUS;
2941
2942        const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2943        utext_openUTF8(&pattern, str_u, -1, &status);
2944        REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2945        REGEX_CHECK_STATUS;
2946
2947        utext_openUTF8(&input, str_abc, -1, &status);
2948        utext_openUTF8(&pattern, str_abc, -1, &status);
2949        status = U_INDEX_OUTOFBOUNDS_ERROR;
2950        REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2951        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2952
2953        utext_close(&input);
2954        utext_close(&pattern);
2955    }
2956
2957
2958    //
2959    // Split()
2960    //
2961    status = U_ZERO_ERROR;
2962    const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2963    utext_openUTF8(&re1, str_spaceplus, -1, &status);
2964    pat1 = RegexPattern::compile(&re1, pe, status);
2965    REGEX_CHECK_STATUS;
2966    UnicodeString  fields[10];
2967
2968    int32_t n;
2969    n = pat1->split("Now is the time", fields, 10, status);
2970    REGEX_CHECK_STATUS;
2971    REGEX_ASSERT(n==4);
2972    REGEX_ASSERT(fields[0]=="Now");
2973    REGEX_ASSERT(fields[1]=="is");
2974    REGEX_ASSERT(fields[2]=="the");
2975    REGEX_ASSERT(fields[3]=="time");
2976    REGEX_ASSERT(fields[4]=="");
2977
2978    n = pat1->split("Now is the time", fields, 2, status);
2979    REGEX_CHECK_STATUS;
2980    REGEX_ASSERT(n==2);
2981    REGEX_ASSERT(fields[0]=="Now");
2982    REGEX_ASSERT(fields[1]=="is the time");
2983    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2984
2985    fields[1] = "*";
2986    status = U_ZERO_ERROR;
2987    n = pat1->split("Now is the time", fields, 1, status);
2988    REGEX_CHECK_STATUS;
2989    REGEX_ASSERT(n==1);
2990    REGEX_ASSERT(fields[0]=="Now is the time");
2991    REGEX_ASSERT(fields[1]=="*");
2992    status = U_ZERO_ERROR;
2993
2994    n = pat1->split("    Now       is the time   ", fields, 10, status);
2995    REGEX_CHECK_STATUS;
2996    REGEX_ASSERT(n==6);
2997    REGEX_ASSERT(fields[0]=="");
2998    REGEX_ASSERT(fields[1]=="Now");
2999    REGEX_ASSERT(fields[2]=="is");
3000    REGEX_ASSERT(fields[3]=="the");
3001    REGEX_ASSERT(fields[4]=="time");
3002    REGEX_ASSERT(fields[5]=="");
3003    REGEX_ASSERT(fields[6]=="");
3004
3005    fields[2] = "*";
3006    n = pat1->split("     ", fields, 10, status);
3007    REGEX_CHECK_STATUS;
3008    REGEX_ASSERT(n==2);
3009    REGEX_ASSERT(fields[0]=="");
3010    REGEX_ASSERT(fields[1]=="");
3011    REGEX_ASSERT(fields[2]=="*");
3012
3013    fields[0] = "foo";
3014    n = pat1->split("", fields, 10, status);
3015    REGEX_CHECK_STATUS;
3016    REGEX_ASSERT(n==0);
3017    REGEX_ASSERT(fields[0]=="foo");
3018
3019    delete pat1;
3020
3021    //  split, with a pattern with (capture)
3022    regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
3023    pat1 = RegexPattern::compile(&re1,  pe, status);
3024    REGEX_CHECK_STATUS;
3025
3026    status = U_ZERO_ERROR;
3027    fields[6] = fields[7] = "*";
3028    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3029    REGEX_CHECK_STATUS;
3030    REGEX_ASSERT(n==7);
3031    REGEX_ASSERT(fields[0]=="");
3032    REGEX_ASSERT(fields[1]=="a");
3033    REGEX_ASSERT(fields[2]=="Now is ");
3034    REGEX_ASSERT(fields[3]=="b");
3035    REGEX_ASSERT(fields[4]=="the time");
3036    REGEX_ASSERT(fields[5]=="c");
3037    REGEX_ASSERT(fields[6]=="");
3038    REGEX_ASSERT(fields[7]=="*");
3039    REGEX_ASSERT(status==U_ZERO_ERROR);
3040
3041    fields[6] = fields[7] = "*";
3042    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
3043    REGEX_CHECK_STATUS;
3044    REGEX_ASSERT(n==7);
3045    REGEX_ASSERT(fields[0]=="  ");
3046    REGEX_ASSERT(fields[1]=="a");
3047    REGEX_ASSERT(fields[2]=="Now is ");
3048    REGEX_ASSERT(fields[3]=="b");
3049    REGEX_ASSERT(fields[4]=="the time");
3050    REGEX_ASSERT(fields[5]=="c");
3051    REGEX_ASSERT(fields[6]=="");
3052    REGEX_ASSERT(fields[7]=="*");
3053
3054    status = U_ZERO_ERROR;
3055    fields[6] = "foo";
3056    n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
3057    REGEX_CHECK_STATUS;
3058    REGEX_ASSERT(n==6);
3059    REGEX_ASSERT(fields[0]=="  ");
3060    REGEX_ASSERT(fields[1]=="a");
3061    REGEX_ASSERT(fields[2]=="Now is ");
3062    REGEX_ASSERT(fields[3]=="b");
3063    REGEX_ASSERT(fields[4]=="the time");
3064    REGEX_ASSERT(fields[5]==" ");
3065    REGEX_ASSERT(fields[6]=="foo");
3066
3067    status = U_ZERO_ERROR;
3068    fields[5] = "foo";
3069    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3070    REGEX_CHECK_STATUS;
3071    REGEX_ASSERT(n==5);
3072    REGEX_ASSERT(fields[0]=="  ");
3073    REGEX_ASSERT(fields[1]=="a");
3074    REGEX_ASSERT(fields[2]=="Now is ");
3075    REGEX_ASSERT(fields[3]=="b");
3076    REGEX_ASSERT(fields[4]=="the time<c>");
3077    REGEX_ASSERT(fields[5]=="foo");
3078
3079    status = U_ZERO_ERROR;
3080    fields[5] = "foo";
3081    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3082    REGEX_CHECK_STATUS;
3083    REGEX_ASSERT(n==5);
3084    REGEX_ASSERT(fields[0]=="  ");
3085    REGEX_ASSERT(fields[1]=="a");
3086    REGEX_ASSERT(fields[2]=="Now is ");
3087    REGEX_ASSERT(fields[3]=="b");
3088    REGEX_ASSERT(fields[4]=="the time");
3089    REGEX_ASSERT(fields[5]=="foo");
3090
3091    status = U_ZERO_ERROR;
3092    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3093    REGEX_CHECK_STATUS;
3094    REGEX_ASSERT(n==4);
3095    REGEX_ASSERT(fields[0]=="  ");
3096    REGEX_ASSERT(fields[1]=="a");
3097    REGEX_ASSERT(fields[2]=="Now is ");
3098    REGEX_ASSERT(fields[3]=="the time<c>");
3099    status = U_ZERO_ERROR;
3100    delete pat1;
3101
3102    regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3103    pat1 = RegexPattern::compile(&re1, pe, status);
3104    REGEX_CHECK_STATUS;
3105    n = pat1->split("1-10,20", fields, 10, status);
3106    REGEX_CHECK_STATUS;
3107    REGEX_ASSERT(n==5);
3108    REGEX_ASSERT(fields[0]=="1");
3109    REGEX_ASSERT(fields[1]=="-");
3110    REGEX_ASSERT(fields[2]=="10");
3111    REGEX_ASSERT(fields[3]==",");
3112    REGEX_ASSERT(fields[4]=="20");
3113    delete pat1;
3114
3115
3116    //
3117    // split of a UText based string, with library allocating output UTexts.
3118    //
3119    {
3120        status = U_ZERO_ERROR;
3121        RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3122        UnicodeString stringToSplit("first:second:third");
3123        UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3124        REGEX_CHECK_STATUS;
3125
3126        UText *splits[10] = {NULL};
3127        int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3128        REGEX_CHECK_STATUS;
3129        REGEX_ASSERT(numFields == 5);
3130        REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3131        REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3132        REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3133        REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3134        REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3135        REGEX_ASSERT(splits[5] == NULL);
3136
3137        for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3138            if (splits[i]) {
3139                utext_close(splits[i]);
3140                splits[i] = NULL;
3141            }
3142        }
3143        utext_close(textToSplit);
3144    }
3145
3146
3147    //
3148    // RegexPattern::pattern() and patternText()
3149    //
3150    pat1 = new RegexPattern();
3151    REGEX_ASSERT(pat1->pattern() == "");
3152    REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3153    delete pat1;
3154    const char *helloWorldInvariant = "(Hello, world)*";
3155    regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3156    pat1 = RegexPattern::compile(&re1, pe, status);
3157    REGEX_CHECK_STATUS;
3158    REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3159    REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3160    delete pat1;
3161
3162    utext_close(&re1);
3163}
3164
3165
3166//---------------------------------------------------------------------------
3167//
3168//      Extended       A more thorough check for features of regex patterns
3169//                     The test cases are in a separate data file,
3170//                       source/tests/testdata/regextst.txt
3171//                     A description of the test data format is included in that file.
3172//
3173//---------------------------------------------------------------------------
3174
3175const char *
3176RegexTest::getPath(char buffer[2048], const char *filename) {
3177    UErrorCode status=U_ZERO_ERROR;
3178    const char *testDataDirectory = IntlTest::getSourceTestData(status);
3179    if (U_FAILURE(status)) {
3180        errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3181        return NULL;
3182    }
3183
3184    strcpy(buffer, testDataDirectory);
3185    strcat(buffer, filename);
3186    return buffer;
3187}
3188
3189void RegexTest::Extended() {
3190    char tdd[2048];
3191    const char *srcPath;
3192    UErrorCode  status  = U_ZERO_ERROR;
3193    int32_t     lineNum = 0;
3194
3195    //
3196    //  Open and read the test data file.
3197    //
3198    srcPath=getPath(tdd, "regextst.txt");
3199    if(srcPath==NULL) {
3200        return; /* something went wrong, error already output */
3201    }
3202
3203    int32_t    len;
3204    UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3205    if (U_FAILURE(status)) {
3206        return; /* something went wrong, error already output */
3207    }
3208
3209    //
3210    //  Put the test data into a UnicodeString
3211    //
3212    UnicodeString testString(FALSE, testData, len);
3213
3214    RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3215    RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3216    RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3217
3218    RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3219    UnicodeString   testPattern;   // The pattern for test from the test file.
3220    UnicodeString   testFlags;     // the flags   for a test.
3221    UnicodeString   matchString;   // The marked up string to be used as input
3222
3223    if (U_FAILURE(status)){
3224        dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3225        delete [] testData;
3226        return;
3227    }
3228
3229    //
3230    //  Loop over the test data file, once per line.
3231    //
3232    while (lineMat.find()) {
3233        lineNum++;
3234        if (U_FAILURE(status)) {
3235          errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3236        }
3237
3238        status = U_ZERO_ERROR;
3239        UnicodeString testLine = lineMat.group(1, status);
3240        if (testLine.length() == 0) {
3241            continue;
3242        }
3243
3244        //
3245        // Parse the test line.  Skip blank and comment only lines.
3246        // Separate out the three main fields - pattern, flags, target.
3247        //
3248
3249        commentMat.reset(testLine);
3250        if (commentMat.lookingAt(status)) {
3251            // This line is a comment, or blank.
3252            continue;
3253        }
3254
3255        //
3256        //  Pull out the pattern field, remove it from the test file line.
3257        //
3258        quotedStuffMat.reset(testLine);
3259        if (quotedStuffMat.lookingAt(status)) {
3260            testPattern = quotedStuffMat.group(2, status);
3261            testLine.remove(0, quotedStuffMat.end(0, status));
3262        } else {
3263            errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3264            continue;
3265        }
3266
3267
3268        //
3269        //  Pull out the flags from the test file line.
3270        //
3271        flagsMat.reset(testLine);
3272        flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3273        testFlags = flagsMat.group(1, status);
3274        if (flagsMat.group(2, status).length() > 0) {
3275            errln("Bad Match flag at line %d. Scanning %c\n",
3276                lineNum, flagsMat.group(2, status).charAt(0));
3277            continue;
3278        }
3279        testLine.remove(0, flagsMat.end(0, status));
3280
3281        //
3282        //  Pull out the match string, as a whole.
3283        //    We'll process the <tags> later.
3284        //
3285        quotedStuffMat.reset(testLine);
3286        if (quotedStuffMat.lookingAt(status)) {
3287            matchString = quotedStuffMat.group(2, status);
3288            testLine.remove(0, quotedStuffMat.end(0, status));
3289        } else {
3290            errln("Bad match string at test file line %d", lineNum);
3291            continue;
3292        }
3293
3294        //
3295        //  The only thing left from the input line should be an optional trailing comment.
3296        //
3297        commentMat.reset(testLine);
3298        if (commentMat.lookingAt(status) == FALSE) {
3299            errln("Line %d: unexpected characters at end of test line.", lineNum);
3300            continue;
3301        }
3302
3303        //
3304        //  Run the test
3305        //
3306        regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3307    }
3308
3309    delete [] testData;
3310
3311}
3312
3313
3314
3315//---------------------------------------------------------------------------
3316//
3317//    regex_find(pattern, flags, inputString, lineNumber)
3318//
3319//         Function to run a single test from the Extended (data driven) tests.
3320//         See file test/testdata/regextst.txt for a description of the
3321//         pattern and inputString fields, and the allowed flags.
3322//         lineNumber is the source line in regextst.txt of the test.
3323//
3324//---------------------------------------------------------------------------
3325
3326
3327//  Set a value into a UVector at position specified by a decimal number in
3328//   a UnicodeString.   This is a utility function needed by the actual test function,
3329//   which follows.
3330static void set(UVector &vec, int32_t val, UnicodeString index) {
3331    UErrorCode  status=U_ZERO_ERROR;
3332    int32_t  idx = 0;
3333    for (int32_t i=0; i<index.length(); i++) {
3334        int32_t d=u_charDigitValue(index.charAt(i));
3335        if (d<0) {return;}
3336        idx = idx*10 + d;
3337    }
3338    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3339    vec.setElementAt(val, idx);
3340}
3341
3342static void setInt(UVector &vec, int32_t val, int32_t idx) {
3343    UErrorCode  status=U_ZERO_ERROR;
3344    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3345    vec.setElementAt(val, idx);
3346}
3347
3348static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3349{
3350    UBool couldFind = TRUE;
3351    UTEXT_SETNATIVEINDEX(utext, 0);
3352    int32_t i = 0;
3353    while (i < unistrOffset) {
3354        UChar32 c = UTEXT_NEXT32(utext);
3355        if (c != U_SENTINEL) {
3356            i += U16_LENGTH(c);
3357        } else {
3358            couldFind = FALSE;
3359            break;
3360        }
3361    }
3362    nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3363    return couldFind;
3364}
3365
3366
3367void RegexTest::regex_find(const UnicodeString &pattern,
3368                           const UnicodeString &flags,
3369                           const UnicodeString &inputString,
3370                           const char *srcPath,
3371                           int32_t line) {
3372    UnicodeString       unEscapedInput;
3373    UnicodeString       deTaggedInput;
3374
3375    int32_t             patternUTF8Length,      inputUTF8Length;
3376    char                *patternChars  = NULL, *inputChars = NULL;
3377    UText               patternText    = UTEXT_INITIALIZER;
3378    UText               inputText      = UTEXT_INITIALIZER;
3379    UConverter          *UTF8Converter = NULL;
3380
3381    UErrorCode          status         = U_ZERO_ERROR;
3382    UParseError         pe;
3383    RegexPattern        *parsePat      = NULL;
3384    RegexMatcher        *parseMatcher  = NULL;
3385    RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3386    RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3387    UVector             groupStarts(status);
3388    UVector             groupEnds(status);
3389    UVector             groupStartsUTF8(status);
3390    UVector             groupEndsUTF8(status);
3391    UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3392    UBool               failed         = FALSE;
3393    int32_t             numFinds;
3394    int32_t             i;
3395    UBool               useMatchesFunc   = FALSE;
3396    UBool               useLookingAtFunc = FALSE;
3397    int32_t             regionStart      = -1;
3398    int32_t             regionEnd        = -1;
3399    int32_t             regionStartUTF8  = -1;
3400    int32_t             regionEndUTF8    = -1;
3401
3402
3403    //
3404    //  Compile the caller's pattern
3405    //
3406    uint32_t bflags = 0;
3407    if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3408        bflags |= UREGEX_CASE_INSENSITIVE;
3409    }
3410    if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3411        bflags |= UREGEX_COMMENTS;
3412    }
3413    if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3414        bflags |= UREGEX_DOTALL;
3415    }
3416    if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3417        bflags |= UREGEX_MULTILINE;
3418    }
3419
3420    if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3421        bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3422    }
3423    if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3424        bflags |= UREGEX_UNIX_LINES;
3425    }
3426    if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3427        bflags |= UREGEX_LITERAL;
3428    }
3429
3430
3431    callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3432    if (status != U_ZERO_ERROR) {
3433        #if UCONFIG_NO_BREAK_ITERATION==1
3434        // 'v' test flag means that the test pattern should not compile if ICU was configured
3435        //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3436        if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3437            goto cleanupAndReturn;
3438        }
3439        #endif
3440        if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3441            // Expected pattern compilation error.
3442            if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3443                logln("Pattern Compile returns \"%s\"", u_errorName(status));
3444            }
3445            goto cleanupAndReturn;
3446        } else {
3447            // Unexpected pattern compilation error.
3448            dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3449            goto cleanupAndReturn;
3450        }
3451    }
3452
3453    UTF8Converter = ucnv_open("UTF8", &status);
3454    ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3455
3456    patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3457    status = U_ZERO_ERROR; // buffer overflow
3458    patternChars = new char[patternUTF8Length+1];
3459    pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3460    utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3461
3462    if (status == U_ZERO_ERROR) {
3463        UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3464
3465        if (status != U_ZERO_ERROR) {
3466#if UCONFIG_NO_BREAK_ITERATION==1
3467            // 'v' test flag means that the test pattern should not compile if ICU was configured
3468            //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3469            if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3470                goto cleanupAndReturn;
3471            }
3472#endif
3473            if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3474                // Expected pattern compilation error.
3475                if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3476                    logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3477                }
3478                goto cleanupAndReturn;
3479            } else {
3480                // Unexpected pattern compilation error.
3481                errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3482                goto cleanupAndReturn;
3483            }
3484        }
3485    }
3486
3487    if (UTF8Pattern == NULL) {
3488        // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3489        logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3490        status = U_ZERO_ERROR;
3491    }
3492
3493    if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3494        callerPattern->dumpPattern();
3495    }
3496
3497    if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3498        errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3499        goto cleanupAndReturn;
3500    }
3501
3502
3503    //
3504    // Number of times find() should be called on the test string, default to 1
3505    //
3506    numFinds = 1;
3507    for (i=2; i<=9; i++) {
3508        if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3509            if (numFinds != 1) {
3510                errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3511                goto cleanupAndReturn;
3512            }
3513            numFinds = i;
3514        }
3515    }
3516
3517    // 'M' flag.  Use matches() instead of find()
3518    if (flags.indexOf((UChar)0x4d) >= 0) {
3519        useMatchesFunc = TRUE;
3520    }
3521    if (flags.indexOf((UChar)0x4c) >= 0) {
3522        useLookingAtFunc = TRUE;
3523    }
3524
3525    //
3526    //  Find the tags in the input data, remove them, and record the group boundary
3527    //    positions.
3528    //
3529    parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3530    REGEX_CHECK_STATUS_L(line);
3531
3532    unEscapedInput = inputString.unescape();
3533    parseMatcher = parsePat->matcher(unEscapedInput, status);
3534    REGEX_CHECK_STATUS_L(line);
3535    while(parseMatcher->find()) {
3536        parseMatcher->appendReplacement(deTaggedInput, "", status);
3537        REGEX_CHECK_STATUS;
3538        UnicodeString groupNum = parseMatcher->group(2, status);
3539        if (groupNum == "r") {
3540            // <r> or </r>, a region specification within the string
3541            if (parseMatcher->group(1, status) == "/") {
3542                regionEnd = deTaggedInput.length();
3543            } else {
3544                regionStart = deTaggedInput.length();
3545            }
3546        } else {
3547            // <digits> or </digits>, a group match boundary tag.
3548            if (parseMatcher->group(1, status) == "/") {
3549                set(groupEnds, deTaggedInput.length(), groupNum);
3550            } else {
3551                set(groupStarts, deTaggedInput.length(), groupNum);
3552            }
3553        }
3554    }
3555    parseMatcher->appendTail(deTaggedInput);
3556    REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3557    if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3558      errln("mismatched <r> tags");
3559      failed = TRUE;
3560      goto cleanupAndReturn;
3561    }
3562
3563    //
3564    //  Configure the matcher according to the flags specified with this test.
3565    //
3566    matcher = callerPattern->matcher(deTaggedInput, status);
3567    REGEX_CHECK_STATUS_L(line);
3568    if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3569        matcher->setTrace(TRUE);
3570    }
3571
3572    if (UTF8Pattern != NULL) {
3573        inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3574        status = U_ZERO_ERROR; // buffer overflow
3575        inputChars = new char[inputUTF8Length+1];
3576        deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3577        utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3578
3579        if (status == U_ZERO_ERROR) {
3580            UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3581            REGEX_CHECK_STATUS_L(line);
3582        }
3583
3584        if (UTF8Matcher == NULL) {
3585            // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3586          logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3587            status = U_ZERO_ERROR;
3588        }
3589    }
3590
3591    //
3592    //  Generate native indices for UTF8 versions of region and capture group info
3593    //
3594    if (UTF8Matcher != NULL) {
3595        if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3596        if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3597
3598        //  Fill out the native index UVector info.
3599        //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3600        for (i=0; i<groupStarts.size(); i++) {
3601            int32_t  start = groupStarts.elementAti(i);
3602            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3603            if (start >= 0) {
3604                int32_t  startUTF8;
3605                if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3606                    errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3607                    failed = TRUE;
3608                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3609                }
3610                setInt(groupStartsUTF8, startUTF8, i);
3611            }
3612
3613            int32_t  end = groupEnds.elementAti(i);
3614            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3615            if (end >= 0) {
3616                int32_t  endUTF8;
3617                if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3618                    errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3619                    failed = TRUE;
3620                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3621                }
3622                setInt(groupEndsUTF8, endUTF8, i);
3623            }
3624        }
3625    }
3626
3627    if (regionStart>=0) {
3628       matcher->region(regionStart, regionEnd, status);
3629       REGEX_CHECK_STATUS_L(line);
3630       if (UTF8Matcher != NULL) {
3631           UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3632           REGEX_CHECK_STATUS_L(line);
3633       }
3634    }
3635    if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3636        matcher->useAnchoringBounds(FALSE);
3637        if (UTF8Matcher != NULL) {
3638            UTF8Matcher->useAnchoringBounds(FALSE);
3639        }
3640    }
3641    if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3642        matcher->useTransparentBounds(TRUE);
3643        if (UTF8Matcher != NULL) {
3644            UTF8Matcher->useTransparentBounds(TRUE);
3645        }
3646    }
3647
3648
3649
3650    //
3651    // Do a find on the de-tagged input using the caller's pattern
3652    //     TODO: error on count>1 and not find().
3653    //           error on both matches() and lookingAt().
3654    //
3655    for (i=0; i<numFinds; i++) {
3656        if (useMatchesFunc) {
3657            isMatch = matcher->matches(status);
3658            if (UTF8Matcher != NULL) {
3659               isUTF8Match = UTF8Matcher->matches(status);
3660            }
3661        } else  if (useLookingAtFunc) {
3662            isMatch = matcher->lookingAt(status);
3663            if (UTF8Matcher != NULL) {
3664                isUTF8Match = UTF8Matcher->lookingAt(status);
3665            }
3666        } else {
3667            isMatch = matcher->find();
3668            if (UTF8Matcher != NULL) {
3669                isUTF8Match = UTF8Matcher->find();
3670            }
3671        }
3672    }
3673    matcher->setTrace(FALSE);
3674    if (U_FAILURE(status)) {
3675        errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3676    }
3677
3678    //
3679    // Match up the groups from the find() with the groups from the tags
3680    //
3681
3682    // number of tags should match number of groups from find operation.
3683    // matcher->groupCount does not include group 0, the entire match, hence the +1.
3684    //   G option in test means that capture group data is not available in the
3685    //     expected results, so the check needs to be suppressed.
3686    if (isMatch == FALSE && groupStarts.size() != 0) {
3687        dataerrln("Error at line %d:  Match expected, but none found.", line);
3688        failed = TRUE;
3689        goto cleanupAndReturn;
3690    } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3691        errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3692        failed = TRUE;
3693        goto cleanupAndReturn;
3694    }
3695
3696    if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3697        // Only check for match / no match.  Don't check capture groups.
3698        if (isMatch && groupStarts.size() == 0) {
3699            errln("Error at line %d:  No match expected, but one found.", line);
3700            failed = TRUE;
3701        } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3702            errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3703            failed = TRUE;
3704        }
3705        goto cleanupAndReturn;
3706    }
3707
3708    REGEX_CHECK_STATUS_L(line);
3709    for (i=0; i<=matcher->groupCount(); i++) {
3710        int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3711        int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3712        if (matcher->start(i, status) != expectedStart) {
3713            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3714                line, i, expectedStart, matcher->start(i, status));
3715            failed = TRUE;
3716            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3717        } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3718            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3719                  line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3720            failed = TRUE;
3721            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3722        }
3723
3724        int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3725        int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3726        if (matcher->end(i, status) != expectedEnd) {
3727            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3728                line, i, expectedEnd, matcher->end(i, status));
3729            failed = TRUE;
3730            // Error on end position;  keep going; real error is probably yet to come as group
3731            //   end positions work from end of the input data towards the front.
3732        } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3733            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3734                  line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3735            failed = TRUE;
3736            // Error on end position;  keep going; real error is probably yet to come as group
3737            //   end positions work from end of the input data towards the front.
3738        }
3739    }
3740    if ( matcher->groupCount()+1 < groupStarts.size()) {
3741        errln("Error at line %d: Expected %d capture groups, found %d.",
3742            line, groupStarts.size()-1, matcher->groupCount());
3743        failed = TRUE;
3744        }
3745    else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3746        errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3747              line, groupStarts.size()-1, UTF8Matcher->groupCount());
3748        failed = TRUE;
3749    }
3750
3751    if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3752        matcher->requireEnd() == TRUE) {
3753        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3754        failed = TRUE;
3755    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3756        UTF8Matcher->requireEnd() == TRUE) {
3757        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3758        failed = TRUE;
3759    }
3760
3761    if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3762        matcher->requireEnd() == FALSE) {
3763        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3764        failed = TRUE;
3765    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3766        UTF8Matcher->requireEnd() == FALSE) {
3767        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3768        failed = TRUE;
3769    }
3770
3771    if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3772        matcher->hitEnd() == TRUE) {
3773        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3774        failed = TRUE;
3775    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3776               UTF8Matcher->hitEnd() == TRUE) {
3777        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3778        failed = TRUE;
3779    }
3780
3781    if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3782        matcher->hitEnd() == FALSE) {
3783        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3784        failed = TRUE;
3785    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3786               UTF8Matcher->hitEnd() == FALSE) {
3787        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3788        failed = TRUE;
3789    }
3790
3791
3792cleanupAndReturn:
3793    if (failed) {
3794        infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3795            +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3796        // callerPattern->dump();
3797    }
3798    delete parseMatcher;
3799    delete parsePat;
3800    delete UTF8Matcher;
3801    delete UTF8Pattern;
3802    delete matcher;
3803    delete callerPattern;
3804
3805    utext_close(&inputText);
3806    delete[] inputChars;
3807    utext_close(&patternText);
3808    delete[] patternChars;
3809    ucnv_close(UTF8Converter);
3810}
3811
3812
3813
3814
3815//---------------------------------------------------------------------------
3816//
3817//      Errors     Check for error handling in patterns.
3818//
3819//---------------------------------------------------------------------------
3820void RegexTest::Errors() {
3821    // \escape sequences that aren't implemented yet.
3822    //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3823
3824    // Missing close parentheses
3825    REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3826    REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3827    REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3828
3829    // Extra close paren
3830    REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3831    REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3832    REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3833
3834    // Look-ahead, Look-behind
3835    //  TODO:  add tests for unbounded length look-behinds.
3836    REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3837
3838    // Attempt to use non-default flags
3839    {
3840        UParseError   pe;
3841        UErrorCode    status = U_ZERO_ERROR;
3842        int32_t       flags  = UREGEX_CANON_EQ |
3843                               UREGEX_COMMENTS         | UREGEX_DOTALL   |
3844                               UREGEX_MULTILINE;
3845        RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3846        REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3847        delete pat1;
3848    }
3849
3850
3851    // Quantifiers are allowed only after something that can be quantified.
3852    REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3853    REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3854    REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3855
3856    // Mal-formed {min,max} quantifiers
3857    REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3858    REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3859    REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3860    REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3861    REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3862    REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3863    REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3864    REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3865    REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3866
3867    // Ticket 5389
3868    REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3869
3870    // Invalid Back Reference \0
3871    //    For ICU 3.8 and earlier
3872    //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3873    //
3874    REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3875
3876}
3877
3878
3879//-------------------------------------------------------------------------------
3880//
3881//  Read a text data file, convert it to UChars, and return the data
3882//    in one big UChar * buffer, which the caller must delete.
3883//
3884//--------------------------------------------------------------------------------
3885UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3886                                     const char *defEncoding, UErrorCode &status) {
3887    UChar       *retPtr  = NULL;
3888    char        *fileBuf = NULL;
3889    UConverter* conv     = NULL;
3890    FILE        *f       = NULL;
3891
3892    ulen = 0;
3893    if (U_FAILURE(status)) {
3894        return retPtr;
3895    }
3896
3897    //
3898    //  Open the file.
3899    //
3900    f = fopen(fileName, "rb");
3901    if (f == 0) {
3902        dataerrln("Error opening test data file %s\n", fileName);
3903        status = U_FILE_ACCESS_ERROR;
3904        return NULL;
3905    }
3906    //
3907    //  Read it in
3908    //
3909    int32_t            fileSize;
3910    int32_t            amt_read;
3911
3912    fseek( f, 0, SEEK_END);
3913    fileSize = ftell(f);
3914    fileBuf = new char[fileSize];
3915    fseek(f, 0, SEEK_SET);
3916    amt_read = fread(fileBuf, 1, fileSize, f);
3917    if (amt_read != fileSize || fileSize <= 0) {
3918        errln("Error reading test data file.");
3919        goto cleanUpAndReturn;
3920    }
3921
3922    //
3923    // Look for a Unicode Signature (BOM) on the data just read
3924    //
3925    int32_t        signatureLength;
3926    const char *   fileBufC;
3927    const char*    encoding;
3928
3929    fileBufC = fileBuf;
3930    encoding = ucnv_detectUnicodeSignature(
3931        fileBuf, fileSize, &signatureLength, &status);
3932    if(encoding!=NULL ){
3933        fileBufC  += signatureLength;
3934        fileSize  -= signatureLength;
3935    } else {
3936        encoding = defEncoding;
3937        if (strcmp(encoding, "utf-8") == 0) {
3938            errln("file %s is missing its BOM", fileName);
3939        }
3940    }
3941
3942    //
3943    // Open a converter to take the rule file to UTF-16
3944    //
3945    conv = ucnv_open(encoding, &status);
3946    if (U_FAILURE(status)) {
3947        goto cleanUpAndReturn;
3948    }
3949
3950    //
3951    // Convert the rules to UChar.
3952    //  Preflight first to determine required buffer size.
3953    //
3954    ulen = ucnv_toUChars(conv,
3955        NULL,           //  dest,
3956        0,              //  destCapacity,
3957        fileBufC,
3958        fileSize,
3959        &status);
3960    if (status == U_BUFFER_OVERFLOW_ERROR) {
3961        // Buffer Overflow is expected from the preflight operation.
3962        status = U_ZERO_ERROR;
3963
3964        retPtr = new UChar[ulen+1];
3965        ucnv_toUChars(conv,
3966            retPtr,       //  dest,
3967            ulen+1,
3968            fileBufC,
3969            fileSize,
3970            &status);
3971    }
3972
3973cleanUpAndReturn:
3974    fclose(f);
3975    delete[] fileBuf;
3976    ucnv_close(conv);
3977    if (U_FAILURE(status)) {
3978        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3979        delete []retPtr;
3980        retPtr = 0;
3981        ulen   = 0;
3982    };
3983    return retPtr;
3984}
3985
3986
3987//-------------------------------------------------------------------------------
3988//
3989//   PerlTests  - Run Perl's regular expression tests
3990//                The input file for this test is re_tests, the standard regular
3991//                expression test data distributed with the Perl source code.
3992//
3993//                Here is Perl's description of the test data file:
3994//
3995//        # The tests are in a separate file 't/op/re_tests'.
3996//        # Each line in that file is a separate test.
3997//        # There are five columns, separated by tabs.
3998//        #
3999//        # Column 1 contains the pattern, optionally enclosed in C<''>.
4000//        # Modifiers can be put after the closing C<'>.
4001//        #
4002//        # Column 2 contains the string to be matched.
4003//        #
4004//        # Column 3 contains the expected result:
4005//        #     y   expect a match
4006//        #     n   expect no match
4007//        #     c   expect an error
4008//        # B   test exposes a known bug in Perl, should be skipped
4009//        # b   test exposes a known bug in Perl, should be skipped if noamp
4010//        #
4011//        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4012//        #
4013//        # Column 4 contains a string, usually C<$&>.
4014//        #
4015//        # Column 5 contains the expected result of double-quote
4016//        # interpolating that string after the match, or start of error message.
4017//        #
4018//        # Column 6, if present, contains a reason why the test is skipped.
4019//        # This is printed with "skipped", for harness to pick up.
4020//        #
4021//        # \n in the tests are interpolated, as are variables of the form ${\w+}.
4022//        #
4023//        # If you want to add a regular expression test that can't be expressed
4024//        # in this format, don't add it here: put it in op/pat.t instead.
4025//
4026//        For ICU, if field 3 contains an 'i', the test will be skipped.
4027//        The test exposes is some known incompatibility between ICU and Perl regexps.
4028//        (The i is in addition to whatever was there before.)
4029//
4030//-------------------------------------------------------------------------------
4031void RegexTest::PerlTests() {
4032    char tdd[2048];
4033    const char *srcPath;
4034    UErrorCode  status = U_ZERO_ERROR;
4035    UParseError pe;
4036
4037    //
4038    //  Open and read the test data file.
4039    //
4040    srcPath=getPath(tdd, "re_tests.txt");
4041    if(srcPath==NULL) {
4042        return; /* something went wrong, error already output */
4043    }
4044
4045    int32_t    len;
4046    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4047    if (U_FAILURE(status)) {
4048        return; /* something went wrong, error already output */
4049    }
4050
4051    //
4052    //  Put the test data into a UnicodeString
4053    //
4054    UnicodeString testDataString(FALSE, testData, len);
4055
4056    //
4057    //  Regex to break the input file into lines, and strip the new lines.
4058    //     One line per match, capture group one is the desired data.
4059    //
4060    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4061    if (U_FAILURE(status)) {
4062        dataerrln("RegexPattern::compile() error");
4063        return;
4064    }
4065    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4066
4067    //
4068    //  Regex to split a test file line into fields.
4069    //    There are six fields, separated by tabs.
4070    //
4071    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4072
4073    //
4074    //  Regex to identify test patterns with flag settings, and to separate them.
4075    //    Test patterns with flags look like 'pattern'i
4076    //    Test patterns without flags are not quoted:   pattern
4077    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4078    //
4079    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4080    RegexMatcher* flagMat = flagPat->matcher(status);
4081
4082    //
4083    // The Perl tests reference several perl-isms, which are evaluated/substituted
4084    //   in the test data.  Not being perl, this must be done explicitly.  Here
4085    //   are string constants and REs for these constructs.
4086    //
4087    UnicodeString nulnulSrc("${nulnul}");
4088    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4089    nulnul = nulnul.unescape();
4090
4091    UnicodeString ffffSrc("${ffff}");
4092    UnicodeString ffff("\\uffff", -1, US_INV);
4093    ffff = ffff.unescape();
4094
4095    //  regexp for $-[0], $+[2], etc.
4096    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4097    RegexMatcher *groupsMat = groupsPat->matcher(status);
4098
4099    //  regexp for $0, $1, $2, etc.
4100    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4101    RegexMatcher *cgMat = cgPat->matcher(status);
4102
4103
4104    //
4105    // Main Loop for the Perl Tests, runs once per line from the
4106    //   test data file.
4107    //
4108    int32_t  lineNum = 0;
4109    int32_t  skippedUnimplementedCount = 0;
4110    while (lineMat->find()) {
4111        lineNum++;
4112
4113        //
4114        //  Get a line, break it into its fields, do the Perl
4115        //    variable substitutions.
4116        //
4117        UnicodeString line = lineMat->group(1, status);
4118        UnicodeString fields[7];
4119        fieldPat->split(line, fields, 7, status);
4120
4121        flagMat->reset(fields[0]);
4122        flagMat->matches(status);
4123        UnicodeString pattern  = flagMat->group(2, status);
4124        pattern.findAndReplace("${bang}", "!");
4125        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4126        pattern.findAndReplace(ffffSrc, ffff);
4127
4128        //
4129        //  Identify patterns that include match flag settings,
4130        //    split off the flags, remove the extra quotes.
4131        //
4132        UnicodeString flagStr = flagMat->group(3, status);
4133        if (U_FAILURE(status)) {
4134            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4135            return;
4136        }
4137        int32_t flags = 0;
4138        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4139        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4140        const UChar UChar_m = 0x6d;
4141        const UChar UChar_x = 0x78;
4142        const UChar UChar_y = 0x79;
4143        if (flagStr.indexOf(UChar_i) != -1) {
4144            flags |= UREGEX_CASE_INSENSITIVE;
4145        }
4146        if (flagStr.indexOf(UChar_m) != -1) {
4147            flags |= UREGEX_MULTILINE;
4148        }
4149        if (flagStr.indexOf(UChar_x) != -1) {
4150            flags |= UREGEX_COMMENTS;
4151        }
4152
4153        //
4154        // Compile the test pattern.
4155        //
4156        status = U_ZERO_ERROR;
4157        RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4158        if (status == U_REGEX_UNIMPLEMENTED) {
4159            //
4160            // Test of a feature that is planned for ICU, but not yet implemented.
4161            //   skip the test.
4162            skippedUnimplementedCount++;
4163            delete testPat;
4164            status = U_ZERO_ERROR;
4165            continue;
4166        }
4167
4168        if (U_FAILURE(status)) {
4169            // Some tests are supposed to generate errors.
4170            //   Only report an error for tests that are supposed to succeed.
4171            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4172                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4173            {
4174                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4175            }
4176            status = U_ZERO_ERROR;
4177            delete testPat;
4178            continue;
4179        }
4180
4181        if (fields[2].indexOf(UChar_i) >= 0) {
4182            // ICU should skip this test.
4183            delete testPat;
4184            continue;
4185        }
4186
4187        if (fields[2].indexOf(UChar_c) >= 0) {
4188            // This pattern should have caused a compilation error, but didn't/
4189            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4190            delete testPat;
4191            continue;
4192        }
4193
4194        //
4195        // replace the Perl variables that appear in some of the
4196        //   match data strings.
4197        //
4198        UnicodeString matchString = fields[1];
4199        matchString.findAndReplace(nulnulSrc, nulnul);
4200        matchString.findAndReplace(ffffSrc,   ffff);
4201
4202        // Replace any \n in the match string with an actual new-line char.
4203        //  Don't do full unescape, as this unescapes more than Perl does, which
4204        //  causes other spurious failures in the tests.
4205        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4206
4207
4208
4209        //
4210        // Run the test, check for expected match/don't match result.
4211        //
4212        RegexMatcher *testMat = testPat->matcher(matchString, status);
4213        UBool found = testMat->find();
4214        UBool expected = FALSE;
4215        if (fields[2].indexOf(UChar_y) >=0) {
4216            expected = TRUE;
4217        }
4218        if (expected != found) {
4219            errln("line %d: Expected %smatch, got %smatch",
4220                lineNum, expected?"":"no ", found?"":"no " );
4221            continue;
4222        }
4223
4224        // Don't try to check expected results if there is no match.
4225        //   (Some have stuff in the expected fields)
4226        if (!found) {
4227            delete testMat;
4228            delete testPat;
4229            continue;
4230        }
4231
4232        //
4233        // Interpret the Perl expression from the fourth field of the data file,
4234        // building up an ICU string from the results of the ICU match.
4235        //   The Perl expression will contain references to the results of
4236        //     a regex match, including the matched string, capture group strings,
4237        //     group starting and ending indicies, etc.
4238        //
4239        UnicodeString resultString;
4240        UnicodeString perlExpr = fields[3];
4241#if SUPPORT_MUTATING_INPUT_STRING
4242        groupsMat->reset(perlExpr);
4243        cgMat->reset(perlExpr);
4244#endif
4245
4246        while (perlExpr.length() > 0) {
4247#if !SUPPORT_MUTATING_INPUT_STRING
4248            //  Perferred usage.  Reset after any modification to input string.
4249            groupsMat->reset(perlExpr);
4250            cgMat->reset(perlExpr);
4251#endif
4252
4253            if (perlExpr.startsWith("$&")) {
4254                resultString.append(testMat->group(status));
4255                perlExpr.remove(0, 2);
4256            }
4257
4258            else if (groupsMat->lookingAt(status)) {
4259                // $-[0]   $+[2]  etc.
4260                UnicodeString digitString = groupsMat->group(2, status);
4261                int32_t t = 0;
4262                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4263                UnicodeString plusOrMinus = groupsMat->group(1, status);
4264                int32_t matchPosition;
4265                if (plusOrMinus.compare("+") == 0) {
4266                    matchPosition = testMat->end(groupNum, status);
4267                } else {
4268                    matchPosition = testMat->start(groupNum, status);
4269                }
4270                if (matchPosition != -1) {
4271                    ICU_Utility::appendNumber(resultString, matchPosition);
4272                }
4273                perlExpr.remove(0, groupsMat->end(status));
4274            }
4275
4276            else if (cgMat->lookingAt(status)) {
4277                // $1, $2, $3, etc.
4278                UnicodeString digitString = cgMat->group(1, status);
4279                int32_t t = 0;
4280                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4281                if (U_SUCCESS(status)) {
4282                    resultString.append(testMat->group(groupNum, status));
4283                    status = U_ZERO_ERROR;
4284                }
4285                perlExpr.remove(0, cgMat->end(status));
4286            }
4287
4288            else if (perlExpr.startsWith("@-")) {
4289                int32_t i;
4290                for (i=0; i<=testMat->groupCount(); i++) {
4291                    if (i>0) {
4292                        resultString.append(" ");
4293                    }
4294                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4295                }
4296                perlExpr.remove(0, 2);
4297            }
4298
4299            else if (perlExpr.startsWith("@+")) {
4300                int32_t i;
4301                for (i=0; i<=testMat->groupCount(); i++) {
4302                    if (i>0) {
4303                        resultString.append(" ");
4304                    }
4305                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4306                }
4307                perlExpr.remove(0, 2);
4308            }
4309
4310            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4311                                                     //           or as an escaped sequence (e.g. \n)
4312                if (perlExpr.length() > 1) {
4313                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4314                }
4315                UChar c = perlExpr.charAt(0);
4316                switch (c) {
4317                case 'n':   c = '\n'; break;
4318                // add any other escape sequences that show up in the test expected results.
4319                }
4320                resultString.append(c);
4321                perlExpr.remove(0, 1);
4322            }
4323
4324            else  {
4325                // Any characters from the perl expression that we don't explicitly
4326                //  recognize before here are assumed to be literals and copied
4327                //  as-is to the expected results.
4328                resultString.append(perlExpr.charAt(0));
4329                perlExpr.remove(0, 1);
4330            }
4331
4332            if (U_FAILURE(status)) {
4333                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4334                break;
4335            }
4336        }
4337
4338        //
4339        // Expected Results Compare
4340        //
4341        UnicodeString expectedS(fields[4]);
4342        expectedS.findAndReplace(nulnulSrc, nulnul);
4343        expectedS.findAndReplace(ffffSrc,   ffff);
4344        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4345
4346
4347        if (expectedS.compare(resultString) != 0) {
4348            err("Line %d: Incorrect perl expression results.", lineNum);
4349            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4350        }
4351
4352        delete testMat;
4353        delete testPat;
4354    }
4355
4356    //
4357    // All done.  Clean up allocated stuff.
4358    //
4359    delete cgMat;
4360    delete cgPat;
4361
4362    delete groupsMat;
4363    delete groupsPat;
4364
4365    delete flagMat;
4366    delete flagPat;
4367
4368    delete lineMat;
4369    delete linePat;
4370
4371    delete fieldPat;
4372    delete [] testData;
4373
4374
4375    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4376
4377}
4378
4379
4380//-------------------------------------------------------------------------------
4381//
4382//   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4383//                  (instead of using UnicodeStrings) to test the alternate engine.
4384//                  The input file for this test is re_tests, the standard regular
4385//                  expression test data distributed with the Perl source code.
4386//                  See PerlTests() for more information.
4387//
4388//-------------------------------------------------------------------------------
4389void RegexTest::PerlTestsUTF8() {
4390    char tdd[2048];
4391    const char *srcPath;
4392    UErrorCode  status = U_ZERO_ERROR;
4393    UParseError pe;
4394    LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4395    UText       patternText = UTEXT_INITIALIZER;
4396    char       *patternChars = NULL;
4397    int32_t     patternLength;
4398    int32_t     patternCapacity = 0;
4399    UText       inputText = UTEXT_INITIALIZER;
4400    char       *inputChars = NULL;
4401    int32_t     inputLength;
4402    int32_t     inputCapacity = 0;
4403
4404    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4405
4406    //
4407    //  Open and read the test data file.
4408    //
4409    srcPath=getPath(tdd, "re_tests.txt");
4410    if(srcPath==NULL) {
4411        return; /* something went wrong, error already output */
4412    }
4413
4414    int32_t    len;
4415    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4416    if (U_FAILURE(status)) {
4417        return; /* something went wrong, error already output */
4418    }
4419
4420    //
4421    //  Put the test data into a UnicodeString
4422    //
4423    UnicodeString testDataString(FALSE, testData, len);
4424
4425    //
4426    //  Regex to break the input file into lines, and strip the new lines.
4427    //     One line per match, capture group one is the desired data.
4428    //
4429    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4430    if (U_FAILURE(status)) {
4431        dataerrln("RegexPattern::compile() error");
4432        return;
4433    }
4434    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4435
4436    //
4437    //  Regex to split a test file line into fields.
4438    //    There are six fields, separated by tabs.
4439    //
4440    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4441
4442    //
4443    //  Regex to identify test patterns with flag settings, and to separate them.
4444    //    Test patterns with flags look like 'pattern'i
4445    //    Test patterns without flags are not quoted:   pattern
4446    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4447    //
4448    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4449    RegexMatcher* flagMat = flagPat->matcher(status);
4450
4451    //
4452    // The Perl tests reference several perl-isms, which are evaluated/substituted
4453    //   in the test data.  Not being perl, this must be done explicitly.  Here
4454    //   are string constants and REs for these constructs.
4455    //
4456    UnicodeString nulnulSrc("${nulnul}");
4457    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4458    nulnul = nulnul.unescape();
4459
4460    UnicodeString ffffSrc("${ffff}");
4461    UnicodeString ffff("\\uffff", -1, US_INV);
4462    ffff = ffff.unescape();
4463
4464    //  regexp for $-[0], $+[2], etc.
4465    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4466    RegexMatcher *groupsMat = groupsPat->matcher(status);
4467
4468    //  regexp for $0, $1, $2, etc.
4469    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4470    RegexMatcher *cgMat = cgPat->matcher(status);
4471
4472
4473    //
4474    // Main Loop for the Perl Tests, runs once per line from the
4475    //   test data file.
4476    //
4477    int32_t  lineNum = 0;
4478    int32_t  skippedUnimplementedCount = 0;
4479    while (lineMat->find()) {
4480        lineNum++;
4481
4482        //
4483        //  Get a line, break it into its fields, do the Perl
4484        //    variable substitutions.
4485        //
4486        UnicodeString line = lineMat->group(1, status);
4487        UnicodeString fields[7];
4488        fieldPat->split(line, fields, 7, status);
4489
4490        flagMat->reset(fields[0]);
4491        flagMat->matches(status);
4492        UnicodeString pattern  = flagMat->group(2, status);
4493        pattern.findAndReplace("${bang}", "!");
4494        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4495        pattern.findAndReplace(ffffSrc, ffff);
4496
4497        //
4498        //  Identify patterns that include match flag settings,
4499        //    split off the flags, remove the extra quotes.
4500        //
4501        UnicodeString flagStr = flagMat->group(3, status);
4502        if (U_FAILURE(status)) {
4503            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4504            return;
4505        }
4506        int32_t flags = 0;
4507        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4508        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4509        const UChar UChar_m = 0x6d;
4510        const UChar UChar_x = 0x78;
4511        const UChar UChar_y = 0x79;
4512        if (flagStr.indexOf(UChar_i) != -1) {
4513            flags |= UREGEX_CASE_INSENSITIVE;
4514        }
4515        if (flagStr.indexOf(UChar_m) != -1) {
4516            flags |= UREGEX_MULTILINE;
4517        }
4518        if (flagStr.indexOf(UChar_x) != -1) {
4519            flags |= UREGEX_COMMENTS;
4520        }
4521
4522        //
4523        // Put the pattern in a UTF-8 UText
4524        //
4525        status = U_ZERO_ERROR;
4526        patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4527        if (status == U_BUFFER_OVERFLOW_ERROR) {
4528            status = U_ZERO_ERROR;
4529            delete[] patternChars;
4530            patternCapacity = patternLength + 1;
4531            patternChars = new char[patternCapacity];
4532            pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4533        }
4534        utext_openUTF8(&patternText, patternChars, patternLength, &status);
4535
4536        //
4537        // Compile the test pattern.
4538        //
4539        RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4540        if (status == U_REGEX_UNIMPLEMENTED) {
4541            //
4542            // Test of a feature that is planned for ICU, but not yet implemented.
4543            //   skip the test.
4544            skippedUnimplementedCount++;
4545            delete testPat;
4546            status = U_ZERO_ERROR;
4547            continue;
4548        }
4549
4550        if (U_FAILURE(status)) {
4551            // Some tests are supposed to generate errors.
4552            //   Only report an error for tests that are supposed to succeed.
4553            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4554                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4555            {
4556                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4557            }
4558            status = U_ZERO_ERROR;
4559            delete testPat;
4560            continue;
4561        }
4562
4563        if (fields[2].indexOf(UChar_i) >= 0) {
4564            // ICU should skip this test.
4565            delete testPat;
4566            continue;
4567        }
4568
4569        if (fields[2].indexOf(UChar_c) >= 0) {
4570            // This pattern should have caused a compilation error, but didn't/
4571            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4572            delete testPat;
4573            continue;
4574        }
4575
4576
4577        //
4578        // replace the Perl variables that appear in some of the
4579        //   match data strings.
4580        //
4581        UnicodeString matchString = fields[1];
4582        matchString.findAndReplace(nulnulSrc, nulnul);
4583        matchString.findAndReplace(ffffSrc,   ffff);
4584
4585        // Replace any \n in the match string with an actual new-line char.
4586        //  Don't do full unescape, as this unescapes more than Perl does, which
4587        //  causes other spurious failures in the tests.
4588        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4589
4590        //
4591        // Put the input in a UTF-8 UText
4592        //
4593        status = U_ZERO_ERROR;
4594        inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4595        if (status == U_BUFFER_OVERFLOW_ERROR) {
4596            status = U_ZERO_ERROR;
4597            delete[] inputChars;
4598            inputCapacity = inputLength + 1;
4599            inputChars = new char[inputCapacity];
4600            matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4601        }
4602        utext_openUTF8(&inputText, inputChars, inputLength, &status);
4603
4604        //
4605        // Run the test, check for expected match/don't match result.
4606        //
4607        RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4608        UBool found = testMat->find();
4609        UBool expected = FALSE;
4610        if (fields[2].indexOf(UChar_y) >=0) {
4611            expected = TRUE;
4612        }
4613        if (expected != found) {
4614            errln("line %d: Expected %smatch, got %smatch",
4615                lineNum, expected?"":"no ", found?"":"no " );
4616            continue;
4617        }
4618
4619        // Don't try to check expected results if there is no match.
4620        //   (Some have stuff in the expected fields)
4621        if (!found) {
4622            delete testMat;
4623            delete testPat;
4624            continue;
4625        }
4626
4627        //
4628        // Interpret the Perl expression from the fourth field of the data file,
4629        // building up an ICU string from the results of the ICU match.
4630        //   The Perl expression will contain references to the results of
4631        //     a regex match, including the matched string, capture group strings,
4632        //     group starting and ending indicies, etc.
4633        //
4634        UnicodeString resultString;
4635        UnicodeString perlExpr = fields[3];
4636
4637        while (perlExpr.length() > 0) {
4638            groupsMat->reset(perlExpr);
4639            cgMat->reset(perlExpr);
4640
4641            if (perlExpr.startsWith("$&")) {
4642                resultString.append(testMat->group(status));
4643                perlExpr.remove(0, 2);
4644            }
4645
4646            else if (groupsMat->lookingAt(status)) {
4647                // $-[0]   $+[2]  etc.
4648                UnicodeString digitString = groupsMat->group(2, status);
4649                int32_t t = 0;
4650                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4651                UnicodeString plusOrMinus = groupsMat->group(1, status);
4652                int32_t matchPosition;
4653                if (plusOrMinus.compare("+") == 0) {
4654                    matchPosition = testMat->end(groupNum, status);
4655                } else {
4656                    matchPosition = testMat->start(groupNum, status);
4657                }
4658                if (matchPosition != -1) {
4659                    ICU_Utility::appendNumber(resultString, matchPosition);
4660                }
4661                perlExpr.remove(0, groupsMat->end(status));
4662            }
4663
4664            else if (cgMat->lookingAt(status)) {
4665                // $1, $2, $3, etc.
4666                UnicodeString digitString = cgMat->group(1, status);
4667                int32_t t = 0;
4668                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4669                if (U_SUCCESS(status)) {
4670                    resultString.append(testMat->group(groupNum, status));
4671                    status = U_ZERO_ERROR;
4672                }
4673                perlExpr.remove(0, cgMat->end(status));
4674            }
4675
4676            else if (perlExpr.startsWith("@-")) {
4677                int32_t i;
4678                for (i=0; i<=testMat->groupCount(); i++) {
4679                    if (i>0) {
4680                        resultString.append(" ");
4681                    }
4682                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4683                }
4684                perlExpr.remove(0, 2);
4685            }
4686
4687            else if (perlExpr.startsWith("@+")) {
4688                int32_t i;
4689                for (i=0; i<=testMat->groupCount(); i++) {
4690                    if (i>0) {
4691                        resultString.append(" ");
4692                    }
4693                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4694                }
4695                perlExpr.remove(0, 2);
4696            }
4697
4698            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4699                                                     //           or as an escaped sequence (e.g. \n)
4700                if (perlExpr.length() > 1) {
4701                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4702                }
4703                UChar c = perlExpr.charAt(0);
4704                switch (c) {
4705                case 'n':   c = '\n'; break;
4706                // add any other escape sequences that show up in the test expected results.
4707                }
4708                resultString.append(c);
4709                perlExpr.remove(0, 1);
4710            }
4711
4712            else  {
4713                // Any characters from the perl expression that we don't explicitly
4714                //  recognize before here are assumed to be literals and copied
4715                //  as-is to the expected results.
4716                resultString.append(perlExpr.charAt(0));
4717                perlExpr.remove(0, 1);
4718            }
4719
4720            if (U_FAILURE(status)) {
4721                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4722                break;
4723            }
4724        }
4725
4726        //
4727        // Expected Results Compare
4728        //
4729        UnicodeString expectedS(fields[4]);
4730        expectedS.findAndReplace(nulnulSrc, nulnul);
4731        expectedS.findAndReplace(ffffSrc,   ffff);
4732        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4733
4734
4735        if (expectedS.compare(resultString) != 0) {
4736            err("Line %d: Incorrect perl expression results.", lineNum);
4737            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4738        }
4739
4740        delete testMat;
4741        delete testPat;
4742    }
4743
4744    //
4745    // All done.  Clean up allocated stuff.
4746    //
4747    delete cgMat;
4748    delete cgPat;
4749
4750    delete groupsMat;
4751    delete groupsPat;
4752
4753    delete flagMat;
4754    delete flagPat;
4755
4756    delete lineMat;
4757    delete linePat;
4758
4759    delete fieldPat;
4760    delete [] testData;
4761
4762    utext_close(&patternText);
4763    utext_close(&inputText);
4764
4765    delete [] patternChars;
4766    delete [] inputChars;
4767
4768
4769    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4770
4771}
4772
4773
4774//--------------------------------------------------------------
4775//
4776//  Bug6149   Verify limits to heap expansion for backtrack stack.
4777//             Use this pattern,
4778//                 "(a?){1,8000000}"
4779//             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4780//                   This test is likely to be fragile, as further optimizations stop
4781//                   more cases of pointless looping in the match engine.
4782//
4783//---------------------------------------------------------------
4784void RegexTest::Bug6149() {
4785    UnicodeString pattern("(a?){1,8000000}");
4786    UnicodeString s("xyz");
4787    uint32_t flags = 0;
4788    UErrorCode status = U_ZERO_ERROR;
4789
4790    RegexMatcher  matcher(pattern, s, flags, status);
4791    UBool result = false;
4792    REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4793    REGEX_ASSERT(result == FALSE);
4794 }
4795
4796
4797//
4798//   Callbacks()    Test the callback function.
4799//                  When set, callbacks occur periodically during matching operations,
4800//                  giving the application code the ability to abort the operation
4801//                  before it's normal completion.
4802//
4803
4804struct callBackContext {
4805    RegexTest        *test;
4806    int32_t          maxCalls;
4807    int32_t          numCalls;
4808    int32_t          lastSteps;
4809    void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4810};
4811
4812U_CDECL_BEGIN
4813static UBool U_CALLCONV
4814testCallBackFn(const void *context, int32_t steps) {
4815    callBackContext  *info = (callBackContext *)context;
4816    if (info->lastSteps+1 != steps) {
4817        info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4818    }
4819    info->lastSteps = steps;
4820    info->numCalls++;
4821    return (info->numCalls < info->maxCalls);
4822}
4823U_CDECL_END
4824
4825void RegexTest::Callbacks() {
4826   {
4827        // Getter returns NULLs if no callback has been set
4828
4829        //   The variables that the getter will fill in.
4830        //   Init to non-null values so that the action of the getter can be seen.
4831        const void          *returnedContext = &returnedContext;
4832        URegexMatchCallback *returnedFn = &testCallBackFn;
4833
4834        UErrorCode status = U_ZERO_ERROR;
4835        RegexMatcher matcher("x", 0, status);
4836        REGEX_CHECK_STATUS;
4837        matcher.getMatchCallback(returnedFn, returnedContext, status);
4838        REGEX_CHECK_STATUS;
4839        REGEX_ASSERT(returnedFn == NULL);
4840        REGEX_ASSERT(returnedContext == NULL);
4841    }
4842
4843   {
4844        // Set and Get work
4845        callBackContext cbInfo = {this, 0, 0, 0};
4846        const void          *returnedContext;
4847        URegexMatchCallback *returnedFn;
4848        UErrorCode status = U_ZERO_ERROR;
4849        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4850        REGEX_CHECK_STATUS;
4851        matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4852        REGEX_CHECK_STATUS;
4853        matcher.getMatchCallback(returnedFn, returnedContext, status);
4854        REGEX_CHECK_STATUS;
4855        REGEX_ASSERT(returnedFn == testCallBackFn);
4856        REGEX_ASSERT(returnedContext == &cbInfo);
4857
4858        // A short-running match shouldn't invoke the callback
4859        status = U_ZERO_ERROR;
4860        cbInfo.reset(1);
4861        UnicodeString s = "xxx";
4862        matcher.reset(s);
4863        REGEX_ASSERT(matcher.matches(status));
4864        REGEX_CHECK_STATUS;
4865        REGEX_ASSERT(cbInfo.numCalls == 0);
4866
4867        // A medium-length match that runs long enough to invoke the
4868        //   callback, but not so long that the callback aborts it.
4869        status = U_ZERO_ERROR;
4870        cbInfo.reset(4);
4871        s = "aaaaaaaaaaaaaaaaaaab";
4872        matcher.reset(s);
4873        REGEX_ASSERT(matcher.matches(status)==FALSE);
4874        REGEX_CHECK_STATUS;
4875        REGEX_ASSERT(cbInfo.numCalls > 0);
4876
4877        // A longer running match that the callback function will abort.
4878        status = U_ZERO_ERROR;
4879        cbInfo.reset(4);
4880        s = "aaaaaaaaaaaaaaaaaaaaaaab";
4881        matcher.reset(s);
4882        REGEX_ASSERT(matcher.matches(status)==FALSE);
4883        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4884        REGEX_ASSERT(cbInfo.numCalls == 4);
4885
4886        // A longer running find that the callback function will abort.
4887        status = U_ZERO_ERROR;
4888        cbInfo.reset(4);
4889        s = "aaaaaaaaaaaaaaaaaaaaaaab";
4890        matcher.reset(s);
4891        REGEX_ASSERT(matcher.find(status)==FALSE);
4892        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4893        REGEX_ASSERT(cbInfo.numCalls == 4);
4894    }
4895
4896
4897}
4898
4899
4900//
4901//   FindProgressCallbacks()    Test the find "progress" callback function.
4902//                  When set, the find progress callback will be invoked during a find operations
4903//                  after each return from a match attempt, giving the application the opportunity
4904//                  to terminate a long-running find operation before it's normal completion.
4905//
4906
4907struct progressCallBackContext {
4908    RegexTest        *test;
4909    int64_t          lastIndex;
4910    int32_t          maxCalls;
4911    int32_t          numCalls;
4912    void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4913};
4914
4915// call-back function for find().
4916// Return TRUE to continue the find().
4917// Return FALSE to stop the find().
4918U_CDECL_BEGIN
4919static UBool U_CALLCONV
4920testProgressCallBackFn(const void *context, int64_t matchIndex) {
4921    progressCallBackContext  *info = (progressCallBackContext *)context;
4922    info->numCalls++;
4923    info->lastIndex = matchIndex;
4924//    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4925    return (info->numCalls < info->maxCalls);
4926}
4927U_CDECL_END
4928
4929void RegexTest::FindProgressCallbacks() {
4930   {
4931        // Getter returns NULLs if no callback has been set
4932
4933        //   The variables that the getter will fill in.
4934        //   Init to non-null values so that the action of the getter can be seen.
4935        const void                  *returnedContext = &returnedContext;
4936        URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4937
4938        UErrorCode status = U_ZERO_ERROR;
4939        RegexMatcher matcher("x", 0, status);
4940        REGEX_CHECK_STATUS;
4941        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4942        REGEX_CHECK_STATUS;
4943        REGEX_ASSERT(returnedFn == NULL);
4944        REGEX_ASSERT(returnedContext == NULL);
4945    }
4946
4947   {
4948        // Set and Get work
4949        progressCallBackContext cbInfo = {this, 0, 0, 0};
4950        const void                  *returnedContext;
4951        URegexFindProgressCallback  *returnedFn;
4952        UErrorCode status = U_ZERO_ERROR;
4953        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4954        REGEX_CHECK_STATUS;
4955        matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4956        REGEX_CHECK_STATUS;
4957        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4958        REGEX_CHECK_STATUS;
4959        REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4960        REGEX_ASSERT(returnedContext == &cbInfo);
4961
4962        // A find that matches on the initial position does NOT invoke the callback.
4963        status = U_ZERO_ERROR;
4964        cbInfo.reset(100);
4965        UnicodeString s = "aaxxx";
4966        matcher.reset(s);
4967#if 0
4968        matcher.setTrace(TRUE);
4969#endif
4970        REGEX_ASSERT(matcher.find(0, status));
4971        REGEX_CHECK_STATUS;
4972        REGEX_ASSERT(cbInfo.numCalls == 0);
4973
4974        // A medium running find() that causes matcher.find() to invoke our callback for each index,
4975        //   but not so many times that we interrupt the operation.
4976        status = U_ZERO_ERROR;
4977        s = "aaaaaaaaaaaaaaaaaaab";
4978        cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4979        matcher.reset(s);
4980        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4981        REGEX_CHECK_STATUS;
4982        REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4983
4984        // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4985        status = U_ZERO_ERROR;
4986        UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4987        cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4988        matcher.reset(s1);
4989        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4990        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4991        REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4992
4993        // Now a match that will succeed, but after an interruption
4994        status = U_ZERO_ERROR;
4995        UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4996        cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4997        matcher.reset(s2);
4998        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4999        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
5000        // Now retry the match from where left off
5001        cbInfo.maxCalls = 100; //  No callback limit
5002        status = U_ZERO_ERROR;
5003        REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
5004        REGEX_CHECK_STATUS;
5005    }
5006
5007
5008}
5009
5010
5011//---------------------------------------------------------------------------
5012//
5013//    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
5014//                             UTexts. The pure-C implementation of UText
5015//                             has no mutable backing stores, but we can
5016//                             use UnicodeString here to test the functionality.
5017//
5018//---------------------------------------------------------------------------
5019void RegexTest::PreAllocatedUTextCAPI () {
5020    UErrorCode           status = U_ZERO_ERROR;
5021    URegularExpression  *re;
5022    UText                patternText = UTEXT_INITIALIZER;
5023    UnicodeString        buffer;
5024    UText                bufferText = UTEXT_INITIALIZER;
5025
5026    utext_openUnicodeString(&bufferText, &buffer, &status);
5027
5028    /*
5029     *  getText() and getUText()
5030     */
5031    {
5032        UText  text1 = UTEXT_INITIALIZER;
5033        UText  text2 = UTEXT_INITIALIZER;
5034        UChar  text2Chars[20];
5035        UText  *resultText;
5036
5037        status = U_ZERO_ERROR;
5038        regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5039        regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5040        u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5041        utext_openUChars(&text2, text2Chars, -1, &status);
5042
5043        regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5044        re = uregex_openUText(&patternText, 0, NULL, &status);
5045
5046        /* First set a UText */
5047        uregex_setUText(re, &text1, &status);
5048        resultText = uregex_getUText(re, &bufferText, &status);
5049        REGEX_CHECK_STATUS;
5050        REGEX_ASSERT(resultText == &bufferText);
5051        utext_setNativeIndex(resultText, 0);
5052        utext_setNativeIndex(&text1, 0);
5053        REGEX_ASSERT(testUTextEqual(resultText, &text1));
5054
5055        resultText = uregex_getUText(re, &bufferText, &status);
5056        REGEX_CHECK_STATUS;
5057        REGEX_ASSERT(resultText == &bufferText);
5058        utext_setNativeIndex(resultText, 0);
5059        utext_setNativeIndex(&text1, 0);
5060        REGEX_ASSERT(testUTextEqual(resultText, &text1));
5061
5062        /* Then set a UChar * */
5063        uregex_setText(re, text2Chars, 7, &status);
5064        resultText = uregex_getUText(re, &bufferText, &status);
5065        REGEX_CHECK_STATUS;
5066        REGEX_ASSERT(resultText == &bufferText);
5067        utext_setNativeIndex(resultText, 0);
5068        utext_setNativeIndex(&text2, 0);
5069        REGEX_ASSERT(testUTextEqual(resultText, &text2));
5070
5071        uregex_close(re);
5072        utext_close(&text1);
5073        utext_close(&text2);
5074    }
5075
5076    /*
5077     *  group()
5078     */
5079    {
5080        UChar    text1[80];
5081        UText   *actual;
5082        UBool    result;
5083        int64_t  length = 0;
5084
5085        u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
5086        //                  012345678901234567890123456789012345678901234567
5087        //                  0         1         2         3         4
5088
5089        status = U_ZERO_ERROR;
5090        re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5091        REGEX_CHECK_STATUS;
5092
5093        uregex_setText(re, text1, -1, &status);
5094        result = uregex_find(re, 0, &status);
5095        REGEX_ASSERT(result==TRUE);
5096
5097        /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
5098        status = U_ZERO_ERROR;
5099        actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5100        REGEX_CHECK_STATUS;
5101        REGEX_ASSERT(actual == &bufferText);
5102        REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5103        REGEX_ASSERT(length == 16);
5104        REGEX_ASSERT(utext_nativeLength(actual) == 47);
5105
5106        /*  Capture group #1.  Should succeed, matching " interior ". */
5107        status = U_ZERO_ERROR;
5108        actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5109        REGEX_CHECK_STATUS;
5110        REGEX_ASSERT(actual == &bufferText);
5111        REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
5112        REGEX_ASSERT(length == 10);
5113        REGEX_ASSERT(utext_nativeLength(actual) == 47);
5114
5115        /*  Capture group out of range.  Error. */
5116        status = U_ZERO_ERROR;
5117        actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5118        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5119        REGEX_ASSERT(actual == &bufferText);
5120        uregex_close(re);
5121
5122    }
5123
5124    /*
5125     *  replaceFirst()
5126     */
5127    {
5128        UChar    text1[80];
5129        UChar    text2[80];
5130        UText    replText = UTEXT_INITIALIZER;
5131        UText   *result;
5132        status = U_ZERO_ERROR;
5133        utext_openUnicodeString(&bufferText, &buffer, &status);
5134
5135        status = U_ZERO_ERROR;
5136        u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5137        u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5138        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5139
5140        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5141        REGEX_CHECK_STATUS;
5142
5143        /*  Normal case, with match */
5144        uregex_setText(re, text1, -1, &status);
5145        REGEX_CHECK_STATUS;
5146        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5147        REGEX_CHECK_STATUS;
5148        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5149        REGEX_CHECK_STATUS;
5150        REGEX_ASSERT(result == &bufferText);
5151        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5152
5153        /* No match.  Text should copy to output with no changes.  */
5154        uregex_setText(re, text2, -1, &status);
5155        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5156        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5157        REGEX_CHECK_STATUS;
5158        REGEX_ASSERT(result == &bufferText);
5159        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5160
5161        /* Unicode escapes */
5162        uregex_setText(re, text1, -1, &status);
5163        regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5164        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5165        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5166        REGEX_CHECK_STATUS;
5167        REGEX_ASSERT(result == &bufferText);
5168        REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5169
5170        uregex_close(re);
5171        utext_close(&replText);
5172    }
5173
5174
5175    /*
5176     *  replaceAll()
5177     */
5178    {
5179        UChar    text1[80];
5180        UChar    text2[80];
5181        UText    replText = UTEXT_INITIALIZER;
5182        UText   *result;
5183
5184        status = U_ZERO_ERROR;
5185        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5186        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5187        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5188
5189        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5190        REGEX_CHECK_STATUS;
5191
5192        /*  Normal case, with match */
5193        uregex_setText(re, text1, -1, &status);
5194        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5195        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5196        REGEX_CHECK_STATUS;
5197        REGEX_ASSERT(result == &bufferText);
5198        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5199
5200        /* No match.  Text should copy to output with no changes.  */
5201        uregex_setText(re, text2, -1, &status);
5202        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5203        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5204        REGEX_CHECK_STATUS;
5205        REGEX_ASSERT(result == &bufferText);
5206        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5207
5208        uregex_close(re);
5209        utext_close(&replText);
5210    }
5211
5212
5213    /*
5214     *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5215     *   so we don't need to test it here.
5216     */
5217
5218    utext_close(&bufferText);
5219    utext_close(&patternText);
5220}
5221
5222
5223//--------------------------------------------------------------
5224//
5225//  NamedCapture   Check basic named capture group functionality
5226//
5227//--------------------------------------------------------------
5228void RegexTest::NamedCapture() {
5229    UErrorCode status = U_ZERO_ERROR;
5230    RegexPattern *pat = RegexPattern::compile(UnicodeString(
5231            "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5232    REGEX_CHECK_STATUS;
5233    int32_t group = pat->groupNumberFromName("five", -1, status);
5234    REGEX_CHECK_STATUS;
5235    REGEX_ASSERT(5 == group);
5236    group = pat->groupNumberFromName("three", -1, status);
5237    REGEX_CHECK_STATUS;
5238    REGEX_ASSERT(3 == group);
5239
5240    status = U_ZERO_ERROR;
5241    group = pat->groupNumberFromName(UnicodeString("six"), status);
5242    REGEX_CHECK_STATUS;
5243    REGEX_ASSERT(6 == group);
5244
5245    status = U_ZERO_ERROR;
5246    group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5247    U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5248
5249    status = U_ZERO_ERROR;
5250
5251    // After copying a pattern, named capture should still work in the copy.
5252    RegexPattern *copiedPat = new RegexPattern(*pat);
5253    REGEX_ASSERT(*copiedPat == *pat);
5254    delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5255
5256    group = copiedPat->groupNumberFromName("five", -1, status);
5257    REGEX_CHECK_STATUS;
5258    REGEX_ASSERT(5 == group);
5259    group = copiedPat->groupNumberFromName("three", -1, status);
5260    REGEX_CHECK_STATUS;
5261    REGEX_ASSERT(3 == group);
5262    delete copiedPat;
5263
5264    // ReplaceAll with named capture group.
5265    status = U_ZERO_ERROR;
5266    UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5267    RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5268    REGEX_CHECK_STATUS;
5269    // m.pattern().dumpPattern();
5270    UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5271    REGEX_CHECK_STATUS;
5272    REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5273    delete m;
5274
5275    // ReplaceAll, allowed capture group numbers.
5276    text = UnicodeString("abcmxyz");
5277    m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5278    REGEX_CHECK_STATUS;
5279
5280    status = U_ZERO_ERROR;
5281    replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5282    REGEX_CHECK_STATUS;
5283    REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5284
5285    status = U_ZERO_ERROR;
5286    replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5287    REGEX_CHECK_STATUS;
5288    REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5289
5290    status = U_ZERO_ERROR;
5291    replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5292    REGEX_CHECK_STATUS;
5293    REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5294
5295    status = U_ZERO_ERROR;
5296    replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5297    REGEX_CHECK_STATUS;
5298    REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5299
5300    status = U_ZERO_ERROR;
5301    replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5302    REGEX_CHECK_STATUS;
5303    REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5304
5305    status = U_ZERO_ERROR;
5306    replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5307    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5308
5309    status = U_ZERO_ERROR;
5310    replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5311    REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5312    REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5313
5314    status = U_ZERO_ERROR;
5315    replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5316    REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5317    REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5318
5319    status = U_ZERO_ERROR;
5320    replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5321    REGEX_CHECK_STATUS;
5322    REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5323
5324    status = U_ZERO_ERROR;
5325    replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5326    REGEX_CHECK_STATUS;
5327    REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5328
5329    status = U_ZERO_ERROR;
5330    replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5331    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5332
5333    status = U_ZERO_ERROR;
5334    replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5335    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5336
5337    status = U_ZERO_ERROR;
5338    replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5339    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5340
5341    status = U_ZERO_ERROR;
5342    replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5343    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5344
5345    delete m;
5346
5347    // Repeat the above replaceAll() tests using the plain C API, which
5348    //  has a separate implementation internally.
5349    //  TODO: factor out the test data.
5350
5351    status = U_ZERO_ERROR;
5352    URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5353    REGEX_CHECK_STATUS;
5354    text = UnicodeString("abcmxyz");
5355    uregex_setText(re, text.getBuffer(), text.length(), &status);
5356    REGEX_CHECK_STATUS;
5357
5358    UChar resultBuf[100];
5359    int32_t resultLength;
5360    UnicodeString repl;
5361
5362    status = U_ZERO_ERROR;
5363    repl = UnicodeString("<$0>");
5364    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5365    REGEX_CHECK_STATUS;
5366    REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5367
5368    status = U_ZERO_ERROR;
5369    repl = UnicodeString("<$1>");
5370    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5371    REGEX_CHECK_STATUS;
5372    REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5373
5374    status = U_ZERO_ERROR;
5375    repl = UnicodeString("<${one}>");
5376    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5377    REGEX_CHECK_STATUS;
5378    REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5379
5380    status = U_ZERO_ERROR;
5381    repl = UnicodeString("<$2>");
5382    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5383    REGEX_CHECK_STATUS;
5384    REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5385
5386    status = U_ZERO_ERROR;
5387    repl = UnicodeString("<$3>");
5388    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5389    REGEX_CHECK_STATUS;
5390    REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5391
5392    status = U_ZERO_ERROR;
5393    repl = UnicodeString("<$4>");
5394    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5395    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5396
5397    status = U_ZERO_ERROR;
5398    repl = UnicodeString("<$04>");
5399    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5400    REGEX_CHECK_STATUS;
5401    REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5402
5403    status = U_ZERO_ERROR;
5404    repl = UnicodeString("<$000016>");
5405    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5406    REGEX_CHECK_STATUS;
5407    REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5408
5409    status = U_ZERO_ERROR;
5410    repl = UnicodeString("<$3$2$1${one}>");
5411    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5412    REGEX_CHECK_STATUS;
5413    REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5414
5415    status = U_ZERO_ERROR;
5416    repl = UnicodeString("$3$2$1${one}");
5417    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5418    REGEX_CHECK_STATUS;
5419    REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5420
5421    status = U_ZERO_ERROR;
5422    repl = UnicodeString("<${noSuchName}>");
5423    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5424    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5425
5426    status = U_ZERO_ERROR;
5427    repl = UnicodeString("<${invalid-name}>");
5428    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5429    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5430
5431    status = U_ZERO_ERROR;
5432    repl = UnicodeString("<${one");
5433    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5434    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5435
5436    status = U_ZERO_ERROR;
5437    repl = UnicodeString("$not a capture group");
5438    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5439    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5440
5441    uregex_close(re);
5442}
5443
5444//--------------------------------------------------------------
5445//
5446//  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5447//                       The point is not so much what the exact limit is,
5448//                       but that a largish number doesn't hit bad non-linear performance,
5449//                       and that exceeding the limit fails cleanly.
5450//
5451//--------------------------------------------------------------
5452void RegexTest::NamedCaptureLimits() {
5453    if (quick) {
5454        logln("Skipping test. Runs in exhuastive mode only.");
5455        return;
5456    }
5457    const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5458    const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5459    char nnbuf[100];
5460    UnicodeString pattern;
5461    int32_t nn;
5462
5463    for (nn=1; nn<goodLimit; nn++) {
5464        sprintf(nnbuf, "(?<nn%d>)", nn);
5465        pattern.append(UnicodeString(nnbuf, -1, US_INV));
5466    }
5467    UErrorCode status = U_ZERO_ERROR;
5468    RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5469    REGEX_CHECK_STATUS;
5470    for (nn=1; nn<goodLimit; nn++) {
5471        sprintf(nnbuf, "nn%d", nn);
5472        int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5473        REGEX_ASSERT(nn == groupNum);
5474        if (nn != groupNum) {
5475            break;
5476        }
5477    }
5478    delete pat;
5479
5480    pattern.remove();
5481    for (nn=1; nn<failLimit; nn++) {
5482        sprintf(nnbuf, "(?<nn%d>)", nn);
5483        pattern.append(UnicodeString(nnbuf, -1, US_INV));
5484    }
5485    status = U_ZERO_ERROR;
5486    pat = RegexPattern::compile(pattern, 0, status);
5487    REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5488    delete pat;
5489}
5490
5491
5492//--------------------------------------------------------------
5493//
5494//  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5495//
5496//---------------------------------------------------------------
5497void RegexTest::Bug7651() {
5498    UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5499    //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5500    //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5501    UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5502    UnicodeString s("#ff @abcd This is test");
5503    RegexPattern  *REPattern = NULL;
5504    RegexMatcher  *REMatcher = NULL;
5505    UErrorCode status = U_ZERO_ERROR;
5506    UParseError pe;
5507
5508    REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5509    REGEX_CHECK_STATUS;
5510    REMatcher = REPattern->matcher(s, status);
5511    REGEX_CHECK_STATUS;
5512    REGEX_ASSERT(REMatcher->find());
5513    REGEX_ASSERT(REMatcher->start(status) == 0);
5514    delete REPattern;
5515    delete REMatcher;
5516    status = U_ZERO_ERROR;
5517
5518    REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5519    REGEX_CHECK_STATUS;
5520    REMatcher = REPattern->matcher(s, status);
5521    REGEX_CHECK_STATUS;
5522    REGEX_ASSERT(REMatcher->find());
5523    REGEX_ASSERT(REMatcher->start(status) == 0);
5524    delete REPattern;
5525    delete REMatcher;
5526    status = U_ZERO_ERROR;
5527 }
5528
5529void RegexTest::Bug7740() {
5530    UErrorCode status = U_ZERO_ERROR;
5531    UnicodeString pattern = "(a)";
5532    UnicodeString text = "abcdef";
5533    RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5534    REGEX_CHECK_STATUS;
5535    REGEX_ASSERT(m->lookingAt(status));
5536    REGEX_CHECK_STATUS;
5537    status = U_ILLEGAL_ARGUMENT_ERROR;
5538    UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5539    REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5540    REGEX_ASSERT(s == "");
5541    delete m;
5542}
5543
5544// Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5545
5546void RegexTest::Bug8479() {
5547    UErrorCode status = U_ZERO_ERROR;
5548
5549    RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5550    REGEX_CHECK_STATUS;
5551    if (U_SUCCESS(status))
5552    {
5553        UnicodeString str;
5554        str.setToBogus();
5555        pMatcher->reset(str);
5556        status = U_ZERO_ERROR;
5557        pMatcher->matches(status);
5558        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5559        delete pMatcher;
5560    }
5561}
5562
5563
5564// Bug 7029
5565void RegexTest::Bug7029() {
5566    UErrorCode status = U_ZERO_ERROR;
5567
5568    RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5569    UnicodeString text = "abc.def";
5570    UnicodeString splits[10];
5571    REGEX_CHECK_STATUS;
5572    int32_t numFields = pMatcher->split(text, splits, 10, status);
5573    REGEX_CHECK_STATUS;
5574    REGEX_ASSERT(numFields == 8);
5575    delete pMatcher;
5576}
5577
5578// Bug 9283
5579//   This test is checking for the existance of any supplemental characters that case-fold
5580//   to a bmp character.
5581//
5582//   At the time of this writing there are none. If any should appear in a subsequent release
5583//   of Unicode, the code in regular expressions compilation that determines the longest
5584//   posssible match for a literal string  will need to be enhanced.
5585//
5586//   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5587//   for details on what to do in case of a failure of this test.
5588//
5589void RegexTest::Bug9283() {
5590#if !UCONFIG_NO_NORMALIZATION
5591    UErrorCode status = U_ZERO_ERROR;
5592    UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5593    REGEX_CHECK_STATUS;
5594    int32_t index;
5595    UChar32 c;
5596    for (index=0; ; index++) {
5597        c = supplementalsWithCaseFolding.charAt(index);
5598        if (c == -1) {
5599            break;
5600        }
5601        UnicodeString cf = UnicodeString(c).foldCase();
5602        REGEX_ASSERT(cf.length() >= 2);
5603    }
5604#endif /* #if !UCONFIG_NO_NORMALIZATION */
5605}
5606
5607
5608void RegexTest::CheckInvBufSize() {
5609  if(inv_next>=INV_BUFSIZ) {
5610    errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5611          __FILE__, INV_BUFSIZ, inv_next);
5612  } else {
5613    logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5614  }
5615}
5616
5617
5618void RegexTest::Bug10459() {
5619    UErrorCode status = U_ZERO_ERROR;
5620    UnicodeString patternString("(txt)");
5621    UnicodeString txtString("txt");
5622
5623    UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5624    REGEX_CHECK_STATUS;
5625    UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5626    REGEX_CHECK_STATUS;
5627
5628    URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5629    REGEX_CHECK_STATUS;
5630
5631    uregex_setUText(icu_re, utext_txt, &status);
5632    REGEX_CHECK_STATUS;
5633
5634    // The bug was that calling uregex_group() before doing a matching operation
5635    //   was causing a segfault. Only for Regular Expressions created from UText.
5636    //   It should set an U_REGEX_INVALID_STATE.
5637
5638    UChar buf[100];
5639    int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5640    REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5641    REGEX_ASSERT(len == 0);
5642
5643    uregex_close(icu_re);
5644    utext_close(utext_pat);
5645    utext_close(utext_txt);
5646}
5647
5648void RegexTest::TestCaseInsensitiveStarters() {
5649    // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5650    //  become stale because of new Unicode characters.
5651    // If it is stale, rerun the generation tool
5652    //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5653    // and replace the embedded data in i18n/regexcmp.cpp
5654
5655    for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5656        if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5657            continue;
5658        }
5659        UnicodeSet s(cp, cp);
5660        s.closeOver(USET_CASE_INSENSITIVE);
5661        UnicodeSetIterator setIter(s);
5662        while (setIter.next()) {
5663            if (!setIter.isString()) {
5664                continue;
5665            }
5666            const UnicodeString &str = setIter.getString();
5667            UChar32 firstChar = str.char32At(0);
5668            UnicodeSet starters;
5669            RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5670            if (!starters.contains(cp)) {
5671                errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5672                return;
5673            }
5674        }
5675    }
5676}
5677
5678
5679void RegexTest::TestBug11049() {
5680    // Original bug report: pattern with match start consisting of one of several individual characters,
5681    //  and the text being matched ending with a supplementary character. find() would read past the
5682    //  end of the input text when searching for potential match starting points.
5683
5684    // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5685    // detect the bad read.
5686
5687    TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5688    TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5689
5690    // Test again with a pattern starting with a single character,
5691    // which takes a different code path than starting with an OR expression,
5692    // but with similar logic.
5693    TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5694    TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5695}
5696
5697// Run a single test case from TestBug11049(). Internal function.
5698void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5699    UErrorCode status = U_ZERO_ERROR;
5700    UnicodeString patternString = UnicodeString(pattern).unescape();
5701    LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5702
5703    UnicodeString dataString = UnicodeString(data).unescape();
5704    UChar *exactBuffer = new UChar[dataString.length()];
5705    dataString.extract(exactBuffer, dataString.length(), status);
5706    UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5707
5708    LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5709    REGEX_CHECK_STATUS;
5710    matcher->reset(ut);
5711    UBool result = matcher->find();
5712    if (result != expectMatch) {
5713        errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5714              __FILE__, lineNumber, expectMatch, result, pattern, data);
5715    }
5716
5717    // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5718    //   off-by-one on find() with match at the last code point.
5719    //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5720    //   because string.unescape() will only shrink it.
5721    char * utf8Buffer = new char[uprv_strlen(data)+1];
5722    u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5723    REGEX_CHECK_STATUS;
5724    ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5725    REGEX_CHECK_STATUS;
5726    matcher->reset(ut);
5727    result = matcher->find();
5728    if (result != expectMatch) {
5729        errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5730              __FILE__, lineNumber, expectMatch, result, pattern, data);
5731    }
5732    delete [] utf8Buffer;
5733
5734    utext_close(ut);
5735    delete [] exactBuffer;
5736}
5737
5738
5739void RegexTest::TestBug11371() {
5740    if (quick) {
5741        logln("Skipping test. Runs in exhuastive mode only.");
5742        return;
5743    }
5744    UErrorCode status = U_ZERO_ERROR;
5745    UnicodeString patternString;
5746
5747    for (int i=0; i<8000000; i++) {
5748        patternString.append(UnicodeString("()"));
5749    }
5750    LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5751    if (status != U_REGEX_PATTERN_TOO_BIG) {
5752        errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5753              __FILE__, __LINE__, u_errorName(status));
5754    }
5755
5756    status = U_ZERO_ERROR;
5757    patternString = "(";
5758    for (int i=0; i<20000000; i++) {
5759        patternString.append(UnicodeString("A++"));
5760    }
5761    patternString.append(UnicodeString("){0}B++"));
5762    LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5763    if (status != U_REGEX_PATTERN_TOO_BIG) {
5764        errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5765              __FILE__, __LINE__, u_errorName(status));
5766    }
5767
5768    // Pattern with too much string data, such that string indexes overflow operand data field size
5769    // in compiled instruction.
5770    status = U_ZERO_ERROR;
5771    patternString = "";
5772    while (patternString.length() < 0x00ffffff) {
5773        patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5774    }
5775    patternString.append(UnicodeString("X? trailing string"));
5776    LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5777    if (status != U_REGEX_PATTERN_TOO_BIG) {
5778        errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5779              __FILE__, __LINE__, u_errorName(status));
5780    }
5781}
5782
5783void RegexTest::TestBug11480() {
5784    // C API, get capture group of a group that does not participate in the match.
5785    //        (Returns a zero length string, with nul termination,
5786    //         indistinguishable from a group with a zero length match.)
5787
5788    UErrorCode status = U_ZERO_ERROR;
5789    URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5790    REGEX_CHECK_STATUS;
5791    UnicodeString text = UNICODE_STRING_SIMPLE("A");
5792    uregex_setText(re, text.getBuffer(), text.length(), &status);
5793    REGEX_CHECK_STATUS;
5794    REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5795    UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5796    int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5797    REGEX_ASSERT(length == 0);
5798    REGEX_ASSERT(buf[0] == 13);
5799    REGEX_ASSERT(buf[1] == 0);
5800    REGEX_ASSERT(buf[2] == 13);
5801    uregex_close(re);
5802
5803    // UText C++ API, length of match is 0 for non-participating matches.
5804    UText ut = UTEXT_INITIALIZER;
5805    utext_openUnicodeString(&ut, &text, &status);
5806    RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5807    REGEX_CHECK_STATUS;
5808    matcher.reset(&ut);
5809    REGEX_ASSERT(matcher.lookingAt(0, status));
5810
5811    // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5812    int64_t groupLen = -666;
5813    UText group = UTEXT_INITIALIZER;
5814    matcher.group(1, &group, groupLen, status);
5815    REGEX_CHECK_STATUS;
5816    REGEX_ASSERT(groupLen == 1);
5817    REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5818
5819    // Capture group 2, the (B), does not participate in the match.
5820    matcher.group(2, &group, groupLen, status);
5821    REGEX_CHECK_STATUS;
5822    REGEX_ASSERT(groupLen == 0);
5823    REGEX_ASSERT(matcher.start(2, status) == -1);
5824    REGEX_CHECK_STATUS;
5825}
5826
5827
5828#endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5829