1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7//
8//   regextst.cpp
9//
10//      ICU Regular Expressions test, part of intltest.
11//
12
13/*
14     NOTE!!
15
16     PLEASE be careful about ASCII assumptions in this test.
17     This test is one of the worst repeat offenders.
18     If you have questions, contact someone on the ICU PMC
19     who has access to an EBCDIC system.
20
21 */
22
23#include "intltest.h"
24#if !UCONFIG_NO_REGULAR_EXPRESSIONS
25
26#include "unicode/localpointer.h"
27#include "unicode/regex.h"
28#include "unicode/uchar.h"
29#include "unicode/ucnv.h"
30#include "unicode/uniset.h"
31#include "unicode/uregex.h"
32#include "unicode/usetiter.h"
33#include "unicode/ustring.h"
34#include "regextst.h"
35#include "regexcmp.h"
36#include "uvector.h"
37#include "util.h"
38#include <stdlib.h>
39#include <string.h>
40#include <stdio.h>
41#include "cmemory.h"
42#include "cstring.h"
43#include "uinvchar.h"
44
45#define SUPPORT_MUTATING_INPUT_STRING   0
46
47//---------------------------------------------------------------------------
48//
49//  Test class boilerplate
50//
51//---------------------------------------------------------------------------
52RegexTest::RegexTest()
53{
54}
55
56
57RegexTest::~RegexTest()
58{
59}
60
61
62
63void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
64{
65    if (exec) logln("TestSuite RegexTest: ");
66    switch (index) {
67
68        case 0: name = "Basic";
69            if (exec) Basic();
70            break;
71        case 1: name = "API_Match";
72            if (exec) API_Match();
73            break;
74        case 2: name = "API_Replace";
75            if (exec) API_Replace();
76            break;
77        case 3: name = "API_Pattern";
78            if (exec) API_Pattern();
79            break;
80        case 4:
81#if !UCONFIG_NO_FILE_IO
82            name = "Extended";
83            if (exec) Extended();
84#else
85            name = "skip";
86#endif
87            break;
88        case 5: name = "Errors";
89            if (exec) Errors();
90            break;
91        case 6: name = "PerlTests";
92            if (exec) PerlTests();
93            break;
94        case 7: name = "Callbacks";
95            if (exec) Callbacks();
96            break;
97        case 8: name = "FindProgressCallbacks";
98            if (exec) FindProgressCallbacks();
99            break;
100        case 9: name = "Bug 6149";
101             if (exec) Bug6149();
102             break;
103        case 10: name = "UTextBasic";
104          if (exec) UTextBasic();
105          break;
106        case 11: name = "API_Match_UTF8";
107          if (exec) API_Match_UTF8();
108          break;
109        case 12: name = "API_Replace_UTF8";
110          if (exec) API_Replace_UTF8();
111          break;
112        case 13: name = "API_Pattern_UTF8";
113          if (exec) API_Pattern_UTF8();
114          break;
115        case 14: name = "PerlTestsUTF8";
116          if (exec) PerlTestsUTF8();
117          break;
118        case 15: name = "PreAllocatedUTextCAPI";
119          if (exec) PreAllocatedUTextCAPI();
120          break;
121        case 16: name = "Bug 7651";
122             if (exec) Bug7651();
123             break;
124        case 17: name = "Bug 7740";
125            if (exec) Bug7740();
126            break;
127        case 18: name = "Bug 8479";
128            if (exec) Bug8479();
129            break;
130        case 19: name = "Bug 7029";
131            if (exec) Bug7029();
132            break;
133        case 20: name = "CheckInvBufSize";
134            if (exec) CheckInvBufSize();
135            break;
136        case 21: name = "Bug 9283";
137            if (exec) Bug9283();
138            break;
139        case 22: name = "Bug10459";
140            if (exec) Bug10459();
141            break;
142        case 23: name = "TestCaseInsensitiveStarters";
143            if (exec) TestCaseInsensitiveStarters();
144            break;
145        case 24: name = "TestBug11049";
146            if (exec) TestBug11049();
147            break;
148        case 25: name = "TestBug11371";
149            if (exec) TestBug11371();
150            break;
151        case 26: name = "TestBug11480";
152            if (exec) TestBug11480();
153            break;
154        case 27: name = "NamedCapture";
155            if (exec) NamedCapture();
156            break;
157        case 28: name = "NamedCaptureLimits";
158            if (exec) NamedCaptureLimits();
159            break;
160        default: name = "";
161            break; //needed to end loop
162    }
163}
164
165
166
167/**
168 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
169 * into ASCII.
170 * @see utext_openUTF8
171 */
172static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
173
174//---------------------------------------------------------------------------
175//
176//   Error Checking / Reporting macros used in all of the tests.
177//
178//---------------------------------------------------------------------------
179
180static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
181  int64_t oldIndex = utext_getNativeIndex(text);
182  utext_setNativeIndex(text, 0);
183  char *bufPtr = buf;
184  UChar32 c = utext_next32From(text, 0);
185  while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
186    if (0x000020<=c && c<0x00007e) {
187      *bufPtr = c;
188    } else {
189#if 0
190      sprintf(bufPtr,"U+%04X", c);
191      bufPtr+= strlen(bufPtr)-1;
192#else
193      *bufPtr = '%';
194#endif
195    }
196    bufPtr++;
197    c = UTEXT_NEXT32(text);
198  }
199  *bufPtr = 0;
200#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
201  char *ebuf = (char*)malloc(bufLen);
202  uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
203  uprv_strncpy(buf, ebuf, bufLen);
204  free((void*)ebuf);
205#endif
206  utext_setNativeIndex(text, oldIndex);
207}
208
209
210static char ASSERT_BUF[1024];
211
212const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
213  if(message.length()==0) {
214    strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
215  } else {
216    UnicodeString buf;
217    IntlTest::prettify(message,buf);
218    if(buf.length()==0) {
219      strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
220    } else {
221      buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
222      if(ASSERT_BUF[0]==0) {
223        ASSERT_BUF[0]=0;
224        for(int32_t i=0;i<buf.length();i++) {
225          UChar ch = buf[i];
226          sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
227        }
228      }
229    }
230  }
231  ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
232  return ASSERT_BUF;
233}
234
235#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
236
237#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
238                                                              __FILE__, __LINE__, u_errorName(status)); return;}}
239
240#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
241
242#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
243if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
244    __LINE__, u_errorName(errcode), u_errorName(status));};}
245
246#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
247    "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
248
249#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
250    errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
251
252// expected: const char * , restricted to invariant characters.
253// actual: const UnicodeString &
254#define REGEX_ASSERT_UNISTR(expected, actual) { \
255    if (UnicodeString(expected, -1, US_INV) != (actual)) { \
256        errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
257                __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
258
259
260static UBool testUTextEqual(UText *uta, UText *utb) {
261    UChar32 ca = 0;
262    UChar32 cb = 0;
263    utext_setNativeIndex(uta, 0);
264    utext_setNativeIndex(utb, 0);
265    do {
266        ca = utext_next32(uta);
267        cb = utext_next32(utb);
268        if (ca != cb) {
269            break;
270        }
271    } while (ca != U_SENTINEL);
272    return ca == cb;
273}
274
275
276/**
277 * @param expected expected text in UTF-8 (not platform) codepage
278 */
279void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
280    UErrorCode status = U_ZERO_ERROR;
281    UText expectedText = UTEXT_INITIALIZER;
282    utext_openUTF8(&expectedText, expected, -1, &status);
283    if(U_FAILURE(status)) {
284      errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
285      return;
286    }
287    if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
288      errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
289      return;
290    }
291    utext_setNativeIndex(actual, 0);
292    if (!testUTextEqual(&expectedText, actual)) {
293        char buf[201 /*21*/];
294        char expectedBuf[201];
295        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
296        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
297        errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
298    }
299    utext_close(&expectedText);
300}
301/**
302 * @param expected invariant (platform local text) input
303 */
304
305void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
306    UErrorCode status = U_ZERO_ERROR;
307    UText expectedText = UTEXT_INITIALIZER;
308    regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
309    if(U_FAILURE(status)) {
310      errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
311      return;
312    }
313    utext_setNativeIndex(actual, 0);
314    if (!testUTextEqual(&expectedText, actual)) {
315        char buf[201 /*21*/];
316        char expectedBuf[201];
317        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
318        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
319        errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
320    }
321    utext_close(&expectedText);
322}
323
324/**
325 * Assumes utf-8 input
326 */
327#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
328/**
329 * Assumes Invariant input
330 */
331#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
332
333/**
334 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
335 * passed into utext_openUTF8. An error will be given if
336 * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
337 */
338
339#define INV_BUFSIZ 2048 /* increase this if too small */
340
341static int64_t inv_next=0;
342
343#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
344static char inv_buf[INV_BUFSIZ];
345#endif
346
347static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
348  if(length==-1) length=strlen(inv);
349#if U_CHARSET_FAMILY==U_ASCII_FAMILY
350  inv_next+=length;
351  return utext_openUTF8(ut, inv, length, status);
352#else
353  if(inv_next+length+1>INV_BUFSIZ) {
354    fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
355            __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
356    *status = U_MEMORY_ALLOCATION_ERROR;
357    return NULL;
358  }
359
360  unsigned char *buf = (unsigned char*)inv_buf+inv_next;
361  uprv_aestrncpy(buf, (const uint8_t*)inv, length);
362  inv_next+=length;
363
364#if 0
365  fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
366#endif
367
368  return utext_openUTF8(ut, (const char*)buf, length, status);
369#endif
370}
371
372
373//---------------------------------------------------------------------------
374//
375//    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
376//                       for the LookingAt() and  Match() functions.
377//
378//       usage:
379//          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
380//
381//          The expected results are UBool - TRUE or FALSE.
382//          The input text is unescaped.  The pattern is not.
383//
384//
385//---------------------------------------------------------------------------
386
387#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
388
389UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
390    const UnicodeString pattern(pat, -1, US_INV);
391    const UnicodeString inputText(text, -1, US_INV);
392    UErrorCode          status  = U_ZERO_ERROR;
393    UParseError         pe;
394    RegexPattern        *REPattern = NULL;
395    RegexMatcher        *REMatcher = NULL;
396    UBool               retVal     = TRUE;
397
398    UnicodeString patString(pat, -1, US_INV);
399    REPattern = RegexPattern::compile(patString, 0, pe, status);
400    if (U_FAILURE(status)) {
401        dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
402            line, u_errorName(status));
403        return FALSE;
404    }
405    if (line==376) { REPattern->dumpPattern();}
406
407    UnicodeString inputString(inputText);
408    UnicodeString unEscapedInput = inputString.unescape();
409    REMatcher = REPattern->matcher(unEscapedInput, status);
410    if (U_FAILURE(status)) {
411        errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
412            line, u_errorName(status));
413        return FALSE;
414    }
415
416    UBool actualmatch;
417    actualmatch = REMatcher->lookingAt(status);
418    if (U_FAILURE(status)) {
419        errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
420            line, u_errorName(status));
421        retVal =  FALSE;
422    }
423    if (actualmatch != looking) {
424        errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
425        retVal = FALSE;
426    }
427
428    status = U_ZERO_ERROR;
429    actualmatch = REMatcher->matches(status);
430    if (U_FAILURE(status)) {
431        errln("RegexTest failure in matches() at line %d.  Status = %s\n",
432            line, u_errorName(status));
433        retVal = FALSE;
434    }
435    if (actualmatch != match) {
436        errln("RegexTest: wrong return from matches() at line %d.\n", line);
437        retVal = FALSE;
438    }
439
440    if (retVal == FALSE) {
441        REPattern->dumpPattern();
442    }
443
444    delete REPattern;
445    delete REMatcher;
446    return retVal;
447}
448
449
450UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
451    UText               pattern    = UTEXT_INITIALIZER;
452    int32_t             inputUTF8Length;
453    char                *textChars = NULL;
454    UText               inputText  = UTEXT_INITIALIZER;
455    UErrorCode          status     = U_ZERO_ERROR;
456    UParseError         pe;
457    RegexPattern        *REPattern = NULL;
458    RegexMatcher        *REMatcher = NULL;
459    UBool               retVal     = TRUE;
460
461    regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
462    REPattern = RegexPattern::compile(&pattern, 0, pe, status);
463    if (U_FAILURE(status)) {
464        dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
465            line, u_errorName(status));
466        return FALSE;
467    }
468
469    UnicodeString inputString(text, -1, US_INV);
470    UnicodeString unEscapedInput = inputString.unescape();
471    LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
472    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
473
474    inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
475    if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
476        // UTF-8 does not allow unpaired surrogates, so this could actually happen
477        logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
478        return TRUE; // not a failure of the Regex engine
479    }
480    status = U_ZERO_ERROR; // buffer overflow
481    textChars = new char[inputUTF8Length+1];
482    unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
483    utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
484
485    REMatcher = &REPattern->matcher(status)->reset(&inputText);
486    if (U_FAILURE(status)) {
487        errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
488            line, u_errorName(status));
489        return FALSE;
490    }
491
492    UBool actualmatch;
493    actualmatch = REMatcher->lookingAt(status);
494    if (U_FAILURE(status)) {
495        errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
496            line, u_errorName(status));
497        retVal =  FALSE;
498    }
499    if (actualmatch != looking) {
500        errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
501        retVal = FALSE;
502    }
503
504    status = U_ZERO_ERROR;
505    actualmatch = REMatcher->matches(status);
506    if (U_FAILURE(status)) {
507        errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
508            line, u_errorName(status));
509        retVal = FALSE;
510    }
511    if (actualmatch != match) {
512        errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
513        retVal = FALSE;
514    }
515
516    if (retVal == FALSE) {
517        REPattern->dumpPattern();
518    }
519
520    delete REPattern;
521    delete REMatcher;
522    utext_close(&inputText);
523    utext_close(&pattern);
524    delete[] textChars;
525    return retVal;
526}
527
528
529
530//---------------------------------------------------------------------------
531//
532//    REGEX_ERR       Macro + invocation function to simplify writing tests
533//                       regex tests for incorrect patterns
534//
535//       usage:
536//          REGEX_ERR("pattern",   expected error line, column, expected status);
537//
538//---------------------------------------------------------------------------
539#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
540
541void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
542                          UErrorCode expectedStatus, int32_t line) {
543    UnicodeString       pattern(pat);
544
545    UErrorCode          status         = U_ZERO_ERROR;
546    UParseError         pe;
547    RegexPattern        *callerPattern = NULL;
548
549    //
550    //  Compile the caller's pattern
551    //
552    UnicodeString patString(pat);
553    callerPattern = RegexPattern::compile(patString, 0, pe, status);
554    if (status != expectedStatus) {
555        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
556    } else {
557        if (status != U_ZERO_ERROR) {
558            if (pe.line != errLine || pe.offset != errCol) {
559                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
560                    line, errLine, errCol, pe.line, pe.offset);
561            }
562        }
563    }
564
565    delete callerPattern;
566
567    //
568    //  Compile again, using a UTF-8-based UText
569    //
570    UText patternText = UTEXT_INITIALIZER;
571    regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
572    callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
573    if (status != expectedStatus) {
574        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
575    } else {
576        if (status != U_ZERO_ERROR) {
577            if (pe.line != errLine || pe.offset != errCol) {
578                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
579                    line, errLine, errCol, pe.line, pe.offset);
580            }
581        }
582    }
583
584    delete callerPattern;
585    utext_close(&patternText);
586}
587
588
589
590//---------------------------------------------------------------------------
591//
592//      Basic      Check for basic functionality of regex pattern matching.
593//                 Avoid the use of REGEX_FIND test macro, which has
594//                 substantial dependencies on basic Regex functionality.
595//
596//---------------------------------------------------------------------------
597void RegexTest::Basic() {
598
599
600//
601// Debug - slide failing test cases early
602//
603#if 0
604    {
605        // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
606        UParseError pe;
607        UErrorCode  status = U_ZERO_ERROR;
608        RegexPattern *pattern;
609        pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
610        pattern->dumpPattern();
611        RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
612        UBool result = m->find();
613        printf("result = %d\n", result);
614        // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
615        // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
616    }
617    exit(1);
618#endif
619
620
621    //
622    // Pattern with parentheses
623    //
624    REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
625    REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
626    REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
627
628    //
629    // Patterns with *
630    //
631    REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
632    REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
633    REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
634    REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
635    REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
636
637    REGEX_TESTLM("a*", "",  TRUE, TRUE);
638    REGEX_TESTLM("a*", "b", TRUE, FALSE);
639
640
641    //
642    //  Patterns with "."
643    //
644    REGEX_TESTLM(".", "abc", TRUE, FALSE);
645    REGEX_TESTLM("...", "abc", TRUE, TRUE);
646    REGEX_TESTLM("....", "abc", FALSE, FALSE);
647    REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
648    REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
649    REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
650    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
651    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
652
653    //
654    //  Patterns with * applied to chars at end of literal string
655    //
656    REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
657    REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
658
659    //
660    //  Supplemental chars match as single chars, not a pair of surrogates.
661    //
662    REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
663    REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
664    REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
665
666
667    //
668    //  UnicodeSets in the pattern
669    //
670    REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
671    REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
672    REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
673    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
674    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
675    REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
676
677    REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
678    REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
679    REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
680    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
681    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
682
683    //
684    //   OR operator in patterns
685    //
686    REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
687    REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
688    REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
689    REGEX_TESTLM("a|b", "b", TRUE, TRUE);
690
691    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
692    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
693    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
694    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
695    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
696    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
697
698    //
699    //  +
700    //
701    REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
702    REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
703    REGEX_TESTLM("b+", "", FALSE, FALSE);
704    REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
705    REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
706    REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
707
708    //
709    //   ?
710    //
711    REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
712    REGEX_TESTLM("ab?", "a", TRUE, TRUE);
713    REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
714    REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
715    REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
716    REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
717    REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
718    REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
719    REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
720
721    //
722    //  Escape sequences that become single literal chars, handled internally
723    //   by ICU's Unescape.
724    //
725
726    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
727    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
728    REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
729    REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
730    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
731    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
732    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
733    REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
734    REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
735    REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
736
737    REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
738    REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
739
740    // Escape of special chars in patterns
741    REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
742}
743
744
745//---------------------------------------------------------------------------
746//
747//    UTextBasic   Check for quirks that are specific to the UText
748//                 implementation.
749//
750//---------------------------------------------------------------------------
751void RegexTest::UTextBasic() {
752    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
753    UErrorCode status = U_ZERO_ERROR;
754    UText pattern = UTEXT_INITIALIZER;
755    utext_openUTF8(&pattern, str_abc, -1, &status);
756    RegexMatcher matcher(&pattern, 0, status);
757    REGEX_CHECK_STATUS;
758
759    UText input = UTEXT_INITIALIZER;
760    utext_openUTF8(&input, str_abc, -1, &status);
761    REGEX_CHECK_STATUS;
762    matcher.reset(&input);
763    REGEX_CHECK_STATUS;
764    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
765
766    matcher.reset(matcher.inputText());
767    REGEX_CHECK_STATUS;
768    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
769
770    utext_close(&pattern);
771    utext_close(&input);
772}
773
774
775//---------------------------------------------------------------------------
776//
777//      API_Match   Test that the API for class RegexMatcher
778//                  is present and nominally working, but excluding functions
779//                  implementing replace operations.
780//
781//---------------------------------------------------------------------------
782void RegexTest::API_Match() {
783    UParseError         pe;
784    UErrorCode          status=U_ZERO_ERROR;
785    int32_t             flags = 0;
786
787    //
788    // Debug - slide failing test cases early
789    //
790#if 0
791    {
792    }
793    return;
794#endif
795
796    //
797    // Simple pattern compilation
798    //
799    {
800        UnicodeString       re("abc");
801        RegexPattern        *pat2;
802        pat2 = RegexPattern::compile(re, flags, pe, status);
803        REGEX_CHECK_STATUS;
804
805        UnicodeString inStr1 = "abcdef this is a test";
806        UnicodeString instr2 = "not abc";
807        UnicodeString empty  = "";
808
809
810        //
811        // Matcher creation and reset.
812        //
813        RegexMatcher *m1 = pat2->matcher(inStr1, status);
814        REGEX_CHECK_STATUS;
815        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
816        REGEX_ASSERT(m1->input() == inStr1);
817        m1->reset(instr2);
818        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
819        REGEX_ASSERT(m1->input() == instr2);
820        m1->reset(inStr1);
821        REGEX_ASSERT(m1->input() == inStr1);
822        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
823        m1->reset(empty);
824        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
825        REGEX_ASSERT(m1->input() == empty);
826        REGEX_ASSERT(&m1->pattern() == pat2);
827
828        //
829        //  reset(pos, status)
830        //
831        m1->reset(inStr1);
832        m1->reset(4, status);
833        REGEX_CHECK_STATUS;
834        REGEX_ASSERT(m1->input() == inStr1);
835        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
836
837        m1->reset(-1, status);
838        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
839        status = U_ZERO_ERROR;
840
841        m1->reset(0, status);
842        REGEX_CHECK_STATUS;
843        status = U_ZERO_ERROR;
844
845        int32_t len = m1->input().length();
846        m1->reset(len-1, status);
847        REGEX_CHECK_STATUS;
848        status = U_ZERO_ERROR;
849
850        m1->reset(len, status);
851        REGEX_CHECK_STATUS;
852        status = U_ZERO_ERROR;
853
854        m1->reset(len+1, status);
855        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
856        status = U_ZERO_ERROR;
857
858        //
859        // match(pos, status)
860        //
861        m1->reset(instr2);
862        REGEX_ASSERT(m1->matches(4, status) == TRUE);
863        m1->reset();
864        REGEX_ASSERT(m1->matches(3, status) == FALSE);
865        m1->reset();
866        REGEX_ASSERT(m1->matches(5, status) == FALSE);
867        REGEX_ASSERT(m1->matches(4, status) == TRUE);
868        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
869        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
870
871        // Match() at end of string should fail, but should not
872        //  be an error.
873        status = U_ZERO_ERROR;
874        len = m1->input().length();
875        REGEX_ASSERT(m1->matches(len, status) == FALSE);
876        REGEX_CHECK_STATUS;
877
878        // Match beyond end of string should fail with an error.
879        status = U_ZERO_ERROR;
880        REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
881        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
882
883        // Successful match at end of string.
884        {
885            status = U_ZERO_ERROR;
886            RegexMatcher m("A?", 0, status);  // will match zero length string.
887            REGEX_CHECK_STATUS;
888            m.reset(inStr1);
889            len = inStr1.length();
890            REGEX_ASSERT(m.matches(len, status) == TRUE);
891            REGEX_CHECK_STATUS;
892            m.reset(empty);
893            REGEX_ASSERT(m.matches(0, status) == TRUE);
894            REGEX_CHECK_STATUS;
895        }
896
897
898        //
899        // lookingAt(pos, status)
900        //
901        status = U_ZERO_ERROR;
902        m1->reset(instr2);  // "not abc"
903        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
904        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
905        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
906        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
907        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
908        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
909        status = U_ZERO_ERROR;
910        len = m1->input().length();
911        REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
912        REGEX_CHECK_STATUS;
913        REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
914        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
915
916        delete m1;
917        delete pat2;
918    }
919
920
921    //
922    // Capture Group.
923    //     RegexMatcher::start();
924    //     RegexMatcher::end();
925    //     RegexMatcher::groupCount();
926    //
927    {
928        int32_t             flags=0;
929        UParseError         pe;
930        UErrorCode          status=U_ZERO_ERROR;
931
932        UnicodeString       re("01(23(45)67)(.*)");
933        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
934        REGEX_CHECK_STATUS;
935        UnicodeString data = "0123456789";
936
937        RegexMatcher *matcher = pat->matcher(data, status);
938        REGEX_CHECK_STATUS;
939        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
940        static const int32_t matchStarts[] = {0,  2, 4, 8};
941        static const int32_t matchEnds[]   = {10, 8, 6, 10};
942        int32_t i;
943        for (i=0; i<4; i++) {
944            int32_t actualStart = matcher->start(i, status);
945            REGEX_CHECK_STATUS;
946            if (actualStart != matchStarts[i]) {
947                errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
948                    __LINE__, i, matchStarts[i], actualStart);
949            }
950            int32_t actualEnd = matcher->end(i, status);
951            REGEX_CHECK_STATUS;
952            if (actualEnd != matchEnds[i]) {
953                errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
954                    __LINE__, i, matchEnds[i], actualEnd);
955            }
956        }
957
958        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
959        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
960
961        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
962        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
963        matcher->reset();
964        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
965
966        matcher->lookingAt(status);
967        REGEX_ASSERT(matcher->group(status)    == "0123456789");
968        REGEX_ASSERT(matcher->group(0, status) == "0123456789");
969        REGEX_ASSERT(matcher->group(1, status) == "234567"    );
970        REGEX_ASSERT(matcher->group(2, status) == "45"        );
971        REGEX_ASSERT(matcher->group(3, status) == "89"        );
972        REGEX_CHECK_STATUS;
973        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
974        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
975        matcher->reset();
976        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
977
978        delete matcher;
979        delete pat;
980
981    }
982
983    //
984    //  find
985    //
986    {
987        int32_t             flags=0;
988        UParseError         pe;
989        UErrorCode          status=U_ZERO_ERROR;
990
991        UnicodeString       re("abc");
992        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
993        REGEX_CHECK_STATUS;
994        UnicodeString data = ".abc..abc...abc..";
995        //                    012345678901234567
996
997        RegexMatcher *matcher = pat->matcher(data, status);
998        REGEX_CHECK_STATUS;
999        REGEX_ASSERT(matcher->find());
1000        REGEX_ASSERT(matcher->start(status) == 1);
1001        REGEX_ASSERT(matcher->find());
1002        REGEX_ASSERT(matcher->start(status) == 6);
1003        REGEX_ASSERT(matcher->find());
1004        REGEX_ASSERT(matcher->start(status) == 12);
1005        REGEX_ASSERT(matcher->find() == FALSE);
1006        REGEX_ASSERT(matcher->find() == FALSE);
1007
1008        matcher->reset();
1009        REGEX_ASSERT(matcher->find());
1010        REGEX_ASSERT(matcher->start(status) == 1);
1011
1012        REGEX_ASSERT(matcher->find(0, status));
1013        REGEX_ASSERT(matcher->start(status) == 1);
1014        REGEX_ASSERT(matcher->find(1, status));
1015        REGEX_ASSERT(matcher->start(status) == 1);
1016        REGEX_ASSERT(matcher->find(2, status));
1017        REGEX_ASSERT(matcher->start(status) == 6);
1018        REGEX_ASSERT(matcher->find(12, status));
1019        REGEX_ASSERT(matcher->start(status) == 12);
1020        REGEX_ASSERT(matcher->find(13, status) == FALSE);
1021        REGEX_ASSERT(matcher->find(16, status) == FALSE);
1022        REGEX_ASSERT(matcher->find(17, status) == FALSE);
1023        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1024
1025        status = U_ZERO_ERROR;
1026        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1027        status = U_ZERO_ERROR;
1028        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1029
1030        REGEX_ASSERT(matcher->groupCount() == 0);
1031
1032        delete matcher;
1033        delete pat;
1034    }
1035
1036
1037    //
1038    //  find, with \G in pattern (true if at the end of a previous match).
1039    //
1040    {
1041        int32_t             flags=0;
1042        UParseError         pe;
1043        UErrorCode          status=U_ZERO_ERROR;
1044
1045        UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1046        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1047        REGEX_CHECK_STATUS;
1048        UnicodeString data = ".abcabc.abc..";
1049        //                    012345678901234567
1050
1051        RegexMatcher *matcher = pat->matcher(data, status);
1052        REGEX_CHECK_STATUS;
1053        REGEX_ASSERT(matcher->find());
1054        REGEX_ASSERT(matcher->start(status) == 0);
1055        REGEX_ASSERT(matcher->start(1, status) == -1);
1056        REGEX_ASSERT(matcher->start(2, status) == 1);
1057
1058        REGEX_ASSERT(matcher->find());
1059        REGEX_ASSERT(matcher->start(status) == 4);
1060        REGEX_ASSERT(matcher->start(1, status) == 4);
1061        REGEX_ASSERT(matcher->start(2, status) == -1);
1062        REGEX_CHECK_STATUS;
1063
1064        delete matcher;
1065        delete pat;
1066    }
1067
1068    //
1069    //   find with zero length matches, match position should bump ahead
1070    //     to prevent loops.
1071    //
1072    {
1073        int32_t                 i;
1074        UErrorCode          status=U_ZERO_ERROR;
1075        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1076                                                      //   using an always-true look-ahead.
1077        REGEX_CHECK_STATUS;
1078        UnicodeString s("    ");
1079        m.reset(s);
1080        for (i=0; ; i++) {
1081            if (m.find() == FALSE) {
1082                break;
1083            }
1084            REGEX_ASSERT(m.start(status) == i);
1085            REGEX_ASSERT(m.end(status) == i);
1086        }
1087        REGEX_ASSERT(i==5);
1088
1089        // Check that the bump goes over surrogate pairs OK
1090        s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1091        s = s.unescape();
1092        m.reset(s);
1093        for (i=0; ; i+=2) {
1094            if (m.find() == FALSE) {
1095                break;
1096            }
1097            REGEX_ASSERT(m.start(status) == i);
1098            REGEX_ASSERT(m.end(status) == i);
1099        }
1100        REGEX_ASSERT(i==10);
1101    }
1102    {
1103        // find() loop breaking test.
1104        //        with pattern of /.?/, should see a series of one char matches, then a single
1105        //        match of zero length at the end of the input string.
1106        int32_t                 i;
1107        UErrorCode          status=U_ZERO_ERROR;
1108        RegexMatcher        m(".?", 0, status);
1109        REGEX_CHECK_STATUS;
1110        UnicodeString s("    ");
1111        m.reset(s);
1112        for (i=0; ; i++) {
1113            if (m.find() == FALSE) {
1114                break;
1115            }
1116            REGEX_ASSERT(m.start(status) == i);
1117            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1118        }
1119        REGEX_ASSERT(i==5);
1120    }
1121
1122
1123    //
1124    // Matchers with no input string behave as if they had an empty input string.
1125    //
1126
1127    {
1128        UErrorCode status = U_ZERO_ERROR;
1129        RegexMatcher  m(".?", 0, status);
1130        REGEX_CHECK_STATUS;
1131        REGEX_ASSERT(m.find());
1132        REGEX_ASSERT(m.start(status) == 0);
1133        REGEX_ASSERT(m.input() == "");
1134    }
1135    {
1136        UErrorCode status = U_ZERO_ERROR;
1137        RegexPattern  *p = RegexPattern::compile(".", 0, status);
1138        RegexMatcher  *m = p->matcher(status);
1139        REGEX_CHECK_STATUS;
1140
1141        REGEX_ASSERT(m->find() == FALSE);
1142        REGEX_ASSERT(m->input() == "");
1143        delete m;
1144        delete p;
1145    }
1146
1147    //
1148    // Regions
1149    //
1150    {
1151        UErrorCode status = U_ZERO_ERROR;
1152        UnicodeString testString("This is test data");
1153        RegexMatcher m(".*", testString,  0, status);
1154        REGEX_CHECK_STATUS;
1155        REGEX_ASSERT(m.regionStart() == 0);
1156        REGEX_ASSERT(m.regionEnd() == testString.length());
1157        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1158        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1159
1160        m.region(2,4, status);
1161        REGEX_CHECK_STATUS;
1162        REGEX_ASSERT(m.matches(status));
1163        REGEX_ASSERT(m.start(status)==2);
1164        REGEX_ASSERT(m.end(status)==4);
1165        REGEX_CHECK_STATUS;
1166
1167        m.reset();
1168        REGEX_ASSERT(m.regionStart() == 0);
1169        REGEX_ASSERT(m.regionEnd() == testString.length());
1170
1171        UnicodeString shorterString("short");
1172        m.reset(shorterString);
1173        REGEX_ASSERT(m.regionStart() == 0);
1174        REGEX_ASSERT(m.regionEnd() == shorterString.length());
1175
1176        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1177        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1178        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1179        REGEX_ASSERT(&m == &m.reset());
1180        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1181
1182        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1183        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1184        REGEX_ASSERT(&m == &m.reset());
1185        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1186
1187        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1188        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1189        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1190        REGEX_ASSERT(&m == &m.reset());
1191        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1192
1193        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1194        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1195        REGEX_ASSERT(&m == &m.reset());
1196        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1197
1198    }
1199
1200    //
1201    // hitEnd() and requireEnd()
1202    //
1203    {
1204        UErrorCode status = U_ZERO_ERROR;
1205        UnicodeString testString("aabb");
1206        RegexMatcher m1(".*", testString,  0, status);
1207        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1208        REGEX_ASSERT(m1.hitEnd() == TRUE);
1209        REGEX_ASSERT(m1.requireEnd() == FALSE);
1210        REGEX_CHECK_STATUS;
1211
1212        status = U_ZERO_ERROR;
1213        RegexMatcher m2("a*", testString, 0, status);
1214        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1215        REGEX_ASSERT(m2.hitEnd() == FALSE);
1216        REGEX_ASSERT(m2.requireEnd() == FALSE);
1217        REGEX_CHECK_STATUS;
1218
1219        status = U_ZERO_ERROR;
1220        RegexMatcher m3(".*$", testString, 0, status);
1221        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1222        REGEX_ASSERT(m3.hitEnd() == TRUE);
1223        REGEX_ASSERT(m3.requireEnd() == TRUE);
1224        REGEX_CHECK_STATUS;
1225    }
1226
1227
1228    //
1229    // Compilation error on reset with UChar *
1230    //   These were a hazard that people were stumbling over with runtime errors.
1231    //   Changed them to compiler errors by adding private methods that more closely
1232    //   matched the incorrect use of the functions.
1233    //
1234#if 0
1235    {
1236        UErrorCode status = U_ZERO_ERROR;
1237        UChar ucharString[20];
1238        RegexMatcher m(".", 0, status);
1239        m.reset(ucharString);  // should not compile.
1240
1241        RegexPattern *p = RegexPattern::compile(".", 0, status);
1242        RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1243
1244        RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1245    }
1246#endif
1247
1248    //
1249    //  Time Outs.
1250    //       Note:  These tests will need to be changed when the regexp engine is
1251    //              able to detect and cut short the exponential time behavior on
1252    //              this type of match.
1253    //
1254    {
1255        UErrorCode status = U_ZERO_ERROR;
1256        //    Enough 'a's in the string to cause the match to time out.
1257        //       (Each on additonal 'a' doubles the time)
1258        UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1259        RegexMatcher matcher("(a+)+b", testString, 0, status);
1260        REGEX_CHECK_STATUS;
1261        REGEX_ASSERT(matcher.getTimeLimit() == 0);
1262        matcher.setTimeLimit(100, status);
1263        REGEX_ASSERT(matcher.getTimeLimit() == 100);
1264        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1265        REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1266    }
1267    {
1268        UErrorCode status = U_ZERO_ERROR;
1269        //   Few enough 'a's to slip in under the time limit.
1270        UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1271        RegexMatcher matcher("(a+)+b", testString, 0, status);
1272        REGEX_CHECK_STATUS;
1273        matcher.setTimeLimit(100, status);
1274        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1275        REGEX_CHECK_STATUS;
1276    }
1277
1278    //
1279    //  Stack Limits
1280    //
1281    {
1282        UErrorCode status = U_ZERO_ERROR;
1283        UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1284
1285        // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1286        //   of the '+', and makes the stack frames larger.
1287        RegexMatcher matcher("(A)+A$", testString, 0, status);
1288
1289        // With the default stack, this match should fail to run
1290        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1291        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1292
1293        // With unlimited stack, it should run
1294        status = U_ZERO_ERROR;
1295        matcher.setStackLimit(0, status);
1296        REGEX_CHECK_STATUS;
1297        REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1298        REGEX_CHECK_STATUS;
1299        REGEX_ASSERT(matcher.getStackLimit() == 0);
1300
1301        // With a limited stack, it the match should fail
1302        status = U_ZERO_ERROR;
1303        matcher.setStackLimit(10000, status);
1304        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1305        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1306        REGEX_ASSERT(matcher.getStackLimit() == 10000);
1307    }
1308
1309        // A pattern that doesn't save state should work with
1310        //   a minimal sized stack
1311    {
1312        UErrorCode status = U_ZERO_ERROR;
1313        UnicodeString testString = "abc";
1314        RegexMatcher matcher("abc", testString, 0, status);
1315        REGEX_CHECK_STATUS;
1316        matcher.setStackLimit(30, status);
1317        REGEX_CHECK_STATUS;
1318        REGEX_ASSERT(matcher.matches(status) == TRUE);
1319        REGEX_CHECK_STATUS;
1320        REGEX_ASSERT(matcher.getStackLimit() == 30);
1321
1322        // Negative stack sizes should fail
1323        status = U_ZERO_ERROR;
1324        matcher.setStackLimit(1000, status);
1325        REGEX_CHECK_STATUS;
1326        matcher.setStackLimit(-1, status);
1327        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1328        REGEX_ASSERT(matcher.getStackLimit() == 1000);
1329    }
1330
1331
1332}
1333
1334
1335
1336
1337
1338
1339//---------------------------------------------------------------------------
1340//
1341//      API_Replace        API test for class RegexMatcher, testing the
1342//                         Replace family of functions.
1343//
1344//---------------------------------------------------------------------------
1345void RegexTest::API_Replace() {
1346    //
1347    //  Replace
1348    //
1349    int32_t             flags=0;
1350    UParseError         pe;
1351    UErrorCode          status=U_ZERO_ERROR;
1352
1353    UnicodeString       re("abc");
1354    RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1355    REGEX_CHECK_STATUS;
1356    UnicodeString data = ".abc..abc...abc..";
1357    //                    012345678901234567
1358    RegexMatcher *matcher = pat->matcher(data, status);
1359
1360    //
1361    //  Plain vanilla matches.
1362    //
1363    UnicodeString  dest;
1364    dest = matcher->replaceFirst("yz", status);
1365    REGEX_CHECK_STATUS;
1366    REGEX_ASSERT(dest == ".yz..abc...abc..");
1367
1368    dest = matcher->replaceAll("yz", status);
1369    REGEX_CHECK_STATUS;
1370    REGEX_ASSERT(dest == ".yz..yz...yz..");
1371
1372    //
1373    //  Plain vanilla non-matches.
1374    //
1375    UnicodeString d2 = ".abx..abx...abx..";
1376    matcher->reset(d2);
1377    dest = matcher->replaceFirst("yz", status);
1378    REGEX_CHECK_STATUS;
1379    REGEX_ASSERT(dest == ".abx..abx...abx..");
1380
1381    dest = matcher->replaceAll("yz", status);
1382    REGEX_CHECK_STATUS;
1383    REGEX_ASSERT(dest == ".abx..abx...abx..");
1384
1385    //
1386    // Empty source string
1387    //
1388    UnicodeString d3 = "";
1389    matcher->reset(d3);
1390    dest = matcher->replaceFirst("yz", status);
1391    REGEX_CHECK_STATUS;
1392    REGEX_ASSERT(dest == "");
1393
1394    dest = matcher->replaceAll("yz", status);
1395    REGEX_CHECK_STATUS;
1396    REGEX_ASSERT(dest == "");
1397
1398    //
1399    // Empty substitution string
1400    //
1401    matcher->reset(data);              // ".abc..abc...abc.."
1402    dest = matcher->replaceFirst("", status);
1403    REGEX_CHECK_STATUS;
1404    REGEX_ASSERT(dest == "...abc...abc..");
1405
1406    dest = matcher->replaceAll("", status);
1407    REGEX_CHECK_STATUS;
1408    REGEX_ASSERT(dest == "........");
1409
1410    //
1411    // match whole string
1412    //
1413    UnicodeString d4 = "abc";
1414    matcher->reset(d4);
1415    dest = matcher->replaceFirst("xyz", status);
1416    REGEX_CHECK_STATUS;
1417    REGEX_ASSERT(dest == "xyz");
1418
1419    dest = matcher->replaceAll("xyz", status);
1420    REGEX_CHECK_STATUS;
1421    REGEX_ASSERT(dest == "xyz");
1422
1423    //
1424    // Capture Group, simple case
1425    //
1426    UnicodeString       re2("a(..)");
1427    RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1428    REGEX_CHECK_STATUS;
1429    UnicodeString d5 = "abcdefg";
1430    RegexMatcher *matcher2 = pat2->matcher(d5, status);
1431    REGEX_CHECK_STATUS;
1432    dest = matcher2->replaceFirst("$1$1", status);
1433    REGEX_CHECK_STATUS;
1434    REGEX_ASSERT(dest == "bcbcdefg");
1435
1436    dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1437    REGEX_CHECK_STATUS;
1438    REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1439
1440    dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1441    REGEX_ASSERT(U_FAILURE(status));
1442    status = U_ZERO_ERROR;
1443
1444    UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1445    replacement = replacement.unescape();
1446    dest = matcher2->replaceFirst(replacement, status);
1447    REGEX_CHECK_STATUS;
1448    REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1449
1450    REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1451
1452
1453    //
1454    // Replacement String with \u hex escapes
1455    //
1456    {
1457        UnicodeString  src = "abc 1 abc 2 abc 3";
1458        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1459        matcher->reset(src);
1460        UnicodeString  result = matcher->replaceAll(substitute, status);
1461        REGEX_CHECK_STATUS;
1462        REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1463    }
1464    {
1465        UnicodeString  src = "abc !";
1466        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1467        matcher->reset(src);
1468        UnicodeString  result = matcher->replaceAll(substitute, status);
1469        REGEX_CHECK_STATUS;
1470        UnicodeString expected = UnicodeString("--");
1471        expected.append((UChar32)0x10000);
1472        expected.append("-- !");
1473        REGEX_ASSERT(result == expected);
1474    }
1475    // TODO:  need more through testing of capture substitutions.
1476
1477    // Bug 4057
1478    //
1479    {
1480        status = U_ZERO_ERROR;
1481        UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1482        RegexMatcher m("ss(.*?)ee", 0, status);
1483        REGEX_CHECK_STATUS;
1484        UnicodeString result;
1485
1486        // Multiple finds do NOT bump up the previous appendReplacement postion.
1487        m.reset(s);
1488        m.find();
1489        m.find();
1490        m.appendReplacement(result, "ooh", status);
1491        REGEX_CHECK_STATUS;
1492        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1493
1494        // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1495        status = U_ZERO_ERROR;
1496        result.truncate(0);
1497        m.reset(10, status);
1498        m.find();
1499        m.find();
1500        m.appendReplacement(result, "ooh", status);
1501        REGEX_CHECK_STATUS;
1502        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1503
1504        // find() at interior of string, appendReplacemnt still starts at beginning.
1505        status = U_ZERO_ERROR;
1506        result.truncate(0);
1507        m.reset();
1508        m.find(10, status);
1509        m.find();
1510        m.appendReplacement(result, "ooh", status);
1511        REGEX_CHECK_STATUS;
1512        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1513
1514        m.appendTail(result);
1515        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1516
1517    }
1518
1519    delete matcher2;
1520    delete pat2;
1521    delete matcher;
1522    delete pat;
1523}
1524
1525
1526//---------------------------------------------------------------------------
1527//
1528//      API_Pattern       Test that the API for class RegexPattern is
1529//                        present and nominally working.
1530//
1531//---------------------------------------------------------------------------
1532void RegexTest::API_Pattern() {
1533    RegexPattern        pata;    // Test default constructor to not crash.
1534    RegexPattern        patb;
1535
1536    REGEX_ASSERT(pata == patb);
1537    REGEX_ASSERT(pata == pata);
1538
1539    UnicodeString re1("abc[a-l][m-z]");
1540    UnicodeString re2("def");
1541    UErrorCode    status = U_ZERO_ERROR;
1542    UParseError   pe;
1543
1544    RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1545    RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1546    REGEX_CHECK_STATUS;
1547    REGEX_ASSERT(*pat1 == *pat1);
1548    REGEX_ASSERT(*pat1 != pata);
1549
1550    // Assign
1551    patb = *pat1;
1552    REGEX_ASSERT(patb == *pat1);
1553
1554    // Copy Construct
1555    RegexPattern patc(*pat1);
1556    REGEX_ASSERT(patc == *pat1);
1557    REGEX_ASSERT(patb == patc);
1558    REGEX_ASSERT(pat1 != pat2);
1559    patb = *pat2;
1560    REGEX_ASSERT(patb != patc);
1561    REGEX_ASSERT(patb == *pat2);
1562
1563    // Compile with no flags.
1564    RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1565    REGEX_ASSERT(*pat1a == *pat1);
1566
1567    REGEX_ASSERT(pat1a->flags() == 0);
1568
1569    // Compile with different flags should be not equal
1570    RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1571    REGEX_CHECK_STATUS;
1572
1573    REGEX_ASSERT(*pat1b != *pat1a);
1574    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1575    REGEX_ASSERT(pat1a->flags() == 0);
1576    delete pat1b;
1577
1578    // clone
1579    RegexPattern *pat1c = pat1->clone();
1580    REGEX_ASSERT(*pat1c == *pat1);
1581    REGEX_ASSERT(*pat1c != *pat2);
1582
1583    delete pat1c;
1584    delete pat1a;
1585    delete pat1;
1586    delete pat2;
1587
1588
1589    //
1590    //   Verify that a matcher created from a cloned pattern works.
1591    //     (Jitterbug 3423)
1592    //
1593    {
1594        UErrorCode     status     = U_ZERO_ERROR;
1595        RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1596        RegexPattern  *pClone     = pSource->clone();
1597        delete         pSource;
1598        RegexMatcher  *mFromClone = pClone->matcher(status);
1599        REGEX_CHECK_STATUS;
1600        UnicodeString s = "Hello World";
1601        mFromClone->reset(s);
1602        REGEX_ASSERT(mFromClone->find() == TRUE);
1603        REGEX_ASSERT(mFromClone->group(status) == "Hello");
1604        REGEX_ASSERT(mFromClone->find() == TRUE);
1605        REGEX_ASSERT(mFromClone->group(status) == "World");
1606        REGEX_ASSERT(mFromClone->find() == FALSE);
1607        delete mFromClone;
1608        delete pClone;
1609    }
1610
1611    //
1612    //   matches convenience API
1613    //
1614    REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1615    REGEX_CHECK_STATUS;
1616    REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1617    REGEX_CHECK_STATUS;
1618    REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1619    REGEX_CHECK_STATUS;
1620    REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1621    REGEX_CHECK_STATUS;
1622    REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1623    REGEX_CHECK_STATUS;
1624    status = U_INDEX_OUTOFBOUNDS_ERROR;
1625    REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1626    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1627
1628
1629    //
1630    // Split()
1631    //
1632    status = U_ZERO_ERROR;
1633    pat1 = RegexPattern::compile(" +",  pe, status);
1634    REGEX_CHECK_STATUS;
1635    UnicodeString  fields[10];
1636
1637    int32_t n;
1638    n = pat1->split("Now is the time", fields, 10, status);
1639    REGEX_CHECK_STATUS;
1640    REGEX_ASSERT(n==4);
1641    REGEX_ASSERT(fields[0]=="Now");
1642    REGEX_ASSERT(fields[1]=="is");
1643    REGEX_ASSERT(fields[2]=="the");
1644    REGEX_ASSERT(fields[3]=="time");
1645    REGEX_ASSERT(fields[4]=="");
1646
1647    n = pat1->split("Now is the time", fields, 2, status);
1648    REGEX_CHECK_STATUS;
1649    REGEX_ASSERT(n==2);
1650    REGEX_ASSERT(fields[0]=="Now");
1651    REGEX_ASSERT(fields[1]=="is the time");
1652    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1653
1654    fields[1] = "*";
1655    status = U_ZERO_ERROR;
1656    n = pat1->split("Now is the time", fields, 1, status);
1657    REGEX_CHECK_STATUS;
1658    REGEX_ASSERT(n==1);
1659    REGEX_ASSERT(fields[0]=="Now is the time");
1660    REGEX_ASSERT(fields[1]=="*");
1661    status = U_ZERO_ERROR;
1662
1663    n = pat1->split("    Now       is the time   ", fields, 10, status);
1664    REGEX_CHECK_STATUS;
1665    REGEX_ASSERT(n==6);
1666    REGEX_ASSERT(fields[0]=="");
1667    REGEX_ASSERT(fields[1]=="Now");
1668    REGEX_ASSERT(fields[2]=="is");
1669    REGEX_ASSERT(fields[3]=="the");
1670    REGEX_ASSERT(fields[4]=="time");
1671    REGEX_ASSERT(fields[5]=="");
1672
1673    n = pat1->split("     ", fields, 10, status);
1674    REGEX_CHECK_STATUS;
1675    REGEX_ASSERT(n==2);
1676    REGEX_ASSERT(fields[0]=="");
1677    REGEX_ASSERT(fields[1]=="");
1678
1679    fields[0] = "foo";
1680    n = pat1->split("", fields, 10, status);
1681    REGEX_CHECK_STATUS;
1682    REGEX_ASSERT(n==0);
1683    REGEX_ASSERT(fields[0]=="foo");
1684
1685    delete pat1;
1686
1687    //  split, with a pattern with (capture)
1688    pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1689    REGEX_CHECK_STATUS;
1690
1691    status = U_ZERO_ERROR;
1692    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1693    REGEX_CHECK_STATUS;
1694    REGEX_ASSERT(n==7);
1695    REGEX_ASSERT(fields[0]=="");
1696    REGEX_ASSERT(fields[1]=="a");
1697    REGEX_ASSERT(fields[2]=="Now is ");
1698    REGEX_ASSERT(fields[3]=="b");
1699    REGEX_ASSERT(fields[4]=="the time");
1700    REGEX_ASSERT(fields[5]=="c");
1701    REGEX_ASSERT(fields[6]=="");
1702    REGEX_ASSERT(status==U_ZERO_ERROR);
1703
1704    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1705    REGEX_CHECK_STATUS;
1706    REGEX_ASSERT(n==7);
1707    REGEX_ASSERT(fields[0]=="  ");
1708    REGEX_ASSERT(fields[1]=="a");
1709    REGEX_ASSERT(fields[2]=="Now is ");
1710    REGEX_ASSERT(fields[3]=="b");
1711    REGEX_ASSERT(fields[4]=="the time");
1712    REGEX_ASSERT(fields[5]=="c");
1713    REGEX_ASSERT(fields[6]=="");
1714
1715    status = U_ZERO_ERROR;
1716    fields[6] = "foo";
1717    n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1718    REGEX_CHECK_STATUS;
1719    REGEX_ASSERT(n==6);
1720    REGEX_ASSERT(fields[0]=="  ");
1721    REGEX_ASSERT(fields[1]=="a");
1722    REGEX_ASSERT(fields[2]=="Now is ");
1723    REGEX_ASSERT(fields[3]=="b");
1724    REGEX_ASSERT(fields[4]=="the time");
1725    REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1726    REGEX_ASSERT(fields[6]=="foo");
1727
1728    status = U_ZERO_ERROR;
1729    fields[5] = "foo";
1730    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1731    REGEX_CHECK_STATUS;
1732    REGEX_ASSERT(n==5);
1733    REGEX_ASSERT(fields[0]=="  ");
1734    REGEX_ASSERT(fields[1]=="a");
1735    REGEX_ASSERT(fields[2]=="Now is ");
1736    REGEX_ASSERT(fields[3]=="b");
1737    REGEX_ASSERT(fields[4]=="the time<c>");
1738    REGEX_ASSERT(fields[5]=="foo");
1739
1740    status = U_ZERO_ERROR;
1741    fields[5] = "foo";
1742    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1743    REGEX_CHECK_STATUS;
1744    REGEX_ASSERT(n==5);
1745    REGEX_ASSERT(fields[0]=="  ");
1746    REGEX_ASSERT(fields[1]=="a");
1747    REGEX_ASSERT(fields[2]=="Now is ");
1748    REGEX_ASSERT(fields[3]=="b");
1749    REGEX_ASSERT(fields[4]=="the time");
1750    REGEX_ASSERT(fields[5]=="foo");
1751
1752    status = U_ZERO_ERROR;
1753    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1754    REGEX_CHECK_STATUS;
1755    REGEX_ASSERT(n==4);
1756    REGEX_ASSERT(fields[0]=="  ");
1757    REGEX_ASSERT(fields[1]=="a");
1758    REGEX_ASSERT(fields[2]=="Now is ");
1759    REGEX_ASSERT(fields[3]=="the time<c>");
1760    status = U_ZERO_ERROR;
1761    delete pat1;
1762
1763    pat1 = RegexPattern::compile("([-,])",  pe, status);
1764    REGEX_CHECK_STATUS;
1765    n = pat1->split("1-10,20", fields, 10, status);
1766    REGEX_CHECK_STATUS;
1767    REGEX_ASSERT(n==5);
1768    REGEX_ASSERT(fields[0]=="1");
1769    REGEX_ASSERT(fields[1]=="-");
1770    REGEX_ASSERT(fields[2]=="10");
1771    REGEX_ASSERT(fields[3]==",");
1772    REGEX_ASSERT(fields[4]=="20");
1773    delete pat1;
1774
1775    // Test split of string with empty trailing fields
1776    pat1 = RegexPattern::compile(",", pe, status);
1777    REGEX_CHECK_STATUS;
1778    n = pat1->split("a,b,c,", fields, 10, status);
1779    REGEX_CHECK_STATUS;
1780    REGEX_ASSERT(n==4);
1781    REGEX_ASSERT(fields[0]=="a");
1782    REGEX_ASSERT(fields[1]=="b");
1783    REGEX_ASSERT(fields[2]=="c");
1784    REGEX_ASSERT(fields[3]=="");
1785
1786    n = pat1->split("a,,,", fields, 10, status);
1787    REGEX_CHECK_STATUS;
1788    REGEX_ASSERT(n==4);
1789    REGEX_ASSERT(fields[0]=="a");
1790    REGEX_ASSERT(fields[1]=="");
1791    REGEX_ASSERT(fields[2]=="");
1792    REGEX_ASSERT(fields[3]=="");
1793    delete pat1;
1794
1795    // Split Separator with zero length match.
1796    pat1 = RegexPattern::compile(":?", pe, status);
1797    REGEX_CHECK_STATUS;
1798    n = pat1->split("abc", fields, 10, status);
1799    REGEX_CHECK_STATUS;
1800    REGEX_ASSERT(n==5);
1801    REGEX_ASSERT(fields[0]=="");
1802    REGEX_ASSERT(fields[1]=="a");
1803    REGEX_ASSERT(fields[2]=="b");
1804    REGEX_ASSERT(fields[3]=="c");
1805    REGEX_ASSERT(fields[4]=="");
1806
1807    delete pat1;
1808
1809    //
1810    // RegexPattern::pattern()
1811    //
1812    pat1 = new RegexPattern();
1813    REGEX_ASSERT(pat1->pattern() == "");
1814    delete pat1;
1815
1816    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1817    REGEX_CHECK_STATUS;
1818    REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1819    delete pat1;
1820
1821
1822    //
1823    // classID functions
1824    //
1825    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1826    REGEX_CHECK_STATUS;
1827    REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1828    REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1829    UnicodeString Hello("Hello, world.");
1830    RegexMatcher *m = pat1->matcher(Hello, status);
1831    REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1832    REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1833    REGEX_ASSERT(m->getDynamicClassID() != NULL);
1834    delete m;
1835    delete pat1;
1836
1837}
1838
1839//---------------------------------------------------------------------------
1840//
1841//      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1842//                       is present and working, but excluding functions
1843//                       implementing replace operations.
1844//
1845//---------------------------------------------------------------------------
1846void RegexTest::API_Match_UTF8() {
1847    UParseError         pe;
1848    UErrorCode          status=U_ZERO_ERROR;
1849    int32_t             flags = 0;
1850
1851    //
1852    // Debug - slide failing test cases early
1853    //
1854#if 0
1855    {
1856    }
1857    return;
1858#endif
1859
1860    //
1861    // Simple pattern compilation
1862    //
1863    {
1864        UText               re = UTEXT_INITIALIZER;
1865        regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1866        REGEX_VERBOSE_TEXT(&re);
1867        RegexPattern        *pat2;
1868        pat2 = RegexPattern::compile(&re, flags, pe, status);
1869        REGEX_CHECK_STATUS;
1870
1871        UText input1 = UTEXT_INITIALIZER;
1872        UText input2 = UTEXT_INITIALIZER;
1873        UText empty  = UTEXT_INITIALIZER;
1874        regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1875        REGEX_VERBOSE_TEXT(&input1);
1876        regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1877        REGEX_VERBOSE_TEXT(&input2);
1878        utext_openUChars(&empty, NULL, 0, &status);
1879
1880        int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1881        int32_t input2Len = strlen("not abc");
1882
1883
1884        //
1885        // Matcher creation and reset.
1886        //
1887        RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1888        REGEX_CHECK_STATUS;
1889        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1890        const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1891        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1892        m1->reset(&input2);
1893        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1894        const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1895        REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1896        m1->reset(&input1);
1897        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1898        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1899        m1->reset(&empty);
1900        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1901        REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1902
1903        //
1904        //  reset(pos, status)
1905        //
1906        m1->reset(&input1);
1907        m1->reset(4, status);
1908        REGEX_CHECK_STATUS;
1909        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1910        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1911
1912        m1->reset(-1, status);
1913        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1914        status = U_ZERO_ERROR;
1915
1916        m1->reset(0, status);
1917        REGEX_CHECK_STATUS;
1918        status = U_ZERO_ERROR;
1919
1920        m1->reset(input1Len-1, status);
1921        REGEX_CHECK_STATUS;
1922        status = U_ZERO_ERROR;
1923
1924        m1->reset(input1Len, status);
1925        REGEX_CHECK_STATUS;
1926        status = U_ZERO_ERROR;
1927
1928        m1->reset(input1Len+1, status);
1929        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1930        status = U_ZERO_ERROR;
1931
1932        //
1933        // match(pos, status)
1934        //
1935        m1->reset(&input2);
1936        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1937        m1->reset();
1938        REGEX_ASSERT(m1->matches(3, status) == FALSE);
1939        m1->reset();
1940        REGEX_ASSERT(m1->matches(5, status) == FALSE);
1941        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1942        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1943        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1944
1945        // Match() at end of string should fail, but should not
1946        //  be an error.
1947        status = U_ZERO_ERROR;
1948        REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1949        REGEX_CHECK_STATUS;
1950
1951        // Match beyond end of string should fail with an error.
1952        status = U_ZERO_ERROR;
1953        REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1954        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1955
1956        // Successful match at end of string.
1957        {
1958            status = U_ZERO_ERROR;
1959            RegexMatcher m("A?", 0, status);  // will match zero length string.
1960            REGEX_CHECK_STATUS;
1961            m.reset(&input1);
1962            REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1963            REGEX_CHECK_STATUS;
1964            m.reset(&empty);
1965            REGEX_ASSERT(m.matches(0, status) == TRUE);
1966            REGEX_CHECK_STATUS;
1967        }
1968
1969
1970        //
1971        // lookingAt(pos, status)
1972        //
1973        status = U_ZERO_ERROR;
1974        m1->reset(&input2);  // "not abc"
1975        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1976        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1977        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1978        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1979        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1980        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1981        status = U_ZERO_ERROR;
1982        REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1983        REGEX_CHECK_STATUS;
1984        REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1985        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1986
1987        delete m1;
1988        delete pat2;
1989
1990        utext_close(&re);
1991        utext_close(&input1);
1992        utext_close(&input2);
1993        utext_close(&empty);
1994    }
1995
1996
1997    //
1998    // Capture Group.
1999    //     RegexMatcher::start();
2000    //     RegexMatcher::end();
2001    //     RegexMatcher::groupCount();
2002    //
2003    {
2004        int32_t             flags=0;
2005        UParseError         pe;
2006        UErrorCode          status=U_ZERO_ERROR;
2007        UText               re=UTEXT_INITIALIZER;
2008        const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
2009        utext_openUTF8(&re, str_01234567_pat, -1, &status);
2010
2011        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2012        REGEX_CHECK_STATUS;
2013
2014        UText input = UTEXT_INITIALIZER;
2015        const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2016        utext_openUTF8(&input, str_0123456789, -1, &status);
2017
2018        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2019        REGEX_CHECK_STATUS;
2020        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
2021        static const int32_t matchStarts[] = {0,  2, 4, 8};
2022        static const int32_t matchEnds[]   = {10, 8, 6, 10};
2023        int32_t i;
2024        for (i=0; i<4; i++) {
2025            int32_t actualStart = matcher->start(i, status);
2026            REGEX_CHECK_STATUS;
2027            if (actualStart != matchStarts[i]) {
2028                errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2029                      __FILE__, __LINE__, i, matchStarts[i], actualStart);
2030            }
2031            int32_t actualEnd = matcher->end(i, status);
2032            REGEX_CHECK_STATUS;
2033            if (actualEnd != matchEnds[i]) {
2034                errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2035                      __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2036            }
2037        }
2038
2039        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2040        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2041
2042        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2043        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2044        matcher->reset();
2045        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2046
2047        matcher->lookingAt(status);
2048
2049        UnicodeString dest;
2050        UText destText = UTEXT_INITIALIZER;
2051        utext_openUnicodeString(&destText, &dest, &status);
2052        UText *result;
2053        //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2054        //  Test shallow-clone API
2055        int64_t   group_len;
2056        result = matcher->group((UText *)NULL, group_len, status);
2057        REGEX_CHECK_STATUS;
2058        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2059        utext_close(result);
2060        result = matcher->group(0, &destText, group_len, status);
2061        REGEX_CHECK_STATUS;
2062        REGEX_ASSERT(result == &destText);
2063        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2064        //  destText is now immutable, reopen it
2065        utext_close(&destText);
2066        utext_openUnicodeString(&destText, &dest, &status);
2067
2068        int64_t length;
2069        result = matcher->group(0, NULL, length, status);
2070        REGEX_CHECK_STATUS;
2071        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2072        utext_close(result);
2073        result = matcher->group(0, &destText, length, status);
2074        REGEX_CHECK_STATUS;
2075        REGEX_ASSERT(result == &destText);
2076        REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2077        REGEX_ASSERT(length == 10);
2078        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2079
2080        // Capture Group 1 == "234567"
2081        result = matcher->group(1, NULL, length, status);
2082        REGEX_CHECK_STATUS;
2083        REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2084        REGEX_ASSERT(length == 6);
2085        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2086        utext_close(result);
2087
2088        result = matcher->group(1, &destText, length, status);
2089        REGEX_CHECK_STATUS;
2090        REGEX_ASSERT(result == &destText);
2091        REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2092        REGEX_ASSERT(length == 6);
2093        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2094        utext_close(result);
2095
2096        // Capture Group 2 == "45"
2097        result = matcher->group(2, NULL, length, status);
2098        REGEX_CHECK_STATUS;
2099        REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2100        REGEX_ASSERT(length == 2);
2101        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2102        utext_close(result);
2103
2104        result = matcher->group(2, &destText, length, status);
2105        REGEX_CHECK_STATUS;
2106        REGEX_ASSERT(result == &destText);
2107        REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2108        REGEX_ASSERT(length == 2);
2109        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2110        utext_close(result);
2111
2112        // Capture Group 3 == "89"
2113        result = matcher->group(3, NULL, length, status);
2114        REGEX_CHECK_STATUS;
2115        REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2116        REGEX_ASSERT(length == 2);
2117        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2118        utext_close(result);
2119
2120        result = matcher->group(3, &destText, length, status);
2121        REGEX_CHECK_STATUS;
2122        REGEX_ASSERT(result == &destText);
2123        REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2124        REGEX_ASSERT(length == 2);
2125        REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2126        utext_close(result);
2127
2128        // Capture Group number out of range.
2129        status = U_ZERO_ERROR;
2130        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2131        status = U_ZERO_ERROR;
2132        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2133        status = U_ZERO_ERROR;
2134        matcher->reset();
2135        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2136
2137        delete matcher;
2138        delete pat;
2139
2140        utext_close(&destText);
2141        utext_close(&input);
2142        utext_close(&re);
2143    }
2144
2145    //
2146    //  find
2147    //
2148    {
2149        int32_t             flags=0;
2150        UParseError         pe;
2151        UErrorCode          status=U_ZERO_ERROR;
2152        UText               re=UTEXT_INITIALIZER;
2153        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2154        utext_openUTF8(&re, str_abc, -1, &status);
2155
2156        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2157        REGEX_CHECK_STATUS;
2158        UText input = UTEXT_INITIALIZER;
2159        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2160        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2161        //                      012345678901234567
2162
2163        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2164        REGEX_CHECK_STATUS;
2165        REGEX_ASSERT(matcher->find());
2166        REGEX_ASSERT(matcher->start(status) == 1);
2167        REGEX_ASSERT(matcher->find());
2168        REGEX_ASSERT(matcher->start(status) == 6);
2169        REGEX_ASSERT(matcher->find());
2170        REGEX_ASSERT(matcher->start(status) == 12);
2171        REGEX_ASSERT(matcher->find() == FALSE);
2172        REGEX_ASSERT(matcher->find() == FALSE);
2173
2174        matcher->reset();
2175        REGEX_ASSERT(matcher->find());
2176        REGEX_ASSERT(matcher->start(status) == 1);
2177
2178        REGEX_ASSERT(matcher->find(0, status));
2179        REGEX_ASSERT(matcher->start(status) == 1);
2180        REGEX_ASSERT(matcher->find(1, status));
2181        REGEX_ASSERT(matcher->start(status) == 1);
2182        REGEX_ASSERT(matcher->find(2, status));
2183        REGEX_ASSERT(matcher->start(status) == 6);
2184        REGEX_ASSERT(matcher->find(12, status));
2185        REGEX_ASSERT(matcher->start(status) == 12);
2186        REGEX_ASSERT(matcher->find(13, status) == FALSE);
2187        REGEX_ASSERT(matcher->find(16, status) == FALSE);
2188        REGEX_ASSERT(matcher->find(17, status) == FALSE);
2189        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2190
2191        status = U_ZERO_ERROR;
2192        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2193        status = U_ZERO_ERROR;
2194        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2195
2196        REGEX_ASSERT(matcher->groupCount() == 0);
2197
2198        delete matcher;
2199        delete pat;
2200
2201        utext_close(&input);
2202        utext_close(&re);
2203    }
2204
2205
2206    //
2207    //  find, with \G in pattern (true if at the end of a previous match).
2208    //
2209    {
2210        int32_t             flags=0;
2211        UParseError         pe;
2212        UErrorCode          status=U_ZERO_ERROR;
2213        UText               re=UTEXT_INITIALIZER;
2214        const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2215        utext_openUTF8(&re, str_Gabcabc, -1, &status);
2216
2217        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2218
2219        REGEX_CHECK_STATUS;
2220        UText input = UTEXT_INITIALIZER;
2221        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2222        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2223        //                      012345678901234567
2224
2225        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2226        REGEX_CHECK_STATUS;
2227        REGEX_ASSERT(matcher->find());
2228        REGEX_ASSERT(matcher->start(status) == 0);
2229        REGEX_ASSERT(matcher->start(1, status) == -1);
2230        REGEX_ASSERT(matcher->start(2, status) == 1);
2231
2232        REGEX_ASSERT(matcher->find());
2233        REGEX_ASSERT(matcher->start(status) == 4);
2234        REGEX_ASSERT(matcher->start(1, status) == 4);
2235        REGEX_ASSERT(matcher->start(2, status) == -1);
2236        REGEX_CHECK_STATUS;
2237
2238        delete matcher;
2239        delete pat;
2240
2241        utext_close(&input);
2242        utext_close(&re);
2243    }
2244
2245    //
2246    //   find with zero length matches, match position should bump ahead
2247    //     to prevent loops.
2248    //
2249    {
2250        int32_t                 i;
2251        UErrorCode          status=U_ZERO_ERROR;
2252        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2253                                                      //   using an always-true look-ahead.
2254        REGEX_CHECK_STATUS;
2255        UText s = UTEXT_INITIALIZER;
2256        utext_openUTF8(&s, "    ", -1, &status);
2257        m.reset(&s);
2258        for (i=0; ; i++) {
2259            if (m.find() == FALSE) {
2260                break;
2261            }
2262            REGEX_ASSERT(m.start(status) == i);
2263            REGEX_ASSERT(m.end(status) == i);
2264        }
2265        REGEX_ASSERT(i==5);
2266
2267        // Check that the bump goes over characters outside the BMP OK
2268        // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2269        unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2270        utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2271        m.reset(&s);
2272        for (i=0; ; i+=4) {
2273            if (m.find() == FALSE) {
2274                break;
2275            }
2276            REGEX_ASSERT(m.start(status) == i);
2277            REGEX_ASSERT(m.end(status) == i);
2278        }
2279        REGEX_ASSERT(i==20);
2280
2281        utext_close(&s);
2282    }
2283    {
2284        // find() loop breaking test.
2285        //        with pattern of /.?/, should see a series of one char matches, then a single
2286        //        match of zero length at the end of the input string.
2287        int32_t                 i;
2288        UErrorCode          status=U_ZERO_ERROR;
2289        RegexMatcher        m(".?", 0, status);
2290        REGEX_CHECK_STATUS;
2291        UText s = UTEXT_INITIALIZER;
2292        utext_openUTF8(&s, "    ", -1, &status);
2293        m.reset(&s);
2294        for (i=0; ; i++) {
2295            if (m.find() == FALSE) {
2296                break;
2297            }
2298            REGEX_ASSERT(m.start(status) == i);
2299            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2300        }
2301        REGEX_ASSERT(i==5);
2302
2303        utext_close(&s);
2304    }
2305
2306
2307    //
2308    // Matchers with no input string behave as if they had an empty input string.
2309    //
2310
2311    {
2312        UErrorCode status = U_ZERO_ERROR;
2313        RegexMatcher  m(".?", 0, status);
2314        REGEX_CHECK_STATUS;
2315        REGEX_ASSERT(m.find());
2316        REGEX_ASSERT(m.start(status) == 0);
2317        REGEX_ASSERT(m.input() == "");
2318    }
2319    {
2320        UErrorCode status = U_ZERO_ERROR;
2321        RegexPattern  *p = RegexPattern::compile(".", 0, status);
2322        RegexMatcher  *m = p->matcher(status);
2323        REGEX_CHECK_STATUS;
2324
2325        REGEX_ASSERT(m->find() == FALSE);
2326        REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2327        delete m;
2328        delete p;
2329    }
2330
2331    //
2332    // Regions
2333    //
2334    {
2335        UErrorCode status = U_ZERO_ERROR;
2336        UText testPattern = UTEXT_INITIALIZER;
2337        UText testText    = UTEXT_INITIALIZER;
2338        regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2339        REGEX_VERBOSE_TEXT(&testPattern);
2340        regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2341        REGEX_VERBOSE_TEXT(&testText);
2342
2343        RegexMatcher m(&testPattern, &testText, 0, status);
2344        REGEX_CHECK_STATUS;
2345        REGEX_ASSERT(m.regionStart() == 0);
2346        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2347        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2348        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2349
2350        m.region(2,4, status);
2351        REGEX_CHECK_STATUS;
2352        REGEX_ASSERT(m.matches(status));
2353        REGEX_ASSERT(m.start(status)==2);
2354        REGEX_ASSERT(m.end(status)==4);
2355        REGEX_CHECK_STATUS;
2356
2357        m.reset();
2358        REGEX_ASSERT(m.regionStart() == 0);
2359        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2360
2361        regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2362        REGEX_VERBOSE_TEXT(&testText);
2363        m.reset(&testText);
2364        REGEX_ASSERT(m.regionStart() == 0);
2365        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2366
2367        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2368        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2369        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2370        REGEX_ASSERT(&m == &m.reset());
2371        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2372
2373        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2374        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2375        REGEX_ASSERT(&m == &m.reset());
2376        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2377
2378        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2379        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2380        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2381        REGEX_ASSERT(&m == &m.reset());
2382        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2383
2384        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2385        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2386        REGEX_ASSERT(&m == &m.reset());
2387        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2388
2389        utext_close(&testText);
2390        utext_close(&testPattern);
2391    }
2392
2393    //
2394    // hitEnd() and requireEnd()
2395    //
2396    {
2397        UErrorCode status = U_ZERO_ERROR;
2398        UText testPattern = UTEXT_INITIALIZER;
2399        UText testText    = UTEXT_INITIALIZER;
2400        const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2401        const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2402        utext_openUTF8(&testPattern, str_, -1, &status);
2403        utext_openUTF8(&testText, str_aabb, -1, &status);
2404
2405        RegexMatcher m1(&testPattern, &testText,  0, status);
2406        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2407        REGEX_ASSERT(m1.hitEnd() == TRUE);
2408        REGEX_ASSERT(m1.requireEnd() == FALSE);
2409        REGEX_CHECK_STATUS;
2410
2411        status = U_ZERO_ERROR;
2412        const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2413        utext_openUTF8(&testPattern, str_a, -1, &status);
2414        RegexMatcher m2(&testPattern, &testText, 0, status);
2415        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2416        REGEX_ASSERT(m2.hitEnd() == FALSE);
2417        REGEX_ASSERT(m2.requireEnd() == FALSE);
2418        REGEX_CHECK_STATUS;
2419
2420        status = U_ZERO_ERROR;
2421        const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2422        utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2423        RegexMatcher m3(&testPattern, &testText, 0, status);
2424        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2425        REGEX_ASSERT(m3.hitEnd() == TRUE);
2426        REGEX_ASSERT(m3.requireEnd() == TRUE);
2427        REGEX_CHECK_STATUS;
2428
2429        utext_close(&testText);
2430        utext_close(&testPattern);
2431    }
2432}
2433
2434
2435//---------------------------------------------------------------------------
2436//
2437//      API_Replace_UTF8   API test for class RegexMatcher, testing the
2438//                         Replace family of functions.
2439//
2440//---------------------------------------------------------------------------
2441void RegexTest::API_Replace_UTF8() {
2442    //
2443    //  Replace
2444    //
2445    int32_t             flags=0;
2446    UParseError         pe;
2447    UErrorCode          status=U_ZERO_ERROR;
2448
2449    UText               re=UTEXT_INITIALIZER;
2450    regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2451    REGEX_VERBOSE_TEXT(&re);
2452    RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2453    REGEX_CHECK_STATUS;
2454
2455    char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2456    //             012345678901234567
2457    UText dataText = UTEXT_INITIALIZER;
2458    utext_openUTF8(&dataText, data, -1, &status);
2459    REGEX_CHECK_STATUS;
2460    REGEX_VERBOSE_TEXT(&dataText);
2461    RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2462
2463    //
2464    //  Plain vanilla matches.
2465    //
2466    UnicodeString  dest;
2467    UText destText = UTEXT_INITIALIZER;
2468    utext_openUnicodeString(&destText, &dest, &status);
2469    UText *result;
2470
2471    UText replText = UTEXT_INITIALIZER;
2472
2473    const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2474    utext_openUTF8(&replText, str_yz, -1, &status);
2475    REGEX_VERBOSE_TEXT(&replText);
2476    result = matcher->replaceFirst(&replText, NULL, status);
2477    REGEX_CHECK_STATUS;
2478    const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2479    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2480    utext_close(result);
2481    result = matcher->replaceFirst(&replText, &destText, status);
2482    REGEX_CHECK_STATUS;
2483    REGEX_ASSERT(result == &destText);
2484    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2485
2486    result = matcher->replaceAll(&replText, NULL, status);
2487    REGEX_CHECK_STATUS;
2488    const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2489    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2490    utext_close(result);
2491
2492    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2493    result = matcher->replaceAll(&replText, &destText, status);
2494    REGEX_CHECK_STATUS;
2495    REGEX_ASSERT(result == &destText);
2496    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2497
2498    //
2499    //  Plain vanilla non-matches.
2500    //
2501    const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2502    utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2503    matcher->reset(&dataText);
2504
2505    result = matcher->replaceFirst(&replText, NULL, status);
2506    REGEX_CHECK_STATUS;
2507    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2508    utext_close(result);
2509    result = matcher->replaceFirst(&replText, &destText, status);
2510    REGEX_CHECK_STATUS;
2511    REGEX_ASSERT(result == &destText);
2512    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2513
2514    result = matcher->replaceAll(&replText, NULL, status);
2515    REGEX_CHECK_STATUS;
2516    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2517    utext_close(result);
2518    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2519    result = matcher->replaceAll(&replText, &destText, status);
2520    REGEX_CHECK_STATUS;
2521    REGEX_ASSERT(result == &destText);
2522    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2523
2524    //
2525    // Empty source string
2526    //
2527    utext_openUTF8(&dataText, NULL, 0, &status);
2528    matcher->reset(&dataText);
2529
2530    result = matcher->replaceFirst(&replText, NULL, status);
2531    REGEX_CHECK_STATUS;
2532    REGEX_ASSERT_UTEXT_UTF8("", result);
2533    utext_close(result);
2534    result = matcher->replaceFirst(&replText, &destText, status);
2535    REGEX_CHECK_STATUS;
2536    REGEX_ASSERT(result == &destText);
2537    REGEX_ASSERT_UTEXT_UTF8("", result);
2538
2539    result = matcher->replaceAll(&replText, NULL, status);
2540    REGEX_CHECK_STATUS;
2541    REGEX_ASSERT_UTEXT_UTF8("", result);
2542    utext_close(result);
2543    result = matcher->replaceAll(&replText, &destText, status);
2544    REGEX_CHECK_STATUS;
2545    REGEX_ASSERT(result == &destText);
2546    REGEX_ASSERT_UTEXT_UTF8("", result);
2547
2548    //
2549    // Empty substitution string
2550    //
2551    utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2552    matcher->reset(&dataText);
2553
2554    utext_openUTF8(&replText, NULL, 0, &status);
2555    result = matcher->replaceFirst(&replText, NULL, status);
2556    REGEX_CHECK_STATUS;
2557    const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2558    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2559    utext_close(result);
2560    result = matcher->replaceFirst(&replText, &destText, status);
2561    REGEX_CHECK_STATUS;
2562    REGEX_ASSERT(result == &destText);
2563    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2564
2565    result = matcher->replaceAll(&replText, NULL, status);
2566    REGEX_CHECK_STATUS;
2567    const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2568    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2569    utext_close(result);
2570    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2571    result = matcher->replaceAll(&replText, &destText, status);
2572    REGEX_CHECK_STATUS;
2573    REGEX_ASSERT(result == &destText);
2574    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2575
2576    //
2577    // match whole string
2578    //
2579    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2580    utext_openUTF8(&dataText, str_abc, -1, &status);
2581    matcher->reset(&dataText);
2582
2583    const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2584    utext_openUTF8(&replText, str_xyz, -1, &status);
2585    result = matcher->replaceFirst(&replText, NULL, status);
2586    REGEX_CHECK_STATUS;
2587    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2588    utext_close(result);
2589    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2590    result = matcher->replaceFirst(&replText, &destText, status);
2591    REGEX_CHECK_STATUS;
2592    REGEX_ASSERT(result == &destText);
2593    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2594
2595    result = matcher->replaceAll(&replText, NULL, status);
2596    REGEX_CHECK_STATUS;
2597    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2598    utext_close(result);
2599    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2600    result = matcher->replaceAll(&replText, &destText, status);
2601    REGEX_CHECK_STATUS;
2602    REGEX_ASSERT(result == &destText);
2603    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2604
2605    //
2606    // Capture Group, simple case
2607    //
2608    const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2609    utext_openUTF8(&re, str_add, -1, &status);
2610    RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2611    REGEX_CHECK_STATUS;
2612
2613    const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2614    utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2615    RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2616    REGEX_CHECK_STATUS;
2617
2618    const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2619    utext_openUTF8(&replText, str_11, -1, &status);
2620    result = matcher2->replaceFirst(&replText, NULL, status);
2621    REGEX_CHECK_STATUS;
2622    const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2623    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2624    utext_close(result);
2625    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2626    result = matcher2->replaceFirst(&replText, &destText, status);
2627    REGEX_CHECK_STATUS;
2628    REGEX_ASSERT(result == &destText);
2629    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2630
2631    const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2632    utext_openUTF8(&replText, str_v, -1, &status);
2633    REGEX_VERBOSE_TEXT(&replText);
2634    result = matcher2->replaceFirst(&replText, NULL, status);
2635    REGEX_CHECK_STATUS;
2636    const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2637    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2638    utext_close(result);
2639    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2640    result = matcher2->replaceFirst(&replText, &destText, status);
2641    REGEX_CHECK_STATUS;
2642    REGEX_ASSERT(result == &destText);
2643    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2644
2645    const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2646               0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2647               0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2648    utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2649    result = matcher2->replaceFirst(&replText, NULL, status);
2650    REGEX_CHECK_STATUS;
2651    const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2652    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2653    utext_close(result);
2654    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2655    result = matcher2->replaceFirst(&replText, &destText, status);
2656    REGEX_CHECK_STATUS;
2657    REGEX_ASSERT(result == &destText);
2658    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2659
2660    unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2661    //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2662    //                                 012345678901234567890123456
2663    supplDigitChars[22] = 0xF0;
2664    supplDigitChars[23] = 0x9D;
2665    supplDigitChars[24] = 0x9F;
2666    supplDigitChars[25] = 0x8F;
2667    utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2668
2669    result = matcher2->replaceFirst(&replText, NULL, status);
2670    REGEX_CHECK_STATUS;
2671    const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2672    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2673    utext_close(result);
2674    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2675    result = matcher2->replaceFirst(&replText, &destText, status);
2676    REGEX_CHECK_STATUS;
2677    REGEX_ASSERT(result == &destText);
2678    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2679    const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2680    utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2681    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2682//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2683    utext_close(result);
2684    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2685    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2686    REGEX_ASSERT(result == &destText);
2687//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2688
2689    //
2690    // Replacement String with \u hex escapes
2691    //
2692    {
2693      const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2694      const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2695        utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2696        utext_openUTF8(&replText, str_u0043, -1, &status);
2697        matcher->reset(&dataText);
2698
2699        result = matcher->replaceAll(&replText, NULL, status);
2700        REGEX_CHECK_STATUS;
2701        const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2702        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2703        utext_close(result);
2704        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2705        result = matcher->replaceAll(&replText, &destText, status);
2706        REGEX_CHECK_STATUS;
2707        REGEX_ASSERT(result == &destText);
2708        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2709    }
2710    {
2711      const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2712        utext_openUTF8(&dataText, str_abc, -1, &status);
2713        const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2714        utext_openUTF8(&replText, str_U00010000, -1, &status);
2715        matcher->reset(&dataText);
2716
2717        unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2718        //                          0123456789
2719        expected[2] = 0xF0;
2720        expected[3] = 0x90;
2721        expected[4] = 0x80;
2722        expected[5] = 0x80;
2723
2724        result = matcher->replaceAll(&replText, NULL, status);
2725        REGEX_CHECK_STATUS;
2726        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2727        utext_close(result);
2728        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2729        result = matcher->replaceAll(&replText, &destText, status);
2730        REGEX_CHECK_STATUS;
2731        REGEX_ASSERT(result == &destText);
2732        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2733    }
2734    // TODO:  need more through testing of capture substitutions.
2735
2736    // Bug 4057
2737    //
2738    {
2739        status = U_ZERO_ERROR;
2740const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2741const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2742const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2743        utext_openUTF8(&re, str_ssee, -1, &status);
2744        utext_openUTF8(&dataText, str_blah, -1, &status);
2745        utext_openUTF8(&replText, str_ooh, -1, &status);
2746
2747        RegexMatcher m(&re, 0, status);
2748        REGEX_CHECK_STATUS;
2749
2750        UnicodeString result;
2751        UText resultText = UTEXT_INITIALIZER;
2752        utext_openUnicodeString(&resultText, &result, &status);
2753
2754        // Multiple finds do NOT bump up the previous appendReplacement postion.
2755        m.reset(&dataText);
2756        m.find();
2757        m.find();
2758        m.appendReplacement(&resultText, &replText, status);
2759        REGEX_CHECK_STATUS;
2760        const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2761        REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2762
2763        // After a reset into the interior of a string, appendReplacement still starts at beginning.
2764        status = U_ZERO_ERROR;
2765        result.truncate(0);
2766        utext_openUnicodeString(&resultText, &result, &status);
2767        m.reset(10, status);
2768        m.find();
2769        m.find();
2770        m.appendReplacement(&resultText, &replText, status);
2771        REGEX_CHECK_STATUS;
2772        const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2773        REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2774
2775        // find() at interior of string, appendReplacement still starts at beginning.
2776        status = U_ZERO_ERROR;
2777        result.truncate(0);
2778        utext_openUnicodeString(&resultText, &result, &status);
2779        m.reset();
2780        m.find(10, status);
2781        m.find();
2782        m.appendReplacement(&resultText, &replText, status);
2783        REGEX_CHECK_STATUS;
2784        const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2785        REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2786
2787        m.appendTail(&resultText, status);
2788        const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2789        REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2790
2791        utext_close(&resultText);
2792    }
2793
2794    delete matcher2;
2795    delete pat2;
2796    delete matcher;
2797    delete pat;
2798
2799    utext_close(&dataText);
2800    utext_close(&replText);
2801    utext_close(&destText);
2802    utext_close(&re);
2803}
2804
2805
2806//---------------------------------------------------------------------------
2807//
2808//      API_Pattern_UTF8  Test that the API for class RegexPattern is
2809//                        present and nominally working.
2810//
2811//---------------------------------------------------------------------------
2812void RegexTest::API_Pattern_UTF8() {
2813    RegexPattern        pata;    // Test default constructor to not crash.
2814    RegexPattern        patb;
2815
2816    REGEX_ASSERT(pata == patb);
2817    REGEX_ASSERT(pata == pata);
2818
2819    UText         re1 = UTEXT_INITIALIZER;
2820    UText         re2 = UTEXT_INITIALIZER;
2821    UErrorCode    status = U_ZERO_ERROR;
2822    UParseError   pe;
2823
2824    const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2825    const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2826    utext_openUTF8(&re1, str_abcalmz, -1, &status);
2827    utext_openUTF8(&re2, str_def, -1, &status);
2828
2829    RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2830    RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2831    REGEX_CHECK_STATUS;
2832    REGEX_ASSERT(*pat1 == *pat1);
2833    REGEX_ASSERT(*pat1 != pata);
2834
2835    // Assign
2836    patb = *pat1;
2837    REGEX_ASSERT(patb == *pat1);
2838
2839    // Copy Construct
2840    RegexPattern patc(*pat1);
2841    REGEX_ASSERT(patc == *pat1);
2842    REGEX_ASSERT(patb == patc);
2843    REGEX_ASSERT(pat1 != pat2);
2844    patb = *pat2;
2845    REGEX_ASSERT(patb != patc);
2846    REGEX_ASSERT(patb == *pat2);
2847
2848    // Compile with no flags.
2849    RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2850    REGEX_ASSERT(*pat1a == *pat1);
2851
2852    REGEX_ASSERT(pat1a->flags() == 0);
2853
2854    // Compile with different flags should be not equal
2855    RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2856    REGEX_CHECK_STATUS;
2857
2858    REGEX_ASSERT(*pat1b != *pat1a);
2859    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2860    REGEX_ASSERT(pat1a->flags() == 0);
2861    delete pat1b;
2862
2863    // clone
2864    RegexPattern *pat1c = pat1->clone();
2865    REGEX_ASSERT(*pat1c == *pat1);
2866    REGEX_ASSERT(*pat1c != *pat2);
2867
2868    delete pat1c;
2869    delete pat1a;
2870    delete pat1;
2871    delete pat2;
2872
2873    utext_close(&re1);
2874    utext_close(&re2);
2875
2876
2877    //
2878    //   Verify that a matcher created from a cloned pattern works.
2879    //     (Jitterbug 3423)
2880    //
2881    {
2882        UErrorCode     status     = U_ZERO_ERROR;
2883        UText          pattern    = UTEXT_INITIALIZER;
2884        const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2885        utext_openUTF8(&pattern, str_pL, -1, &status);
2886
2887        RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2888        RegexPattern  *pClone     = pSource->clone();
2889        delete         pSource;
2890        RegexMatcher  *mFromClone = pClone->matcher(status);
2891        REGEX_CHECK_STATUS;
2892
2893        UText          input      = UTEXT_INITIALIZER;
2894        const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2895        utext_openUTF8(&input, str_HelloWorld, -1, &status);
2896        mFromClone->reset(&input);
2897        REGEX_ASSERT(mFromClone->find() == TRUE);
2898        REGEX_ASSERT(mFromClone->group(status) == "Hello");
2899        REGEX_ASSERT(mFromClone->find() == TRUE);
2900        REGEX_ASSERT(mFromClone->group(status) == "World");
2901        REGEX_ASSERT(mFromClone->find() == FALSE);
2902        delete mFromClone;
2903        delete pClone;
2904
2905        utext_close(&input);
2906        utext_close(&pattern);
2907    }
2908
2909    //
2910    //   matches convenience API
2911    //
2912    {
2913        UErrorCode status  = U_ZERO_ERROR;
2914        UText      pattern = UTEXT_INITIALIZER;
2915        UText      input   = UTEXT_INITIALIZER;
2916
2917        const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2918        utext_openUTF8(&input, str_randominput, -1, &status);
2919
2920        const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2921        utext_openUTF8(&pattern, str_dotstar, -1, &status);
2922        REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2923        REGEX_CHECK_STATUS;
2924
2925        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2926        utext_openUTF8(&pattern, str_abc, -1, &status);
2927        REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2928        REGEX_CHECK_STATUS;
2929
2930        const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2931        utext_openUTF8(&pattern, str_nput, -1, &status);
2932        REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2933        REGEX_CHECK_STATUS;
2934
2935        utext_openUTF8(&pattern, str_randominput, -1, &status);
2936        REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2937        REGEX_CHECK_STATUS;
2938
2939        const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2940        utext_openUTF8(&pattern, str_u, -1, &status);
2941        REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2942        REGEX_CHECK_STATUS;
2943
2944        utext_openUTF8(&input, str_abc, -1, &status);
2945        utext_openUTF8(&pattern, str_abc, -1, &status);
2946        status = U_INDEX_OUTOFBOUNDS_ERROR;
2947        REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2948        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2949
2950        utext_close(&input);
2951        utext_close(&pattern);
2952    }
2953
2954
2955    //
2956    // Split()
2957    //
2958    status = U_ZERO_ERROR;
2959    const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2960    utext_openUTF8(&re1, str_spaceplus, -1, &status);
2961    pat1 = RegexPattern::compile(&re1, pe, status);
2962    REGEX_CHECK_STATUS;
2963    UnicodeString  fields[10];
2964
2965    int32_t n;
2966    n = pat1->split("Now is the time", fields, 10, status);
2967    REGEX_CHECK_STATUS;
2968    REGEX_ASSERT(n==4);
2969    REGEX_ASSERT(fields[0]=="Now");
2970    REGEX_ASSERT(fields[1]=="is");
2971    REGEX_ASSERT(fields[2]=="the");
2972    REGEX_ASSERT(fields[3]=="time");
2973    REGEX_ASSERT(fields[4]=="");
2974
2975    n = pat1->split("Now is the time", fields, 2, status);
2976    REGEX_CHECK_STATUS;
2977    REGEX_ASSERT(n==2);
2978    REGEX_ASSERT(fields[0]=="Now");
2979    REGEX_ASSERT(fields[1]=="is the time");
2980    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2981
2982    fields[1] = "*";
2983    status = U_ZERO_ERROR;
2984    n = pat1->split("Now is the time", fields, 1, status);
2985    REGEX_CHECK_STATUS;
2986    REGEX_ASSERT(n==1);
2987    REGEX_ASSERT(fields[0]=="Now is the time");
2988    REGEX_ASSERT(fields[1]=="*");
2989    status = U_ZERO_ERROR;
2990
2991    n = pat1->split("    Now       is the time   ", fields, 10, status);
2992    REGEX_CHECK_STATUS;
2993    REGEX_ASSERT(n==6);
2994    REGEX_ASSERT(fields[0]=="");
2995    REGEX_ASSERT(fields[1]=="Now");
2996    REGEX_ASSERT(fields[2]=="is");
2997    REGEX_ASSERT(fields[3]=="the");
2998    REGEX_ASSERT(fields[4]=="time");
2999    REGEX_ASSERT(fields[5]=="");
3000    REGEX_ASSERT(fields[6]=="");
3001
3002    fields[2] = "*";
3003    n = pat1->split("     ", fields, 10, status);
3004    REGEX_CHECK_STATUS;
3005    REGEX_ASSERT(n==2);
3006    REGEX_ASSERT(fields[0]=="");
3007    REGEX_ASSERT(fields[1]=="");
3008    REGEX_ASSERT(fields[2]=="*");
3009
3010    fields[0] = "foo";
3011    n = pat1->split("", fields, 10, status);
3012    REGEX_CHECK_STATUS;
3013    REGEX_ASSERT(n==0);
3014    REGEX_ASSERT(fields[0]=="foo");
3015
3016    delete pat1;
3017
3018    //  split, with a pattern with (capture)
3019    regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
3020    pat1 = RegexPattern::compile(&re1,  pe, status);
3021    REGEX_CHECK_STATUS;
3022
3023    status = U_ZERO_ERROR;
3024    fields[6] = fields[7] = "*";
3025    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3026    REGEX_CHECK_STATUS;
3027    REGEX_ASSERT(n==7);
3028    REGEX_ASSERT(fields[0]=="");
3029    REGEX_ASSERT(fields[1]=="a");
3030    REGEX_ASSERT(fields[2]=="Now is ");
3031    REGEX_ASSERT(fields[3]=="b");
3032    REGEX_ASSERT(fields[4]=="the time");
3033    REGEX_ASSERT(fields[5]=="c");
3034    REGEX_ASSERT(fields[6]=="");
3035    REGEX_ASSERT(fields[7]=="*");
3036    REGEX_ASSERT(status==U_ZERO_ERROR);
3037
3038    fields[6] = fields[7] = "*";
3039    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
3040    REGEX_CHECK_STATUS;
3041    REGEX_ASSERT(n==7);
3042    REGEX_ASSERT(fields[0]=="  ");
3043    REGEX_ASSERT(fields[1]=="a");
3044    REGEX_ASSERT(fields[2]=="Now is ");
3045    REGEX_ASSERT(fields[3]=="b");
3046    REGEX_ASSERT(fields[4]=="the time");
3047    REGEX_ASSERT(fields[5]=="c");
3048    REGEX_ASSERT(fields[6]=="");
3049    REGEX_ASSERT(fields[7]=="*");
3050
3051    status = U_ZERO_ERROR;
3052    fields[6] = "foo";
3053    n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
3054    REGEX_CHECK_STATUS;
3055    REGEX_ASSERT(n==6);
3056    REGEX_ASSERT(fields[0]=="  ");
3057    REGEX_ASSERT(fields[1]=="a");
3058    REGEX_ASSERT(fields[2]=="Now is ");
3059    REGEX_ASSERT(fields[3]=="b");
3060    REGEX_ASSERT(fields[4]=="the time");
3061    REGEX_ASSERT(fields[5]==" ");
3062    REGEX_ASSERT(fields[6]=="foo");
3063
3064    status = U_ZERO_ERROR;
3065    fields[5] = "foo";
3066    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3067    REGEX_CHECK_STATUS;
3068    REGEX_ASSERT(n==5);
3069    REGEX_ASSERT(fields[0]=="  ");
3070    REGEX_ASSERT(fields[1]=="a");
3071    REGEX_ASSERT(fields[2]=="Now is ");
3072    REGEX_ASSERT(fields[3]=="b");
3073    REGEX_ASSERT(fields[4]=="the time<c>");
3074    REGEX_ASSERT(fields[5]=="foo");
3075
3076    status = U_ZERO_ERROR;
3077    fields[5] = "foo";
3078    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3079    REGEX_CHECK_STATUS;
3080    REGEX_ASSERT(n==5);
3081    REGEX_ASSERT(fields[0]=="  ");
3082    REGEX_ASSERT(fields[1]=="a");
3083    REGEX_ASSERT(fields[2]=="Now is ");
3084    REGEX_ASSERT(fields[3]=="b");
3085    REGEX_ASSERT(fields[4]=="the time");
3086    REGEX_ASSERT(fields[5]=="foo");
3087
3088    status = U_ZERO_ERROR;
3089    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3090    REGEX_CHECK_STATUS;
3091    REGEX_ASSERT(n==4);
3092    REGEX_ASSERT(fields[0]=="  ");
3093    REGEX_ASSERT(fields[1]=="a");
3094    REGEX_ASSERT(fields[2]=="Now is ");
3095    REGEX_ASSERT(fields[3]=="the time<c>");
3096    status = U_ZERO_ERROR;
3097    delete pat1;
3098
3099    regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3100    pat1 = RegexPattern::compile(&re1, pe, status);
3101    REGEX_CHECK_STATUS;
3102    n = pat1->split("1-10,20", fields, 10, status);
3103    REGEX_CHECK_STATUS;
3104    REGEX_ASSERT(n==5);
3105    REGEX_ASSERT(fields[0]=="1");
3106    REGEX_ASSERT(fields[1]=="-");
3107    REGEX_ASSERT(fields[2]=="10");
3108    REGEX_ASSERT(fields[3]==",");
3109    REGEX_ASSERT(fields[4]=="20");
3110    delete pat1;
3111
3112
3113    //
3114    // split of a UText based string, with library allocating output UTexts.
3115    //
3116    {
3117        status = U_ZERO_ERROR;
3118        RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3119        UnicodeString stringToSplit("first:second:third");
3120        UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3121        REGEX_CHECK_STATUS;
3122
3123        UText *splits[10] = {NULL};
3124        int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3125        REGEX_CHECK_STATUS;
3126        REGEX_ASSERT(numFields == 5);
3127        REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3128        REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3129        REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3130        REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3131        REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3132        REGEX_ASSERT(splits[5] == NULL);
3133
3134        for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3135            if (splits[i]) {
3136                utext_close(splits[i]);
3137                splits[i] = NULL;
3138            }
3139        }
3140        utext_close(textToSplit);
3141    }
3142
3143
3144    //
3145    // RegexPattern::pattern() and patternText()
3146    //
3147    pat1 = new RegexPattern();
3148    REGEX_ASSERT(pat1->pattern() == "");
3149    REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3150    delete pat1;
3151    const char *helloWorldInvariant = "(Hello, world)*";
3152    regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3153    pat1 = RegexPattern::compile(&re1, pe, status);
3154    REGEX_CHECK_STATUS;
3155    REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3156    REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3157    delete pat1;
3158
3159    utext_close(&re1);
3160}
3161
3162
3163//---------------------------------------------------------------------------
3164//
3165//      Extended       A more thorough check for features of regex patterns
3166//                     The test cases are in a separate data file,
3167//                       source/tests/testdata/regextst.txt
3168//                     A description of the test data format is included in that file.
3169//
3170//---------------------------------------------------------------------------
3171
3172const char *
3173RegexTest::getPath(char buffer[2048], const char *filename) {
3174    UErrorCode status=U_ZERO_ERROR;
3175    const char *testDataDirectory = IntlTest::getSourceTestData(status);
3176    if (U_FAILURE(status)) {
3177        errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3178        return NULL;
3179    }
3180
3181    strcpy(buffer, testDataDirectory);
3182    strcat(buffer, filename);
3183    return buffer;
3184}
3185
3186void RegexTest::Extended() {
3187    char tdd[2048];
3188    const char *srcPath;
3189    UErrorCode  status  = U_ZERO_ERROR;
3190    int32_t     lineNum = 0;
3191
3192    //
3193    //  Open and read the test data file.
3194    //
3195    srcPath=getPath(tdd, "regextst.txt");
3196    if(srcPath==NULL) {
3197        return; /* something went wrong, error already output */
3198    }
3199
3200    int32_t    len;
3201    UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3202    if (U_FAILURE(status)) {
3203        return; /* something went wrong, error already output */
3204    }
3205
3206    //
3207    //  Put the test data into a UnicodeString
3208    //
3209    UnicodeString testString(FALSE, testData, len);
3210
3211    RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3212    RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3213    RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3214
3215    RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3216    UnicodeString   testPattern;   // The pattern for test from the test file.
3217    UnicodeString   testFlags;     // the flags   for a test.
3218    UnicodeString   matchString;   // The marked up string to be used as input
3219
3220    if (U_FAILURE(status)){
3221        dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3222        delete [] testData;
3223        return;
3224    }
3225
3226    //
3227    //  Loop over the test data file, once per line.
3228    //
3229    while (lineMat.find()) {
3230        lineNum++;
3231        if (U_FAILURE(status)) {
3232          errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3233        }
3234
3235        status = U_ZERO_ERROR;
3236        UnicodeString testLine = lineMat.group(1, status);
3237        if (testLine.length() == 0) {
3238            continue;
3239        }
3240
3241        //
3242        // Parse the test line.  Skip blank and comment only lines.
3243        // Separate out the three main fields - pattern, flags, target.
3244        //
3245
3246        commentMat.reset(testLine);
3247        if (commentMat.lookingAt(status)) {
3248            // This line is a comment, or blank.
3249            continue;
3250        }
3251
3252        //
3253        //  Pull out the pattern field, remove it from the test file line.
3254        //
3255        quotedStuffMat.reset(testLine);
3256        if (quotedStuffMat.lookingAt(status)) {
3257            testPattern = quotedStuffMat.group(2, status);
3258            testLine.remove(0, quotedStuffMat.end(0, status));
3259        } else {
3260            errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3261            continue;
3262        }
3263
3264
3265        //
3266        //  Pull out the flags from the test file line.
3267        //
3268        flagsMat.reset(testLine);
3269        flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3270        testFlags = flagsMat.group(1, status);
3271        if (flagsMat.group(2, status).length() > 0) {
3272            errln("Bad Match flag at line %d. Scanning %c\n",
3273                lineNum, flagsMat.group(2, status).charAt(0));
3274            continue;
3275        }
3276        testLine.remove(0, flagsMat.end(0, status));
3277
3278        //
3279        //  Pull out the match string, as a whole.
3280        //    We'll process the <tags> later.
3281        //
3282        quotedStuffMat.reset(testLine);
3283        if (quotedStuffMat.lookingAt(status)) {
3284            matchString = quotedStuffMat.group(2, status);
3285            testLine.remove(0, quotedStuffMat.end(0, status));
3286        } else {
3287            errln("Bad match string at test file line %d", lineNum);
3288            continue;
3289        }
3290
3291        //
3292        //  The only thing left from the input line should be an optional trailing comment.
3293        //
3294        commentMat.reset(testLine);
3295        if (commentMat.lookingAt(status) == FALSE) {
3296            errln("Line %d: unexpected characters at end of test line.", lineNum);
3297            continue;
3298        }
3299
3300        //
3301        //  Run the test
3302        //
3303        regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3304    }
3305
3306    delete [] testData;
3307
3308}
3309
3310
3311
3312//---------------------------------------------------------------------------
3313//
3314//    regex_find(pattern, flags, inputString, lineNumber)
3315//
3316//         Function to run a single test from the Extended (data driven) tests.
3317//         See file test/testdata/regextst.txt for a description of the
3318//         pattern and inputString fields, and the allowed flags.
3319//         lineNumber is the source line in regextst.txt of the test.
3320//
3321//---------------------------------------------------------------------------
3322
3323
3324//  Set a value into a UVector at position specified by a decimal number in
3325//   a UnicodeString.   This is a utility function needed by the actual test function,
3326//   which follows.
3327static void set(UVector &vec, int32_t val, UnicodeString index) {
3328    UErrorCode  status=U_ZERO_ERROR;
3329    int32_t  idx = 0;
3330    for (int32_t i=0; i<index.length(); i++) {
3331        int32_t d=u_charDigitValue(index.charAt(i));
3332        if (d<0) {return;}
3333        idx = idx*10 + d;
3334    }
3335    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3336    vec.setElementAt(val, idx);
3337}
3338
3339static void setInt(UVector &vec, int32_t val, int32_t idx) {
3340    UErrorCode  status=U_ZERO_ERROR;
3341    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3342    vec.setElementAt(val, idx);
3343}
3344
3345static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3346{
3347    UBool couldFind = TRUE;
3348    UTEXT_SETNATIVEINDEX(utext, 0);
3349    int32_t i = 0;
3350    while (i < unistrOffset) {
3351        UChar32 c = UTEXT_NEXT32(utext);
3352        if (c != U_SENTINEL) {
3353            i += U16_LENGTH(c);
3354        } else {
3355            couldFind = FALSE;
3356            break;
3357        }
3358    }
3359    nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3360    return couldFind;
3361}
3362
3363
3364void RegexTest::regex_find(const UnicodeString &pattern,
3365                           const UnicodeString &flags,
3366                           const UnicodeString &inputString,
3367                           const char *srcPath,
3368                           int32_t line) {
3369    UnicodeString       unEscapedInput;
3370    UnicodeString       deTaggedInput;
3371
3372    int32_t             patternUTF8Length,      inputUTF8Length;
3373    char                *patternChars  = NULL, *inputChars = NULL;
3374    UText               patternText    = UTEXT_INITIALIZER;
3375    UText               inputText      = UTEXT_INITIALIZER;
3376    UConverter          *UTF8Converter = NULL;
3377
3378    UErrorCode          status         = U_ZERO_ERROR;
3379    UParseError         pe;
3380    RegexPattern        *parsePat      = NULL;
3381    RegexMatcher        *parseMatcher  = NULL;
3382    RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3383    RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3384    UVector             groupStarts(status);
3385    UVector             groupEnds(status);
3386    UVector             groupStartsUTF8(status);
3387    UVector             groupEndsUTF8(status);
3388    UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3389    UBool               failed         = FALSE;
3390    int32_t             numFinds;
3391    int32_t             i;
3392    UBool               useMatchesFunc   = FALSE;
3393    UBool               useLookingAtFunc = FALSE;
3394    int32_t             regionStart      = -1;
3395    int32_t             regionEnd        = -1;
3396    int32_t             regionStartUTF8  = -1;
3397    int32_t             regionEndUTF8    = -1;
3398
3399
3400    //
3401    //  Compile the caller's pattern
3402    //
3403    uint32_t bflags = 0;
3404    if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3405        bflags |= UREGEX_CASE_INSENSITIVE;
3406    }
3407    if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3408        bflags |= UREGEX_COMMENTS;
3409    }
3410    if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3411        bflags |= UREGEX_DOTALL;
3412    }
3413    if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3414        bflags |= UREGEX_MULTILINE;
3415    }
3416
3417    if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3418        bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3419    }
3420    if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3421        bflags |= UREGEX_UNIX_LINES;
3422    }
3423    if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3424        bflags |= UREGEX_LITERAL;
3425    }
3426
3427
3428    callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3429    if (status != U_ZERO_ERROR) {
3430        #if UCONFIG_NO_BREAK_ITERATION==1
3431        // 'v' test flag means that the test pattern should not compile if ICU was configured
3432        //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3433        if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3434            goto cleanupAndReturn;
3435        }
3436        #endif
3437        if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3438            // Expected pattern compilation error.
3439            if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3440                logln("Pattern Compile returns \"%s\"", u_errorName(status));
3441            }
3442            goto cleanupAndReturn;
3443        } else {
3444            // Unexpected pattern compilation error.
3445            dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3446            goto cleanupAndReturn;
3447        }
3448    }
3449
3450    UTF8Converter = ucnv_open("UTF8", &status);
3451    ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3452
3453    patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3454    status = U_ZERO_ERROR; // buffer overflow
3455    patternChars = new char[patternUTF8Length+1];
3456    pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3457    utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3458
3459    if (status == U_ZERO_ERROR) {
3460        UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3461
3462        if (status != U_ZERO_ERROR) {
3463#if UCONFIG_NO_BREAK_ITERATION==1
3464            // 'v' test flag means that the test pattern should not compile if ICU was configured
3465            //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3466            if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3467                goto cleanupAndReturn;
3468            }
3469#endif
3470            if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3471                // Expected pattern compilation error.
3472                if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3473                    logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3474                }
3475                goto cleanupAndReturn;
3476            } else {
3477                // Unexpected pattern compilation error.
3478                errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3479                goto cleanupAndReturn;
3480            }
3481        }
3482    }
3483
3484    if (UTF8Pattern == NULL) {
3485        // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3486        logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3487        status = U_ZERO_ERROR;
3488    }
3489
3490    if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3491        callerPattern->dumpPattern();
3492    }
3493
3494    if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3495        errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3496        goto cleanupAndReturn;
3497    }
3498
3499
3500    //
3501    // Number of times find() should be called on the test string, default to 1
3502    //
3503    numFinds = 1;
3504    for (i=2; i<=9; i++) {
3505        if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3506            if (numFinds != 1) {
3507                errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3508                goto cleanupAndReturn;
3509            }
3510            numFinds = i;
3511        }
3512    }
3513
3514    // 'M' flag.  Use matches() instead of find()
3515    if (flags.indexOf((UChar)0x4d) >= 0) {
3516        useMatchesFunc = TRUE;
3517    }
3518    if (flags.indexOf((UChar)0x4c) >= 0) {
3519        useLookingAtFunc = TRUE;
3520    }
3521
3522    //
3523    //  Find the tags in the input data, remove them, and record the group boundary
3524    //    positions.
3525    //
3526    parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3527    REGEX_CHECK_STATUS_L(line);
3528
3529    unEscapedInput = inputString.unescape();
3530    parseMatcher = parsePat->matcher(unEscapedInput, status);
3531    REGEX_CHECK_STATUS_L(line);
3532    while(parseMatcher->find()) {
3533        parseMatcher->appendReplacement(deTaggedInput, "", status);
3534        REGEX_CHECK_STATUS;
3535        UnicodeString groupNum = parseMatcher->group(2, status);
3536        if (groupNum == "r") {
3537            // <r> or </r>, a region specification within the string
3538            if (parseMatcher->group(1, status) == "/") {
3539                regionEnd = deTaggedInput.length();
3540            } else {
3541                regionStart = deTaggedInput.length();
3542            }
3543        } else {
3544            // <digits> or </digits>, a group match boundary tag.
3545            if (parseMatcher->group(1, status) == "/") {
3546                set(groupEnds, deTaggedInput.length(), groupNum);
3547            } else {
3548                set(groupStarts, deTaggedInput.length(), groupNum);
3549            }
3550        }
3551    }
3552    parseMatcher->appendTail(deTaggedInput);
3553    REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3554    if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3555      errln("mismatched <r> tags");
3556      failed = TRUE;
3557      goto cleanupAndReturn;
3558    }
3559
3560    //
3561    //  Configure the matcher according to the flags specified with this test.
3562    //
3563    matcher = callerPattern->matcher(deTaggedInput, status);
3564    REGEX_CHECK_STATUS_L(line);
3565    if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3566        matcher->setTrace(TRUE);
3567    }
3568
3569    if (UTF8Pattern != NULL) {
3570        inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3571        status = U_ZERO_ERROR; // buffer overflow
3572        inputChars = new char[inputUTF8Length+1];
3573        deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3574        utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3575
3576        if (status == U_ZERO_ERROR) {
3577            UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3578            REGEX_CHECK_STATUS_L(line);
3579        }
3580
3581        if (UTF8Matcher == NULL) {
3582            // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3583          logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3584            status = U_ZERO_ERROR;
3585        }
3586    }
3587
3588    //
3589    //  Generate native indices for UTF8 versions of region and capture group info
3590    //
3591    if (UTF8Matcher != NULL) {
3592        if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3593        if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3594
3595        //  Fill out the native index UVector info.
3596        //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3597        for (i=0; i<groupStarts.size(); i++) {
3598            int32_t  start = groupStarts.elementAti(i);
3599            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3600            if (start >= 0) {
3601                int32_t  startUTF8;
3602                if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3603                    errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3604                    failed = TRUE;
3605                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3606                }
3607                setInt(groupStartsUTF8, startUTF8, i);
3608            }
3609
3610            int32_t  end = groupEnds.elementAti(i);
3611            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3612            if (end >= 0) {
3613                int32_t  endUTF8;
3614                if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3615                    errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3616                    failed = TRUE;
3617                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3618                }
3619                setInt(groupEndsUTF8, endUTF8, i);
3620            }
3621        }
3622    }
3623
3624    if (regionStart>=0) {
3625       matcher->region(regionStart, regionEnd, status);
3626       REGEX_CHECK_STATUS_L(line);
3627       if (UTF8Matcher != NULL) {
3628           UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3629           REGEX_CHECK_STATUS_L(line);
3630       }
3631    }
3632    if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3633        matcher->useAnchoringBounds(FALSE);
3634        if (UTF8Matcher != NULL) {
3635            UTF8Matcher->useAnchoringBounds(FALSE);
3636        }
3637    }
3638    if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3639        matcher->useTransparentBounds(TRUE);
3640        if (UTF8Matcher != NULL) {
3641            UTF8Matcher->useTransparentBounds(TRUE);
3642        }
3643    }
3644
3645
3646
3647    //
3648    // Do a find on the de-tagged input using the caller's pattern
3649    //     TODO: error on count>1 and not find().
3650    //           error on both matches() and lookingAt().
3651    //
3652    for (i=0; i<numFinds; i++) {
3653        if (useMatchesFunc) {
3654            isMatch = matcher->matches(status);
3655            if (UTF8Matcher != NULL) {
3656               isUTF8Match = UTF8Matcher->matches(status);
3657            }
3658        } else  if (useLookingAtFunc) {
3659            isMatch = matcher->lookingAt(status);
3660            if (UTF8Matcher != NULL) {
3661                isUTF8Match = UTF8Matcher->lookingAt(status);
3662            }
3663        } else {
3664            isMatch = matcher->find();
3665            if (UTF8Matcher != NULL) {
3666                isUTF8Match = UTF8Matcher->find();
3667            }
3668        }
3669    }
3670    matcher->setTrace(FALSE);
3671    if (U_FAILURE(status)) {
3672        errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3673    }
3674
3675    //
3676    // Match up the groups from the find() with the groups from the tags
3677    //
3678
3679    // number of tags should match number of groups from find operation.
3680    // matcher->groupCount does not include group 0, the entire match, hence the +1.
3681    //   G option in test means that capture group data is not available in the
3682    //     expected results, so the check needs to be suppressed.
3683    if (isMatch == FALSE && groupStarts.size() != 0) {
3684        dataerrln("Error at line %d:  Match expected, but none found.", line);
3685        failed = TRUE;
3686        goto cleanupAndReturn;
3687    } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3688        errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3689        failed = TRUE;
3690        goto cleanupAndReturn;
3691    }
3692
3693    if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3694        // Only check for match / no match.  Don't check capture groups.
3695        if (isMatch && groupStarts.size() == 0) {
3696            errln("Error at line %d:  No match expected, but one found.", line);
3697            failed = TRUE;
3698        } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3699            errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3700            failed = TRUE;
3701        }
3702        goto cleanupAndReturn;
3703    }
3704
3705    REGEX_CHECK_STATUS_L(line);
3706    for (i=0; i<=matcher->groupCount(); i++) {
3707        int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3708        int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3709        if (matcher->start(i, status) != expectedStart) {
3710            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3711                line, i, expectedStart, matcher->start(i, status));
3712            failed = TRUE;
3713            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3714        } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3715            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3716                  line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3717            failed = TRUE;
3718            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3719        }
3720
3721        int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3722        int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3723        if (matcher->end(i, status) != expectedEnd) {
3724            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3725                line, i, expectedEnd, matcher->end(i, status));
3726            failed = TRUE;
3727            // Error on end position;  keep going; real error is probably yet to come as group
3728            //   end positions work from end of the input data towards the front.
3729        } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3730            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3731                  line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3732            failed = TRUE;
3733            // Error on end position;  keep going; real error is probably yet to come as group
3734            //   end positions work from end of the input data towards the front.
3735        }
3736    }
3737    if ( matcher->groupCount()+1 < groupStarts.size()) {
3738        errln("Error at line %d: Expected %d capture groups, found %d.",
3739            line, groupStarts.size()-1, matcher->groupCount());
3740        failed = TRUE;
3741        }
3742    else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3743        errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3744              line, groupStarts.size()-1, UTF8Matcher->groupCount());
3745        failed = TRUE;
3746    }
3747
3748    if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3749        matcher->requireEnd() == TRUE) {
3750        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3751        failed = TRUE;
3752    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3753        UTF8Matcher->requireEnd() == TRUE) {
3754        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3755        failed = TRUE;
3756    }
3757
3758    if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3759        matcher->requireEnd() == FALSE) {
3760        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3761        failed = TRUE;
3762    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3763        UTF8Matcher->requireEnd() == FALSE) {
3764        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3765        failed = TRUE;
3766    }
3767
3768    if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3769        matcher->hitEnd() == TRUE) {
3770        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3771        failed = TRUE;
3772    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3773               UTF8Matcher->hitEnd() == TRUE) {
3774        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3775        failed = TRUE;
3776    }
3777
3778    if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3779        matcher->hitEnd() == FALSE) {
3780        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3781        failed = TRUE;
3782    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3783               UTF8Matcher->hitEnd() == FALSE) {
3784        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3785        failed = TRUE;
3786    }
3787
3788
3789cleanupAndReturn:
3790    if (failed) {
3791        infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3792            +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3793        // callerPattern->dump();
3794    }
3795    delete parseMatcher;
3796    delete parsePat;
3797    delete UTF8Matcher;
3798    delete UTF8Pattern;
3799    delete matcher;
3800    delete callerPattern;
3801
3802    utext_close(&inputText);
3803    delete[] inputChars;
3804    utext_close(&patternText);
3805    delete[] patternChars;
3806    ucnv_close(UTF8Converter);
3807}
3808
3809
3810
3811
3812//---------------------------------------------------------------------------
3813//
3814//      Errors     Check for error handling in patterns.
3815//
3816//---------------------------------------------------------------------------
3817void RegexTest::Errors() {
3818    // \escape sequences that aren't implemented yet.
3819    //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3820
3821    // Missing close parentheses
3822    REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3823    REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3824    REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3825
3826    // Extra close paren
3827    REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3828    REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3829    REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3830
3831    // Look-ahead, Look-behind
3832    //  TODO:  add tests for unbounded length look-behinds.
3833    REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3834
3835    // Attempt to use non-default flags
3836    {
3837        UParseError   pe;
3838        UErrorCode    status = U_ZERO_ERROR;
3839        int32_t       flags  = UREGEX_CANON_EQ |
3840                               UREGEX_COMMENTS         | UREGEX_DOTALL   |
3841                               UREGEX_MULTILINE;
3842        RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3843        REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3844        delete pat1;
3845    }
3846
3847
3848    // Quantifiers are allowed only after something that can be quantified.
3849    REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3850    REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3851    REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3852
3853    // Mal-formed {min,max} quantifiers
3854    REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3855    REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3856    REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3857    REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3858    REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3859    REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3860    REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3861    REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3862    REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3863
3864    // Ticket 5389
3865    REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3866
3867    // Invalid Back Reference \0
3868    //    For ICU 3.8 and earlier
3869    //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3870    //
3871    REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3872
3873}
3874
3875
3876//-------------------------------------------------------------------------------
3877//
3878//  Read a text data file, convert it to UChars, and return the data
3879//    in one big UChar * buffer, which the caller must delete.
3880//
3881//--------------------------------------------------------------------------------
3882UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3883                                     const char *defEncoding, UErrorCode &status) {
3884    UChar       *retPtr  = NULL;
3885    char        *fileBuf = NULL;
3886    UConverter* conv     = NULL;
3887    FILE        *f       = NULL;
3888
3889    ulen = 0;
3890    if (U_FAILURE(status)) {
3891        return retPtr;
3892    }
3893
3894    //
3895    //  Open the file.
3896    //
3897    f = fopen(fileName, "rb");
3898    if (f == 0) {
3899        dataerrln("Error opening test data file %s\n", fileName);
3900        status = U_FILE_ACCESS_ERROR;
3901        return NULL;
3902    }
3903    //
3904    //  Read it in
3905    //
3906    int32_t            fileSize;
3907    int32_t            amt_read;
3908
3909    fseek( f, 0, SEEK_END);
3910    fileSize = ftell(f);
3911    fileBuf = new char[fileSize];
3912    fseek(f, 0, SEEK_SET);
3913    amt_read = fread(fileBuf, 1, fileSize, f);
3914    if (amt_read != fileSize || fileSize <= 0) {
3915        errln("Error reading test data file.");
3916        goto cleanUpAndReturn;
3917    }
3918
3919    //
3920    // Look for a Unicode Signature (BOM) on the data just read
3921    //
3922    int32_t        signatureLength;
3923    const char *   fileBufC;
3924    const char*    encoding;
3925
3926    fileBufC = fileBuf;
3927    encoding = ucnv_detectUnicodeSignature(
3928        fileBuf, fileSize, &signatureLength, &status);
3929    if(encoding!=NULL ){
3930        fileBufC  += signatureLength;
3931        fileSize  -= signatureLength;
3932    } else {
3933        encoding = defEncoding;
3934        if (strcmp(encoding, "utf-8") == 0) {
3935            errln("file %s is missing its BOM", fileName);
3936        }
3937    }
3938
3939    //
3940    // Open a converter to take the rule file to UTF-16
3941    //
3942    conv = ucnv_open(encoding, &status);
3943    if (U_FAILURE(status)) {
3944        goto cleanUpAndReturn;
3945    }
3946
3947    //
3948    // Convert the rules to UChar.
3949    //  Preflight first to determine required buffer size.
3950    //
3951    ulen = ucnv_toUChars(conv,
3952        NULL,           //  dest,
3953        0,              //  destCapacity,
3954        fileBufC,
3955        fileSize,
3956        &status);
3957    if (status == U_BUFFER_OVERFLOW_ERROR) {
3958        // Buffer Overflow is expected from the preflight operation.
3959        status = U_ZERO_ERROR;
3960
3961        retPtr = new UChar[ulen+1];
3962        ucnv_toUChars(conv,
3963            retPtr,       //  dest,
3964            ulen+1,
3965            fileBufC,
3966            fileSize,
3967            &status);
3968    }
3969
3970cleanUpAndReturn:
3971    fclose(f);
3972    delete[] fileBuf;
3973    ucnv_close(conv);
3974    if (U_FAILURE(status)) {
3975        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3976        delete []retPtr;
3977        retPtr = 0;
3978        ulen   = 0;
3979    };
3980    return retPtr;
3981}
3982
3983
3984//-------------------------------------------------------------------------------
3985//
3986//   PerlTests  - Run Perl's regular expression tests
3987//                The input file for this test is re_tests, the standard regular
3988//                expression test data distributed with the Perl source code.
3989//
3990//                Here is Perl's description of the test data file:
3991//
3992//        # The tests are in a separate file 't/op/re_tests'.
3993//        # Each line in that file is a separate test.
3994//        # There are five columns, separated by tabs.
3995//        #
3996//        # Column 1 contains the pattern, optionally enclosed in C<''>.
3997//        # Modifiers can be put after the closing C<'>.
3998//        #
3999//        # Column 2 contains the string to be matched.
4000//        #
4001//        # Column 3 contains the expected result:
4002//        #     y   expect a match
4003//        #     n   expect no match
4004//        #     c   expect an error
4005//        # B   test exposes a known bug in Perl, should be skipped
4006//        # b   test exposes a known bug in Perl, should be skipped if noamp
4007//        #
4008//        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4009//        #
4010//        # Column 4 contains a string, usually C<$&>.
4011//        #
4012//        # Column 5 contains the expected result of double-quote
4013//        # interpolating that string after the match, or start of error message.
4014//        #
4015//        # Column 6, if present, contains a reason why the test is skipped.
4016//        # This is printed with "skipped", for harness to pick up.
4017//        #
4018//        # \n in the tests are interpolated, as are variables of the form ${\w+}.
4019//        #
4020//        # If you want to add a regular expression test that can't be expressed
4021//        # in this format, don't add it here: put it in op/pat.t instead.
4022//
4023//        For ICU, if field 3 contains an 'i', the test will be skipped.
4024//        The test exposes is some known incompatibility between ICU and Perl regexps.
4025//        (The i is in addition to whatever was there before.)
4026//
4027//-------------------------------------------------------------------------------
4028void RegexTest::PerlTests() {
4029    char tdd[2048];
4030    const char *srcPath;
4031    UErrorCode  status = U_ZERO_ERROR;
4032    UParseError pe;
4033
4034    //
4035    //  Open and read the test data file.
4036    //
4037    srcPath=getPath(tdd, "re_tests.txt");
4038    if(srcPath==NULL) {
4039        return; /* something went wrong, error already output */
4040    }
4041
4042    int32_t    len;
4043    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4044    if (U_FAILURE(status)) {
4045        return; /* something went wrong, error already output */
4046    }
4047
4048    //
4049    //  Put the test data into a UnicodeString
4050    //
4051    UnicodeString testDataString(FALSE, testData, len);
4052
4053    //
4054    //  Regex to break the input file into lines, and strip the new lines.
4055    //     One line per match, capture group one is the desired data.
4056    //
4057    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4058    if (U_FAILURE(status)) {
4059        dataerrln("RegexPattern::compile() error");
4060        return;
4061    }
4062    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4063
4064    //
4065    //  Regex to split a test file line into fields.
4066    //    There are six fields, separated by tabs.
4067    //
4068    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4069
4070    //
4071    //  Regex to identify test patterns with flag settings, and to separate them.
4072    //    Test patterns with flags look like 'pattern'i
4073    //    Test patterns without flags are not quoted:   pattern
4074    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4075    //
4076    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4077    RegexMatcher* flagMat = flagPat->matcher(status);
4078
4079    //
4080    // The Perl tests reference several perl-isms, which are evaluated/substituted
4081    //   in the test data.  Not being perl, this must be done explicitly.  Here
4082    //   are string constants and REs for these constructs.
4083    //
4084    UnicodeString nulnulSrc("${nulnul}");
4085    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4086    nulnul = nulnul.unescape();
4087
4088    UnicodeString ffffSrc("${ffff}");
4089    UnicodeString ffff("\\uffff", -1, US_INV);
4090    ffff = ffff.unescape();
4091
4092    //  regexp for $-[0], $+[2], etc.
4093    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4094    RegexMatcher *groupsMat = groupsPat->matcher(status);
4095
4096    //  regexp for $0, $1, $2, etc.
4097    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4098    RegexMatcher *cgMat = cgPat->matcher(status);
4099
4100
4101    //
4102    // Main Loop for the Perl Tests, runs once per line from the
4103    //   test data file.
4104    //
4105    int32_t  lineNum = 0;
4106    int32_t  skippedUnimplementedCount = 0;
4107    while (lineMat->find()) {
4108        lineNum++;
4109
4110        //
4111        //  Get a line, break it into its fields, do the Perl
4112        //    variable substitutions.
4113        //
4114        UnicodeString line = lineMat->group(1, status);
4115        UnicodeString fields[7];
4116        fieldPat->split(line, fields, 7, status);
4117
4118        flagMat->reset(fields[0]);
4119        flagMat->matches(status);
4120        UnicodeString pattern  = flagMat->group(2, status);
4121        pattern.findAndReplace("${bang}", "!");
4122        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4123        pattern.findAndReplace(ffffSrc, ffff);
4124
4125        //
4126        //  Identify patterns that include match flag settings,
4127        //    split off the flags, remove the extra quotes.
4128        //
4129        UnicodeString flagStr = flagMat->group(3, status);
4130        if (U_FAILURE(status)) {
4131            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4132            return;
4133        }
4134        int32_t flags = 0;
4135        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4136        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4137        const UChar UChar_m = 0x6d;
4138        const UChar UChar_x = 0x78;
4139        const UChar UChar_y = 0x79;
4140        if (flagStr.indexOf(UChar_i) != -1) {
4141            flags |= UREGEX_CASE_INSENSITIVE;
4142        }
4143        if (flagStr.indexOf(UChar_m) != -1) {
4144            flags |= UREGEX_MULTILINE;
4145        }
4146        if (flagStr.indexOf(UChar_x) != -1) {
4147            flags |= UREGEX_COMMENTS;
4148        }
4149
4150        //
4151        // Compile the test pattern.
4152        //
4153        status = U_ZERO_ERROR;
4154        RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4155        if (status == U_REGEX_UNIMPLEMENTED) {
4156            //
4157            // Test of a feature that is planned for ICU, but not yet implemented.
4158            //   skip the test.
4159            skippedUnimplementedCount++;
4160            delete testPat;
4161            status = U_ZERO_ERROR;
4162            continue;
4163        }
4164
4165        if (U_FAILURE(status)) {
4166            // Some tests are supposed to generate errors.
4167            //   Only report an error for tests that are supposed to succeed.
4168            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4169                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4170            {
4171                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4172            }
4173            status = U_ZERO_ERROR;
4174            delete testPat;
4175            continue;
4176        }
4177
4178        if (fields[2].indexOf(UChar_i) >= 0) {
4179            // ICU should skip this test.
4180            delete testPat;
4181            continue;
4182        }
4183
4184        if (fields[2].indexOf(UChar_c) >= 0) {
4185            // This pattern should have caused a compilation error, but didn't/
4186            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4187            delete testPat;
4188            continue;
4189        }
4190
4191        //
4192        // replace the Perl variables that appear in some of the
4193        //   match data strings.
4194        //
4195        UnicodeString matchString = fields[1];
4196        matchString.findAndReplace(nulnulSrc, nulnul);
4197        matchString.findAndReplace(ffffSrc,   ffff);
4198
4199        // Replace any \n in the match string with an actual new-line char.
4200        //  Don't do full unescape, as this unescapes more than Perl does, which
4201        //  causes other spurious failures in the tests.
4202        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4203
4204
4205
4206        //
4207        // Run the test, check for expected match/don't match result.
4208        //
4209        RegexMatcher *testMat = testPat->matcher(matchString, status);
4210        UBool found = testMat->find();
4211        UBool expected = FALSE;
4212        if (fields[2].indexOf(UChar_y) >=0) {
4213            expected = TRUE;
4214        }
4215        if (expected != found) {
4216            errln("line %d: Expected %smatch, got %smatch",
4217                lineNum, expected?"":"no ", found?"":"no " );
4218            continue;
4219        }
4220
4221        // Don't try to check expected results if there is no match.
4222        //   (Some have stuff in the expected fields)
4223        if (!found) {
4224            delete testMat;
4225            delete testPat;
4226            continue;
4227        }
4228
4229        //
4230        // Interpret the Perl expression from the fourth field of the data file,
4231        // building up an ICU string from the results of the ICU match.
4232        //   The Perl expression will contain references to the results of
4233        //     a regex match, including the matched string, capture group strings,
4234        //     group starting and ending indicies, etc.
4235        //
4236        UnicodeString resultString;
4237        UnicodeString perlExpr = fields[3];
4238#if SUPPORT_MUTATING_INPUT_STRING
4239        groupsMat->reset(perlExpr);
4240        cgMat->reset(perlExpr);
4241#endif
4242
4243        while (perlExpr.length() > 0) {
4244#if !SUPPORT_MUTATING_INPUT_STRING
4245            //  Perferred usage.  Reset after any modification to input string.
4246            groupsMat->reset(perlExpr);
4247            cgMat->reset(perlExpr);
4248#endif
4249
4250            if (perlExpr.startsWith("$&")) {
4251                resultString.append(testMat->group(status));
4252                perlExpr.remove(0, 2);
4253            }
4254
4255            else if (groupsMat->lookingAt(status)) {
4256                // $-[0]   $+[2]  etc.
4257                UnicodeString digitString = groupsMat->group(2, status);
4258                int32_t t = 0;
4259                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4260                UnicodeString plusOrMinus = groupsMat->group(1, status);
4261                int32_t matchPosition;
4262                if (plusOrMinus.compare("+") == 0) {
4263                    matchPosition = testMat->end(groupNum, status);
4264                } else {
4265                    matchPosition = testMat->start(groupNum, status);
4266                }
4267                if (matchPosition != -1) {
4268                    ICU_Utility::appendNumber(resultString, matchPosition);
4269                }
4270                perlExpr.remove(0, groupsMat->end(status));
4271            }
4272
4273            else if (cgMat->lookingAt(status)) {
4274                // $1, $2, $3, etc.
4275                UnicodeString digitString = cgMat->group(1, status);
4276                int32_t t = 0;
4277                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4278                if (U_SUCCESS(status)) {
4279                    resultString.append(testMat->group(groupNum, status));
4280                    status = U_ZERO_ERROR;
4281                }
4282                perlExpr.remove(0, cgMat->end(status));
4283            }
4284
4285            else if (perlExpr.startsWith("@-")) {
4286                int32_t i;
4287                for (i=0; i<=testMat->groupCount(); i++) {
4288                    if (i>0) {
4289                        resultString.append(" ");
4290                    }
4291                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4292                }
4293                perlExpr.remove(0, 2);
4294            }
4295
4296            else if (perlExpr.startsWith("@+")) {
4297                int32_t i;
4298                for (i=0; i<=testMat->groupCount(); i++) {
4299                    if (i>0) {
4300                        resultString.append(" ");
4301                    }
4302                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4303                }
4304                perlExpr.remove(0, 2);
4305            }
4306
4307            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4308                                                     //           or as an escaped sequence (e.g. \n)
4309                if (perlExpr.length() > 1) {
4310                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4311                }
4312                UChar c = perlExpr.charAt(0);
4313                switch (c) {
4314                case 'n':   c = '\n'; break;
4315                // add any other escape sequences that show up in the test expected results.
4316                }
4317                resultString.append(c);
4318                perlExpr.remove(0, 1);
4319            }
4320
4321            else  {
4322                // Any characters from the perl expression that we don't explicitly
4323                //  recognize before here are assumed to be literals and copied
4324                //  as-is to the expected results.
4325                resultString.append(perlExpr.charAt(0));
4326                perlExpr.remove(0, 1);
4327            }
4328
4329            if (U_FAILURE(status)) {
4330                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4331                break;
4332            }
4333        }
4334
4335        //
4336        // Expected Results Compare
4337        //
4338        UnicodeString expectedS(fields[4]);
4339        expectedS.findAndReplace(nulnulSrc, nulnul);
4340        expectedS.findAndReplace(ffffSrc,   ffff);
4341        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4342
4343
4344        if (expectedS.compare(resultString) != 0) {
4345            err("Line %d: Incorrect perl expression results.", lineNum);
4346            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4347        }
4348
4349        delete testMat;
4350        delete testPat;
4351    }
4352
4353    //
4354    // All done.  Clean up allocated stuff.
4355    //
4356    delete cgMat;
4357    delete cgPat;
4358
4359    delete groupsMat;
4360    delete groupsPat;
4361
4362    delete flagMat;
4363    delete flagPat;
4364
4365    delete lineMat;
4366    delete linePat;
4367
4368    delete fieldPat;
4369    delete [] testData;
4370
4371
4372    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4373
4374}
4375
4376
4377//-------------------------------------------------------------------------------
4378//
4379//   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4380//                  (instead of using UnicodeStrings) to test the alternate engine.
4381//                  The input file for this test is re_tests, the standard regular
4382//                  expression test data distributed with the Perl source code.
4383//                  See PerlTests() for more information.
4384//
4385//-------------------------------------------------------------------------------
4386void RegexTest::PerlTestsUTF8() {
4387    char tdd[2048];
4388    const char *srcPath;
4389    UErrorCode  status = U_ZERO_ERROR;
4390    UParseError pe;
4391    LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4392    UText       patternText = UTEXT_INITIALIZER;
4393    char       *patternChars = NULL;
4394    int32_t     patternLength;
4395    int32_t     patternCapacity = 0;
4396    UText       inputText = UTEXT_INITIALIZER;
4397    char       *inputChars = NULL;
4398    int32_t     inputLength;
4399    int32_t     inputCapacity = 0;
4400
4401    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4402
4403    //
4404    //  Open and read the test data file.
4405    //
4406    srcPath=getPath(tdd, "re_tests.txt");
4407    if(srcPath==NULL) {
4408        return; /* something went wrong, error already output */
4409    }
4410
4411    int32_t    len;
4412    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4413    if (U_FAILURE(status)) {
4414        return; /* something went wrong, error already output */
4415    }
4416
4417    //
4418    //  Put the test data into a UnicodeString
4419    //
4420    UnicodeString testDataString(FALSE, testData, len);
4421
4422    //
4423    //  Regex to break the input file into lines, and strip the new lines.
4424    //     One line per match, capture group one is the desired data.
4425    //
4426    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4427    if (U_FAILURE(status)) {
4428        dataerrln("RegexPattern::compile() error");
4429        return;
4430    }
4431    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4432
4433    //
4434    //  Regex to split a test file line into fields.
4435    //    There are six fields, separated by tabs.
4436    //
4437    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4438
4439    //
4440    //  Regex to identify test patterns with flag settings, and to separate them.
4441    //    Test patterns with flags look like 'pattern'i
4442    //    Test patterns without flags are not quoted:   pattern
4443    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4444    //
4445    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4446    RegexMatcher* flagMat = flagPat->matcher(status);
4447
4448    //
4449    // The Perl tests reference several perl-isms, which are evaluated/substituted
4450    //   in the test data.  Not being perl, this must be done explicitly.  Here
4451    //   are string constants and REs for these constructs.
4452    //
4453    UnicodeString nulnulSrc("${nulnul}");
4454    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4455    nulnul = nulnul.unescape();
4456
4457    UnicodeString ffffSrc("${ffff}");
4458    UnicodeString ffff("\\uffff", -1, US_INV);
4459    ffff = ffff.unescape();
4460
4461    //  regexp for $-[0], $+[2], etc.
4462    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4463    RegexMatcher *groupsMat = groupsPat->matcher(status);
4464
4465    //  regexp for $0, $1, $2, etc.
4466    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4467    RegexMatcher *cgMat = cgPat->matcher(status);
4468
4469
4470    //
4471    // Main Loop for the Perl Tests, runs once per line from the
4472    //   test data file.
4473    //
4474    int32_t  lineNum = 0;
4475    int32_t  skippedUnimplementedCount = 0;
4476    while (lineMat->find()) {
4477        lineNum++;
4478
4479        //
4480        //  Get a line, break it into its fields, do the Perl
4481        //    variable substitutions.
4482        //
4483        UnicodeString line = lineMat->group(1, status);
4484        UnicodeString fields[7];
4485        fieldPat->split(line, fields, 7, status);
4486
4487        flagMat->reset(fields[0]);
4488        flagMat->matches(status);
4489        UnicodeString pattern  = flagMat->group(2, status);
4490        pattern.findAndReplace("${bang}", "!");
4491        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4492        pattern.findAndReplace(ffffSrc, ffff);
4493
4494        //
4495        //  Identify patterns that include match flag settings,
4496        //    split off the flags, remove the extra quotes.
4497        //
4498        UnicodeString flagStr = flagMat->group(3, status);
4499        if (U_FAILURE(status)) {
4500            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4501            return;
4502        }
4503        int32_t flags = 0;
4504        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4505        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4506        const UChar UChar_m = 0x6d;
4507        const UChar UChar_x = 0x78;
4508        const UChar UChar_y = 0x79;
4509        if (flagStr.indexOf(UChar_i) != -1) {
4510            flags |= UREGEX_CASE_INSENSITIVE;
4511        }
4512        if (flagStr.indexOf(UChar_m) != -1) {
4513            flags |= UREGEX_MULTILINE;
4514        }
4515        if (flagStr.indexOf(UChar_x) != -1) {
4516            flags |= UREGEX_COMMENTS;
4517        }
4518
4519        //
4520        // Put the pattern in a UTF-8 UText
4521        //
4522        status = U_ZERO_ERROR;
4523        patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4524        if (status == U_BUFFER_OVERFLOW_ERROR) {
4525            status = U_ZERO_ERROR;
4526            delete[] patternChars;
4527            patternCapacity = patternLength + 1;
4528            patternChars = new char[patternCapacity];
4529            pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4530        }
4531        utext_openUTF8(&patternText, patternChars, patternLength, &status);
4532
4533        //
4534        // Compile the test pattern.
4535        //
4536        RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4537        if (status == U_REGEX_UNIMPLEMENTED) {
4538            //
4539            // Test of a feature that is planned for ICU, but not yet implemented.
4540            //   skip the test.
4541            skippedUnimplementedCount++;
4542            delete testPat;
4543            status = U_ZERO_ERROR;
4544            continue;
4545        }
4546
4547        if (U_FAILURE(status)) {
4548            // Some tests are supposed to generate errors.
4549            //   Only report an error for tests that are supposed to succeed.
4550            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4551                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4552            {
4553                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4554            }
4555            status = U_ZERO_ERROR;
4556            delete testPat;
4557            continue;
4558        }
4559
4560        if (fields[2].indexOf(UChar_i) >= 0) {
4561            // ICU should skip this test.
4562            delete testPat;
4563            continue;
4564        }
4565
4566        if (fields[2].indexOf(UChar_c) >= 0) {
4567            // This pattern should have caused a compilation error, but didn't/
4568            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4569            delete testPat;
4570            continue;
4571        }
4572
4573
4574        //
4575        // replace the Perl variables that appear in some of the
4576        //   match data strings.
4577        //
4578        UnicodeString matchString = fields[1];
4579        matchString.findAndReplace(nulnulSrc, nulnul);
4580        matchString.findAndReplace(ffffSrc,   ffff);
4581
4582        // Replace any \n in the match string with an actual new-line char.
4583        //  Don't do full unescape, as this unescapes more than Perl does, which
4584        //  causes other spurious failures in the tests.
4585        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4586
4587        //
4588        // Put the input in a UTF-8 UText
4589        //
4590        status = U_ZERO_ERROR;
4591        inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4592        if (status == U_BUFFER_OVERFLOW_ERROR) {
4593            status = U_ZERO_ERROR;
4594            delete[] inputChars;
4595            inputCapacity = inputLength + 1;
4596            inputChars = new char[inputCapacity];
4597            matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4598        }
4599        utext_openUTF8(&inputText, inputChars, inputLength, &status);
4600
4601        //
4602        // Run the test, check for expected match/don't match result.
4603        //
4604        RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4605        UBool found = testMat->find();
4606        UBool expected = FALSE;
4607        if (fields[2].indexOf(UChar_y) >=0) {
4608            expected = TRUE;
4609        }
4610        if (expected != found) {
4611            errln("line %d: Expected %smatch, got %smatch",
4612                lineNum, expected?"":"no ", found?"":"no " );
4613            continue;
4614        }
4615
4616        // Don't try to check expected results if there is no match.
4617        //   (Some have stuff in the expected fields)
4618        if (!found) {
4619            delete testMat;
4620            delete testPat;
4621            continue;
4622        }
4623
4624        //
4625        // Interpret the Perl expression from the fourth field of the data file,
4626        // building up an ICU string from the results of the ICU match.
4627        //   The Perl expression will contain references to the results of
4628        //     a regex match, including the matched string, capture group strings,
4629        //     group starting and ending indicies, etc.
4630        //
4631        UnicodeString resultString;
4632        UnicodeString perlExpr = fields[3];
4633
4634        while (perlExpr.length() > 0) {
4635            groupsMat->reset(perlExpr);
4636            cgMat->reset(perlExpr);
4637
4638            if (perlExpr.startsWith("$&")) {
4639                resultString.append(testMat->group(status));
4640                perlExpr.remove(0, 2);
4641            }
4642
4643            else if (groupsMat->lookingAt(status)) {
4644                // $-[0]   $+[2]  etc.
4645                UnicodeString digitString = groupsMat->group(2, status);
4646                int32_t t = 0;
4647                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4648                UnicodeString plusOrMinus = groupsMat->group(1, status);
4649                int32_t matchPosition;
4650                if (plusOrMinus.compare("+") == 0) {
4651                    matchPosition = testMat->end(groupNum, status);
4652                } else {
4653                    matchPosition = testMat->start(groupNum, status);
4654                }
4655                if (matchPosition != -1) {
4656                    ICU_Utility::appendNumber(resultString, matchPosition);
4657                }
4658                perlExpr.remove(0, groupsMat->end(status));
4659            }
4660
4661            else if (cgMat->lookingAt(status)) {
4662                // $1, $2, $3, etc.
4663                UnicodeString digitString = cgMat->group(1, status);
4664                int32_t t = 0;
4665                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4666                if (U_SUCCESS(status)) {
4667                    resultString.append(testMat->group(groupNum, status));
4668                    status = U_ZERO_ERROR;
4669                }
4670                perlExpr.remove(0, cgMat->end(status));
4671            }
4672
4673            else if (perlExpr.startsWith("@-")) {
4674                int32_t i;
4675                for (i=0; i<=testMat->groupCount(); i++) {
4676                    if (i>0) {
4677                        resultString.append(" ");
4678                    }
4679                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4680                }
4681                perlExpr.remove(0, 2);
4682            }
4683
4684            else if (perlExpr.startsWith("@+")) {
4685                int32_t i;
4686                for (i=0; i<=testMat->groupCount(); i++) {
4687                    if (i>0) {
4688                        resultString.append(" ");
4689                    }
4690                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4691                }
4692                perlExpr.remove(0, 2);
4693            }
4694
4695            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4696                                                     //           or as an escaped sequence (e.g. \n)
4697                if (perlExpr.length() > 1) {
4698                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4699                }
4700                UChar c = perlExpr.charAt(0);
4701                switch (c) {
4702                case 'n':   c = '\n'; break;
4703                // add any other escape sequences that show up in the test expected results.
4704                }
4705                resultString.append(c);
4706                perlExpr.remove(0, 1);
4707            }
4708
4709            else  {
4710                // Any characters from the perl expression that we don't explicitly
4711                //  recognize before here are assumed to be literals and copied
4712                //  as-is to the expected results.
4713                resultString.append(perlExpr.charAt(0));
4714                perlExpr.remove(0, 1);
4715            }
4716
4717            if (U_FAILURE(status)) {
4718                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4719                break;
4720            }
4721        }
4722
4723        //
4724        // Expected Results Compare
4725        //
4726        UnicodeString expectedS(fields[4]);
4727        expectedS.findAndReplace(nulnulSrc, nulnul);
4728        expectedS.findAndReplace(ffffSrc,   ffff);
4729        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4730
4731
4732        if (expectedS.compare(resultString) != 0) {
4733            err("Line %d: Incorrect perl expression results.", lineNum);
4734            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4735        }
4736
4737        delete testMat;
4738        delete testPat;
4739    }
4740
4741    //
4742    // All done.  Clean up allocated stuff.
4743    //
4744    delete cgMat;
4745    delete cgPat;
4746
4747    delete groupsMat;
4748    delete groupsPat;
4749
4750    delete flagMat;
4751    delete flagPat;
4752
4753    delete lineMat;
4754    delete linePat;
4755
4756    delete fieldPat;
4757    delete [] testData;
4758
4759    utext_close(&patternText);
4760    utext_close(&inputText);
4761
4762    delete [] patternChars;
4763    delete [] inputChars;
4764
4765
4766    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4767
4768}
4769
4770
4771//--------------------------------------------------------------
4772//
4773//  Bug6149   Verify limits to heap expansion for backtrack stack.
4774//             Use this pattern,
4775//                 "(a?){1,8000000}"
4776//             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4777//                   This test is likely to be fragile, as further optimizations stop
4778//                   more cases of pointless looping in the match engine.
4779//
4780//---------------------------------------------------------------
4781void RegexTest::Bug6149() {
4782    UnicodeString pattern("(a?){1,8000000}");
4783    UnicodeString s("xyz");
4784    uint32_t flags = 0;
4785    UErrorCode status = U_ZERO_ERROR;
4786
4787    RegexMatcher  matcher(pattern, s, flags, status);
4788    UBool result = false;
4789    REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4790    REGEX_ASSERT(result == FALSE);
4791 }
4792
4793
4794//
4795//   Callbacks()    Test the callback function.
4796//                  When set, callbacks occur periodically during matching operations,
4797//                  giving the application code the ability to abort the operation
4798//                  before it's normal completion.
4799//
4800
4801struct callBackContext {
4802    RegexTest        *test;
4803    int32_t          maxCalls;
4804    int32_t          numCalls;
4805    int32_t          lastSteps;
4806    void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4807};
4808
4809U_CDECL_BEGIN
4810static UBool U_CALLCONV
4811testCallBackFn(const void *context, int32_t steps) {
4812    callBackContext  *info = (callBackContext *)context;
4813    if (info->lastSteps+1 != steps) {
4814        info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4815    }
4816    info->lastSteps = steps;
4817    info->numCalls++;
4818    return (info->numCalls < info->maxCalls);
4819}
4820U_CDECL_END
4821
4822void RegexTest::Callbacks() {
4823   {
4824        // Getter returns NULLs if no callback has been set
4825
4826        //   The variables that the getter will fill in.
4827        //   Init to non-null values so that the action of the getter can be seen.
4828        const void          *returnedContext = &returnedContext;
4829        URegexMatchCallback *returnedFn = &testCallBackFn;
4830
4831        UErrorCode status = U_ZERO_ERROR;
4832        RegexMatcher matcher("x", 0, status);
4833        REGEX_CHECK_STATUS;
4834        matcher.getMatchCallback(returnedFn, returnedContext, status);
4835        REGEX_CHECK_STATUS;
4836        REGEX_ASSERT(returnedFn == NULL);
4837        REGEX_ASSERT(returnedContext == NULL);
4838    }
4839
4840   {
4841        // Set and Get work
4842        callBackContext cbInfo = {this, 0, 0, 0};
4843        const void          *returnedContext;
4844        URegexMatchCallback *returnedFn;
4845        UErrorCode status = U_ZERO_ERROR;
4846        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4847        REGEX_CHECK_STATUS;
4848        matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4849        REGEX_CHECK_STATUS;
4850        matcher.getMatchCallback(returnedFn, returnedContext, status);
4851        REGEX_CHECK_STATUS;
4852        REGEX_ASSERT(returnedFn == testCallBackFn);
4853        REGEX_ASSERT(returnedContext == &cbInfo);
4854
4855        // A short-running match shouldn't invoke the callback
4856        status = U_ZERO_ERROR;
4857        cbInfo.reset(1);
4858        UnicodeString s = "xxx";
4859        matcher.reset(s);
4860        REGEX_ASSERT(matcher.matches(status));
4861        REGEX_CHECK_STATUS;
4862        REGEX_ASSERT(cbInfo.numCalls == 0);
4863
4864        // A medium-length match that runs long enough to invoke the
4865        //   callback, but not so long that the callback aborts it.
4866        status = U_ZERO_ERROR;
4867        cbInfo.reset(4);
4868        s = "aaaaaaaaaaaaaaaaaaab";
4869        matcher.reset(s);
4870        REGEX_ASSERT(matcher.matches(status)==FALSE);
4871        REGEX_CHECK_STATUS;
4872        REGEX_ASSERT(cbInfo.numCalls > 0);
4873
4874        // A longer running match that the callback function will abort.
4875        status = U_ZERO_ERROR;
4876        cbInfo.reset(4);
4877        s = "aaaaaaaaaaaaaaaaaaaaaaab";
4878        matcher.reset(s);
4879        REGEX_ASSERT(matcher.matches(status)==FALSE);
4880        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4881        REGEX_ASSERT(cbInfo.numCalls == 4);
4882
4883        // A longer running find that the callback function will abort.
4884        status = U_ZERO_ERROR;
4885        cbInfo.reset(4);
4886        s = "aaaaaaaaaaaaaaaaaaaaaaab";
4887        matcher.reset(s);
4888        REGEX_ASSERT(matcher.find(status)==FALSE);
4889        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4890        REGEX_ASSERT(cbInfo.numCalls == 4);
4891    }
4892
4893
4894}
4895
4896
4897//
4898//   FindProgressCallbacks()    Test the find "progress" callback function.
4899//                  When set, the find progress callback will be invoked during a find operations
4900//                  after each return from a match attempt, giving the application the opportunity
4901//                  to terminate a long-running find operation before it's normal completion.
4902//
4903
4904struct progressCallBackContext {
4905    RegexTest        *test;
4906    int64_t          lastIndex;
4907    int32_t          maxCalls;
4908    int32_t          numCalls;
4909    void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4910};
4911
4912// call-back function for find().
4913// Return TRUE to continue the find().
4914// Return FALSE to stop the find().
4915U_CDECL_BEGIN
4916static UBool U_CALLCONV
4917testProgressCallBackFn(const void *context, int64_t matchIndex) {
4918    progressCallBackContext  *info = (progressCallBackContext *)context;
4919    info->numCalls++;
4920    info->lastIndex = matchIndex;
4921//    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4922    return (info->numCalls < info->maxCalls);
4923}
4924U_CDECL_END
4925
4926void RegexTest::FindProgressCallbacks() {
4927   {
4928        // Getter returns NULLs if no callback has been set
4929
4930        //   The variables that the getter will fill in.
4931        //   Init to non-null values so that the action of the getter can be seen.
4932        const void                  *returnedContext = &returnedContext;
4933        URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4934
4935        UErrorCode status = U_ZERO_ERROR;
4936        RegexMatcher matcher("x", 0, status);
4937        REGEX_CHECK_STATUS;
4938        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4939        REGEX_CHECK_STATUS;
4940        REGEX_ASSERT(returnedFn == NULL);
4941        REGEX_ASSERT(returnedContext == NULL);
4942    }
4943
4944   {
4945        // Set and Get work
4946        progressCallBackContext cbInfo = {this, 0, 0, 0};
4947        const void                  *returnedContext;
4948        URegexFindProgressCallback  *returnedFn;
4949        UErrorCode status = U_ZERO_ERROR;
4950        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4951        REGEX_CHECK_STATUS;
4952        matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4953        REGEX_CHECK_STATUS;
4954        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4955        REGEX_CHECK_STATUS;
4956        REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4957        REGEX_ASSERT(returnedContext == &cbInfo);
4958
4959        // A find that matches on the initial position does NOT invoke the callback.
4960        status = U_ZERO_ERROR;
4961        cbInfo.reset(100);
4962        UnicodeString s = "aaxxx";
4963        matcher.reset(s);
4964#if 0
4965        matcher.setTrace(TRUE);
4966#endif
4967        REGEX_ASSERT(matcher.find(0, status));
4968        REGEX_CHECK_STATUS;
4969        REGEX_ASSERT(cbInfo.numCalls == 0);
4970
4971        // A medium running find() that causes matcher.find() to invoke our callback for each index,
4972        //   but not so many times that we interrupt the operation.
4973        status = U_ZERO_ERROR;
4974        s = "aaaaaaaaaaaaaaaaaaab";
4975        cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4976        matcher.reset(s);
4977        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4978        REGEX_CHECK_STATUS;
4979        REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4980
4981        // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4982        status = U_ZERO_ERROR;
4983        UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4984        cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4985        matcher.reset(s1);
4986        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4987        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4988        REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4989
4990        // Now a match that will succeed, but after an interruption
4991        status = U_ZERO_ERROR;
4992        UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4993        cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4994        matcher.reset(s2);
4995        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4996        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4997        // Now retry the match from where left off
4998        cbInfo.maxCalls = 100; //  No callback limit
4999        status = U_ZERO_ERROR;
5000        REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
5001        REGEX_CHECK_STATUS;
5002    }
5003
5004
5005}
5006
5007
5008//---------------------------------------------------------------------------
5009//
5010//    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
5011//                             UTexts. The pure-C implementation of UText
5012//                             has no mutable backing stores, but we can
5013//                             use UnicodeString here to test the functionality.
5014//
5015//---------------------------------------------------------------------------
5016void RegexTest::PreAllocatedUTextCAPI () {
5017    UErrorCode           status = U_ZERO_ERROR;
5018    URegularExpression  *re;
5019    UText                patternText = UTEXT_INITIALIZER;
5020    UnicodeString        buffer;
5021    UText                bufferText = UTEXT_INITIALIZER;
5022
5023    utext_openUnicodeString(&bufferText, &buffer, &status);
5024
5025    /*
5026     *  getText() and getUText()
5027     */
5028    {
5029        UText  text1 = UTEXT_INITIALIZER;
5030        UText  text2 = UTEXT_INITIALIZER;
5031        UChar  text2Chars[20];
5032        UText  *resultText;
5033
5034        status = U_ZERO_ERROR;
5035        regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5036        regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5037        u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5038        utext_openUChars(&text2, text2Chars, -1, &status);
5039
5040        regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5041        re = uregex_openUText(&patternText, 0, NULL, &status);
5042
5043        /* First set a UText */
5044        uregex_setUText(re, &text1, &status);
5045        resultText = uregex_getUText(re, &bufferText, &status);
5046        REGEX_CHECK_STATUS;
5047        REGEX_ASSERT(resultText == &bufferText);
5048        utext_setNativeIndex(resultText, 0);
5049        utext_setNativeIndex(&text1, 0);
5050        REGEX_ASSERT(testUTextEqual(resultText, &text1));
5051
5052        resultText = uregex_getUText(re, &bufferText, &status);
5053        REGEX_CHECK_STATUS;
5054        REGEX_ASSERT(resultText == &bufferText);
5055        utext_setNativeIndex(resultText, 0);
5056        utext_setNativeIndex(&text1, 0);
5057        REGEX_ASSERT(testUTextEqual(resultText, &text1));
5058
5059        /* Then set a UChar * */
5060        uregex_setText(re, text2Chars, 7, &status);
5061        resultText = uregex_getUText(re, &bufferText, &status);
5062        REGEX_CHECK_STATUS;
5063        REGEX_ASSERT(resultText == &bufferText);
5064        utext_setNativeIndex(resultText, 0);
5065        utext_setNativeIndex(&text2, 0);
5066        REGEX_ASSERT(testUTextEqual(resultText, &text2));
5067
5068        uregex_close(re);
5069        utext_close(&text1);
5070        utext_close(&text2);
5071    }
5072
5073    /*
5074     *  group()
5075     */
5076    {
5077        UChar    text1[80];
5078        UText   *actual;
5079        UBool    result;
5080        int64_t  length = 0;
5081
5082        u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
5083        //                  012345678901234567890123456789012345678901234567
5084        //                  0         1         2         3         4
5085
5086        status = U_ZERO_ERROR;
5087        re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5088        REGEX_CHECK_STATUS;
5089
5090        uregex_setText(re, text1, -1, &status);
5091        result = uregex_find(re, 0, &status);
5092        REGEX_ASSERT(result==TRUE);
5093
5094        /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
5095        status = U_ZERO_ERROR;
5096        actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5097        REGEX_CHECK_STATUS;
5098        REGEX_ASSERT(actual == &bufferText);
5099        REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5100        REGEX_ASSERT(length == 16);
5101        REGEX_ASSERT(utext_nativeLength(actual) == 47);
5102
5103        /*  Capture group #1.  Should succeed, matching " interior ". */
5104        status = U_ZERO_ERROR;
5105        actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5106        REGEX_CHECK_STATUS;
5107        REGEX_ASSERT(actual == &bufferText);
5108        REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
5109        REGEX_ASSERT(length == 10);
5110        REGEX_ASSERT(utext_nativeLength(actual) == 47);
5111
5112        /*  Capture group out of range.  Error. */
5113        status = U_ZERO_ERROR;
5114        actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5115        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5116        REGEX_ASSERT(actual == &bufferText);
5117        uregex_close(re);
5118
5119    }
5120
5121    /*
5122     *  replaceFirst()
5123     */
5124    {
5125        UChar    text1[80];
5126        UChar    text2[80];
5127        UText    replText = UTEXT_INITIALIZER;
5128        UText   *result;
5129        status = U_ZERO_ERROR;
5130        utext_openUnicodeString(&bufferText, &buffer, &status);
5131
5132        status = U_ZERO_ERROR;
5133        u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5134        u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5135        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5136
5137        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5138        REGEX_CHECK_STATUS;
5139
5140        /*  Normal case, with match */
5141        uregex_setText(re, text1, -1, &status);
5142        REGEX_CHECK_STATUS;
5143        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5144        REGEX_CHECK_STATUS;
5145        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5146        REGEX_CHECK_STATUS;
5147        REGEX_ASSERT(result == &bufferText);
5148        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5149
5150        /* No match.  Text should copy to output with no changes.  */
5151        uregex_setText(re, text2, -1, &status);
5152        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5153        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5154        REGEX_CHECK_STATUS;
5155        REGEX_ASSERT(result == &bufferText);
5156        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5157
5158        /* Unicode escapes */
5159        uregex_setText(re, text1, -1, &status);
5160        regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5161        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5162        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5163        REGEX_CHECK_STATUS;
5164        REGEX_ASSERT(result == &bufferText);
5165        REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5166
5167        uregex_close(re);
5168        utext_close(&replText);
5169    }
5170
5171
5172    /*
5173     *  replaceAll()
5174     */
5175    {
5176        UChar    text1[80];
5177        UChar    text2[80];
5178        UText    replText = UTEXT_INITIALIZER;
5179        UText   *result;
5180
5181        status = U_ZERO_ERROR;
5182        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5183        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5184        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5185
5186        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5187        REGEX_CHECK_STATUS;
5188
5189        /*  Normal case, with match */
5190        uregex_setText(re, text1, -1, &status);
5191        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5192        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5193        REGEX_CHECK_STATUS;
5194        REGEX_ASSERT(result == &bufferText);
5195        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5196
5197        /* No match.  Text should copy to output with no changes.  */
5198        uregex_setText(re, text2, -1, &status);
5199        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5200        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5201        REGEX_CHECK_STATUS;
5202        REGEX_ASSERT(result == &bufferText);
5203        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5204
5205        uregex_close(re);
5206        utext_close(&replText);
5207    }
5208
5209
5210    /*
5211     *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5212     *   so we don't need to test it here.
5213     */
5214
5215    utext_close(&bufferText);
5216    utext_close(&patternText);
5217}
5218
5219
5220//--------------------------------------------------------------
5221//
5222//  NamedCapture   Check basic named capture group functionality
5223//
5224//--------------------------------------------------------------
5225void RegexTest::NamedCapture() {
5226    UErrorCode status = U_ZERO_ERROR;
5227    RegexPattern *pat = RegexPattern::compile(UnicodeString(
5228            "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5229    REGEX_CHECK_STATUS;
5230    int32_t group = pat->groupNumberFromName("five", -1, status);
5231    REGEX_CHECK_STATUS;
5232    REGEX_ASSERT(5 == group);
5233    group = pat->groupNumberFromName("three", -1, status);
5234    REGEX_CHECK_STATUS;
5235    REGEX_ASSERT(3 == group);
5236
5237    status = U_ZERO_ERROR;
5238    group = pat->groupNumberFromName(UnicodeString("six"), status);
5239    REGEX_CHECK_STATUS;
5240    REGEX_ASSERT(6 == group);
5241
5242    status = U_ZERO_ERROR;
5243    group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5244    U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5245
5246    status = U_ZERO_ERROR;
5247
5248    // After copying a pattern, named capture should still work in the copy.
5249    RegexPattern *copiedPat = new RegexPattern(*pat);
5250    REGEX_ASSERT(*copiedPat == *pat);
5251    delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5252
5253    group = copiedPat->groupNumberFromName("five", -1, status);
5254    REGEX_CHECK_STATUS;
5255    REGEX_ASSERT(5 == group);
5256    group = copiedPat->groupNumberFromName("three", -1, status);
5257    REGEX_CHECK_STATUS;
5258    REGEX_ASSERT(3 == group);
5259    delete copiedPat;
5260
5261    // ReplaceAll with named capture group.
5262    status = U_ZERO_ERROR;
5263    UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5264    RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5265    REGEX_CHECK_STATUS;
5266    // m.pattern().dumpPattern();
5267    UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5268    REGEX_CHECK_STATUS;
5269    REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5270    delete m;
5271
5272    // ReplaceAll, allowed capture group numbers.
5273    text = UnicodeString("abcmxyz");
5274    m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5275    REGEX_CHECK_STATUS;
5276
5277    status = U_ZERO_ERROR;
5278    replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5279    REGEX_CHECK_STATUS;
5280    REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5281
5282    status = U_ZERO_ERROR;
5283    replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5284    REGEX_CHECK_STATUS;
5285    REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5286
5287    status = U_ZERO_ERROR;
5288    replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5289    REGEX_CHECK_STATUS;
5290    REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5291
5292    status = U_ZERO_ERROR;
5293    replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5294    REGEX_CHECK_STATUS;
5295    REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5296
5297    status = U_ZERO_ERROR;
5298    replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5299    REGEX_CHECK_STATUS;
5300    REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5301
5302    status = U_ZERO_ERROR;
5303    replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5304    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5305
5306    status = U_ZERO_ERROR;
5307    replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5308    REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5309    REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5310
5311    status = U_ZERO_ERROR;
5312    replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5313    REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5314    REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5315
5316    status = U_ZERO_ERROR;
5317    replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5318    REGEX_CHECK_STATUS;
5319    REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5320
5321    status = U_ZERO_ERROR;
5322    replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5323    REGEX_CHECK_STATUS;
5324    REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5325
5326    status = U_ZERO_ERROR;
5327    replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5328    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5329
5330    status = U_ZERO_ERROR;
5331    replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5332    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5333
5334    status = U_ZERO_ERROR;
5335    replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5336    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5337
5338    status = U_ZERO_ERROR;
5339    replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5340    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5341
5342    delete m;
5343
5344    // Repeat the above replaceAll() tests using the plain C API, which
5345    //  has a separate implementation internally.
5346    //  TODO: factor out the test data.
5347
5348    status = U_ZERO_ERROR;
5349    URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5350    REGEX_CHECK_STATUS;
5351    text = UnicodeString("abcmxyz");
5352    uregex_setText(re, text.getBuffer(), text.length(), &status);
5353    REGEX_CHECK_STATUS;
5354
5355    UChar resultBuf[100];
5356    int32_t resultLength;
5357    UnicodeString repl;
5358
5359    status = U_ZERO_ERROR;
5360    repl = UnicodeString("<$0>");
5361    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5362    REGEX_CHECK_STATUS;
5363    REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5364
5365    status = U_ZERO_ERROR;
5366    repl = UnicodeString("<$1>");
5367    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5368    REGEX_CHECK_STATUS;
5369    REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5370
5371    status = U_ZERO_ERROR;
5372    repl = UnicodeString("<${one}>");
5373    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5374    REGEX_CHECK_STATUS;
5375    REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5376
5377    status = U_ZERO_ERROR;
5378    repl = UnicodeString("<$2>");
5379    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5380    REGEX_CHECK_STATUS;
5381    REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5382
5383    status = U_ZERO_ERROR;
5384    repl = UnicodeString("<$3>");
5385    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5386    REGEX_CHECK_STATUS;
5387    REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5388
5389    status = U_ZERO_ERROR;
5390    repl = UnicodeString("<$4>");
5391    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5392    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5393
5394    status = U_ZERO_ERROR;
5395    repl = UnicodeString("<$04>");
5396    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5397    REGEX_CHECK_STATUS;
5398    REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5399
5400    status = U_ZERO_ERROR;
5401    repl = UnicodeString("<$000016>");
5402    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5403    REGEX_CHECK_STATUS;
5404    REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5405
5406    status = U_ZERO_ERROR;
5407    repl = UnicodeString("<$3$2$1${one}>");
5408    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5409    REGEX_CHECK_STATUS;
5410    REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5411
5412    status = U_ZERO_ERROR;
5413    repl = UnicodeString("$3$2$1${one}");
5414    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5415    REGEX_CHECK_STATUS;
5416    REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5417
5418    status = U_ZERO_ERROR;
5419    repl = UnicodeString("<${noSuchName}>");
5420    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5421    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5422
5423    status = U_ZERO_ERROR;
5424    repl = UnicodeString("<${invalid-name}>");
5425    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5426    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5427
5428    status = U_ZERO_ERROR;
5429    repl = UnicodeString("<${one");
5430    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5431    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5432
5433    status = U_ZERO_ERROR;
5434    repl = UnicodeString("$not a capture group");
5435    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5436    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5437
5438    uregex_close(re);
5439}
5440
5441//--------------------------------------------------------------
5442//
5443//  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5444//                       The point is not so much what the exact limit is,
5445//                       but that a largish number doesn't hit bad non-linear performance,
5446//                       and that exceeding the limit fails cleanly.
5447//
5448//--------------------------------------------------------------
5449void RegexTest::NamedCaptureLimits() {
5450    if (quick) {
5451        logln("Skipping test. Runs in exhuastive mode only.");
5452        return;
5453    }
5454    const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5455    const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5456    char nnbuf[100];
5457    UnicodeString pattern;
5458    int32_t nn;
5459
5460    for (nn=1; nn<goodLimit; nn++) {
5461        sprintf(nnbuf, "(?<nn%d>)", nn);
5462        pattern.append(UnicodeString(nnbuf, -1, US_INV));
5463    }
5464    UErrorCode status = U_ZERO_ERROR;
5465    RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5466    REGEX_CHECK_STATUS;
5467    for (nn=1; nn<goodLimit; nn++) {
5468        sprintf(nnbuf, "nn%d", nn);
5469        int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5470        REGEX_ASSERT(nn == groupNum);
5471        if (nn != groupNum) {
5472            break;
5473        }
5474    }
5475    delete pat;
5476
5477    pattern.remove();
5478    for (nn=1; nn<failLimit; nn++) {
5479        sprintf(nnbuf, "(?<nn%d>)", nn);
5480        pattern.append(UnicodeString(nnbuf, -1, US_INV));
5481    }
5482    status = U_ZERO_ERROR;
5483    pat = RegexPattern::compile(pattern, 0, status);
5484    REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5485    delete pat;
5486}
5487
5488
5489//--------------------------------------------------------------
5490//
5491//  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5492//
5493//---------------------------------------------------------------
5494void RegexTest::Bug7651() {
5495    UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5496    //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5497    //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5498    UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5499    UnicodeString s("#ff @abcd This is test");
5500    RegexPattern  *REPattern = NULL;
5501    RegexMatcher  *REMatcher = NULL;
5502    UErrorCode status = U_ZERO_ERROR;
5503    UParseError pe;
5504
5505    REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5506    REGEX_CHECK_STATUS;
5507    REMatcher = REPattern->matcher(s, status);
5508    REGEX_CHECK_STATUS;
5509    REGEX_ASSERT(REMatcher->find());
5510    REGEX_ASSERT(REMatcher->start(status) == 0);
5511    delete REPattern;
5512    delete REMatcher;
5513    status = U_ZERO_ERROR;
5514
5515    REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5516    REGEX_CHECK_STATUS;
5517    REMatcher = REPattern->matcher(s, status);
5518    REGEX_CHECK_STATUS;
5519    REGEX_ASSERT(REMatcher->find());
5520    REGEX_ASSERT(REMatcher->start(status) == 0);
5521    delete REPattern;
5522    delete REMatcher;
5523    status = U_ZERO_ERROR;
5524 }
5525
5526void RegexTest::Bug7740() {
5527    UErrorCode status = U_ZERO_ERROR;
5528    UnicodeString pattern = "(a)";
5529    UnicodeString text = "abcdef";
5530    RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5531    REGEX_CHECK_STATUS;
5532    REGEX_ASSERT(m->lookingAt(status));
5533    REGEX_CHECK_STATUS;
5534    status = U_ILLEGAL_ARGUMENT_ERROR;
5535    UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5536    REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5537    REGEX_ASSERT(s == "");
5538    delete m;
5539}
5540
5541// Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5542
5543void RegexTest::Bug8479() {
5544    UErrorCode status = U_ZERO_ERROR;
5545
5546    RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5547    REGEX_CHECK_STATUS;
5548    if (U_SUCCESS(status))
5549    {
5550        UnicodeString str;
5551        str.setToBogus();
5552        pMatcher->reset(str);
5553        status = U_ZERO_ERROR;
5554        pMatcher->matches(status);
5555        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5556        delete pMatcher;
5557    }
5558}
5559
5560
5561// Bug 7029
5562void RegexTest::Bug7029() {
5563    UErrorCode status = U_ZERO_ERROR;
5564
5565    RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5566    UnicodeString text = "abc.def";
5567    UnicodeString splits[10];
5568    REGEX_CHECK_STATUS;
5569    int32_t numFields = pMatcher->split(text, splits, 10, status);
5570    REGEX_CHECK_STATUS;
5571    REGEX_ASSERT(numFields == 8);
5572    delete pMatcher;
5573}
5574
5575// Bug 9283
5576//   This test is checking for the existance of any supplemental characters that case-fold
5577//   to a bmp character.
5578//
5579//   At the time of this writing there are none. If any should appear in a subsequent release
5580//   of Unicode, the code in regular expressions compilation that determines the longest
5581//   posssible match for a literal string  will need to be enhanced.
5582//
5583//   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5584//   for details on what to do in case of a failure of this test.
5585//
5586void RegexTest::Bug9283() {
5587#if !UCONFIG_NO_NORMALIZATION
5588    UErrorCode status = U_ZERO_ERROR;
5589    UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5590    REGEX_CHECK_STATUS;
5591    int32_t index;
5592    UChar32 c;
5593    for (index=0; ; index++) {
5594        c = supplementalsWithCaseFolding.charAt(index);
5595        if (c == -1) {
5596            break;
5597        }
5598        UnicodeString cf = UnicodeString(c).foldCase();
5599        REGEX_ASSERT(cf.length() >= 2);
5600    }
5601#endif /* #if !UCONFIG_NO_NORMALIZATION */
5602}
5603
5604
5605void RegexTest::CheckInvBufSize() {
5606  if(inv_next>=INV_BUFSIZ) {
5607    errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5608          __FILE__, INV_BUFSIZ, inv_next);
5609  } else {
5610    logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5611  }
5612}
5613
5614
5615void RegexTest::Bug10459() {
5616    UErrorCode status = U_ZERO_ERROR;
5617    UnicodeString patternString("(txt)");
5618    UnicodeString txtString("txt");
5619
5620    UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5621    REGEX_CHECK_STATUS;
5622    UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5623    REGEX_CHECK_STATUS;
5624
5625    URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5626    REGEX_CHECK_STATUS;
5627
5628    uregex_setUText(icu_re, utext_txt, &status);
5629    REGEX_CHECK_STATUS;
5630
5631    // The bug was that calling uregex_group() before doing a matching operation
5632    //   was causing a segfault. Only for Regular Expressions created from UText.
5633    //   It should set an U_REGEX_INVALID_STATE.
5634
5635    UChar buf[100];
5636    int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5637    REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5638    REGEX_ASSERT(len == 0);
5639
5640    uregex_close(icu_re);
5641    utext_close(utext_pat);
5642    utext_close(utext_txt);
5643}
5644
5645void RegexTest::TestCaseInsensitiveStarters() {
5646    // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5647    //  become stale because of new Unicode characters.
5648    // If it is stale, rerun the generation tool
5649    //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5650    // and replace the embedded data in i18n/regexcmp.cpp
5651
5652    for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5653        if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5654            continue;
5655        }
5656        UnicodeSet s(cp, cp);
5657        s.closeOver(USET_CASE_INSENSITIVE);
5658        UnicodeSetIterator setIter(s);
5659        while (setIter.next()) {
5660            if (!setIter.isString()) {
5661                continue;
5662            }
5663            const UnicodeString &str = setIter.getString();
5664            UChar32 firstChar = str.char32At(0);
5665            UnicodeSet starters;
5666            RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5667            if (!starters.contains(cp)) {
5668                errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5669                return;
5670            }
5671        }
5672    }
5673}
5674
5675
5676void RegexTest::TestBug11049() {
5677    // Original bug report: pattern with match start consisting of one of several individual characters,
5678    //  and the text being matched ending with a supplementary character. find() would read past the
5679    //  end of the input text when searching for potential match starting points.
5680
5681    // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5682    // detect the bad read.
5683
5684    TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5685    TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5686
5687    // Test again with a pattern starting with a single character,
5688    // which takes a different code path than starting with an OR expression,
5689    // but with similar logic.
5690    TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5691    TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5692}
5693
5694// Run a single test case from TestBug11049(). Internal function.
5695void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5696    UErrorCode status = U_ZERO_ERROR;
5697    UnicodeString patternString = UnicodeString(pattern).unescape();
5698    LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5699
5700    UnicodeString dataString = UnicodeString(data).unescape();
5701    UChar *exactBuffer = new UChar[dataString.length()];
5702    dataString.extract(exactBuffer, dataString.length(), status);
5703    UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5704
5705    LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5706    REGEX_CHECK_STATUS;
5707    matcher->reset(ut);
5708    UBool result = matcher->find();
5709    if (result != expectMatch) {
5710        errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5711              __FILE__, lineNumber, expectMatch, result, pattern, data);
5712    }
5713
5714    // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5715    //   off-by-one on find() with match at the last code point.
5716    //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5717    //   because string.unescape() will only shrink it.
5718    char * utf8Buffer = new char[uprv_strlen(data)+1];
5719    u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5720    REGEX_CHECK_STATUS;
5721    ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5722    REGEX_CHECK_STATUS;
5723    matcher->reset(ut);
5724    result = matcher->find();
5725    if (result != expectMatch) {
5726        errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5727              __FILE__, lineNumber, expectMatch, result, pattern, data);
5728    }
5729    delete [] utf8Buffer;
5730
5731    utext_close(ut);
5732    delete [] exactBuffer;
5733}
5734
5735
5736void RegexTest::TestBug11371() {
5737    if (quick) {
5738        logln("Skipping test. Runs in exhuastive mode only.");
5739        return;
5740    }
5741    UErrorCode status = U_ZERO_ERROR;
5742    UnicodeString patternString;
5743
5744    for (int i=0; i<8000000; i++) {
5745        patternString.append(UnicodeString("()"));
5746    }
5747    LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5748    if (status != U_REGEX_PATTERN_TOO_BIG) {
5749        errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5750              __FILE__, __LINE__, u_errorName(status));
5751    }
5752
5753    status = U_ZERO_ERROR;
5754    patternString = "(";
5755    for (int i=0; i<20000000; i++) {
5756        patternString.append(UnicodeString("A++"));
5757    }
5758    patternString.append(UnicodeString("){0}B++"));
5759    LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5760    if (status != U_REGEX_PATTERN_TOO_BIG) {
5761        errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5762              __FILE__, __LINE__, u_errorName(status));
5763    }
5764
5765    // Pattern with too much string data, such that string indexes overflow operand data field size
5766    // in compiled instruction.
5767    status = U_ZERO_ERROR;
5768    patternString = "";
5769    while (patternString.length() < 0x00ffffff) {
5770        patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5771    }
5772    patternString.append(UnicodeString("X? trailing string"));
5773    LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5774    if (status != U_REGEX_PATTERN_TOO_BIG) {
5775        errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5776              __FILE__, __LINE__, u_errorName(status));
5777    }
5778}
5779
5780void RegexTest::TestBug11480() {
5781    // C API, get capture group of a group that does not participate in the match.
5782    //        (Returns a zero length string, with nul termination,
5783    //         indistinguishable from a group with a zero lenght match.)
5784
5785    UErrorCode status = U_ZERO_ERROR;
5786    URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5787    REGEX_CHECK_STATUS;
5788    UnicodeString text = UNICODE_STRING_SIMPLE("A");
5789    uregex_setText(re, text.getBuffer(), text.length(), &status);
5790    REGEX_CHECK_STATUS;
5791    REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5792    UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5793    int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5794    REGEX_ASSERT(length == 0);
5795    REGEX_ASSERT(buf[0] == 13);
5796    REGEX_ASSERT(buf[1] == 0);
5797    REGEX_ASSERT(buf[2] == 13);
5798    uregex_close(re);
5799}
5800
5801
5802#endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5803