1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2013, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7//
8//   regextst.cpp
9//
10//      ICU Regular Expressions test, part of intltest.
11//
12
13/*
14     NOTE!!
15
16     PLEASE be careful about ASCII assumptions in this test.
17     This test is one of the worst repeat offenders.
18     If you have questions, contact someone on the ICU PMC
19     who has access to an EBCDIC system.
20
21 */
22
23#include "intltest.h"
24#if !UCONFIG_NO_REGULAR_EXPRESSIONS
25
26#include "unicode/regex.h"
27#include "unicode/uchar.h"
28#include "unicode/ucnv.h"
29#include "unicode/uniset.h"
30#include "unicode/ustring.h"
31#include "regextst.h"
32#include "uvector.h"
33#include "util.h"
34#include <stdlib.h>
35#include <string.h>
36#include <stdio.h>
37#include "cstring.h"
38#include "uinvchar.h"
39
40#define SUPPORT_MUTATING_INPUT_STRING   0
41
42//---------------------------------------------------------------------------
43//
44//  Test class boilerplate
45//
46//---------------------------------------------------------------------------
47RegexTest::RegexTest()
48{
49}
50
51
52RegexTest::~RegexTest()
53{
54}
55
56
57
58void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
59{
60    if (exec) logln("TestSuite RegexTest: ");
61    switch (index) {
62
63        case 0: name = "Basic";
64            if (exec) Basic();
65            break;
66        case 1: name = "API_Match";
67            if (exec) API_Match();
68            break;
69        case 2: name = "API_Replace";
70            if (exec) API_Replace();
71            break;
72        case 3: name = "API_Pattern";
73            if (exec) API_Pattern();
74            break;
75        case 4:
76#if !UCONFIG_NO_FILE_IO
77            name = "Extended";
78            if (exec) Extended();
79#else
80            name = "skip";
81#endif
82            break;
83        case 5: name = "Errors";
84            if (exec) Errors();
85            break;
86        case 6: name = "PerlTests";
87            if (exec) PerlTests();
88            break;
89        case 7: name = "Callbacks";
90            if (exec) Callbacks();
91            break;
92        case 8: name = "FindProgressCallbacks";
93            if (exec) FindProgressCallbacks();
94            break;
95        case 9: name = "Bug 6149";
96             if (exec) Bug6149();
97             break;
98        case 10: name = "UTextBasic";
99          if (exec) UTextBasic();
100          break;
101        case 11: name = "API_Match_UTF8";
102          if (exec) API_Match_UTF8();
103          break;
104        case 12: name = "API_Replace_UTF8";
105          if (exec) API_Replace_UTF8();
106          break;
107        case 13: name = "API_Pattern_UTF8";
108          if (exec) API_Pattern_UTF8();
109          break;
110        case 14: name = "PerlTestsUTF8";
111          if (exec) PerlTestsUTF8();
112          break;
113        case 15: name = "PreAllocatedUTextCAPI";
114          if (exec) PreAllocatedUTextCAPI();
115          break;
116        case 16: name = "Bug 7651";
117             if (exec) Bug7651();
118             break;
119        case 17: name = "Bug 7740";
120            if (exec) Bug7740();
121            break;
122        case 18: name = "Bug 8479";
123            if (exec) Bug8479();
124            break;
125        case 19: name = "Bug 7029";
126            if (exec) Bug7029();
127            break;
128        case 20: name = "CheckInvBufSize";
129            if (exec) CheckInvBufSize();
130            break;
131        case 21: name = "Bug 9283";
132            if (exec) Bug9283();
133            break;
134
135        default: name = "";
136            break; //needed to end loop
137    }
138}
139
140
141
142/**
143 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
144 * into ASCII.
145 * @see utext_openUTF8
146 */
147static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
148
149//---------------------------------------------------------------------------
150//
151//   Error Checking / Reporting macros used in all of the tests.
152//
153//---------------------------------------------------------------------------
154
155static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
156  int64_t oldIndex = utext_getNativeIndex(text);
157  utext_setNativeIndex(text, 0);
158  char *bufPtr = buf;
159  UChar32 c = utext_next32From(text, 0);
160  while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
161    if (0x000020<=c && c<0x00007e) {
162      *bufPtr = c;
163    } else {
164#if 0
165      sprintf(bufPtr,"U+%04X", c);
166      bufPtr+= strlen(bufPtr)-1;
167#else
168      *bufPtr = '%';
169#endif
170    }
171    bufPtr++;
172    c = UTEXT_NEXT32(text);
173  }
174  *bufPtr = 0;
175#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
176  char *ebuf = (char*)malloc(bufLen);
177  uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
178  uprv_strncpy(buf, ebuf, bufLen);
179  free((void*)ebuf);
180#endif
181  utext_setNativeIndex(text, oldIndex);
182}
183
184
185static char ASSERT_BUF[1024];
186
187const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
188  if(message.length()==0) {
189    strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
190  } else {
191    UnicodeString buf;
192    IntlTest::prettify(message,buf);
193    if(buf.length()==0) {
194      strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
195    } else {
196      buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
197      if(ASSERT_BUF[0]==0) {
198        ASSERT_BUF[0]=0;
199        for(int32_t i=0;i<buf.length();i++) {
200          UChar ch = buf[i];
201          sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
202        }
203      }
204    }
205  }
206  ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
207  return ASSERT_BUF;
208}
209
210
211#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
212
213#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
214                                                              __FILE__, __LINE__, u_errorName(status)); return;}}
215
216#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
217
218#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
219if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
220    __LINE__, u_errorName(errcode), u_errorName(status));};}
221
222#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
223    "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
224
225#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
226    errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
227
228#define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
229
230
231static UBool testUTextEqual(UText *uta, UText *utb) {
232    UChar32 ca = 0;
233    UChar32 cb = 0;
234    utext_setNativeIndex(uta, 0);
235    utext_setNativeIndex(utb, 0);
236    do {
237        ca = utext_next32(uta);
238        cb = utext_next32(utb);
239        if (ca != cb) {
240            break;
241        }
242    } while (ca != U_SENTINEL);
243    return ca == cb;
244}
245
246
247/**
248 * @param expected expected text in UTF-8 (not platform) codepage
249 */
250void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
251    UErrorCode status = U_ZERO_ERROR;
252    UText expectedText = UTEXT_INITIALIZER;
253    utext_openUTF8(&expectedText, expected, -1, &status);
254    if(U_FAILURE(status)) {
255      errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
256      return;
257    }
258    if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
259      errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
260      return;
261    }
262    utext_setNativeIndex(actual, 0);
263    if (!testUTextEqual(&expectedText, actual)) {
264        char buf[201 /*21*/];
265        char expectedBuf[201];
266        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
267        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
268        errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
269    }
270    utext_close(&expectedText);
271}
272/**
273 * @param expected invariant (platform local text) input
274 */
275
276void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
277    UErrorCode status = U_ZERO_ERROR;
278    UText expectedText = UTEXT_INITIALIZER;
279    regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
280    if(U_FAILURE(status)) {
281      errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
282      return;
283    }
284    utext_setNativeIndex(actual, 0);
285    if (!testUTextEqual(&expectedText, actual)) {
286        char buf[201 /*21*/];
287        char expectedBuf[201];
288        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
289        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
290        errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
291    }
292    utext_close(&expectedText);
293}
294
295/**
296 * Assumes utf-8 input
297 */
298#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
299/**
300 * Assumes Invariant input
301 */
302#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
303
304/**
305 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
306 * passed into utext_openUTF8. An error will be given if
307 * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
308 */
309
310#define INV_BUFSIZ 2048 /* increase this if too small */
311
312static int64_t inv_next=0;
313
314#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
315static char inv_buf[INV_BUFSIZ];
316#endif
317
318static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
319  if(length==-1) length=strlen(inv);
320#if U_CHARSET_FAMILY==U_ASCII_FAMILY
321  inv_next+=length;
322  return utext_openUTF8(ut, inv, length, status);
323#else
324  if(inv_next+length+1>INV_BUFSIZ) {
325    fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
326            __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
327    *status = U_MEMORY_ALLOCATION_ERROR;
328    return NULL;
329  }
330
331  unsigned char *buf = (unsigned char*)inv_buf+inv_next;
332  uprv_aestrncpy(buf, (const uint8_t*)inv, length);
333  inv_next+=length;
334
335#if 0
336  fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
337#endif
338
339  return utext_openUTF8(ut, (const char*)buf, length, status);
340#endif
341}
342
343
344//---------------------------------------------------------------------------
345//
346//    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
347//                       for the LookingAt() and  Match() functions.
348//
349//       usage:
350//          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
351//
352//          The expected results are UBool - TRUE or FALSE.
353//          The input text is unescaped.  The pattern is not.
354//
355//
356//---------------------------------------------------------------------------
357
358#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
359
360UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
361    const UnicodeString pattern(pat, -1, US_INV);
362    const UnicodeString inputText(text, -1, US_INV);
363    UErrorCode          status  = U_ZERO_ERROR;
364    UParseError         pe;
365    RegexPattern        *REPattern = NULL;
366    RegexMatcher        *REMatcher = NULL;
367    UBool               retVal     = TRUE;
368
369    UnicodeString patString(pat, -1, US_INV);
370    REPattern = RegexPattern::compile(patString, 0, pe, status);
371    if (U_FAILURE(status)) {
372        dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
373            line, u_errorName(status));
374        return FALSE;
375    }
376    if (line==376) { RegexPatternDump(REPattern);}
377
378    UnicodeString inputString(inputText);
379    UnicodeString unEscapedInput = inputString.unescape();
380    REMatcher = REPattern->matcher(unEscapedInput, status);
381    if (U_FAILURE(status)) {
382        errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
383            line, u_errorName(status));
384        return FALSE;
385    }
386
387    UBool actualmatch;
388    actualmatch = REMatcher->lookingAt(status);
389    if (U_FAILURE(status)) {
390        errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
391            line, u_errorName(status));
392        retVal =  FALSE;
393    }
394    if (actualmatch != looking) {
395        errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
396        retVal = FALSE;
397    }
398
399    status = U_ZERO_ERROR;
400    actualmatch = REMatcher->matches(status);
401    if (U_FAILURE(status)) {
402        errln("RegexTest failure in matches() at line %d.  Status = %s\n",
403            line, u_errorName(status));
404        retVal = FALSE;
405    }
406    if (actualmatch != match) {
407        errln("RegexTest: wrong return from matches() at line %d.\n", line);
408        retVal = FALSE;
409    }
410
411    if (retVal == FALSE) {
412        RegexPatternDump(REPattern);
413    }
414
415    delete REPattern;
416    delete REMatcher;
417    return retVal;
418}
419
420
421UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
422    UText               pattern    = UTEXT_INITIALIZER;
423    int32_t             inputUTF8Length;
424    char                *textChars = NULL;
425    UText               inputText  = UTEXT_INITIALIZER;
426    UErrorCode          status     = U_ZERO_ERROR;
427    UParseError         pe;
428    RegexPattern        *REPattern = NULL;
429    RegexMatcher        *REMatcher = NULL;
430    UBool               retVal     = TRUE;
431
432    regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
433    REPattern = RegexPattern::compile(&pattern, 0, pe, status);
434    if (U_FAILURE(status)) {
435        dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
436            line, u_errorName(status));
437        return FALSE;
438    }
439
440    UnicodeString inputString(text, -1, US_INV);
441    UnicodeString unEscapedInput = inputString.unescape();
442    LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
443    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
444
445    inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
446    if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
447        // UTF-8 does not allow unpaired surrogates, so this could actually happen
448        logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
449        return TRUE; // not a failure of the Regex engine
450    }
451    status = U_ZERO_ERROR; // buffer overflow
452    textChars = new char[inputUTF8Length+1];
453    unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
454    utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
455
456    REMatcher = &REPattern->matcher(status)->reset(&inputText);
457    if (U_FAILURE(status)) {
458        errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
459            line, u_errorName(status));
460        return FALSE;
461    }
462
463    UBool actualmatch;
464    actualmatch = REMatcher->lookingAt(status);
465    if (U_FAILURE(status)) {
466        errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
467            line, u_errorName(status));
468        retVal =  FALSE;
469    }
470    if (actualmatch != looking) {
471        errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
472        retVal = FALSE;
473    }
474
475    status = U_ZERO_ERROR;
476    actualmatch = REMatcher->matches(status);
477    if (U_FAILURE(status)) {
478        errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
479            line, u_errorName(status));
480        retVal = FALSE;
481    }
482    if (actualmatch != match) {
483        errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
484        retVal = FALSE;
485    }
486
487    if (retVal == FALSE) {
488        RegexPatternDump(REPattern);
489    }
490
491    delete REPattern;
492    delete REMatcher;
493    utext_close(&inputText);
494    utext_close(&pattern);
495    delete[] textChars;
496    return retVal;
497}
498
499
500
501//---------------------------------------------------------------------------
502//
503//    REGEX_ERR       Macro + invocation function to simplify writing tests
504//                       regex tests for incorrect patterns
505//
506//       usage:
507//          REGEX_ERR("pattern",   expected error line, column, expected status);
508//
509//---------------------------------------------------------------------------
510#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
511
512void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
513                          UErrorCode expectedStatus, int32_t line) {
514    UnicodeString       pattern(pat);
515
516    UErrorCode          status         = U_ZERO_ERROR;
517    UParseError         pe;
518    RegexPattern        *callerPattern = NULL;
519
520    //
521    //  Compile the caller's pattern
522    //
523    UnicodeString patString(pat);
524    callerPattern = RegexPattern::compile(patString, 0, pe, status);
525    if (status != expectedStatus) {
526        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
527    } else {
528        if (status != U_ZERO_ERROR) {
529            if (pe.line != errLine || pe.offset != errCol) {
530                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
531                    line, errLine, errCol, pe.line, pe.offset);
532            }
533        }
534    }
535
536    delete callerPattern;
537
538    //
539    //  Compile again, using a UTF-8-based UText
540    //
541    UText patternText = UTEXT_INITIALIZER;
542    regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
543    callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
544    if (status != expectedStatus) {
545        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
546    } else {
547        if (status != U_ZERO_ERROR) {
548            if (pe.line != errLine || pe.offset != errCol) {
549                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
550                    line, errLine, errCol, pe.line, pe.offset);
551            }
552        }
553    }
554
555    delete callerPattern;
556    utext_close(&patternText);
557}
558
559
560
561//---------------------------------------------------------------------------
562//
563//      Basic      Check for basic functionality of regex pattern matching.
564//                 Avoid the use of REGEX_FIND test macro, which has
565//                 substantial dependencies on basic Regex functionality.
566//
567//---------------------------------------------------------------------------
568void RegexTest::Basic() {
569
570
571//
572// Debug - slide failing test cases early
573//
574#if 0
575    {
576        // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
577        UParseError pe;
578        UErrorCode  status = U_ZERO_ERROR;
579        RegexPattern *pattern;
580        pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
581        RegexPatternDump(pattern);
582        RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
583        UBool result = m->find();
584        printf("result = %d\n", result);
585        // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
586        // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
587    }
588    exit(1);
589#endif
590
591
592    //
593    // Pattern with parentheses
594    //
595    REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
596    REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
597    REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
598
599    //
600    // Patterns with *
601    //
602    REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
603    REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
604    REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
605    REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
606    REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
607
608    REGEX_TESTLM("a*", "",  TRUE, TRUE);
609    REGEX_TESTLM("a*", "b", TRUE, FALSE);
610
611
612    //
613    //  Patterns with "."
614    //
615    REGEX_TESTLM(".", "abc", TRUE, FALSE);
616    REGEX_TESTLM("...", "abc", TRUE, TRUE);
617    REGEX_TESTLM("....", "abc", FALSE, FALSE);
618    REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
619    REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
620    REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
621    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
622    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
623
624    //
625    //  Patterns with * applied to chars at end of literal string
626    //
627    REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
628    REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
629
630    //
631    //  Supplemental chars match as single chars, not a pair of surrogates.
632    //
633    REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
634    REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
635    REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
636
637
638    //
639    //  UnicodeSets in the pattern
640    //
641    REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
642    REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
643    REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
644    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
645    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
646    REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
647
648    REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
649    REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
650    REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
651    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
652    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
653
654    //
655    //   OR operator in patterns
656    //
657    REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
658    REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
659    REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
660    REGEX_TESTLM("a|b", "b", TRUE, TRUE);
661
662    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
663    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
664    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
665    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
666    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
667    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
668
669    //
670    //  +
671    //
672    REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
673    REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
674    REGEX_TESTLM("b+", "", FALSE, FALSE);
675    REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
676    REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
677    REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
678
679    //
680    //   ?
681    //
682    REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
683    REGEX_TESTLM("ab?", "a", TRUE, TRUE);
684    REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
685    REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
686    REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
687    REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
688    REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
689    REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
690    REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
691
692    //
693    //  Escape sequences that become single literal chars, handled internally
694    //   by ICU's Unescape.
695    //
696
697    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
698    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
699    REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
700    REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
701    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
702    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
703    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
704    REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
705    REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
706    REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
707
708    REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
709    REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
710
711    // Escape of special chars in patterns
712    REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
713}
714
715
716//---------------------------------------------------------------------------
717//
718//    UTextBasic   Check for quirks that are specific to the UText
719//                 implementation.
720//
721//---------------------------------------------------------------------------
722void RegexTest::UTextBasic() {
723    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
724    UErrorCode status = U_ZERO_ERROR;
725    UText pattern = UTEXT_INITIALIZER;
726    utext_openUTF8(&pattern, str_abc, -1, &status);
727    RegexMatcher matcher(&pattern, 0, status);
728    REGEX_CHECK_STATUS;
729
730    UText input = UTEXT_INITIALIZER;
731    utext_openUTF8(&input, str_abc, -1, &status);
732    REGEX_CHECK_STATUS;
733    matcher.reset(&input);
734    REGEX_CHECK_STATUS;
735    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
736
737    matcher.reset(matcher.inputText());
738    REGEX_CHECK_STATUS;
739    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
740
741    utext_close(&pattern);
742    utext_close(&input);
743}
744
745
746//---------------------------------------------------------------------------
747//
748//      API_Match   Test that the API for class RegexMatcher
749//                  is present and nominally working, but excluding functions
750//                  implementing replace operations.
751//
752//---------------------------------------------------------------------------
753void RegexTest::API_Match() {
754    UParseError         pe;
755    UErrorCode          status=U_ZERO_ERROR;
756    int32_t             flags = 0;
757
758    //
759    // Debug - slide failing test cases early
760    //
761#if 0
762    {
763    }
764    return;
765#endif
766
767    //
768    // Simple pattern compilation
769    //
770    {
771        UnicodeString       re("abc");
772        RegexPattern        *pat2;
773        pat2 = RegexPattern::compile(re, flags, pe, status);
774        REGEX_CHECK_STATUS;
775
776        UnicodeString inStr1 = "abcdef this is a test";
777        UnicodeString instr2 = "not abc";
778        UnicodeString empty  = "";
779
780
781        //
782        // Matcher creation and reset.
783        //
784        RegexMatcher *m1 = pat2->matcher(inStr1, status);
785        REGEX_CHECK_STATUS;
786        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
787        REGEX_ASSERT(m1->input() == inStr1);
788        m1->reset(instr2);
789        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
790        REGEX_ASSERT(m1->input() == instr2);
791        m1->reset(inStr1);
792        REGEX_ASSERT(m1->input() == inStr1);
793        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
794        m1->reset(empty);
795        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
796        REGEX_ASSERT(m1->input() == empty);
797        REGEX_ASSERT(&m1->pattern() == pat2);
798
799        //
800        //  reset(pos, status)
801        //
802        m1->reset(inStr1);
803        m1->reset(4, status);
804        REGEX_CHECK_STATUS;
805        REGEX_ASSERT(m1->input() == inStr1);
806        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
807
808        m1->reset(-1, status);
809        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
810        status = U_ZERO_ERROR;
811
812        m1->reset(0, status);
813        REGEX_CHECK_STATUS;
814        status = U_ZERO_ERROR;
815
816        int32_t len = m1->input().length();
817        m1->reset(len-1, status);
818        REGEX_CHECK_STATUS;
819        status = U_ZERO_ERROR;
820
821        m1->reset(len, status);
822        REGEX_CHECK_STATUS;
823        status = U_ZERO_ERROR;
824
825        m1->reset(len+1, status);
826        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
827        status = U_ZERO_ERROR;
828
829        //
830        // match(pos, status)
831        //
832        m1->reset(instr2);
833        REGEX_ASSERT(m1->matches(4, status) == TRUE);
834        m1->reset();
835        REGEX_ASSERT(m1->matches(3, status) == FALSE);
836        m1->reset();
837        REGEX_ASSERT(m1->matches(5, status) == FALSE);
838        REGEX_ASSERT(m1->matches(4, status) == TRUE);
839        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
840        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
841
842        // Match() at end of string should fail, but should not
843        //  be an error.
844        status = U_ZERO_ERROR;
845        len = m1->input().length();
846        REGEX_ASSERT(m1->matches(len, status) == FALSE);
847        REGEX_CHECK_STATUS;
848
849        // Match beyond end of string should fail with an error.
850        status = U_ZERO_ERROR;
851        REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
852        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
853
854        // Successful match at end of string.
855        {
856            status = U_ZERO_ERROR;
857            RegexMatcher m("A?", 0, status);  // will match zero length string.
858            REGEX_CHECK_STATUS;
859            m.reset(inStr1);
860            len = inStr1.length();
861            REGEX_ASSERT(m.matches(len, status) == TRUE);
862            REGEX_CHECK_STATUS;
863            m.reset(empty);
864            REGEX_ASSERT(m.matches(0, status) == TRUE);
865            REGEX_CHECK_STATUS;
866        }
867
868
869        //
870        // lookingAt(pos, status)
871        //
872        status = U_ZERO_ERROR;
873        m1->reset(instr2);  // "not abc"
874        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
875        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
876        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
877        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
878        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
879        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
880        status = U_ZERO_ERROR;
881        len = m1->input().length();
882        REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
883        REGEX_CHECK_STATUS;
884        REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
885        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
886
887        delete m1;
888        delete pat2;
889    }
890
891
892    //
893    // Capture Group.
894    //     RegexMatcher::start();
895    //     RegexMatcher::end();
896    //     RegexMatcher::groupCount();
897    //
898    {
899        int32_t             flags=0;
900        UParseError         pe;
901        UErrorCode          status=U_ZERO_ERROR;
902
903        UnicodeString       re("01(23(45)67)(.*)");
904        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
905        REGEX_CHECK_STATUS;
906        UnicodeString data = "0123456789";
907
908        RegexMatcher *matcher = pat->matcher(data, status);
909        REGEX_CHECK_STATUS;
910        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
911        static const int32_t matchStarts[] = {0,  2, 4, 8};
912        static const int32_t matchEnds[]   = {10, 8, 6, 10};
913        int32_t i;
914        for (i=0; i<4; i++) {
915            int32_t actualStart = matcher->start(i, status);
916            REGEX_CHECK_STATUS;
917            if (actualStart != matchStarts[i]) {
918                errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
919                    __LINE__, i, matchStarts[i], actualStart);
920            }
921            int32_t actualEnd = matcher->end(i, status);
922            REGEX_CHECK_STATUS;
923            if (actualEnd != matchEnds[i]) {
924                errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
925                    __LINE__, i, matchEnds[i], actualEnd);
926            }
927        }
928
929        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
930        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
931
932        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
933        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
934        matcher->reset();
935        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
936
937        matcher->lookingAt(status);
938        REGEX_ASSERT(matcher->group(status)    == "0123456789");
939        REGEX_ASSERT(matcher->group(0, status) == "0123456789");
940        REGEX_ASSERT(matcher->group(1, status) == "234567"    );
941        REGEX_ASSERT(matcher->group(2, status) == "45"        );
942        REGEX_ASSERT(matcher->group(3, status) == "89"        );
943        REGEX_CHECK_STATUS;
944        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
945        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
946        matcher->reset();
947        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
948
949        delete matcher;
950        delete pat;
951
952    }
953
954    //
955    //  find
956    //
957    {
958        int32_t             flags=0;
959        UParseError         pe;
960        UErrorCode          status=U_ZERO_ERROR;
961
962        UnicodeString       re("abc");
963        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
964        REGEX_CHECK_STATUS;
965        UnicodeString data = ".abc..abc...abc..";
966        //                    012345678901234567
967
968        RegexMatcher *matcher = pat->matcher(data, status);
969        REGEX_CHECK_STATUS;
970        REGEX_ASSERT(matcher->find());
971        REGEX_ASSERT(matcher->start(status) == 1);
972        REGEX_ASSERT(matcher->find());
973        REGEX_ASSERT(matcher->start(status) == 6);
974        REGEX_ASSERT(matcher->find());
975        REGEX_ASSERT(matcher->start(status) == 12);
976        REGEX_ASSERT(matcher->find() == FALSE);
977        REGEX_ASSERT(matcher->find() == FALSE);
978
979        matcher->reset();
980        REGEX_ASSERT(matcher->find());
981        REGEX_ASSERT(matcher->start(status) == 1);
982
983        REGEX_ASSERT(matcher->find(0, status));
984        REGEX_ASSERT(matcher->start(status) == 1);
985        REGEX_ASSERT(matcher->find(1, status));
986        REGEX_ASSERT(matcher->start(status) == 1);
987        REGEX_ASSERT(matcher->find(2, status));
988        REGEX_ASSERT(matcher->start(status) == 6);
989        REGEX_ASSERT(matcher->find(12, status));
990        REGEX_ASSERT(matcher->start(status) == 12);
991        REGEX_ASSERT(matcher->find(13, status) == FALSE);
992        REGEX_ASSERT(matcher->find(16, status) == FALSE);
993        REGEX_ASSERT(matcher->find(17, status) == FALSE);
994        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
995
996        status = U_ZERO_ERROR;
997        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
998        status = U_ZERO_ERROR;
999        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1000
1001        REGEX_ASSERT(matcher->groupCount() == 0);
1002
1003        delete matcher;
1004        delete pat;
1005    }
1006
1007
1008    //
1009    //  find, with \G in pattern (true if at the end of a previous match).
1010    //
1011    {
1012        int32_t             flags=0;
1013        UParseError         pe;
1014        UErrorCode          status=U_ZERO_ERROR;
1015
1016        UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1017        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1018        REGEX_CHECK_STATUS;
1019        UnicodeString data = ".abcabc.abc..";
1020        //                    012345678901234567
1021
1022        RegexMatcher *matcher = pat->matcher(data, status);
1023        REGEX_CHECK_STATUS;
1024        REGEX_ASSERT(matcher->find());
1025        REGEX_ASSERT(matcher->start(status) == 0);
1026        REGEX_ASSERT(matcher->start(1, status) == -1);
1027        REGEX_ASSERT(matcher->start(2, status) == 1);
1028
1029        REGEX_ASSERT(matcher->find());
1030        REGEX_ASSERT(matcher->start(status) == 4);
1031        REGEX_ASSERT(matcher->start(1, status) == 4);
1032        REGEX_ASSERT(matcher->start(2, status) == -1);
1033        REGEX_CHECK_STATUS;
1034
1035        delete matcher;
1036        delete pat;
1037    }
1038
1039    //
1040    //   find with zero length matches, match position should bump ahead
1041    //     to prevent loops.
1042    //
1043    {
1044        int32_t                 i;
1045        UErrorCode          status=U_ZERO_ERROR;
1046        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1047                                                      //   using an always-true look-ahead.
1048        REGEX_CHECK_STATUS;
1049        UnicodeString s("    ");
1050        m.reset(s);
1051        for (i=0; ; i++) {
1052            if (m.find() == FALSE) {
1053                break;
1054            }
1055            REGEX_ASSERT(m.start(status) == i);
1056            REGEX_ASSERT(m.end(status) == i);
1057        }
1058        REGEX_ASSERT(i==5);
1059
1060        // Check that the bump goes over surrogate pairs OK
1061        s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1062        s = s.unescape();
1063        m.reset(s);
1064        for (i=0; ; i+=2) {
1065            if (m.find() == FALSE) {
1066                break;
1067            }
1068            REGEX_ASSERT(m.start(status) == i);
1069            REGEX_ASSERT(m.end(status) == i);
1070        }
1071        REGEX_ASSERT(i==10);
1072    }
1073    {
1074        // find() loop breaking test.
1075        //        with pattern of /.?/, should see a series of one char matches, then a single
1076        //        match of zero length at the end of the input string.
1077        int32_t                 i;
1078        UErrorCode          status=U_ZERO_ERROR;
1079        RegexMatcher        m(".?", 0, status);
1080        REGEX_CHECK_STATUS;
1081        UnicodeString s("    ");
1082        m.reset(s);
1083        for (i=0; ; i++) {
1084            if (m.find() == FALSE) {
1085                break;
1086            }
1087            REGEX_ASSERT(m.start(status) == i);
1088            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1089        }
1090        REGEX_ASSERT(i==5);
1091    }
1092
1093
1094    //
1095    // Matchers with no input string behave as if they had an empty input string.
1096    //
1097
1098    {
1099        UErrorCode status = U_ZERO_ERROR;
1100        RegexMatcher  m(".?", 0, status);
1101        REGEX_CHECK_STATUS;
1102        REGEX_ASSERT(m.find());
1103        REGEX_ASSERT(m.start(status) == 0);
1104        REGEX_ASSERT(m.input() == "");
1105    }
1106    {
1107        UErrorCode status = U_ZERO_ERROR;
1108        RegexPattern  *p = RegexPattern::compile(".", 0, status);
1109        RegexMatcher  *m = p->matcher(status);
1110        REGEX_CHECK_STATUS;
1111
1112        REGEX_ASSERT(m->find() == FALSE);
1113        REGEX_ASSERT(m->input() == "");
1114        delete m;
1115        delete p;
1116    }
1117
1118    //
1119    // Regions
1120    //
1121    {
1122        UErrorCode status = U_ZERO_ERROR;
1123        UnicodeString testString("This is test data");
1124        RegexMatcher m(".*", testString,  0, status);
1125        REGEX_CHECK_STATUS;
1126        REGEX_ASSERT(m.regionStart() == 0);
1127        REGEX_ASSERT(m.regionEnd() == testString.length());
1128        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1129        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1130
1131        m.region(2,4, status);
1132        REGEX_CHECK_STATUS;
1133        REGEX_ASSERT(m.matches(status));
1134        REGEX_ASSERT(m.start(status)==2);
1135        REGEX_ASSERT(m.end(status)==4);
1136        REGEX_CHECK_STATUS;
1137
1138        m.reset();
1139        REGEX_ASSERT(m.regionStart() == 0);
1140        REGEX_ASSERT(m.regionEnd() == testString.length());
1141
1142        UnicodeString shorterString("short");
1143        m.reset(shorterString);
1144        REGEX_ASSERT(m.regionStart() == 0);
1145        REGEX_ASSERT(m.regionEnd() == shorterString.length());
1146
1147        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1148        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1149        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1150        REGEX_ASSERT(&m == &m.reset());
1151        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1152
1153        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1154        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1155        REGEX_ASSERT(&m == &m.reset());
1156        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1157
1158        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1159        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1160        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1161        REGEX_ASSERT(&m == &m.reset());
1162        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1163
1164        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1165        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1166        REGEX_ASSERT(&m == &m.reset());
1167        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1168
1169    }
1170
1171    //
1172    // hitEnd() and requireEnd()
1173    //
1174    {
1175        UErrorCode status = U_ZERO_ERROR;
1176        UnicodeString testString("aabb");
1177        RegexMatcher m1(".*", testString,  0, status);
1178        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1179        REGEX_ASSERT(m1.hitEnd() == TRUE);
1180        REGEX_ASSERT(m1.requireEnd() == FALSE);
1181        REGEX_CHECK_STATUS;
1182
1183        status = U_ZERO_ERROR;
1184        RegexMatcher m2("a*", testString, 0, status);
1185        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1186        REGEX_ASSERT(m2.hitEnd() == FALSE);
1187        REGEX_ASSERT(m2.requireEnd() == FALSE);
1188        REGEX_CHECK_STATUS;
1189
1190        status = U_ZERO_ERROR;
1191        RegexMatcher m3(".*$", testString, 0, status);
1192        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1193        REGEX_ASSERT(m3.hitEnd() == TRUE);
1194        REGEX_ASSERT(m3.requireEnd() == TRUE);
1195        REGEX_CHECK_STATUS;
1196    }
1197
1198
1199    //
1200    // Compilation error on reset with UChar *
1201    //   These were a hazard that people were stumbling over with runtime errors.
1202    //   Changed them to compiler errors by adding private methods that more closely
1203    //   matched the incorrect use of the functions.
1204    //
1205#if 0
1206    {
1207        UErrorCode status = U_ZERO_ERROR;
1208        UChar ucharString[20];
1209        RegexMatcher m(".", 0, status);
1210        m.reset(ucharString);  // should not compile.
1211
1212        RegexPattern *p = RegexPattern::compile(".", 0, status);
1213        RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1214
1215        RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1216    }
1217#endif
1218
1219    //
1220    //  Time Outs.
1221    //       Note:  These tests will need to be changed when the regexp engine is
1222    //              able to detect and cut short the exponential time behavior on
1223    //              this type of match.
1224    //
1225    {
1226        UErrorCode status = U_ZERO_ERROR;
1227        //    Enough 'a's in the string to cause the match to time out.
1228        //       (Each on additonal 'a' doubles the time)
1229        UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1230        RegexMatcher matcher("(a+)+b", testString, 0, status);
1231        REGEX_CHECK_STATUS;
1232        REGEX_ASSERT(matcher.getTimeLimit() == 0);
1233        matcher.setTimeLimit(100, status);
1234        REGEX_ASSERT(matcher.getTimeLimit() == 100);
1235        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1236        REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1237    }
1238    {
1239        UErrorCode status = U_ZERO_ERROR;
1240        //   Few enough 'a's to slip in under the time limit.
1241        UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1242        RegexMatcher matcher("(a+)+b", testString, 0, status);
1243        REGEX_CHECK_STATUS;
1244        matcher.setTimeLimit(100, status);
1245        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1246        REGEX_CHECK_STATUS;
1247    }
1248
1249    //
1250    //  Stack Limits
1251    //
1252    {
1253        UErrorCode status = U_ZERO_ERROR;
1254        UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1255
1256        // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1257        //   of the '+', and makes the stack frames larger.
1258        RegexMatcher matcher("(A)+A$", testString, 0, status);
1259
1260        // With the default stack, this match should fail to run
1261        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1262        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1263
1264        // With unlimited stack, it should run
1265        status = U_ZERO_ERROR;
1266        matcher.setStackLimit(0, status);
1267        REGEX_CHECK_STATUS;
1268        REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1269        REGEX_CHECK_STATUS;
1270        REGEX_ASSERT(matcher.getStackLimit() == 0);
1271
1272        // With a limited stack, it the match should fail
1273        status = U_ZERO_ERROR;
1274        matcher.setStackLimit(10000, status);
1275        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1276        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1277        REGEX_ASSERT(matcher.getStackLimit() == 10000);
1278    }
1279
1280        // A pattern that doesn't save state should work with
1281        //   a minimal sized stack
1282    {
1283        UErrorCode status = U_ZERO_ERROR;
1284        UnicodeString testString = "abc";
1285        RegexMatcher matcher("abc", testString, 0, status);
1286        REGEX_CHECK_STATUS;
1287        matcher.setStackLimit(30, status);
1288        REGEX_CHECK_STATUS;
1289        REGEX_ASSERT(matcher.matches(status) == TRUE);
1290        REGEX_CHECK_STATUS;
1291        REGEX_ASSERT(matcher.getStackLimit() == 30);
1292
1293        // Negative stack sizes should fail
1294        status = U_ZERO_ERROR;
1295        matcher.setStackLimit(1000, status);
1296        REGEX_CHECK_STATUS;
1297        matcher.setStackLimit(-1, status);
1298        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1299        REGEX_ASSERT(matcher.getStackLimit() == 1000);
1300    }
1301
1302
1303}
1304
1305
1306
1307
1308
1309
1310//---------------------------------------------------------------------------
1311//
1312//      API_Replace        API test for class RegexMatcher, testing the
1313//                         Replace family of functions.
1314//
1315//---------------------------------------------------------------------------
1316void RegexTest::API_Replace() {
1317    //
1318    //  Replace
1319    //
1320    int32_t             flags=0;
1321    UParseError         pe;
1322    UErrorCode          status=U_ZERO_ERROR;
1323
1324    UnicodeString       re("abc");
1325    RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1326    REGEX_CHECK_STATUS;
1327    UnicodeString data = ".abc..abc...abc..";
1328    //                    012345678901234567
1329    RegexMatcher *matcher = pat->matcher(data, status);
1330
1331    //
1332    //  Plain vanilla matches.
1333    //
1334    UnicodeString  dest;
1335    dest = matcher->replaceFirst("yz", status);
1336    REGEX_CHECK_STATUS;
1337    REGEX_ASSERT(dest == ".yz..abc...abc..");
1338
1339    dest = matcher->replaceAll("yz", status);
1340    REGEX_CHECK_STATUS;
1341    REGEX_ASSERT(dest == ".yz..yz...yz..");
1342
1343    //
1344    //  Plain vanilla non-matches.
1345    //
1346    UnicodeString d2 = ".abx..abx...abx..";
1347    matcher->reset(d2);
1348    dest = matcher->replaceFirst("yz", status);
1349    REGEX_CHECK_STATUS;
1350    REGEX_ASSERT(dest == ".abx..abx...abx..");
1351
1352    dest = matcher->replaceAll("yz", status);
1353    REGEX_CHECK_STATUS;
1354    REGEX_ASSERT(dest == ".abx..abx...abx..");
1355
1356    //
1357    // Empty source string
1358    //
1359    UnicodeString d3 = "";
1360    matcher->reset(d3);
1361    dest = matcher->replaceFirst("yz", status);
1362    REGEX_CHECK_STATUS;
1363    REGEX_ASSERT(dest == "");
1364
1365    dest = matcher->replaceAll("yz", status);
1366    REGEX_CHECK_STATUS;
1367    REGEX_ASSERT(dest == "");
1368
1369    //
1370    // Empty substitution string
1371    //
1372    matcher->reset(data);              // ".abc..abc...abc.."
1373    dest = matcher->replaceFirst("", status);
1374    REGEX_CHECK_STATUS;
1375    REGEX_ASSERT(dest == "...abc...abc..");
1376
1377    dest = matcher->replaceAll("", status);
1378    REGEX_CHECK_STATUS;
1379    REGEX_ASSERT(dest == "........");
1380
1381    //
1382    // match whole string
1383    //
1384    UnicodeString d4 = "abc";
1385    matcher->reset(d4);
1386    dest = matcher->replaceFirst("xyz", status);
1387    REGEX_CHECK_STATUS;
1388    REGEX_ASSERT(dest == "xyz");
1389
1390    dest = matcher->replaceAll("xyz", status);
1391    REGEX_CHECK_STATUS;
1392    REGEX_ASSERT(dest == "xyz");
1393
1394    //
1395    // Capture Group, simple case
1396    //
1397    UnicodeString       re2("a(..)");
1398    RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1399    REGEX_CHECK_STATUS;
1400    UnicodeString d5 = "abcdefg";
1401    RegexMatcher *matcher2 = pat2->matcher(d5, status);
1402    REGEX_CHECK_STATUS;
1403    dest = matcher2->replaceFirst("$1$1", status);
1404    REGEX_CHECK_STATUS;
1405    REGEX_ASSERT(dest == "bcbcdefg");
1406
1407    dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1408    REGEX_CHECK_STATUS;
1409    REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1410
1411    dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1412    REGEX_CHECK_STATUS;
1413    REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1414
1415    UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1416    replacement = replacement.unescape();
1417    dest = matcher2->replaceFirst(replacement, status);
1418    REGEX_CHECK_STATUS;
1419    REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1420
1421    REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1422
1423
1424    //
1425    // Replacement String with \u hex escapes
1426    //
1427    {
1428        UnicodeString  src = "abc 1 abc 2 abc 3";
1429        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1430        matcher->reset(src);
1431        UnicodeString  result = matcher->replaceAll(substitute, status);
1432        REGEX_CHECK_STATUS;
1433        REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1434    }
1435    {
1436        UnicodeString  src = "abc !";
1437        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1438        matcher->reset(src);
1439        UnicodeString  result = matcher->replaceAll(substitute, status);
1440        REGEX_CHECK_STATUS;
1441        UnicodeString expected = UnicodeString("--");
1442        expected.append((UChar32)0x10000);
1443        expected.append("-- !");
1444        REGEX_ASSERT(result == expected);
1445    }
1446    // TODO:  need more through testing of capture substitutions.
1447
1448    // Bug 4057
1449    //
1450    {
1451        status = U_ZERO_ERROR;
1452        UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1453        RegexMatcher m("ss(.*?)ee", 0, status);
1454        REGEX_CHECK_STATUS;
1455        UnicodeString result;
1456
1457        // Multiple finds do NOT bump up the previous appendReplacement postion.
1458        m.reset(s);
1459        m.find();
1460        m.find();
1461        m.appendReplacement(result, "ooh", status);
1462        REGEX_CHECK_STATUS;
1463        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1464
1465        // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1466        status = U_ZERO_ERROR;
1467        result.truncate(0);
1468        m.reset(10, status);
1469        m.find();
1470        m.find();
1471        m.appendReplacement(result, "ooh", status);
1472        REGEX_CHECK_STATUS;
1473        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1474
1475        // find() at interior of string, appendReplacemnt still starts at beginning.
1476        status = U_ZERO_ERROR;
1477        result.truncate(0);
1478        m.reset();
1479        m.find(10, status);
1480        m.find();
1481        m.appendReplacement(result, "ooh", status);
1482        REGEX_CHECK_STATUS;
1483        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1484
1485        m.appendTail(result);
1486        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1487
1488    }
1489
1490    delete matcher2;
1491    delete pat2;
1492    delete matcher;
1493    delete pat;
1494}
1495
1496
1497//---------------------------------------------------------------------------
1498//
1499//      API_Pattern       Test that the API for class RegexPattern is
1500//                        present and nominally working.
1501//
1502//---------------------------------------------------------------------------
1503void RegexTest::API_Pattern() {
1504    RegexPattern        pata;    // Test default constructor to not crash.
1505    RegexPattern        patb;
1506
1507    REGEX_ASSERT(pata == patb);
1508    REGEX_ASSERT(pata == pata);
1509
1510    UnicodeString re1("abc[a-l][m-z]");
1511    UnicodeString re2("def");
1512    UErrorCode    status = U_ZERO_ERROR;
1513    UParseError   pe;
1514
1515    RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1516    RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1517    REGEX_CHECK_STATUS;
1518    REGEX_ASSERT(*pat1 == *pat1);
1519    REGEX_ASSERT(*pat1 != pata);
1520
1521    // Assign
1522    patb = *pat1;
1523    REGEX_ASSERT(patb == *pat1);
1524
1525    // Copy Construct
1526    RegexPattern patc(*pat1);
1527    REGEX_ASSERT(patc == *pat1);
1528    REGEX_ASSERT(patb == patc);
1529    REGEX_ASSERT(pat1 != pat2);
1530    patb = *pat2;
1531    REGEX_ASSERT(patb != patc);
1532    REGEX_ASSERT(patb == *pat2);
1533
1534    // Compile with no flags.
1535    RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1536    REGEX_ASSERT(*pat1a == *pat1);
1537
1538    REGEX_ASSERT(pat1a->flags() == 0);
1539
1540    // Compile with different flags should be not equal
1541    RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1542    REGEX_CHECK_STATUS;
1543
1544    REGEX_ASSERT(*pat1b != *pat1a);
1545    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1546    REGEX_ASSERT(pat1a->flags() == 0);
1547    delete pat1b;
1548
1549    // clone
1550    RegexPattern *pat1c = pat1->clone();
1551    REGEX_ASSERT(*pat1c == *pat1);
1552    REGEX_ASSERT(*pat1c != *pat2);
1553
1554    delete pat1c;
1555    delete pat1a;
1556    delete pat1;
1557    delete pat2;
1558
1559
1560    //
1561    //   Verify that a matcher created from a cloned pattern works.
1562    //     (Jitterbug 3423)
1563    //
1564    {
1565        UErrorCode     status     = U_ZERO_ERROR;
1566        RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1567        RegexPattern  *pClone     = pSource->clone();
1568        delete         pSource;
1569        RegexMatcher  *mFromClone = pClone->matcher(status);
1570        REGEX_CHECK_STATUS;
1571        UnicodeString s = "Hello World";
1572        mFromClone->reset(s);
1573        REGEX_ASSERT(mFromClone->find() == TRUE);
1574        REGEX_ASSERT(mFromClone->group(status) == "Hello");
1575        REGEX_ASSERT(mFromClone->find() == TRUE);
1576        REGEX_ASSERT(mFromClone->group(status) == "World");
1577        REGEX_ASSERT(mFromClone->find() == FALSE);
1578        delete mFromClone;
1579        delete pClone;
1580    }
1581
1582    //
1583    //   matches convenience API
1584    //
1585    REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1586    REGEX_CHECK_STATUS;
1587    REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1588    REGEX_CHECK_STATUS;
1589    REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1590    REGEX_CHECK_STATUS;
1591    REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1592    REGEX_CHECK_STATUS;
1593    REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1594    REGEX_CHECK_STATUS;
1595    status = U_INDEX_OUTOFBOUNDS_ERROR;
1596    REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1597    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1598
1599
1600    //
1601    // Split()
1602    //
1603    status = U_ZERO_ERROR;
1604    pat1 = RegexPattern::compile(" +",  pe, status);
1605    REGEX_CHECK_STATUS;
1606    UnicodeString  fields[10];
1607
1608    int32_t n;
1609    n = pat1->split("Now is the time", fields, 10, status);
1610    REGEX_CHECK_STATUS;
1611    REGEX_ASSERT(n==4);
1612    REGEX_ASSERT(fields[0]=="Now");
1613    REGEX_ASSERT(fields[1]=="is");
1614    REGEX_ASSERT(fields[2]=="the");
1615    REGEX_ASSERT(fields[3]=="time");
1616    REGEX_ASSERT(fields[4]=="");
1617
1618    n = pat1->split("Now is the time", fields, 2, status);
1619    REGEX_CHECK_STATUS;
1620    REGEX_ASSERT(n==2);
1621    REGEX_ASSERT(fields[0]=="Now");
1622    REGEX_ASSERT(fields[1]=="is the time");
1623    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1624
1625    fields[1] = "*";
1626    status = U_ZERO_ERROR;
1627    n = pat1->split("Now is the time", fields, 1, status);
1628    REGEX_CHECK_STATUS;
1629    REGEX_ASSERT(n==1);
1630    REGEX_ASSERT(fields[0]=="Now is the time");
1631    REGEX_ASSERT(fields[1]=="*");
1632    status = U_ZERO_ERROR;
1633
1634    n = pat1->split("    Now       is the time   ", fields, 10, status);
1635    REGEX_CHECK_STATUS;
1636    REGEX_ASSERT(n==6);
1637    REGEX_ASSERT(fields[0]=="");
1638    REGEX_ASSERT(fields[1]=="Now");
1639    REGEX_ASSERT(fields[2]=="is");
1640    REGEX_ASSERT(fields[3]=="the");
1641    REGEX_ASSERT(fields[4]=="time");
1642    REGEX_ASSERT(fields[5]=="");
1643
1644    n = pat1->split("     ", fields, 10, status);
1645    REGEX_CHECK_STATUS;
1646    REGEX_ASSERT(n==2);
1647    REGEX_ASSERT(fields[0]=="");
1648    REGEX_ASSERT(fields[1]=="");
1649
1650    fields[0] = "foo";
1651    n = pat1->split("", fields, 10, status);
1652    REGEX_CHECK_STATUS;
1653    REGEX_ASSERT(n==0);
1654    REGEX_ASSERT(fields[0]=="foo");
1655
1656    delete pat1;
1657
1658    //  split, with a pattern with (capture)
1659    pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1660    REGEX_CHECK_STATUS;
1661
1662    status = U_ZERO_ERROR;
1663    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1664    REGEX_CHECK_STATUS;
1665    REGEX_ASSERT(n==7);
1666    REGEX_ASSERT(fields[0]=="");
1667    REGEX_ASSERT(fields[1]=="a");
1668    REGEX_ASSERT(fields[2]=="Now is ");
1669    REGEX_ASSERT(fields[3]=="b");
1670    REGEX_ASSERT(fields[4]=="the time");
1671    REGEX_ASSERT(fields[5]=="c");
1672    REGEX_ASSERT(fields[6]=="");
1673    REGEX_ASSERT(status==U_ZERO_ERROR);
1674
1675    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1676    REGEX_CHECK_STATUS;
1677    REGEX_ASSERT(n==7);
1678    REGEX_ASSERT(fields[0]=="  ");
1679    REGEX_ASSERT(fields[1]=="a");
1680    REGEX_ASSERT(fields[2]=="Now is ");
1681    REGEX_ASSERT(fields[3]=="b");
1682    REGEX_ASSERT(fields[4]=="the time");
1683    REGEX_ASSERT(fields[5]=="c");
1684    REGEX_ASSERT(fields[6]=="");
1685
1686    status = U_ZERO_ERROR;
1687    fields[6] = "foo";
1688    n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1689    REGEX_CHECK_STATUS;
1690    REGEX_ASSERT(n==6);
1691    REGEX_ASSERT(fields[0]=="  ");
1692    REGEX_ASSERT(fields[1]=="a");
1693    REGEX_ASSERT(fields[2]=="Now is ");
1694    REGEX_ASSERT(fields[3]=="b");
1695    REGEX_ASSERT(fields[4]=="the time");
1696    REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1697    REGEX_ASSERT(fields[6]=="foo");
1698
1699    status = U_ZERO_ERROR;
1700    fields[5] = "foo";
1701    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1702    REGEX_CHECK_STATUS;
1703    REGEX_ASSERT(n==5);
1704    REGEX_ASSERT(fields[0]=="  ");
1705    REGEX_ASSERT(fields[1]=="a");
1706    REGEX_ASSERT(fields[2]=="Now is ");
1707    REGEX_ASSERT(fields[3]=="b");
1708    REGEX_ASSERT(fields[4]=="the time<c>");
1709    REGEX_ASSERT(fields[5]=="foo");
1710
1711    status = U_ZERO_ERROR;
1712    fields[5] = "foo";
1713    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1714    REGEX_CHECK_STATUS;
1715    REGEX_ASSERT(n==5);
1716    REGEX_ASSERT(fields[0]=="  ");
1717    REGEX_ASSERT(fields[1]=="a");
1718    REGEX_ASSERT(fields[2]=="Now is ");
1719    REGEX_ASSERT(fields[3]=="b");
1720    REGEX_ASSERT(fields[4]=="the time");
1721    REGEX_ASSERT(fields[5]=="foo");
1722
1723    status = U_ZERO_ERROR;
1724    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1725    REGEX_CHECK_STATUS;
1726    REGEX_ASSERT(n==4);
1727    REGEX_ASSERT(fields[0]=="  ");
1728    REGEX_ASSERT(fields[1]=="a");
1729    REGEX_ASSERT(fields[2]=="Now is ");
1730    REGEX_ASSERT(fields[3]=="the time<c>");
1731    status = U_ZERO_ERROR;
1732    delete pat1;
1733
1734    pat1 = RegexPattern::compile("([-,])",  pe, status);
1735    REGEX_CHECK_STATUS;
1736    n = pat1->split("1-10,20", fields, 10, status);
1737    REGEX_CHECK_STATUS;
1738    REGEX_ASSERT(n==5);
1739    REGEX_ASSERT(fields[0]=="1");
1740    REGEX_ASSERT(fields[1]=="-");
1741    REGEX_ASSERT(fields[2]=="10");
1742    REGEX_ASSERT(fields[3]==",");
1743    REGEX_ASSERT(fields[4]=="20");
1744    delete pat1;
1745
1746    // Test split of string with empty trailing fields
1747    pat1 = RegexPattern::compile(",", pe, status);
1748    REGEX_CHECK_STATUS;
1749    n = pat1->split("a,b,c,", fields, 10, status);
1750    REGEX_CHECK_STATUS;
1751    REGEX_ASSERT(n==4);
1752    REGEX_ASSERT(fields[0]=="a");
1753    REGEX_ASSERT(fields[1]=="b");
1754    REGEX_ASSERT(fields[2]=="c");
1755    REGEX_ASSERT(fields[3]=="");
1756
1757    n = pat1->split("a,,,", fields, 10, status);
1758    REGEX_CHECK_STATUS;
1759    REGEX_ASSERT(n==4);
1760    REGEX_ASSERT(fields[0]=="a");
1761    REGEX_ASSERT(fields[1]=="");
1762    REGEX_ASSERT(fields[2]=="");
1763    REGEX_ASSERT(fields[3]=="");
1764    delete pat1;
1765
1766    // Split Separator with zero length match.
1767    pat1 = RegexPattern::compile(":?", pe, status);
1768    REGEX_CHECK_STATUS;
1769    n = pat1->split("abc", fields, 10, status);
1770    REGEX_CHECK_STATUS;
1771    REGEX_ASSERT(n==5);
1772    REGEX_ASSERT(fields[0]=="");
1773    REGEX_ASSERT(fields[1]=="a");
1774    REGEX_ASSERT(fields[2]=="b");
1775    REGEX_ASSERT(fields[3]=="c");
1776    REGEX_ASSERT(fields[4]=="");
1777
1778    delete pat1;
1779
1780    //
1781    // RegexPattern::pattern()
1782    //
1783    pat1 = new RegexPattern();
1784    REGEX_ASSERT(pat1->pattern() == "");
1785    delete pat1;
1786
1787    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1788    REGEX_CHECK_STATUS;
1789    REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1790    delete pat1;
1791
1792
1793    //
1794    // classID functions
1795    //
1796    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1797    REGEX_CHECK_STATUS;
1798    REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1799    REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1800    UnicodeString Hello("Hello, world.");
1801    RegexMatcher *m = pat1->matcher(Hello, status);
1802    REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1803    REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1804    REGEX_ASSERT(m->getDynamicClassID() != NULL);
1805    delete m;
1806    delete pat1;
1807
1808}
1809
1810//---------------------------------------------------------------------------
1811//
1812//      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1813//                       is present and working, but excluding functions
1814//                       implementing replace operations.
1815//
1816//---------------------------------------------------------------------------
1817void RegexTest::API_Match_UTF8() {
1818    UParseError         pe;
1819    UErrorCode          status=U_ZERO_ERROR;
1820    int32_t             flags = 0;
1821
1822    //
1823    // Debug - slide failing test cases early
1824    //
1825#if 0
1826    {
1827    }
1828    return;
1829#endif
1830
1831    //
1832    // Simple pattern compilation
1833    //
1834    {
1835        UText               re = UTEXT_INITIALIZER;
1836        regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1837        REGEX_VERBOSE_TEXT(&re);
1838        RegexPattern        *pat2;
1839        pat2 = RegexPattern::compile(&re, flags, pe, status);
1840        REGEX_CHECK_STATUS;
1841
1842        UText input1 = UTEXT_INITIALIZER;
1843        UText input2 = UTEXT_INITIALIZER;
1844        UText empty  = UTEXT_INITIALIZER;
1845        regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1846        REGEX_VERBOSE_TEXT(&input1);
1847        regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1848        REGEX_VERBOSE_TEXT(&input2);
1849        utext_openUChars(&empty, NULL, 0, &status);
1850
1851        int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1852        int32_t input2Len = strlen("not abc");
1853
1854
1855        //
1856        // Matcher creation and reset.
1857        //
1858        RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1859        REGEX_CHECK_STATUS;
1860        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1861        const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1862        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1863        m1->reset(&input2);
1864        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1865        const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1866        REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1867        m1->reset(&input1);
1868        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1869        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1870        m1->reset(&empty);
1871        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1872        REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1873
1874        //
1875        //  reset(pos, status)
1876        //
1877        m1->reset(&input1);
1878        m1->reset(4, status);
1879        REGEX_CHECK_STATUS;
1880        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1881        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1882
1883        m1->reset(-1, status);
1884        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1885        status = U_ZERO_ERROR;
1886
1887        m1->reset(0, status);
1888        REGEX_CHECK_STATUS;
1889        status = U_ZERO_ERROR;
1890
1891        m1->reset(input1Len-1, status);
1892        REGEX_CHECK_STATUS;
1893        status = U_ZERO_ERROR;
1894
1895        m1->reset(input1Len, status);
1896        REGEX_CHECK_STATUS;
1897        status = U_ZERO_ERROR;
1898
1899        m1->reset(input1Len+1, status);
1900        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1901        status = U_ZERO_ERROR;
1902
1903        //
1904        // match(pos, status)
1905        //
1906        m1->reset(&input2);
1907        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1908        m1->reset();
1909        REGEX_ASSERT(m1->matches(3, status) == FALSE);
1910        m1->reset();
1911        REGEX_ASSERT(m1->matches(5, status) == FALSE);
1912        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1913        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1914        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1915
1916        // Match() at end of string should fail, but should not
1917        //  be an error.
1918        status = U_ZERO_ERROR;
1919        REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1920        REGEX_CHECK_STATUS;
1921
1922        // Match beyond end of string should fail with an error.
1923        status = U_ZERO_ERROR;
1924        REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1925        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1926
1927        // Successful match at end of string.
1928        {
1929            status = U_ZERO_ERROR;
1930            RegexMatcher m("A?", 0, status);  // will match zero length string.
1931            REGEX_CHECK_STATUS;
1932            m.reset(&input1);
1933            REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1934            REGEX_CHECK_STATUS;
1935            m.reset(&empty);
1936            REGEX_ASSERT(m.matches(0, status) == TRUE);
1937            REGEX_CHECK_STATUS;
1938        }
1939
1940
1941        //
1942        // lookingAt(pos, status)
1943        //
1944        status = U_ZERO_ERROR;
1945        m1->reset(&input2);  // "not abc"
1946        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1947        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1948        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1949        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1950        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1951        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1952        status = U_ZERO_ERROR;
1953        REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1954        REGEX_CHECK_STATUS;
1955        REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1956        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1957
1958        delete m1;
1959        delete pat2;
1960
1961        utext_close(&re);
1962        utext_close(&input1);
1963        utext_close(&input2);
1964        utext_close(&empty);
1965    }
1966
1967
1968    //
1969    // Capture Group.
1970    //     RegexMatcher::start();
1971    //     RegexMatcher::end();
1972    //     RegexMatcher::groupCount();
1973    //
1974    {
1975        int32_t             flags=0;
1976        UParseError         pe;
1977        UErrorCode          status=U_ZERO_ERROR;
1978        UText               re=UTEXT_INITIALIZER;
1979        const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1980        utext_openUTF8(&re, str_01234567_pat, -1, &status);
1981
1982        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1983        REGEX_CHECK_STATUS;
1984
1985        UText input = UTEXT_INITIALIZER;
1986        const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1987        utext_openUTF8(&input, str_0123456789, -1, &status);
1988
1989        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1990        REGEX_CHECK_STATUS;
1991        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1992        static const int32_t matchStarts[] = {0,  2, 4, 8};
1993        static const int32_t matchEnds[]   = {10, 8, 6, 10};
1994        int32_t i;
1995        for (i=0; i<4; i++) {
1996            int32_t actualStart = matcher->start(i, status);
1997            REGEX_CHECK_STATUS;
1998            if (actualStart != matchStarts[i]) {
1999                errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2000                      __FILE__, __LINE__, i, matchStarts[i], actualStart);
2001            }
2002            int32_t actualEnd = matcher->end(i, status);
2003            REGEX_CHECK_STATUS;
2004            if (actualEnd != matchEnds[i]) {
2005                errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2006                      __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2007            }
2008        }
2009
2010        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2011        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2012
2013        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2014        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2015        matcher->reset();
2016        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2017
2018        matcher->lookingAt(status);
2019
2020        UnicodeString dest;
2021        UText destText = UTEXT_INITIALIZER;
2022        utext_openUnicodeString(&destText, &dest, &status);
2023        UText *result;
2024        //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2025        //	Test shallow-clone API
2026        int64_t   group_len;
2027        result = matcher->group((UText *)NULL, group_len, status);
2028        REGEX_CHECK_STATUS;
2029        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2030        utext_close(result);
2031        result = matcher->group(0, &destText, group_len, status);
2032        REGEX_CHECK_STATUS;
2033        REGEX_ASSERT(result == &destText);
2034        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2035        //  destText is now immutable, reopen it
2036        utext_close(&destText);
2037        utext_openUnicodeString(&destText, &dest, &status);
2038
2039        result = matcher->group(0, NULL, status);
2040        REGEX_CHECK_STATUS;
2041        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2042        utext_close(result);
2043        result = matcher->group(0, &destText, status);
2044        REGEX_CHECK_STATUS;
2045        REGEX_ASSERT(result == &destText);
2046        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2047
2048        result = matcher->group(1, NULL, status);
2049        REGEX_CHECK_STATUS;
2050        const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
2051        REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2052        utext_close(result);
2053        result = matcher->group(1, &destText, status);
2054        REGEX_CHECK_STATUS;
2055        REGEX_ASSERT(result == &destText);
2056        REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2057
2058        result = matcher->group(2, NULL, status);
2059        REGEX_CHECK_STATUS;
2060        const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
2061        REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2062        utext_close(result);
2063        result = matcher->group(2, &destText, status);
2064        REGEX_CHECK_STATUS;
2065        REGEX_ASSERT(result == &destText);
2066        REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2067
2068        result = matcher->group(3, NULL, status);
2069        REGEX_CHECK_STATUS;
2070        const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
2071        REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2072        utext_close(result);
2073        result = matcher->group(3, &destText, status);
2074        REGEX_CHECK_STATUS;
2075        REGEX_ASSERT(result == &destText);
2076        REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2077
2078        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2079        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2080        matcher->reset();
2081        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2082
2083        delete matcher;
2084        delete pat;
2085
2086        utext_close(&destText);
2087        utext_close(&input);
2088        utext_close(&re);
2089    }
2090
2091    //
2092    //  find
2093    //
2094    {
2095        int32_t             flags=0;
2096        UParseError         pe;
2097        UErrorCode          status=U_ZERO_ERROR;
2098        UText               re=UTEXT_INITIALIZER;
2099        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2100        utext_openUTF8(&re, str_abc, -1, &status);
2101
2102        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2103        REGEX_CHECK_STATUS;
2104        UText input = UTEXT_INITIALIZER;
2105        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2106        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2107        //                      012345678901234567
2108
2109        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2110        REGEX_CHECK_STATUS;
2111        REGEX_ASSERT(matcher->find());
2112        REGEX_ASSERT(matcher->start(status) == 1);
2113        REGEX_ASSERT(matcher->find());
2114        REGEX_ASSERT(matcher->start(status) == 6);
2115        REGEX_ASSERT(matcher->find());
2116        REGEX_ASSERT(matcher->start(status) == 12);
2117        REGEX_ASSERT(matcher->find() == FALSE);
2118        REGEX_ASSERT(matcher->find() == FALSE);
2119
2120        matcher->reset();
2121        REGEX_ASSERT(matcher->find());
2122        REGEX_ASSERT(matcher->start(status) == 1);
2123
2124        REGEX_ASSERT(matcher->find(0, status));
2125        REGEX_ASSERT(matcher->start(status) == 1);
2126        REGEX_ASSERT(matcher->find(1, status));
2127        REGEX_ASSERT(matcher->start(status) == 1);
2128        REGEX_ASSERT(matcher->find(2, status));
2129        REGEX_ASSERT(matcher->start(status) == 6);
2130        REGEX_ASSERT(matcher->find(12, status));
2131        REGEX_ASSERT(matcher->start(status) == 12);
2132        REGEX_ASSERT(matcher->find(13, status) == FALSE);
2133        REGEX_ASSERT(matcher->find(16, status) == FALSE);
2134        REGEX_ASSERT(matcher->find(17, status) == FALSE);
2135        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2136
2137        status = U_ZERO_ERROR;
2138        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2139        status = U_ZERO_ERROR;
2140        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2141
2142        REGEX_ASSERT(matcher->groupCount() == 0);
2143
2144        delete matcher;
2145        delete pat;
2146
2147        utext_close(&input);
2148        utext_close(&re);
2149    }
2150
2151
2152    //
2153    //  find, with \G in pattern (true if at the end of a previous match).
2154    //
2155    {
2156        int32_t             flags=0;
2157        UParseError         pe;
2158        UErrorCode          status=U_ZERO_ERROR;
2159        UText               re=UTEXT_INITIALIZER;
2160        const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2161        utext_openUTF8(&re, str_Gabcabc, -1, &status);
2162
2163        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2164
2165        REGEX_CHECK_STATUS;
2166        UText input = UTEXT_INITIALIZER;
2167        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2168        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2169        //                      012345678901234567
2170
2171        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2172        REGEX_CHECK_STATUS;
2173        REGEX_ASSERT(matcher->find());
2174        REGEX_ASSERT(matcher->start(status) == 0);
2175        REGEX_ASSERT(matcher->start(1, status) == -1);
2176        REGEX_ASSERT(matcher->start(2, status) == 1);
2177
2178        REGEX_ASSERT(matcher->find());
2179        REGEX_ASSERT(matcher->start(status) == 4);
2180        REGEX_ASSERT(matcher->start(1, status) == 4);
2181        REGEX_ASSERT(matcher->start(2, status) == -1);
2182        REGEX_CHECK_STATUS;
2183
2184        delete matcher;
2185        delete pat;
2186
2187        utext_close(&input);
2188        utext_close(&re);
2189    }
2190
2191    //
2192    //   find with zero length matches, match position should bump ahead
2193    //     to prevent loops.
2194    //
2195    {
2196        int32_t                 i;
2197        UErrorCode          status=U_ZERO_ERROR;
2198        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2199                                                      //   using an always-true look-ahead.
2200        REGEX_CHECK_STATUS;
2201        UText s = UTEXT_INITIALIZER;
2202        utext_openUTF8(&s, "    ", -1, &status);
2203        m.reset(&s);
2204        for (i=0; ; i++) {
2205            if (m.find() == FALSE) {
2206                break;
2207            }
2208            REGEX_ASSERT(m.start(status) == i);
2209            REGEX_ASSERT(m.end(status) == i);
2210        }
2211        REGEX_ASSERT(i==5);
2212
2213        // Check that the bump goes over characters outside the BMP OK
2214        // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2215        unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2216        utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2217        m.reset(&s);
2218        for (i=0; ; i+=4) {
2219            if (m.find() == FALSE) {
2220                break;
2221            }
2222            REGEX_ASSERT(m.start(status) == i);
2223            REGEX_ASSERT(m.end(status) == i);
2224        }
2225        REGEX_ASSERT(i==20);
2226
2227        utext_close(&s);
2228    }
2229    {
2230        // find() loop breaking test.
2231        //        with pattern of /.?/, should see a series of one char matches, then a single
2232        //        match of zero length at the end of the input string.
2233        int32_t                 i;
2234        UErrorCode          status=U_ZERO_ERROR;
2235        RegexMatcher        m(".?", 0, status);
2236        REGEX_CHECK_STATUS;
2237        UText s = UTEXT_INITIALIZER;
2238        utext_openUTF8(&s, "    ", -1, &status);
2239        m.reset(&s);
2240        for (i=0; ; i++) {
2241            if (m.find() == FALSE) {
2242                break;
2243            }
2244            REGEX_ASSERT(m.start(status) == i);
2245            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2246        }
2247        REGEX_ASSERT(i==5);
2248
2249        utext_close(&s);
2250    }
2251
2252
2253    //
2254    // Matchers with no input string behave as if they had an empty input string.
2255    //
2256
2257    {
2258        UErrorCode status = U_ZERO_ERROR;
2259        RegexMatcher  m(".?", 0, status);
2260        REGEX_CHECK_STATUS;
2261        REGEX_ASSERT(m.find());
2262        REGEX_ASSERT(m.start(status) == 0);
2263        REGEX_ASSERT(m.input() == "");
2264    }
2265    {
2266        UErrorCode status = U_ZERO_ERROR;
2267        RegexPattern  *p = RegexPattern::compile(".", 0, status);
2268        RegexMatcher  *m = p->matcher(status);
2269        REGEX_CHECK_STATUS;
2270
2271        REGEX_ASSERT(m->find() == FALSE);
2272        REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2273        delete m;
2274        delete p;
2275    }
2276
2277    //
2278    // Regions
2279    //
2280    {
2281        UErrorCode status = U_ZERO_ERROR;
2282        UText testPattern = UTEXT_INITIALIZER;
2283        UText testText    = UTEXT_INITIALIZER;
2284        regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2285        REGEX_VERBOSE_TEXT(&testPattern);
2286        regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2287        REGEX_VERBOSE_TEXT(&testText);
2288
2289        RegexMatcher m(&testPattern, &testText, 0, status);
2290        REGEX_CHECK_STATUS;
2291        REGEX_ASSERT(m.regionStart() == 0);
2292        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2293        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2294        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2295
2296        m.region(2,4, status);
2297        REGEX_CHECK_STATUS;
2298        REGEX_ASSERT(m.matches(status));
2299        REGEX_ASSERT(m.start(status)==2);
2300        REGEX_ASSERT(m.end(status)==4);
2301        REGEX_CHECK_STATUS;
2302
2303        m.reset();
2304        REGEX_ASSERT(m.regionStart() == 0);
2305        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2306
2307        regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2308        REGEX_VERBOSE_TEXT(&testText);
2309        m.reset(&testText);
2310        REGEX_ASSERT(m.regionStart() == 0);
2311        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2312
2313        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2314        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2315        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2316        REGEX_ASSERT(&m == &m.reset());
2317        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2318
2319        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2320        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2321        REGEX_ASSERT(&m == &m.reset());
2322        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2323
2324        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2325        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2326        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2327        REGEX_ASSERT(&m == &m.reset());
2328        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2329
2330        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2331        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2332        REGEX_ASSERT(&m == &m.reset());
2333        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2334
2335        utext_close(&testText);
2336        utext_close(&testPattern);
2337    }
2338
2339    //
2340    // hitEnd() and requireEnd()
2341    //
2342    {
2343        UErrorCode status = U_ZERO_ERROR;
2344        UText testPattern = UTEXT_INITIALIZER;
2345        UText testText    = UTEXT_INITIALIZER;
2346        const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2347        const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2348        utext_openUTF8(&testPattern, str_, -1, &status);
2349        utext_openUTF8(&testText, str_aabb, -1, &status);
2350
2351        RegexMatcher m1(&testPattern, &testText,  0, status);
2352        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2353        REGEX_ASSERT(m1.hitEnd() == TRUE);
2354        REGEX_ASSERT(m1.requireEnd() == FALSE);
2355        REGEX_CHECK_STATUS;
2356
2357        status = U_ZERO_ERROR;
2358        const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2359        utext_openUTF8(&testPattern, str_a, -1, &status);
2360        RegexMatcher m2(&testPattern, &testText, 0, status);
2361        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2362        REGEX_ASSERT(m2.hitEnd() == FALSE);
2363        REGEX_ASSERT(m2.requireEnd() == FALSE);
2364        REGEX_CHECK_STATUS;
2365
2366        status = U_ZERO_ERROR;
2367        const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2368        utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2369        RegexMatcher m3(&testPattern, &testText, 0, status);
2370        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2371        REGEX_ASSERT(m3.hitEnd() == TRUE);
2372        REGEX_ASSERT(m3.requireEnd() == TRUE);
2373        REGEX_CHECK_STATUS;
2374
2375        utext_close(&testText);
2376        utext_close(&testPattern);
2377    }
2378}
2379
2380
2381//---------------------------------------------------------------------------
2382//
2383//      API_Replace_UTF8   API test for class RegexMatcher, testing the
2384//                         Replace family of functions.
2385//
2386//---------------------------------------------------------------------------
2387void RegexTest::API_Replace_UTF8() {
2388    //
2389    //  Replace
2390    //
2391    int32_t             flags=0;
2392    UParseError         pe;
2393    UErrorCode          status=U_ZERO_ERROR;
2394
2395    UText               re=UTEXT_INITIALIZER;
2396    regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2397    REGEX_VERBOSE_TEXT(&re);
2398    RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2399    REGEX_CHECK_STATUS;
2400
2401    char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2402    //             012345678901234567
2403    UText dataText = UTEXT_INITIALIZER;
2404    utext_openUTF8(&dataText, data, -1, &status);
2405    REGEX_CHECK_STATUS;
2406    REGEX_VERBOSE_TEXT(&dataText);
2407    RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2408
2409    //
2410    //  Plain vanilla matches.
2411    //
2412    UnicodeString  dest;
2413    UText destText = UTEXT_INITIALIZER;
2414    utext_openUnicodeString(&destText, &dest, &status);
2415    UText *result;
2416
2417    UText replText = UTEXT_INITIALIZER;
2418
2419    const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2420    utext_openUTF8(&replText, str_yz, -1, &status);
2421    REGEX_VERBOSE_TEXT(&replText);
2422    result = matcher->replaceFirst(&replText, NULL, status);
2423    REGEX_CHECK_STATUS;
2424    const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2425    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2426    utext_close(result);
2427    result = matcher->replaceFirst(&replText, &destText, status);
2428    REGEX_CHECK_STATUS;
2429    REGEX_ASSERT(result == &destText);
2430    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2431
2432    result = matcher->replaceAll(&replText, NULL, status);
2433    REGEX_CHECK_STATUS;
2434    const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2435    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2436    utext_close(result);
2437
2438    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2439    result = matcher->replaceAll(&replText, &destText, status);
2440    REGEX_CHECK_STATUS;
2441    REGEX_ASSERT(result == &destText);
2442    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2443
2444    //
2445    //  Plain vanilla non-matches.
2446    //
2447    const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2448    utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2449    matcher->reset(&dataText);
2450
2451    result = matcher->replaceFirst(&replText, NULL, status);
2452    REGEX_CHECK_STATUS;
2453    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2454    utext_close(result);
2455    result = matcher->replaceFirst(&replText, &destText, status);
2456    REGEX_CHECK_STATUS;
2457    REGEX_ASSERT(result == &destText);
2458    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2459
2460    result = matcher->replaceAll(&replText, NULL, status);
2461    REGEX_CHECK_STATUS;
2462    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2463    utext_close(result);
2464    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2465    result = matcher->replaceAll(&replText, &destText, status);
2466    REGEX_CHECK_STATUS;
2467    REGEX_ASSERT(result == &destText);
2468    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2469
2470    //
2471    // Empty source string
2472    //
2473    utext_openUTF8(&dataText, NULL, 0, &status);
2474    matcher->reset(&dataText);
2475
2476    result = matcher->replaceFirst(&replText, NULL, status);
2477    REGEX_CHECK_STATUS;
2478    REGEX_ASSERT_UTEXT_UTF8("", result);
2479    utext_close(result);
2480    result = matcher->replaceFirst(&replText, &destText, status);
2481    REGEX_CHECK_STATUS;
2482    REGEX_ASSERT(result == &destText);
2483    REGEX_ASSERT_UTEXT_UTF8("", result);
2484
2485    result = matcher->replaceAll(&replText, NULL, status);
2486    REGEX_CHECK_STATUS;
2487    REGEX_ASSERT_UTEXT_UTF8("", result);
2488    utext_close(result);
2489    result = matcher->replaceAll(&replText, &destText, status);
2490    REGEX_CHECK_STATUS;
2491    REGEX_ASSERT(result == &destText);
2492    REGEX_ASSERT_UTEXT_UTF8("", result);
2493
2494    //
2495    // Empty substitution string
2496    //
2497    utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2498    matcher->reset(&dataText);
2499
2500    utext_openUTF8(&replText, NULL, 0, &status);
2501    result = matcher->replaceFirst(&replText, NULL, status);
2502    REGEX_CHECK_STATUS;
2503    const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2504    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2505    utext_close(result);
2506    result = matcher->replaceFirst(&replText, &destText, status);
2507    REGEX_CHECK_STATUS;
2508    REGEX_ASSERT(result == &destText);
2509    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2510
2511    result = matcher->replaceAll(&replText, NULL, status);
2512    REGEX_CHECK_STATUS;
2513    const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2514    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2515    utext_close(result);
2516    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2517    result = matcher->replaceAll(&replText, &destText, status);
2518    REGEX_CHECK_STATUS;
2519    REGEX_ASSERT(result == &destText);
2520    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2521
2522    //
2523    // match whole string
2524    //
2525    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2526    utext_openUTF8(&dataText, str_abc, -1, &status);
2527    matcher->reset(&dataText);
2528
2529    const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2530    utext_openUTF8(&replText, str_xyz, -1, &status);
2531    result = matcher->replaceFirst(&replText, NULL, status);
2532    REGEX_CHECK_STATUS;
2533    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2534    utext_close(result);
2535    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2536    result = matcher->replaceFirst(&replText, &destText, status);
2537    REGEX_CHECK_STATUS;
2538    REGEX_ASSERT(result == &destText);
2539    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2540
2541    result = matcher->replaceAll(&replText, NULL, status);
2542    REGEX_CHECK_STATUS;
2543    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2544    utext_close(result);
2545    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2546    result = matcher->replaceAll(&replText, &destText, status);
2547    REGEX_CHECK_STATUS;
2548    REGEX_ASSERT(result == &destText);
2549    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2550
2551    //
2552    // Capture Group, simple case
2553    //
2554    const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2555    utext_openUTF8(&re, str_add, -1, &status);
2556    RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2557    REGEX_CHECK_STATUS;
2558
2559    const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2560    utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2561    RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2562    REGEX_CHECK_STATUS;
2563
2564    const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2565    utext_openUTF8(&replText, str_11, -1, &status);
2566    result = matcher2->replaceFirst(&replText, NULL, status);
2567    REGEX_CHECK_STATUS;
2568    const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2569    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2570    utext_close(result);
2571    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2572    result = matcher2->replaceFirst(&replText, &destText, status);
2573    REGEX_CHECK_STATUS;
2574    REGEX_ASSERT(result == &destText);
2575    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2576
2577    const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2578    utext_openUTF8(&replText, str_v, -1, &status);
2579    REGEX_VERBOSE_TEXT(&replText);
2580    result = matcher2->replaceFirst(&replText, NULL, status);
2581    REGEX_CHECK_STATUS;
2582    const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2583    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2584    utext_close(result);
2585    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2586    result = matcher2->replaceFirst(&replText, &destText, status);
2587    REGEX_CHECK_STATUS;
2588    REGEX_ASSERT(result == &destText);
2589    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2590
2591    const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2592    utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2593    result = matcher2->replaceFirst(&replText, NULL, status);
2594    REGEX_CHECK_STATUS;
2595    const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2596    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2597    utext_close(result);
2598    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2599    result = matcher2->replaceFirst(&replText, &destText, status);
2600    REGEX_CHECK_STATUS;
2601    REGEX_ASSERT(result == &destText);
2602    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2603
2604    unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2605    //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2606    //                                 012345678901234567890123456
2607    supplDigitChars[22] = 0xF0;
2608    supplDigitChars[23] = 0x9D;
2609    supplDigitChars[24] = 0x9F;
2610    supplDigitChars[25] = 0x8F;
2611    utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2612
2613    result = matcher2->replaceFirst(&replText, NULL, status);
2614    REGEX_CHECK_STATUS;
2615    const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2616    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2617    utext_close(result);
2618    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2619    result = matcher2->replaceFirst(&replText, &destText, status);
2620    REGEX_CHECK_STATUS;
2621    REGEX_ASSERT(result == &destText);
2622    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2623    const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2624    utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2625    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2626//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2627    utext_close(result);
2628    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2629    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2630    REGEX_ASSERT(result == &destText);
2631//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2632
2633    //
2634    // Replacement String with \u hex escapes
2635    //
2636    {
2637      const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2638      const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2639        utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2640        utext_openUTF8(&replText, str_u0043, -1, &status);
2641        matcher->reset(&dataText);
2642
2643        result = matcher->replaceAll(&replText, NULL, status);
2644        REGEX_CHECK_STATUS;
2645        const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2646        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2647        utext_close(result);
2648        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2649        result = matcher->replaceAll(&replText, &destText, status);
2650        REGEX_CHECK_STATUS;
2651        REGEX_ASSERT(result == &destText);
2652        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2653    }
2654    {
2655      const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2656        utext_openUTF8(&dataText, str_abc, -1, &status);
2657        const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2658        utext_openUTF8(&replText, str_U00010000, -1, &status);
2659        matcher->reset(&dataText);
2660
2661        unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2662        //                          0123456789
2663        expected[2] = 0xF0;
2664        expected[3] = 0x90;
2665        expected[4] = 0x80;
2666        expected[5] = 0x80;
2667
2668        result = matcher->replaceAll(&replText, NULL, status);
2669        REGEX_CHECK_STATUS;
2670        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2671        utext_close(result);
2672        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2673        result = matcher->replaceAll(&replText, &destText, status);
2674        REGEX_CHECK_STATUS;
2675        REGEX_ASSERT(result == &destText);
2676        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2677    }
2678    // TODO:  need more through testing of capture substitutions.
2679
2680    // Bug 4057
2681    //
2682    {
2683        status = U_ZERO_ERROR;
2684const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2685const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2686const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2687        utext_openUTF8(&re, str_ssee, -1, &status);
2688        utext_openUTF8(&dataText, str_blah, -1, &status);
2689        utext_openUTF8(&replText, str_ooh, -1, &status);
2690
2691        RegexMatcher m(&re, 0, status);
2692        REGEX_CHECK_STATUS;
2693
2694        UnicodeString result;
2695        UText resultText = UTEXT_INITIALIZER;
2696        utext_openUnicodeString(&resultText, &result, &status);
2697
2698        // Multiple finds do NOT bump up the previous appendReplacement postion.
2699        m.reset(&dataText);
2700        m.find();
2701        m.find();
2702        m.appendReplacement(&resultText, &replText, status);
2703        REGEX_CHECK_STATUS;
2704        const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2705        REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2706
2707        // After a reset into the interior of a string, appendReplacement still starts at beginning.
2708        status = U_ZERO_ERROR;
2709        result.truncate(0);
2710        utext_openUnicodeString(&resultText, &result, &status);
2711        m.reset(10, status);
2712        m.find();
2713        m.find();
2714        m.appendReplacement(&resultText, &replText, status);
2715        REGEX_CHECK_STATUS;
2716        const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2717        REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2718
2719        // find() at interior of string, appendReplacement still starts at beginning.
2720        status = U_ZERO_ERROR;
2721        result.truncate(0);
2722        utext_openUnicodeString(&resultText, &result, &status);
2723        m.reset();
2724        m.find(10, status);
2725        m.find();
2726        m.appendReplacement(&resultText, &replText, status);
2727        REGEX_CHECK_STATUS;
2728        const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2729        REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2730
2731        m.appendTail(&resultText, status);
2732        const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2733        REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2734
2735        utext_close(&resultText);
2736    }
2737
2738    delete matcher2;
2739    delete pat2;
2740    delete matcher;
2741    delete pat;
2742
2743    utext_close(&dataText);
2744    utext_close(&replText);
2745    utext_close(&destText);
2746    utext_close(&re);
2747}
2748
2749
2750//---------------------------------------------------------------------------
2751//
2752//      API_Pattern_UTF8  Test that the API for class RegexPattern is
2753//                        present and nominally working.
2754//
2755//---------------------------------------------------------------------------
2756void RegexTest::API_Pattern_UTF8() {
2757    RegexPattern        pata;    // Test default constructor to not crash.
2758    RegexPattern        patb;
2759
2760    REGEX_ASSERT(pata == patb);
2761    REGEX_ASSERT(pata == pata);
2762
2763    UText         re1 = UTEXT_INITIALIZER;
2764    UText         re2 = UTEXT_INITIALIZER;
2765    UErrorCode    status = U_ZERO_ERROR;
2766    UParseError   pe;
2767
2768    const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2769    const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2770    utext_openUTF8(&re1, str_abcalmz, -1, &status);
2771    utext_openUTF8(&re2, str_def, -1, &status);
2772
2773    RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2774    RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2775    REGEX_CHECK_STATUS;
2776    REGEX_ASSERT(*pat1 == *pat1);
2777    REGEX_ASSERT(*pat1 != pata);
2778
2779    // Assign
2780    patb = *pat1;
2781    REGEX_ASSERT(patb == *pat1);
2782
2783    // Copy Construct
2784    RegexPattern patc(*pat1);
2785    REGEX_ASSERT(patc == *pat1);
2786    REGEX_ASSERT(patb == patc);
2787    REGEX_ASSERT(pat1 != pat2);
2788    patb = *pat2;
2789    REGEX_ASSERT(patb != patc);
2790    REGEX_ASSERT(patb == *pat2);
2791
2792    // Compile with no flags.
2793    RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2794    REGEX_ASSERT(*pat1a == *pat1);
2795
2796    REGEX_ASSERT(pat1a->flags() == 0);
2797
2798    // Compile with different flags should be not equal
2799    RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2800    REGEX_CHECK_STATUS;
2801
2802    REGEX_ASSERT(*pat1b != *pat1a);
2803    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2804    REGEX_ASSERT(pat1a->flags() == 0);
2805    delete pat1b;
2806
2807    // clone
2808    RegexPattern *pat1c = pat1->clone();
2809    REGEX_ASSERT(*pat1c == *pat1);
2810    REGEX_ASSERT(*pat1c != *pat2);
2811
2812    delete pat1c;
2813    delete pat1a;
2814    delete pat1;
2815    delete pat2;
2816
2817    utext_close(&re1);
2818    utext_close(&re2);
2819
2820
2821    //
2822    //   Verify that a matcher created from a cloned pattern works.
2823    //     (Jitterbug 3423)
2824    //
2825    {
2826        UErrorCode     status     = U_ZERO_ERROR;
2827        UText          pattern    = UTEXT_INITIALIZER;
2828        const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2829        utext_openUTF8(&pattern, str_pL, -1, &status);
2830
2831        RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2832        RegexPattern  *pClone     = pSource->clone();
2833        delete         pSource;
2834        RegexMatcher  *mFromClone = pClone->matcher(status);
2835        REGEX_CHECK_STATUS;
2836
2837        UText          input      = UTEXT_INITIALIZER;
2838        const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2839        utext_openUTF8(&input, str_HelloWorld, -1, &status);
2840        mFromClone->reset(&input);
2841        REGEX_ASSERT(mFromClone->find() == TRUE);
2842        REGEX_ASSERT(mFromClone->group(status) == "Hello");
2843        REGEX_ASSERT(mFromClone->find() == TRUE);
2844        REGEX_ASSERT(mFromClone->group(status) == "World");
2845        REGEX_ASSERT(mFromClone->find() == FALSE);
2846        delete mFromClone;
2847        delete pClone;
2848
2849        utext_close(&input);
2850        utext_close(&pattern);
2851    }
2852
2853    //
2854    //   matches convenience API
2855    //
2856    {
2857        UErrorCode status  = U_ZERO_ERROR;
2858        UText      pattern = UTEXT_INITIALIZER;
2859        UText      input   = UTEXT_INITIALIZER;
2860
2861        const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2862        utext_openUTF8(&input, str_randominput, -1, &status);
2863
2864        const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2865        utext_openUTF8(&pattern, str_dotstar, -1, &status);
2866        REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2867        REGEX_CHECK_STATUS;
2868
2869        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2870        utext_openUTF8(&pattern, str_abc, -1, &status);
2871        REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2872        REGEX_CHECK_STATUS;
2873
2874        const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2875        utext_openUTF8(&pattern, str_nput, -1, &status);
2876        REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2877        REGEX_CHECK_STATUS;
2878
2879        utext_openUTF8(&pattern, str_randominput, -1, &status);
2880        REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2881        REGEX_CHECK_STATUS;
2882
2883        const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2884        utext_openUTF8(&pattern, str_u, -1, &status);
2885        REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2886        REGEX_CHECK_STATUS;
2887
2888        utext_openUTF8(&input, str_abc, -1, &status);
2889        utext_openUTF8(&pattern, str_abc, -1, &status);
2890        status = U_INDEX_OUTOFBOUNDS_ERROR;
2891        REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2892        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2893
2894        utext_close(&input);
2895        utext_close(&pattern);
2896    }
2897
2898
2899    //
2900    // Split()
2901    //
2902    status = U_ZERO_ERROR;
2903    const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2904    utext_openUTF8(&re1, str_spaceplus, -1, &status);
2905    pat1 = RegexPattern::compile(&re1, pe, status);
2906    REGEX_CHECK_STATUS;
2907    UnicodeString  fields[10];
2908
2909    int32_t n;
2910    n = pat1->split("Now is the time", fields, 10, status);
2911    REGEX_CHECK_STATUS;
2912    REGEX_ASSERT(n==4);
2913    REGEX_ASSERT(fields[0]=="Now");
2914    REGEX_ASSERT(fields[1]=="is");
2915    REGEX_ASSERT(fields[2]=="the");
2916    REGEX_ASSERT(fields[3]=="time");
2917    REGEX_ASSERT(fields[4]=="");
2918
2919    n = pat1->split("Now is the time", fields, 2, status);
2920    REGEX_CHECK_STATUS;
2921    REGEX_ASSERT(n==2);
2922    REGEX_ASSERT(fields[0]=="Now");
2923    REGEX_ASSERT(fields[1]=="is the time");
2924    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2925
2926    fields[1] = "*";
2927    status = U_ZERO_ERROR;
2928    n = pat1->split("Now is the time", fields, 1, status);
2929    REGEX_CHECK_STATUS;
2930    REGEX_ASSERT(n==1);
2931    REGEX_ASSERT(fields[0]=="Now is the time");
2932    REGEX_ASSERT(fields[1]=="*");
2933    status = U_ZERO_ERROR;
2934
2935    n = pat1->split("    Now       is the time   ", fields, 10, status);
2936    REGEX_CHECK_STATUS;
2937    REGEX_ASSERT(n==6);
2938    REGEX_ASSERT(fields[0]=="");
2939    REGEX_ASSERT(fields[1]=="Now");
2940    REGEX_ASSERT(fields[2]=="is");
2941    REGEX_ASSERT(fields[3]=="the");
2942    REGEX_ASSERT(fields[4]=="time");
2943    REGEX_ASSERT(fields[5]=="");
2944    REGEX_ASSERT(fields[6]=="");
2945
2946    fields[2] = "*";
2947    n = pat1->split("     ", fields, 10, status);
2948    REGEX_CHECK_STATUS;
2949    REGEX_ASSERT(n==2);
2950    REGEX_ASSERT(fields[0]=="");
2951    REGEX_ASSERT(fields[1]=="");
2952    REGEX_ASSERT(fields[2]=="*");
2953
2954    fields[0] = "foo";
2955    n = pat1->split("", fields, 10, status);
2956    REGEX_CHECK_STATUS;
2957    REGEX_ASSERT(n==0);
2958    REGEX_ASSERT(fields[0]=="foo");
2959
2960    delete pat1;
2961
2962    //  split, with a pattern with (capture)
2963    regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2964    pat1 = RegexPattern::compile(&re1,  pe, status);
2965    REGEX_CHECK_STATUS;
2966
2967    status = U_ZERO_ERROR;
2968    fields[6] = fields[7] = "*";
2969    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2970    REGEX_CHECK_STATUS;
2971    REGEX_ASSERT(n==7);
2972    REGEX_ASSERT(fields[0]=="");
2973    REGEX_ASSERT(fields[1]=="a");
2974    REGEX_ASSERT(fields[2]=="Now is ");
2975    REGEX_ASSERT(fields[3]=="b");
2976    REGEX_ASSERT(fields[4]=="the time");
2977    REGEX_ASSERT(fields[5]=="c");
2978    REGEX_ASSERT(fields[6]=="");
2979    REGEX_ASSERT(fields[7]=="*");
2980    REGEX_ASSERT(status==U_ZERO_ERROR);
2981
2982    fields[6] = fields[7] = "*";
2983    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2984    REGEX_CHECK_STATUS;
2985    REGEX_ASSERT(n==7);
2986    REGEX_ASSERT(fields[0]=="  ");
2987    REGEX_ASSERT(fields[1]=="a");
2988    REGEX_ASSERT(fields[2]=="Now is ");
2989    REGEX_ASSERT(fields[3]=="b");
2990    REGEX_ASSERT(fields[4]=="the time");
2991    REGEX_ASSERT(fields[5]=="c");
2992    REGEX_ASSERT(fields[6]=="");
2993    REGEX_ASSERT(fields[7]=="*");
2994
2995    status = U_ZERO_ERROR;
2996    fields[6] = "foo";
2997    n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
2998    REGEX_CHECK_STATUS;
2999    REGEX_ASSERT(n==6);
3000    REGEX_ASSERT(fields[0]=="  ");
3001    REGEX_ASSERT(fields[1]=="a");
3002    REGEX_ASSERT(fields[2]=="Now is ");
3003    REGEX_ASSERT(fields[3]=="b");
3004    REGEX_ASSERT(fields[4]=="the time");
3005    REGEX_ASSERT(fields[5]==" ");
3006    REGEX_ASSERT(fields[6]=="foo");
3007
3008    status = U_ZERO_ERROR;
3009    fields[5] = "foo";
3010    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3011    REGEX_CHECK_STATUS;
3012    REGEX_ASSERT(n==5);
3013    REGEX_ASSERT(fields[0]=="  ");
3014    REGEX_ASSERT(fields[1]=="a");
3015    REGEX_ASSERT(fields[2]=="Now is ");
3016    REGEX_ASSERT(fields[3]=="b");
3017    REGEX_ASSERT(fields[4]=="the time<c>");
3018    REGEX_ASSERT(fields[5]=="foo");
3019
3020    status = U_ZERO_ERROR;
3021    fields[5] = "foo";
3022    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3023    REGEX_CHECK_STATUS;
3024    REGEX_ASSERT(n==5);
3025    REGEX_ASSERT(fields[0]=="  ");
3026    REGEX_ASSERT(fields[1]=="a");
3027    REGEX_ASSERT(fields[2]=="Now is ");
3028    REGEX_ASSERT(fields[3]=="b");
3029    REGEX_ASSERT(fields[4]=="the time");
3030    REGEX_ASSERT(fields[5]=="foo");
3031
3032    status = U_ZERO_ERROR;
3033    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3034    REGEX_CHECK_STATUS;
3035    REGEX_ASSERT(n==4);
3036    REGEX_ASSERT(fields[0]=="  ");
3037    REGEX_ASSERT(fields[1]=="a");
3038    REGEX_ASSERT(fields[2]=="Now is ");
3039    REGEX_ASSERT(fields[3]=="the time<c>");
3040    status = U_ZERO_ERROR;
3041    delete pat1;
3042
3043    regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3044    pat1 = RegexPattern::compile(&re1, pe, status);
3045    REGEX_CHECK_STATUS;
3046    n = pat1->split("1-10,20", fields, 10, status);
3047    REGEX_CHECK_STATUS;
3048    REGEX_ASSERT(n==5);
3049    REGEX_ASSERT(fields[0]=="1");
3050    REGEX_ASSERT(fields[1]=="-");
3051    REGEX_ASSERT(fields[2]=="10");
3052    REGEX_ASSERT(fields[3]==",");
3053    REGEX_ASSERT(fields[4]=="20");
3054    delete pat1;
3055
3056
3057    //
3058    // RegexPattern::pattern() and patternText()
3059    //
3060    pat1 = new RegexPattern();
3061    REGEX_ASSERT(pat1->pattern() == "");
3062    REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3063    delete pat1;
3064    const char *helloWorldInvariant = "(Hello, world)*";
3065    regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3066    pat1 = RegexPattern::compile(&re1, pe, status);
3067    REGEX_CHECK_STATUS;
3068    REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
3069    REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3070    delete pat1;
3071
3072    utext_close(&re1);
3073}
3074
3075
3076//---------------------------------------------------------------------------
3077//
3078//      Extended       A more thorough check for features of regex patterns
3079//                     The test cases are in a separate data file,
3080//                       source/tests/testdata/regextst.txt
3081//                     A description of the test data format is included in that file.
3082//
3083//---------------------------------------------------------------------------
3084
3085const char *
3086RegexTest::getPath(char buffer[2048], const char *filename) {
3087    UErrorCode status=U_ZERO_ERROR;
3088    const char *testDataDirectory = IntlTest::getSourceTestData(status);
3089    if (U_FAILURE(status)) {
3090        errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3091        return NULL;
3092    }
3093
3094    strcpy(buffer, testDataDirectory);
3095    strcat(buffer, filename);
3096    return buffer;
3097}
3098
3099void RegexTest::Extended() {
3100    char tdd[2048];
3101    const char *srcPath;
3102    UErrorCode  status  = U_ZERO_ERROR;
3103    int32_t     lineNum = 0;
3104
3105    //
3106    //  Open and read the test data file.
3107    //
3108    srcPath=getPath(tdd, "regextst.txt");
3109    if(srcPath==NULL) {
3110        return; /* something went wrong, error already output */
3111    }
3112
3113    int32_t    len;
3114    UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3115    if (U_FAILURE(status)) {
3116        return; /* something went wrong, error already output */
3117    }
3118
3119    //
3120    //  Put the test data into a UnicodeString
3121    //
3122    UnicodeString testString(FALSE, testData, len);
3123
3124    RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3125    RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3126    RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3127
3128    RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3129    UnicodeString   testPattern;   // The pattern for test from the test file.
3130    UnicodeString   testFlags;     // the flags   for a test.
3131    UnicodeString   matchString;   // The marked up string to be used as input
3132
3133    if (U_FAILURE(status)){
3134        dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3135        delete [] testData;
3136        return;
3137    }
3138
3139    //
3140    //  Loop over the test data file, once per line.
3141    //
3142    while (lineMat.find()) {
3143        lineNum++;
3144        if (U_FAILURE(status)) {
3145          errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3146        }
3147
3148        status = U_ZERO_ERROR;
3149        UnicodeString testLine = lineMat.group(1, status);
3150        if (testLine.length() == 0) {
3151            continue;
3152        }
3153
3154        //
3155        // Parse the test line.  Skip blank and comment only lines.
3156        // Separate out the three main fields - pattern, flags, target.
3157        //
3158
3159        commentMat.reset(testLine);
3160        if (commentMat.lookingAt(status)) {
3161            // This line is a comment, or blank.
3162            continue;
3163        }
3164
3165        //
3166        //  Pull out the pattern field, remove it from the test file line.
3167        //
3168        quotedStuffMat.reset(testLine);
3169        if (quotedStuffMat.lookingAt(status)) {
3170            testPattern = quotedStuffMat.group(2, status);
3171            testLine.remove(0, quotedStuffMat.end(0, status));
3172        } else {
3173            errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3174            continue;
3175        }
3176
3177
3178        //
3179        //  Pull out the flags from the test file line.
3180        //
3181        flagsMat.reset(testLine);
3182        flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3183        testFlags = flagsMat.group(1, status);
3184        if (flagsMat.group(2, status).length() > 0) {
3185            errln("Bad Match flag at line %d. Scanning %c\n",
3186                lineNum, flagsMat.group(2, status).charAt(0));
3187            continue;
3188        }
3189        testLine.remove(0, flagsMat.end(0, status));
3190
3191        //
3192        //  Pull out the match string, as a whole.
3193        //    We'll process the <tags> later.
3194        //
3195        quotedStuffMat.reset(testLine);
3196        if (quotedStuffMat.lookingAt(status)) {
3197            matchString = quotedStuffMat.group(2, status);
3198            testLine.remove(0, quotedStuffMat.end(0, status));
3199        } else {
3200            errln("Bad match string at test file line %d", lineNum);
3201            continue;
3202        }
3203
3204        //
3205        //  The only thing left from the input line should be an optional trailing comment.
3206        //
3207        commentMat.reset(testLine);
3208        if (commentMat.lookingAt(status) == FALSE) {
3209            errln("Line %d: unexpected characters at end of test line.", lineNum);
3210            continue;
3211        }
3212
3213        //
3214        //  Run the test
3215        //
3216        regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3217    }
3218
3219    delete [] testData;
3220
3221}
3222
3223
3224
3225//---------------------------------------------------------------------------
3226//
3227//    regex_find(pattern, flags, inputString, lineNumber)
3228//
3229//         Function to run a single test from the Extended (data driven) tests.
3230//         See file test/testdata/regextst.txt for a description of the
3231//         pattern and inputString fields, and the allowed flags.
3232//         lineNumber is the source line in regextst.txt of the test.
3233//
3234//---------------------------------------------------------------------------
3235
3236
3237//  Set a value into a UVector at position specified by a decimal number in
3238//   a UnicodeString.   This is a utility function needed by the actual test function,
3239//   which follows.
3240static void set(UVector &vec, int32_t val, UnicodeString index) {
3241    UErrorCode  status=U_ZERO_ERROR;
3242    int32_t  idx = 0;
3243    for (int32_t i=0; i<index.length(); i++) {
3244        int32_t d=u_charDigitValue(index.charAt(i));
3245        if (d<0) {return;}
3246        idx = idx*10 + d;
3247    }
3248    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3249    vec.setElementAt(val, idx);
3250}
3251
3252static void setInt(UVector &vec, int32_t val, int32_t idx) {
3253    UErrorCode  status=U_ZERO_ERROR;
3254    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3255    vec.setElementAt(val, idx);
3256}
3257
3258static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3259{
3260    UBool couldFind = TRUE;
3261    UTEXT_SETNATIVEINDEX(utext, 0);
3262    int32_t i = 0;
3263    while (i < unistrOffset) {
3264        UChar32 c = UTEXT_NEXT32(utext);
3265        if (c != U_SENTINEL) {
3266            i += U16_LENGTH(c);
3267        } else {
3268            couldFind = FALSE;
3269            break;
3270        }
3271    }
3272    nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3273    return couldFind;
3274}
3275
3276
3277void RegexTest::regex_find(const UnicodeString &pattern,
3278                           const UnicodeString &flags,
3279                           const UnicodeString &inputString,
3280                           const char *srcPath,
3281                           int32_t line) {
3282    UnicodeString       unEscapedInput;
3283    UnicodeString       deTaggedInput;
3284
3285    int32_t             patternUTF8Length,      inputUTF8Length;
3286    char                *patternChars  = NULL, *inputChars = NULL;
3287    UText               patternText    = UTEXT_INITIALIZER;
3288    UText               inputText      = UTEXT_INITIALIZER;
3289    UConverter          *UTF8Converter = NULL;
3290
3291    UErrorCode          status         = U_ZERO_ERROR;
3292    UParseError         pe;
3293    RegexPattern        *parsePat      = NULL;
3294    RegexMatcher        *parseMatcher  = NULL;
3295    RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3296    RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3297    UVector             groupStarts(status);
3298    UVector             groupEnds(status);
3299    UVector             groupStartsUTF8(status);
3300    UVector             groupEndsUTF8(status);
3301    UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3302    UBool               failed         = FALSE;
3303    int32_t             numFinds;
3304    int32_t             i;
3305    UBool               useMatchesFunc   = FALSE;
3306    UBool               useLookingAtFunc = FALSE;
3307    int32_t             regionStart      = -1;
3308    int32_t             regionEnd        = -1;
3309    int32_t             regionStartUTF8  = -1;
3310    int32_t             regionEndUTF8    = -1;
3311
3312
3313    //
3314    //  Compile the caller's pattern
3315    //
3316    uint32_t bflags = 0;
3317    if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3318        bflags |= UREGEX_CASE_INSENSITIVE;
3319    }
3320    if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3321        bflags |= UREGEX_COMMENTS;
3322    }
3323    if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3324        bflags |= UREGEX_DOTALL;
3325    }
3326    if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3327        bflags |= UREGEX_MULTILINE;
3328    }
3329
3330    if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3331        bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3332    }
3333    if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3334        bflags |= UREGEX_UNIX_LINES;
3335    }
3336    if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3337        bflags |= UREGEX_LITERAL;
3338    }
3339
3340
3341    callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3342    if (status != U_ZERO_ERROR) {
3343        #if UCONFIG_NO_BREAK_ITERATION==1
3344        // 'v' test flag means that the test pattern should not compile if ICU was configured
3345        //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3346        if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3347            goto cleanupAndReturn;
3348        }
3349        #endif
3350        if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3351            // Expected pattern compilation error.
3352            if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3353                logln("Pattern Compile returns \"%s\"", u_errorName(status));
3354            }
3355            goto cleanupAndReturn;
3356        } else {
3357            // Unexpected pattern compilation error.
3358            dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3359            goto cleanupAndReturn;
3360        }
3361    }
3362
3363    UTF8Converter = ucnv_open("UTF8", &status);
3364    ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3365
3366    patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3367    status = U_ZERO_ERROR; // buffer overflow
3368    patternChars = new char[patternUTF8Length+1];
3369    pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3370    utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3371
3372    if (status == U_ZERO_ERROR) {
3373        UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3374
3375        if (status != U_ZERO_ERROR) {
3376#if UCONFIG_NO_BREAK_ITERATION==1
3377            // 'v' test flag means that the test pattern should not compile if ICU was configured
3378            //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3379            if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3380                goto cleanupAndReturn;
3381            }
3382#endif
3383            if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3384                // Expected pattern compilation error.
3385                if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3386                    logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3387                }
3388                goto cleanupAndReturn;
3389            } else {
3390                // Unexpected pattern compilation error.
3391                errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3392                goto cleanupAndReturn;
3393            }
3394        }
3395    }
3396
3397    if (UTF8Pattern == NULL) {
3398        // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3399        logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3400        status = U_ZERO_ERROR;
3401    }
3402
3403    if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3404        RegexPatternDump(callerPattern);
3405    }
3406
3407    if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3408        errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3409        goto cleanupAndReturn;
3410    }
3411
3412
3413    //
3414    // Number of times find() should be called on the test string, default to 1
3415    //
3416    numFinds = 1;
3417    for (i=2; i<=9; i++) {
3418        if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3419            if (numFinds != 1) {
3420                errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3421                goto cleanupAndReturn;
3422            }
3423            numFinds = i;
3424        }
3425    }
3426
3427    // 'M' flag.  Use matches() instead of find()
3428    if (flags.indexOf((UChar)0x4d) >= 0) {
3429        useMatchesFunc = TRUE;
3430    }
3431    if (flags.indexOf((UChar)0x4c) >= 0) {
3432        useLookingAtFunc = TRUE;
3433    }
3434
3435    //
3436    //  Find the tags in the input data, remove them, and record the group boundary
3437    //    positions.
3438    //
3439    parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3440    REGEX_CHECK_STATUS_L(line);
3441
3442    unEscapedInput = inputString.unescape();
3443    parseMatcher = parsePat->matcher(unEscapedInput, status);
3444    REGEX_CHECK_STATUS_L(line);
3445    while(parseMatcher->find()) {
3446        parseMatcher->appendReplacement(deTaggedInput, "", status);
3447        REGEX_CHECK_STATUS;
3448        UnicodeString groupNum = parseMatcher->group(2, status);
3449        if (groupNum == "r") {
3450            // <r> or </r>, a region specification within the string
3451            if (parseMatcher->group(1, status) == "/") {
3452                regionEnd = deTaggedInput.length();
3453            } else {
3454                regionStart = deTaggedInput.length();
3455            }
3456        } else {
3457            // <digits> or </digits>, a group match boundary tag.
3458            if (parseMatcher->group(1, status) == "/") {
3459                set(groupEnds, deTaggedInput.length(), groupNum);
3460            } else {
3461                set(groupStarts, deTaggedInput.length(), groupNum);
3462            }
3463        }
3464    }
3465    parseMatcher->appendTail(deTaggedInput);
3466    REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3467    if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3468      errln("mismatched <r> tags");
3469      failed = TRUE;
3470      goto cleanupAndReturn;
3471    }
3472
3473    //
3474    //  Configure the matcher according to the flags specified with this test.
3475    //
3476    matcher = callerPattern->matcher(deTaggedInput, status);
3477    REGEX_CHECK_STATUS_L(line);
3478    if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3479        matcher->setTrace(TRUE);
3480    }
3481
3482    if (UTF8Pattern != NULL) {
3483        inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3484        status = U_ZERO_ERROR; // buffer overflow
3485        inputChars = new char[inputUTF8Length+1];
3486        deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3487        utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3488
3489        if (status == U_ZERO_ERROR) {
3490            UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3491            REGEX_CHECK_STATUS_L(line);
3492        }
3493
3494        if (UTF8Matcher == NULL) {
3495            // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3496          logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3497            status = U_ZERO_ERROR;
3498        }
3499    }
3500
3501    //
3502    //  Generate native indices for UTF8 versions of region and capture group info
3503    //
3504    if (UTF8Matcher != NULL) {
3505        if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3506        if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3507
3508        //  Fill out the native index UVector info.
3509        //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3510        for (i=0; i<groupStarts.size(); i++) {
3511            int32_t  start = groupStarts.elementAti(i);
3512            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3513            if (start >= 0) {
3514                int32_t  startUTF8;
3515                if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3516                    errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3517                    failed = TRUE;
3518                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3519                }
3520                setInt(groupStartsUTF8, startUTF8, i);
3521            }
3522
3523            int32_t  end = groupEnds.elementAti(i);
3524            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3525            if (end >= 0) {
3526                int32_t  endUTF8;
3527                if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3528                    errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3529                    failed = TRUE;
3530                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3531                }
3532                setInt(groupEndsUTF8, endUTF8, i);
3533            }
3534        }
3535    }
3536
3537    if (regionStart>=0) {
3538       matcher->region(regionStart, regionEnd, status);
3539       REGEX_CHECK_STATUS_L(line);
3540       if (UTF8Matcher != NULL) {
3541           UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3542           REGEX_CHECK_STATUS_L(line);
3543       }
3544    }
3545    if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3546        matcher->useAnchoringBounds(FALSE);
3547        if (UTF8Matcher != NULL) {
3548            UTF8Matcher->useAnchoringBounds(FALSE);
3549        }
3550    }
3551    if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3552        matcher->useTransparentBounds(TRUE);
3553        if (UTF8Matcher != NULL) {
3554            UTF8Matcher->useTransparentBounds(TRUE);
3555        }
3556    }
3557
3558
3559
3560    //
3561    // Do a find on the de-tagged input using the caller's pattern
3562    //     TODO: error on count>1 and not find().
3563    //           error on both matches() and lookingAt().
3564    //
3565    for (i=0; i<numFinds; i++) {
3566        if (useMatchesFunc) {
3567            isMatch = matcher->matches(status);
3568            if (UTF8Matcher != NULL) {
3569               isUTF8Match = UTF8Matcher->matches(status);
3570            }
3571        } else  if (useLookingAtFunc) {
3572            isMatch = matcher->lookingAt(status);
3573            if (UTF8Matcher != NULL) {
3574                isUTF8Match = UTF8Matcher->lookingAt(status);
3575            }
3576        } else {
3577            isMatch = matcher->find();
3578            if (UTF8Matcher != NULL) {
3579                isUTF8Match = UTF8Matcher->find();
3580            }
3581        }
3582    }
3583    matcher->setTrace(FALSE);
3584    if (U_FAILURE(status)) {
3585        errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3586    }
3587
3588    //
3589    // Match up the groups from the find() with the groups from the tags
3590    //
3591
3592    // number of tags should match number of groups from find operation.
3593    // matcher->groupCount does not include group 0, the entire match, hence the +1.
3594    //   G option in test means that capture group data is not available in the
3595    //     expected results, so the check needs to be suppressed.
3596    if (isMatch == FALSE && groupStarts.size() != 0) {
3597        dataerrln("Error at line %d:  Match expected, but none found.", line);
3598        failed = TRUE;
3599        goto cleanupAndReturn;
3600    } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3601        errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3602        failed = TRUE;
3603        goto cleanupAndReturn;
3604    }
3605
3606    if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3607        // Only check for match / no match.  Don't check capture groups.
3608        if (isMatch && groupStarts.size() == 0) {
3609            errln("Error at line %d:  No match expected, but one found.", line);
3610            failed = TRUE;
3611        } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3612            errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3613            failed = TRUE;
3614        }
3615        goto cleanupAndReturn;
3616    }
3617
3618    REGEX_CHECK_STATUS_L(line);
3619    for (i=0; i<=matcher->groupCount(); i++) {
3620        int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3621        int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3622        if (matcher->start(i, status) != expectedStart) {
3623            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3624                line, i, expectedStart, matcher->start(i, status));
3625            failed = TRUE;
3626            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3627        } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3628            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3629                  line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3630            failed = TRUE;
3631            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3632        }
3633
3634        int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3635        int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3636        if (matcher->end(i, status) != expectedEnd) {
3637            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3638                line, i, expectedEnd, matcher->end(i, status));
3639            failed = TRUE;
3640            // Error on end position;  keep going; real error is probably yet to come as group
3641            //   end positions work from end of the input data towards the front.
3642        } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3643            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3644                  line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3645            failed = TRUE;
3646            // Error on end position;  keep going; real error is probably yet to come as group
3647            //   end positions work from end of the input data towards the front.
3648        }
3649    }
3650    if ( matcher->groupCount()+1 < groupStarts.size()) {
3651        errln("Error at line %d: Expected %d capture groups, found %d.",
3652            line, groupStarts.size()-1, matcher->groupCount());
3653        failed = TRUE;
3654        }
3655    else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3656        errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3657              line, groupStarts.size()-1, UTF8Matcher->groupCount());
3658        failed = TRUE;
3659    }
3660
3661    if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3662        matcher->requireEnd() == TRUE) {
3663        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3664        failed = TRUE;
3665    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3666        UTF8Matcher->requireEnd() == TRUE) {
3667        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3668        failed = TRUE;
3669    }
3670
3671    if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3672        matcher->requireEnd() == FALSE) {
3673        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3674        failed = TRUE;
3675    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3676        UTF8Matcher->requireEnd() == FALSE) {
3677        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3678        failed = TRUE;
3679    }
3680
3681    if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3682        matcher->hitEnd() == TRUE) {
3683        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3684        failed = TRUE;
3685    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3686               UTF8Matcher->hitEnd() == TRUE) {
3687        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3688        failed = TRUE;
3689    }
3690
3691    if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3692        matcher->hitEnd() == FALSE) {
3693        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3694        failed = TRUE;
3695    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3696               UTF8Matcher->hitEnd() == FALSE) {
3697        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3698        failed = TRUE;
3699    }
3700
3701
3702cleanupAndReturn:
3703    if (failed) {
3704        infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3705            +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3706        // callerPattern->dump();
3707    }
3708    delete parseMatcher;
3709    delete parsePat;
3710    delete UTF8Matcher;
3711    delete UTF8Pattern;
3712    delete matcher;
3713    delete callerPattern;
3714
3715    utext_close(&inputText);
3716    delete[] inputChars;
3717    utext_close(&patternText);
3718    delete[] patternChars;
3719    ucnv_close(UTF8Converter);
3720}
3721
3722
3723
3724
3725//---------------------------------------------------------------------------
3726//
3727//      Errors     Check for error handling in patterns.
3728//
3729//---------------------------------------------------------------------------
3730void RegexTest::Errors() {
3731    // \escape sequences that aren't implemented yet.
3732    //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3733
3734    // Missing close parentheses
3735    REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3736    REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3737    REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3738
3739    // Extra close paren
3740    REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3741    REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3742    REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3743
3744    // Look-ahead, Look-behind
3745    //  TODO:  add tests for unbounded length look-behinds.
3746    REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3747
3748    // Attempt to use non-default flags
3749    {
3750        UParseError   pe;
3751        UErrorCode    status = U_ZERO_ERROR;
3752        int32_t       flags  = UREGEX_CANON_EQ |
3753                               UREGEX_COMMENTS         | UREGEX_DOTALL   |
3754                               UREGEX_MULTILINE;
3755        RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3756        REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3757        delete pat1;
3758    }
3759
3760
3761    // Quantifiers are allowed only after something that can be quantified.
3762    REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3763    REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3764    REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3765
3766    // Mal-formed {min,max} quantifiers
3767    REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3768    REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3769    REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3770    REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3771    REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3772    REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3773    REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3774    REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3775    REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3776
3777    // Ticket 5389
3778    REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3779
3780    // Invalid Back Reference \0
3781    //    For ICU 3.8 and earlier
3782    //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3783    //
3784    REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3785
3786}
3787
3788
3789//-------------------------------------------------------------------------------
3790//
3791//  Read a text data file, convert it to UChars, and return the data
3792//    in one big UChar * buffer, which the caller must delete.
3793//
3794//--------------------------------------------------------------------------------
3795UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3796                                     const char *defEncoding, UErrorCode &status) {
3797    UChar       *retPtr  = NULL;
3798    char        *fileBuf = NULL;
3799    UConverter* conv     = NULL;
3800    FILE        *f       = NULL;
3801
3802    ulen = 0;
3803    if (U_FAILURE(status)) {
3804        return retPtr;
3805    }
3806
3807    //
3808    //  Open the file.
3809    //
3810    f = fopen(fileName, "rb");
3811    if (f == 0) {
3812        dataerrln("Error opening test data file %s\n", fileName);
3813        status = U_FILE_ACCESS_ERROR;
3814        return NULL;
3815    }
3816    //
3817    //  Read it in
3818    //
3819    int32_t            fileSize;
3820    int32_t            amt_read;
3821
3822    fseek( f, 0, SEEK_END);
3823    fileSize = ftell(f);
3824    fileBuf = new char[fileSize];
3825    fseek(f, 0, SEEK_SET);
3826    amt_read = fread(fileBuf, 1, fileSize, f);
3827    if (amt_read != fileSize || fileSize <= 0) {
3828        errln("Error reading test data file.");
3829        goto cleanUpAndReturn;
3830    }
3831
3832    //
3833    // Look for a Unicode Signature (BOM) on the data just read
3834    //
3835    int32_t        signatureLength;
3836    const char *   fileBufC;
3837    const char*    encoding;
3838
3839    fileBufC = fileBuf;
3840    encoding = ucnv_detectUnicodeSignature(
3841        fileBuf, fileSize, &signatureLength, &status);
3842    if(encoding!=NULL ){
3843        fileBufC  += signatureLength;
3844        fileSize  -= signatureLength;
3845    } else {
3846        encoding = defEncoding;
3847        if (strcmp(encoding, "utf-8") == 0) {
3848            errln("file %s is missing its BOM", fileName);
3849        }
3850    }
3851
3852    //
3853    // Open a converter to take the rule file to UTF-16
3854    //
3855    conv = ucnv_open(encoding, &status);
3856    if (U_FAILURE(status)) {
3857        goto cleanUpAndReturn;
3858    }
3859
3860    //
3861    // Convert the rules to UChar.
3862    //  Preflight first to determine required buffer size.
3863    //
3864    ulen = ucnv_toUChars(conv,
3865        NULL,           //  dest,
3866        0,              //  destCapacity,
3867        fileBufC,
3868        fileSize,
3869        &status);
3870    if (status == U_BUFFER_OVERFLOW_ERROR) {
3871        // Buffer Overflow is expected from the preflight operation.
3872        status = U_ZERO_ERROR;
3873
3874        retPtr = new UChar[ulen+1];
3875        ucnv_toUChars(conv,
3876            retPtr,       //  dest,
3877            ulen+1,
3878            fileBufC,
3879            fileSize,
3880            &status);
3881    }
3882
3883cleanUpAndReturn:
3884    fclose(f);
3885    delete[] fileBuf;
3886    ucnv_close(conv);
3887    if (U_FAILURE(status)) {
3888        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3889        delete []retPtr;
3890        retPtr = 0;
3891        ulen   = 0;
3892    };
3893    return retPtr;
3894}
3895
3896
3897//-------------------------------------------------------------------------------
3898//
3899//   PerlTests  - Run Perl's regular expression tests
3900//                The input file for this test is re_tests, the standard regular
3901//                expression test data distributed with the Perl source code.
3902//
3903//                Here is Perl's description of the test data file:
3904//
3905//        # The tests are in a separate file 't/op/re_tests'.
3906//        # Each line in that file is a separate test.
3907//        # There are five columns, separated by tabs.
3908//        #
3909//        # Column 1 contains the pattern, optionally enclosed in C<''>.
3910//        # Modifiers can be put after the closing C<'>.
3911//        #
3912//        # Column 2 contains the string to be matched.
3913//        #
3914//        # Column 3 contains the expected result:
3915//        #     y   expect a match
3916//        #     n   expect no match
3917//        #     c   expect an error
3918//        # B   test exposes a known bug in Perl, should be skipped
3919//        # b   test exposes a known bug in Perl, should be skipped if noamp
3920//        #
3921//        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3922//        #
3923//        # Column 4 contains a string, usually C<$&>.
3924//        #
3925//        # Column 5 contains the expected result of double-quote
3926//        # interpolating that string after the match, or start of error message.
3927//        #
3928//        # Column 6, if present, contains a reason why the test is skipped.
3929//        # This is printed with "skipped", for harness to pick up.
3930//        #
3931//        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3932//        #
3933//        # If you want to add a regular expression test that can't be expressed
3934//        # in this format, don't add it here: put it in op/pat.t instead.
3935//
3936//        For ICU, if field 3 contains an 'i', the test will be skipped.
3937//        The test exposes is some known incompatibility between ICU and Perl regexps.
3938//        (The i is in addition to whatever was there before.)
3939//
3940//-------------------------------------------------------------------------------
3941void RegexTest::PerlTests() {
3942    char tdd[2048];
3943    const char *srcPath;
3944    UErrorCode  status = U_ZERO_ERROR;
3945    UParseError pe;
3946
3947    //
3948    //  Open and read the test data file.
3949    //
3950    srcPath=getPath(tdd, "re_tests.txt");
3951    if(srcPath==NULL) {
3952        return; /* something went wrong, error already output */
3953    }
3954
3955    int32_t    len;
3956    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3957    if (U_FAILURE(status)) {
3958        return; /* something went wrong, error already output */
3959    }
3960
3961    //
3962    //  Put the test data into a UnicodeString
3963    //
3964    UnicodeString testDataString(FALSE, testData, len);
3965
3966    //
3967    //  Regex to break the input file into lines, and strip the new lines.
3968    //     One line per match, capture group one is the desired data.
3969    //
3970    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3971    if (U_FAILURE(status)) {
3972        dataerrln("RegexPattern::compile() error");
3973        return;
3974    }
3975    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3976
3977    //
3978    //  Regex to split a test file line into fields.
3979    //    There are six fields, separated by tabs.
3980    //
3981    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3982
3983    //
3984    //  Regex to identify test patterns with flag settings, and to separate them.
3985    //    Test patterns with flags look like 'pattern'i
3986    //    Test patterns without flags are not quoted:   pattern
3987    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3988    //
3989    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3990    RegexMatcher* flagMat = flagPat->matcher(status);
3991
3992    //
3993    // The Perl tests reference several perl-isms, which are evaluated/substituted
3994    //   in the test data.  Not being perl, this must be done explicitly.  Here
3995    //   are string constants and REs for these constructs.
3996    //
3997    UnicodeString nulnulSrc("${nulnul}");
3998    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3999    nulnul = nulnul.unescape();
4000
4001    UnicodeString ffffSrc("${ffff}");
4002    UnicodeString ffff("\\uffff", -1, US_INV);
4003    ffff = ffff.unescape();
4004
4005    //  regexp for $-[0], $+[2], etc.
4006    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4007    RegexMatcher *groupsMat = groupsPat->matcher(status);
4008
4009    //  regexp for $0, $1, $2, etc.
4010    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4011    RegexMatcher *cgMat = cgPat->matcher(status);
4012
4013
4014    //
4015    // Main Loop for the Perl Tests, runs once per line from the
4016    //   test data file.
4017    //
4018    int32_t  lineNum = 0;
4019    int32_t  skippedUnimplementedCount = 0;
4020    while (lineMat->find()) {
4021        lineNum++;
4022
4023        //
4024        //  Get a line, break it into its fields, do the Perl
4025        //    variable substitutions.
4026        //
4027        UnicodeString line = lineMat->group(1, status);
4028        UnicodeString fields[7];
4029        fieldPat->split(line, fields, 7, status);
4030
4031        flagMat->reset(fields[0]);
4032        flagMat->matches(status);
4033        UnicodeString pattern  = flagMat->group(2, status);
4034        pattern.findAndReplace("${bang}", "!");
4035        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4036        pattern.findAndReplace(ffffSrc, ffff);
4037
4038        //
4039        //  Identify patterns that include match flag settings,
4040        //    split off the flags, remove the extra quotes.
4041        //
4042        UnicodeString flagStr = flagMat->group(3, status);
4043        if (U_FAILURE(status)) {
4044            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4045            return;
4046        }
4047        int32_t flags = 0;
4048        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4049        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4050        const UChar UChar_m = 0x6d;
4051        const UChar UChar_x = 0x78;
4052        const UChar UChar_y = 0x79;
4053        if (flagStr.indexOf(UChar_i) != -1) {
4054            flags |= UREGEX_CASE_INSENSITIVE;
4055        }
4056        if (flagStr.indexOf(UChar_m) != -1) {
4057            flags |= UREGEX_MULTILINE;
4058        }
4059        if (flagStr.indexOf(UChar_x) != -1) {
4060            flags |= UREGEX_COMMENTS;
4061        }
4062
4063        //
4064        // Compile the test pattern.
4065        //
4066        status = U_ZERO_ERROR;
4067        RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4068        if (status == U_REGEX_UNIMPLEMENTED) {
4069            //
4070            // Test of a feature that is planned for ICU, but not yet implemented.
4071            //   skip the test.
4072            skippedUnimplementedCount++;
4073            delete testPat;
4074            status = U_ZERO_ERROR;
4075            continue;
4076        }
4077
4078        if (U_FAILURE(status)) {
4079            // Some tests are supposed to generate errors.
4080            //   Only report an error for tests that are supposed to succeed.
4081            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4082                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4083            {
4084                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4085            }
4086            status = U_ZERO_ERROR;
4087            delete testPat;
4088            continue;
4089        }
4090
4091        if (fields[2].indexOf(UChar_i) >= 0) {
4092            // ICU should skip this test.
4093            delete testPat;
4094            continue;
4095        }
4096
4097        if (fields[2].indexOf(UChar_c) >= 0) {
4098            // This pattern should have caused a compilation error, but didn't/
4099            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4100            delete testPat;
4101            continue;
4102        }
4103
4104        //
4105        // replace the Perl variables that appear in some of the
4106        //   match data strings.
4107        //
4108        UnicodeString matchString = fields[1];
4109        matchString.findAndReplace(nulnulSrc, nulnul);
4110        matchString.findAndReplace(ffffSrc,   ffff);
4111
4112        // Replace any \n in the match string with an actual new-line char.
4113        //  Don't do full unescape, as this unescapes more than Perl does, which
4114        //  causes other spurious failures in the tests.
4115        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4116
4117
4118
4119        //
4120        // Run the test, check for expected match/don't match result.
4121        //
4122        RegexMatcher *testMat = testPat->matcher(matchString, status);
4123        UBool found = testMat->find();
4124        UBool expected = FALSE;
4125        if (fields[2].indexOf(UChar_y) >=0) {
4126            expected = TRUE;
4127        }
4128        if (expected != found) {
4129            errln("line %d: Expected %smatch, got %smatch",
4130                lineNum, expected?"":"no ", found?"":"no " );
4131            continue;
4132        }
4133
4134        // Don't try to check expected results if there is no match.
4135        //   (Some have stuff in the expected fields)
4136        if (!found) {
4137            delete testMat;
4138            delete testPat;
4139            continue;
4140        }
4141
4142        //
4143        // Interpret the Perl expression from the fourth field of the data file,
4144        // building up an ICU string from the results of the ICU match.
4145        //   The Perl expression will contain references to the results of
4146        //     a regex match, including the matched string, capture group strings,
4147        //     group starting and ending indicies, etc.
4148        //
4149        UnicodeString resultString;
4150        UnicodeString perlExpr = fields[3];
4151#if SUPPORT_MUTATING_INPUT_STRING
4152        groupsMat->reset(perlExpr);
4153        cgMat->reset(perlExpr);
4154#endif
4155
4156        while (perlExpr.length() > 0) {
4157#if !SUPPORT_MUTATING_INPUT_STRING
4158            //  Perferred usage.  Reset after any modification to input string.
4159            groupsMat->reset(perlExpr);
4160            cgMat->reset(perlExpr);
4161#endif
4162
4163            if (perlExpr.startsWith("$&")) {
4164                resultString.append(testMat->group(status));
4165                perlExpr.remove(0, 2);
4166            }
4167
4168            else if (groupsMat->lookingAt(status)) {
4169                // $-[0]   $+[2]  etc.
4170                UnicodeString digitString = groupsMat->group(2, status);
4171                int32_t t = 0;
4172                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4173                UnicodeString plusOrMinus = groupsMat->group(1, status);
4174                int32_t matchPosition;
4175                if (plusOrMinus.compare("+") == 0) {
4176                    matchPosition = testMat->end(groupNum, status);
4177                } else {
4178                    matchPosition = testMat->start(groupNum, status);
4179                }
4180                if (matchPosition != -1) {
4181                    ICU_Utility::appendNumber(resultString, matchPosition);
4182                }
4183                perlExpr.remove(0, groupsMat->end(status));
4184            }
4185
4186            else if (cgMat->lookingAt(status)) {
4187                // $1, $2, $3, etc.
4188                UnicodeString digitString = cgMat->group(1, status);
4189                int32_t t = 0;
4190                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4191                if (U_SUCCESS(status)) {
4192                    resultString.append(testMat->group(groupNum, status));
4193                    status = U_ZERO_ERROR;
4194                }
4195                perlExpr.remove(0, cgMat->end(status));
4196            }
4197
4198            else if (perlExpr.startsWith("@-")) {
4199                int32_t i;
4200                for (i=0; i<=testMat->groupCount(); i++) {
4201                    if (i>0) {
4202                        resultString.append(" ");
4203                    }
4204                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4205                }
4206                perlExpr.remove(0, 2);
4207            }
4208
4209            else if (perlExpr.startsWith("@+")) {
4210                int32_t i;
4211                for (i=0; i<=testMat->groupCount(); i++) {
4212                    if (i>0) {
4213                        resultString.append(" ");
4214                    }
4215                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4216                }
4217                perlExpr.remove(0, 2);
4218            }
4219
4220            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4221                                                     //           or as an escaped sequence (e.g. \n)
4222                if (perlExpr.length() > 1) {
4223                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4224                }
4225                UChar c = perlExpr.charAt(0);
4226                switch (c) {
4227                case 'n':   c = '\n'; break;
4228                // add any other escape sequences that show up in the test expected results.
4229                }
4230                resultString.append(c);
4231                perlExpr.remove(0, 1);
4232            }
4233
4234            else  {
4235                // Any characters from the perl expression that we don't explicitly
4236                //  recognize before here are assumed to be literals and copied
4237                //  as-is to the expected results.
4238                resultString.append(perlExpr.charAt(0));
4239                perlExpr.remove(0, 1);
4240            }
4241
4242            if (U_FAILURE(status)) {
4243                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4244                break;
4245            }
4246        }
4247
4248        //
4249        // Expected Results Compare
4250        //
4251        UnicodeString expectedS(fields[4]);
4252        expectedS.findAndReplace(nulnulSrc, nulnul);
4253        expectedS.findAndReplace(ffffSrc,   ffff);
4254        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4255
4256
4257        if (expectedS.compare(resultString) != 0) {
4258            err("Line %d: Incorrect perl expression results.", lineNum);
4259            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4260        }
4261
4262        delete testMat;
4263        delete testPat;
4264    }
4265
4266    //
4267    // All done.  Clean up allocated stuff.
4268    //
4269    delete cgMat;
4270    delete cgPat;
4271
4272    delete groupsMat;
4273    delete groupsPat;
4274
4275    delete flagMat;
4276    delete flagPat;
4277
4278    delete lineMat;
4279    delete linePat;
4280
4281    delete fieldPat;
4282    delete [] testData;
4283
4284
4285    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4286
4287}
4288
4289
4290//-------------------------------------------------------------------------------
4291//
4292//   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4293//                  (instead of using UnicodeStrings) to test the alternate engine.
4294//                  The input file for this test is re_tests, the standard regular
4295//                  expression test data distributed with the Perl source code.
4296//                  See PerlTests() for more information.
4297//
4298//-------------------------------------------------------------------------------
4299void RegexTest::PerlTestsUTF8() {
4300    char tdd[2048];
4301    const char *srcPath;
4302    UErrorCode  status = U_ZERO_ERROR;
4303    UParseError pe;
4304    LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4305    UText       patternText = UTEXT_INITIALIZER;
4306    char       *patternChars = NULL;
4307    int32_t     patternLength;
4308    int32_t     patternCapacity = 0;
4309    UText       inputText = UTEXT_INITIALIZER;
4310    char       *inputChars = NULL;
4311    int32_t     inputLength;
4312    int32_t     inputCapacity = 0;
4313
4314    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4315
4316    //
4317    //  Open and read the test data file.
4318    //
4319    srcPath=getPath(tdd, "re_tests.txt");
4320    if(srcPath==NULL) {
4321        return; /* something went wrong, error already output */
4322    }
4323
4324    int32_t    len;
4325    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4326    if (U_FAILURE(status)) {
4327        return; /* something went wrong, error already output */
4328    }
4329
4330    //
4331    //  Put the test data into a UnicodeString
4332    //
4333    UnicodeString testDataString(FALSE, testData, len);
4334
4335    //
4336    //  Regex to break the input file into lines, and strip the new lines.
4337    //     One line per match, capture group one is the desired data.
4338    //
4339    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4340    if (U_FAILURE(status)) {
4341        dataerrln("RegexPattern::compile() error");
4342        return;
4343    }
4344    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4345
4346    //
4347    //  Regex to split a test file line into fields.
4348    //    There are six fields, separated by tabs.
4349    //
4350    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4351
4352    //
4353    //  Regex to identify test patterns with flag settings, and to separate them.
4354    //    Test patterns with flags look like 'pattern'i
4355    //    Test patterns without flags are not quoted:   pattern
4356    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4357    //
4358    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4359    RegexMatcher* flagMat = flagPat->matcher(status);
4360
4361    //
4362    // The Perl tests reference several perl-isms, which are evaluated/substituted
4363    //   in the test data.  Not being perl, this must be done explicitly.  Here
4364    //   are string constants and REs for these constructs.
4365    //
4366    UnicodeString nulnulSrc("${nulnul}");
4367    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4368    nulnul = nulnul.unescape();
4369
4370    UnicodeString ffffSrc("${ffff}");
4371    UnicodeString ffff("\\uffff", -1, US_INV);
4372    ffff = ffff.unescape();
4373
4374    //  regexp for $-[0], $+[2], etc.
4375    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4376    RegexMatcher *groupsMat = groupsPat->matcher(status);
4377
4378    //  regexp for $0, $1, $2, etc.
4379    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4380    RegexMatcher *cgMat = cgPat->matcher(status);
4381
4382
4383    //
4384    // Main Loop for the Perl Tests, runs once per line from the
4385    //   test data file.
4386    //
4387    int32_t  lineNum = 0;
4388    int32_t  skippedUnimplementedCount = 0;
4389    while (lineMat->find()) {
4390        lineNum++;
4391
4392        //
4393        //  Get a line, break it into its fields, do the Perl
4394        //    variable substitutions.
4395        //
4396        UnicodeString line = lineMat->group(1, status);
4397        UnicodeString fields[7];
4398        fieldPat->split(line, fields, 7, status);
4399
4400        flagMat->reset(fields[0]);
4401        flagMat->matches(status);
4402        UnicodeString pattern  = flagMat->group(2, status);
4403        pattern.findAndReplace("${bang}", "!");
4404        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4405        pattern.findAndReplace(ffffSrc, ffff);
4406
4407        //
4408        //  Identify patterns that include match flag settings,
4409        //    split off the flags, remove the extra quotes.
4410        //
4411        UnicodeString flagStr = flagMat->group(3, status);
4412        if (U_FAILURE(status)) {
4413            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4414            return;
4415        }
4416        int32_t flags = 0;
4417        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4418        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4419        const UChar UChar_m = 0x6d;
4420        const UChar UChar_x = 0x78;
4421        const UChar UChar_y = 0x79;
4422        if (flagStr.indexOf(UChar_i) != -1) {
4423            flags |= UREGEX_CASE_INSENSITIVE;
4424        }
4425        if (flagStr.indexOf(UChar_m) != -1) {
4426            flags |= UREGEX_MULTILINE;
4427        }
4428        if (flagStr.indexOf(UChar_x) != -1) {
4429            flags |= UREGEX_COMMENTS;
4430        }
4431
4432        //
4433        // Put the pattern in a UTF-8 UText
4434        //
4435        status = U_ZERO_ERROR;
4436        patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4437        if (status == U_BUFFER_OVERFLOW_ERROR) {
4438            status = U_ZERO_ERROR;
4439            delete[] patternChars;
4440            patternCapacity = patternLength + 1;
4441            patternChars = new char[patternCapacity];
4442            pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4443        }
4444        utext_openUTF8(&patternText, patternChars, patternLength, &status);
4445
4446        //
4447        // Compile the test pattern.
4448        //
4449        RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4450        if (status == U_REGEX_UNIMPLEMENTED) {
4451            //
4452            // Test of a feature that is planned for ICU, but not yet implemented.
4453            //   skip the test.
4454            skippedUnimplementedCount++;
4455            delete testPat;
4456            status = U_ZERO_ERROR;
4457            continue;
4458        }
4459
4460        if (U_FAILURE(status)) {
4461            // Some tests are supposed to generate errors.
4462            //   Only report an error for tests that are supposed to succeed.
4463            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4464                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4465            {
4466                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4467            }
4468            status = U_ZERO_ERROR;
4469            delete testPat;
4470            continue;
4471        }
4472
4473        if (fields[2].indexOf(UChar_i) >= 0) {
4474            // ICU should skip this test.
4475            delete testPat;
4476            continue;
4477        }
4478
4479        if (fields[2].indexOf(UChar_c) >= 0) {
4480            // This pattern should have caused a compilation error, but didn't/
4481            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4482            delete testPat;
4483            continue;
4484        }
4485
4486
4487        //
4488        // replace the Perl variables that appear in some of the
4489        //   match data strings.
4490        //
4491        UnicodeString matchString = fields[1];
4492        matchString.findAndReplace(nulnulSrc, nulnul);
4493        matchString.findAndReplace(ffffSrc,   ffff);
4494
4495        // Replace any \n in the match string with an actual new-line char.
4496        //  Don't do full unescape, as this unescapes more than Perl does, which
4497        //  causes other spurious failures in the tests.
4498        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4499
4500        //
4501        // Put the input in a UTF-8 UText
4502        //
4503        status = U_ZERO_ERROR;
4504        inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4505        if (status == U_BUFFER_OVERFLOW_ERROR) {
4506            status = U_ZERO_ERROR;
4507            delete[] inputChars;
4508            inputCapacity = inputLength + 1;
4509            inputChars = new char[inputCapacity];
4510            matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4511        }
4512        utext_openUTF8(&inputText, inputChars, inputLength, &status);
4513
4514        //
4515        // Run the test, check for expected match/don't match result.
4516        //
4517        RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4518        UBool found = testMat->find();
4519        UBool expected = FALSE;
4520        if (fields[2].indexOf(UChar_y) >=0) {
4521            expected = TRUE;
4522        }
4523        if (expected != found) {
4524            errln("line %d: Expected %smatch, got %smatch",
4525                lineNum, expected?"":"no ", found?"":"no " );
4526            continue;
4527        }
4528
4529        // Don't try to check expected results if there is no match.
4530        //   (Some have stuff in the expected fields)
4531        if (!found) {
4532            delete testMat;
4533            delete testPat;
4534            continue;
4535        }
4536
4537        //
4538        // Interpret the Perl expression from the fourth field of the data file,
4539        // building up an ICU string from the results of the ICU match.
4540        //   The Perl expression will contain references to the results of
4541        //     a regex match, including the matched string, capture group strings,
4542        //     group starting and ending indicies, etc.
4543        //
4544        UnicodeString resultString;
4545        UnicodeString perlExpr = fields[3];
4546
4547        while (perlExpr.length() > 0) {
4548            groupsMat->reset(perlExpr);
4549            cgMat->reset(perlExpr);
4550
4551            if (perlExpr.startsWith("$&")) {
4552                resultString.append(testMat->group(status));
4553                perlExpr.remove(0, 2);
4554            }
4555
4556            else if (groupsMat->lookingAt(status)) {
4557                // $-[0]   $+[2]  etc.
4558                UnicodeString digitString = groupsMat->group(2, status);
4559                int32_t t = 0;
4560                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4561                UnicodeString plusOrMinus = groupsMat->group(1, status);
4562                int32_t matchPosition;
4563                if (plusOrMinus.compare("+") == 0) {
4564                    matchPosition = testMat->end(groupNum, status);
4565                } else {
4566                    matchPosition = testMat->start(groupNum, status);
4567                }
4568                if (matchPosition != -1) {
4569                    ICU_Utility::appendNumber(resultString, matchPosition);
4570                }
4571                perlExpr.remove(0, groupsMat->end(status));
4572            }
4573
4574            else if (cgMat->lookingAt(status)) {
4575                // $1, $2, $3, etc.
4576                UnicodeString digitString = cgMat->group(1, status);
4577                int32_t t = 0;
4578                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4579                if (U_SUCCESS(status)) {
4580                    resultString.append(testMat->group(groupNum, status));
4581                    status = U_ZERO_ERROR;
4582                }
4583                perlExpr.remove(0, cgMat->end(status));
4584            }
4585
4586            else if (perlExpr.startsWith("@-")) {
4587                int32_t i;
4588                for (i=0; i<=testMat->groupCount(); i++) {
4589                    if (i>0) {
4590                        resultString.append(" ");
4591                    }
4592                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4593                }
4594                perlExpr.remove(0, 2);
4595            }
4596
4597            else if (perlExpr.startsWith("@+")) {
4598                int32_t i;
4599                for (i=0; i<=testMat->groupCount(); i++) {
4600                    if (i>0) {
4601                        resultString.append(" ");
4602                    }
4603                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4604                }
4605                perlExpr.remove(0, 2);
4606            }
4607
4608            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4609                                                     //           or as an escaped sequence (e.g. \n)
4610                if (perlExpr.length() > 1) {
4611                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4612                }
4613                UChar c = perlExpr.charAt(0);
4614                switch (c) {
4615                case 'n':   c = '\n'; break;
4616                // add any other escape sequences that show up in the test expected results.
4617                }
4618                resultString.append(c);
4619                perlExpr.remove(0, 1);
4620            }
4621
4622            else  {
4623                // Any characters from the perl expression that we don't explicitly
4624                //  recognize before here are assumed to be literals and copied
4625                //  as-is to the expected results.
4626                resultString.append(perlExpr.charAt(0));
4627                perlExpr.remove(0, 1);
4628            }
4629
4630            if (U_FAILURE(status)) {
4631                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4632                break;
4633            }
4634        }
4635
4636        //
4637        // Expected Results Compare
4638        //
4639        UnicodeString expectedS(fields[4]);
4640        expectedS.findAndReplace(nulnulSrc, nulnul);
4641        expectedS.findAndReplace(ffffSrc,   ffff);
4642        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4643
4644
4645        if (expectedS.compare(resultString) != 0) {
4646            err("Line %d: Incorrect perl expression results.", lineNum);
4647            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4648        }
4649
4650        delete testMat;
4651        delete testPat;
4652    }
4653
4654    //
4655    // All done.  Clean up allocated stuff.
4656    //
4657    delete cgMat;
4658    delete cgPat;
4659
4660    delete groupsMat;
4661    delete groupsPat;
4662
4663    delete flagMat;
4664    delete flagPat;
4665
4666    delete lineMat;
4667    delete linePat;
4668
4669    delete fieldPat;
4670    delete [] testData;
4671
4672    utext_close(&patternText);
4673    utext_close(&inputText);
4674
4675    delete [] patternChars;
4676    delete [] inputChars;
4677
4678
4679    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4680
4681}
4682
4683
4684//--------------------------------------------------------------
4685//
4686//  Bug6149   Verify limits to heap expansion for backtrack stack.
4687//             Use this pattern,
4688//                 "(a?){1,8000000}"
4689//             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4690//                   This test is likely to be fragile, as further optimizations stop
4691//                   more cases of pointless looping in the match engine.
4692//
4693//---------------------------------------------------------------
4694void RegexTest::Bug6149() {
4695    UnicodeString pattern("(a?){1,8000000}");
4696    UnicodeString s("xyz");
4697    uint32_t flags = 0;
4698    UErrorCode status = U_ZERO_ERROR;
4699
4700    RegexMatcher  matcher(pattern, s, flags, status);
4701    UBool result = false;
4702    REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4703    REGEX_ASSERT(result == FALSE);
4704 }
4705
4706
4707//
4708//   Callbacks()    Test the callback function.
4709//                  When set, callbacks occur periodically during matching operations,
4710//                  giving the application code the ability to abort the operation
4711//                  before it's normal completion.
4712//
4713
4714struct callBackContext {
4715    RegexTest        *test;
4716    int32_t          maxCalls;
4717    int32_t          numCalls;
4718    int32_t          lastSteps;
4719    void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4720};
4721
4722U_CDECL_BEGIN
4723static UBool U_CALLCONV
4724testCallBackFn(const void *context, int32_t steps) {
4725    callBackContext  *info = (callBackContext *)context;
4726    if (info->lastSteps+1 != steps) {
4727        info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4728    }
4729    info->lastSteps = steps;
4730    info->numCalls++;
4731    return (info->numCalls < info->maxCalls);
4732}
4733U_CDECL_END
4734
4735void RegexTest::Callbacks() {
4736   {
4737        // Getter returns NULLs if no callback has been set
4738
4739        //   The variables that the getter will fill in.
4740        //   Init to non-null values so that the action of the getter can be seen.
4741        const void          *returnedContext = &returnedContext;
4742        URegexMatchCallback *returnedFn = &testCallBackFn;
4743
4744        UErrorCode status = U_ZERO_ERROR;
4745        RegexMatcher matcher("x", 0, status);
4746        REGEX_CHECK_STATUS;
4747        matcher.getMatchCallback(returnedFn, returnedContext, status);
4748        REGEX_CHECK_STATUS;
4749        REGEX_ASSERT(returnedFn == NULL);
4750        REGEX_ASSERT(returnedContext == NULL);
4751    }
4752
4753   {
4754        // Set and Get work
4755        callBackContext cbInfo = {this, 0, 0, 0};
4756        const void          *returnedContext;
4757        URegexMatchCallback *returnedFn;
4758        UErrorCode status = U_ZERO_ERROR;
4759        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4760        REGEX_CHECK_STATUS;
4761        matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4762        REGEX_CHECK_STATUS;
4763        matcher.getMatchCallback(returnedFn, returnedContext, status);
4764        REGEX_CHECK_STATUS;
4765        REGEX_ASSERT(returnedFn == testCallBackFn);
4766        REGEX_ASSERT(returnedContext == &cbInfo);
4767
4768        // A short-running match shouldn't invoke the callback
4769        status = U_ZERO_ERROR;
4770        cbInfo.reset(1);
4771        UnicodeString s = "xxx";
4772        matcher.reset(s);
4773        REGEX_ASSERT(matcher.matches(status));
4774        REGEX_CHECK_STATUS;
4775        REGEX_ASSERT(cbInfo.numCalls == 0);
4776
4777        // A medium-length match that runs long enough to invoke the
4778        //   callback, but not so long that the callback aborts it.
4779        status = U_ZERO_ERROR;
4780        cbInfo.reset(4);
4781        s = "aaaaaaaaaaaaaaaaaaab";
4782        matcher.reset(s);
4783        REGEX_ASSERT(matcher.matches(status)==FALSE);
4784        REGEX_CHECK_STATUS;
4785        REGEX_ASSERT(cbInfo.numCalls > 0);
4786
4787        // A longer running match that the callback function will abort.
4788        status = U_ZERO_ERROR;
4789        cbInfo.reset(4);
4790        s = "aaaaaaaaaaaaaaaaaaaaaaab";
4791        matcher.reset(s);
4792        REGEX_ASSERT(matcher.matches(status)==FALSE);
4793        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4794        REGEX_ASSERT(cbInfo.numCalls == 4);
4795    }
4796
4797
4798}
4799
4800
4801//
4802//   FindProgressCallbacks()    Test the find "progress" callback function.
4803//                  When set, the find progress callback will be invoked during a find operations
4804//                  after each return from a match attempt, giving the application the opportunity
4805//                  to terminate a long-running find operation before it's normal completion.
4806//
4807
4808struct progressCallBackContext {
4809    RegexTest        *test;
4810    int64_t          lastIndex;
4811    int32_t          maxCalls;
4812    int32_t          numCalls;
4813    void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4814};
4815
4816U_CDECL_BEGIN
4817static UBool U_CALLCONV
4818testProgressCallBackFn(const void *context, int64_t matchIndex) {
4819    progressCallBackContext  *info = (progressCallBackContext *)context;
4820    info->numCalls++;
4821    info->lastIndex = matchIndex;
4822//    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4823    return (info->numCalls < info->maxCalls);
4824}
4825U_CDECL_END
4826
4827void RegexTest::FindProgressCallbacks() {
4828   {
4829        // Getter returns NULLs if no callback has been set
4830
4831        //   The variables that the getter will fill in.
4832        //   Init to non-null values so that the action of the getter can be seen.
4833        const void                  *returnedContext = &returnedContext;
4834        URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4835
4836        UErrorCode status = U_ZERO_ERROR;
4837        RegexMatcher matcher("x", 0, status);
4838        REGEX_CHECK_STATUS;
4839        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4840        REGEX_CHECK_STATUS;
4841        REGEX_ASSERT(returnedFn == NULL);
4842        REGEX_ASSERT(returnedContext == NULL);
4843    }
4844
4845   {
4846        // Set and Get work
4847        progressCallBackContext cbInfo = {this, 0, 0, 0};
4848        const void                  *returnedContext;
4849        URegexFindProgressCallback  *returnedFn;
4850        UErrorCode status = U_ZERO_ERROR;
4851        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4852        REGEX_CHECK_STATUS;
4853        matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4854        REGEX_CHECK_STATUS;
4855        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4856        REGEX_CHECK_STATUS;
4857        REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4858        REGEX_ASSERT(returnedContext == &cbInfo);
4859
4860        // A short-running match should NOT invoke the callback.
4861        status = U_ZERO_ERROR;
4862        cbInfo.reset(100);
4863        UnicodeString s = "abxxx";
4864        matcher.reset(s);
4865#if 0
4866        matcher.setTrace(TRUE);
4867#endif
4868        REGEX_ASSERT(matcher.find(0, status));
4869        REGEX_CHECK_STATUS;
4870        REGEX_ASSERT(cbInfo.numCalls == 0);
4871
4872        // A medium running match that causes matcher.find() to invoke our callback for each index.
4873        status = U_ZERO_ERROR;
4874        s = "aaaaaaaaaaaaaaaaaaab";
4875        cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4876        matcher.reset(s);
4877        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4878        REGEX_CHECK_STATUS;
4879        REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4880
4881        // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4882        status = U_ZERO_ERROR;
4883        UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4884        cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4885        matcher.reset(s1);
4886        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4887        REGEX_CHECK_STATUS;
4888        REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4889
4890#if 0
4891        // Now a match that will succeed, but after an interruption
4892        status = U_ZERO_ERROR;
4893        UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4894        cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4895        matcher.reset(s2);
4896        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4897        REGEX_CHECK_STATUS;
4898        // Now retry the match from where left off
4899        cbInfo.maxCalls = 100; //  No callback limit
4900        REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4901        REGEX_CHECK_STATUS;
4902#endif
4903    }
4904
4905
4906}
4907
4908
4909//---------------------------------------------------------------------------
4910//
4911//    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4912//                             UTexts. The pure-C implementation of UText
4913//                             has no mutable backing stores, but we can
4914//                             use UnicodeString here to test the functionality.
4915//
4916//---------------------------------------------------------------------------
4917void RegexTest::PreAllocatedUTextCAPI () {
4918    UErrorCode           status = U_ZERO_ERROR;
4919    URegularExpression  *re;
4920    UText                patternText = UTEXT_INITIALIZER;
4921    UnicodeString        buffer;
4922    UText                bufferText = UTEXT_INITIALIZER;
4923
4924    utext_openUnicodeString(&bufferText, &buffer, &status);
4925
4926    /*
4927     *  getText() and getUText()
4928     */
4929    {
4930        UText  text1 = UTEXT_INITIALIZER;
4931        UText  text2 = UTEXT_INITIALIZER;
4932        UChar  text2Chars[20];
4933        UText  *resultText;
4934
4935        status = U_ZERO_ERROR;
4936        regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4937        regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4938        u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4939        utext_openUChars(&text2, text2Chars, -1, &status);
4940
4941        regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4942        re = uregex_openUText(&patternText, 0, NULL, &status);
4943
4944        /* First set a UText */
4945        uregex_setUText(re, &text1, &status);
4946        resultText = uregex_getUText(re, &bufferText, &status);
4947        REGEX_CHECK_STATUS;
4948        REGEX_ASSERT(resultText == &bufferText);
4949        utext_setNativeIndex(resultText, 0);
4950        utext_setNativeIndex(&text1, 0);
4951        REGEX_ASSERT(testUTextEqual(resultText, &text1));
4952
4953        resultText = uregex_getUText(re, &bufferText, &status);
4954        REGEX_CHECK_STATUS;
4955        REGEX_ASSERT(resultText == &bufferText);
4956        utext_setNativeIndex(resultText, 0);
4957        utext_setNativeIndex(&text1, 0);
4958        REGEX_ASSERT(testUTextEqual(resultText, &text1));
4959
4960        /* Then set a UChar * */
4961        uregex_setText(re, text2Chars, 7, &status);
4962        resultText = uregex_getUText(re, &bufferText, &status);
4963        REGEX_CHECK_STATUS;
4964        REGEX_ASSERT(resultText == &bufferText);
4965        utext_setNativeIndex(resultText, 0);
4966        utext_setNativeIndex(&text2, 0);
4967        REGEX_ASSERT(testUTextEqual(resultText, &text2));
4968
4969        uregex_close(re);
4970        utext_close(&text1);
4971        utext_close(&text2);
4972    }
4973
4974    /*
4975     *  group()
4976     */
4977    {
4978        UChar    text1[80];
4979        UText   *actual;
4980        UBool    result;
4981        u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
4982
4983        status = U_ZERO_ERROR;
4984        re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4985        REGEX_CHECK_STATUS;
4986
4987        uregex_setText(re, text1, -1, &status);
4988        result = uregex_find(re, 0, &status);
4989        REGEX_ASSERT(result==TRUE);
4990
4991        /*  Capture Group 0, the full match.  Should succeed.  */
4992        status = U_ZERO_ERROR;
4993        actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
4994        REGEX_CHECK_STATUS;
4995        REGEX_ASSERT(actual == &bufferText);
4996        REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
4997
4998        /*  Capture group #1.  Should succeed. */
4999        status = U_ZERO_ERROR;
5000        actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
5001        REGEX_CHECK_STATUS;
5002        REGEX_ASSERT(actual == &bufferText);
5003        REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
5004
5005        /*  Capture group out of range.  Error. */
5006        status = U_ZERO_ERROR;
5007        actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
5008        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5009        REGEX_ASSERT(actual == &bufferText);
5010
5011        uregex_close(re);
5012
5013    }
5014
5015    /*
5016     *  replaceFirst()
5017     */
5018    {
5019        UChar    text1[80];
5020        UChar    text2[80];
5021        UText    replText = UTEXT_INITIALIZER;
5022        UText   *result;
5023
5024        status = U_ZERO_ERROR;
5025        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5026        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5027        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5028
5029        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5030        REGEX_CHECK_STATUS;
5031
5032        /*  Normal case, with match */
5033        uregex_setText(re, text1, -1, &status);
5034        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5035        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5036        REGEX_CHECK_STATUS;
5037        REGEX_ASSERT(result == &bufferText);
5038        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5039
5040        /* No match.  Text should copy to output with no changes.  */
5041        uregex_setText(re, text2, -1, &status);
5042        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5043        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5044        REGEX_CHECK_STATUS;
5045        REGEX_ASSERT(result == &bufferText);
5046        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5047
5048        /* Unicode escapes */
5049        uregex_setText(re, text1, -1, &status);
5050        regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
5051        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5052        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5053        REGEX_CHECK_STATUS;
5054        REGEX_ASSERT(result == &bufferText);
5055        REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5056
5057        uregex_close(re);
5058        utext_close(&replText);
5059    }
5060
5061
5062    /*
5063     *  replaceAll()
5064     */
5065    {
5066        UChar    text1[80];
5067        UChar    text2[80];
5068        UText    replText = UTEXT_INITIALIZER;
5069        UText   *result;
5070
5071        status = U_ZERO_ERROR;
5072        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5073        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5074        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5075
5076        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5077        REGEX_CHECK_STATUS;
5078
5079        /*  Normal case, with match */
5080        uregex_setText(re, text1, -1, &status);
5081        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5082        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5083        REGEX_CHECK_STATUS;
5084        REGEX_ASSERT(result == &bufferText);
5085        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5086
5087        /* No match.  Text should copy to output with no changes.  */
5088        uregex_setText(re, text2, -1, &status);
5089        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5090        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5091        REGEX_CHECK_STATUS;
5092        REGEX_ASSERT(result == &bufferText);
5093        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5094
5095        uregex_close(re);
5096        utext_close(&replText);
5097    }
5098
5099
5100    /*
5101     *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5102     *   so we don't need to test it here.
5103     */
5104
5105    utext_close(&bufferText);
5106    utext_close(&patternText);
5107}
5108
5109//--------------------------------------------------------------
5110//
5111//  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5112//
5113//---------------------------------------------------------------
5114void RegexTest::Bug7651() {
5115    UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5116    //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5117    //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5118    UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5119    UnicodeString s("#ff @abcd This is test");
5120    RegexPattern  *REPattern = NULL;
5121    RegexMatcher  *REMatcher = NULL;
5122    UErrorCode status = U_ZERO_ERROR;
5123    UParseError pe;
5124
5125    REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5126    REGEX_CHECK_STATUS;
5127    REMatcher = REPattern->matcher(s, status);
5128    REGEX_CHECK_STATUS;
5129    REGEX_ASSERT(REMatcher->find());
5130    REGEX_ASSERT(REMatcher->start(status) == 0);
5131    delete REPattern;
5132    delete REMatcher;
5133    status = U_ZERO_ERROR;
5134
5135    REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5136    REGEX_CHECK_STATUS;
5137    REMatcher = REPattern->matcher(s, status);
5138    REGEX_CHECK_STATUS;
5139    REGEX_ASSERT(REMatcher->find());
5140    REGEX_ASSERT(REMatcher->start(status) == 0);
5141    delete REPattern;
5142    delete REMatcher;
5143    status = U_ZERO_ERROR;
5144 }
5145
5146void RegexTest::Bug7740() {
5147    UErrorCode status = U_ZERO_ERROR;
5148    UnicodeString pattern = "(a)";
5149    UnicodeString text = "abcdef";
5150    RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5151    REGEX_CHECK_STATUS;
5152    REGEX_ASSERT(m->lookingAt(status));
5153    REGEX_CHECK_STATUS;
5154    status = U_ILLEGAL_ARGUMENT_ERROR;
5155    UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5156    REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5157    REGEX_ASSERT(s == "");
5158    delete m;
5159}
5160
5161// Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5162
5163void RegexTest::Bug8479() {
5164    UErrorCode status = U_ZERO_ERROR;
5165
5166    RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5167    REGEX_CHECK_STATUS;
5168    if (U_SUCCESS(status))
5169    {
5170        UnicodeString str;
5171        str.setToBogus();
5172        pMatcher->reset(str);
5173        status = U_ZERO_ERROR;
5174        pMatcher->matches(status);
5175        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5176        delete pMatcher;
5177    }
5178}
5179
5180
5181// Bug 7029
5182void RegexTest::Bug7029() {
5183    UErrorCode status = U_ZERO_ERROR;
5184
5185    RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5186    UnicodeString text = "abc.def";
5187    UnicodeString splits[10];
5188    REGEX_CHECK_STATUS;
5189    int32_t numFields = pMatcher->split(text, splits, 10, status);
5190    REGEX_CHECK_STATUS;
5191    REGEX_ASSERT(numFields == 8);
5192    delete pMatcher;
5193}
5194
5195// Bug 9283
5196//   This test is checking for the existance of any supplemental characters that case-fold
5197//   to a bmp character.
5198//
5199//   At the time of this writing there are none. If any should appear in a subsequent release
5200//   of Unicode, the code in regular expressions compilation that determines the longest
5201//   posssible match for a literal string  will need to be enhanced.
5202//
5203//   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5204//   for details on what to do in case of a failure of this test.
5205//
5206void RegexTest::Bug9283() {
5207    UErrorCode status = U_ZERO_ERROR;
5208    UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5209    REGEX_CHECK_STATUS;
5210    int32_t index;
5211    UChar32 c;
5212    for (index=0; ; index++) {
5213        c = supplementalsWithCaseFolding.charAt(index);
5214        if (c == -1) {
5215            break;
5216        }
5217        UnicodeString cf = UnicodeString(c).foldCase();
5218        REGEX_ASSERT(cf.length() >= 2);
5219    }
5220}
5221
5222
5223void RegexTest::CheckInvBufSize() {
5224  if(inv_next>=INV_BUFSIZ) {
5225    errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5226          __FILE__, INV_BUFSIZ, inv_next);
5227  } else {
5228    logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5229  }
5230}
5231
5232#endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5233
5234