1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2014, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7//
8//   regextst.cpp
9//
10//      ICU Regular Expressions test, part of intltest.
11//
12
13/*
14     NOTE!!
15
16     PLEASE be careful about ASCII assumptions in this test.
17     This test is one of the worst repeat offenders.
18     If you have questions, contact someone on the ICU PMC
19     who has access to an EBCDIC system.
20
21 */
22
23#include "intltest.h"
24#if !UCONFIG_NO_REGULAR_EXPRESSIONS
25
26#include "unicode/regex.h"
27#include "unicode/uchar.h"
28#include "unicode/ucnv.h"
29#include "unicode/uniset.h"
30#include "unicode/uregex.h"
31#include "unicode/ustring.h"
32#include "regextst.h"
33#include "uvector.h"
34#include "util.h"
35#include <stdlib.h>
36#include <string.h>
37#include <stdio.h>
38#include "cstring.h"
39#include "uinvchar.h"
40
41#define SUPPORT_MUTATING_INPUT_STRING   0
42
43//---------------------------------------------------------------------------
44//
45//  Test class boilerplate
46//
47//---------------------------------------------------------------------------
48RegexTest::RegexTest()
49{
50}
51
52
53RegexTest::~RegexTest()
54{
55}
56
57
58
59void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
60{
61    if (exec) logln("TestSuite RegexTest: ");
62    switch (index) {
63
64        case 0: name = "Basic";
65            if (exec) Basic();
66            break;
67        case 1: name = "API_Match";
68            if (exec) API_Match();
69            break;
70        case 2: name = "API_Replace";
71            if (exec) API_Replace();
72            break;
73        case 3: name = "API_Pattern";
74            if (exec) API_Pattern();
75            break;
76        case 4:
77#if !UCONFIG_NO_FILE_IO
78            name = "Extended";
79            if (exec) Extended();
80#else
81            name = "skip";
82#endif
83            break;
84        case 5: name = "Errors";
85            if (exec) Errors();
86            break;
87        case 6: name = "PerlTests";
88            if (exec) PerlTests();
89            break;
90        case 7: name = "Callbacks";
91            if (exec) Callbacks();
92            break;
93        case 8: name = "FindProgressCallbacks";
94            if (exec) FindProgressCallbacks();
95            break;
96        case 9: name = "Bug 6149";
97             if (exec) Bug6149();
98             break;
99        case 10: name = "UTextBasic";
100          if (exec) UTextBasic();
101          break;
102        case 11: name = "API_Match_UTF8";
103          if (exec) API_Match_UTF8();
104          break;
105        case 12: name = "API_Replace_UTF8";
106          if (exec) API_Replace_UTF8();
107          break;
108        case 13: name = "API_Pattern_UTF8";
109          if (exec) API_Pattern_UTF8();
110          break;
111        case 14: name = "PerlTestsUTF8";
112          if (exec) PerlTestsUTF8();
113          break;
114        case 15: name = "PreAllocatedUTextCAPI";
115          if (exec) PreAllocatedUTextCAPI();
116          break;
117        case 16: name = "Bug 7651";
118             if (exec) Bug7651();
119             break;
120        case 17: name = "Bug 7740";
121            if (exec) Bug7740();
122            break;
123        case 18: name = "Bug 8479";
124            if (exec) Bug8479();
125            break;
126        case 19: name = "Bug 7029";
127            if (exec) Bug7029();
128            break;
129        case 20: name = "CheckInvBufSize";
130            if (exec) CheckInvBufSize();
131            break;
132        case 21: name = "Bug 9283";
133            if (exec) Bug9283();
134            break;
135        case 22: name = "Bug10459";
136            if (exec) Bug10459();
137            break;
138
139        default: name = "";
140            break; //needed to end loop
141    }
142}
143
144
145
146/**
147 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
148 * into ASCII.
149 * @see utext_openUTF8
150 */
151static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
152
153//---------------------------------------------------------------------------
154//
155//   Error Checking / Reporting macros used in all of the tests.
156//
157//---------------------------------------------------------------------------
158
159static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
160  int64_t oldIndex = utext_getNativeIndex(text);
161  utext_setNativeIndex(text, 0);
162  char *bufPtr = buf;
163  UChar32 c = utext_next32From(text, 0);
164  while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
165    if (0x000020<=c && c<0x00007e) {
166      *bufPtr = c;
167    } else {
168#if 0
169      sprintf(bufPtr,"U+%04X", c);
170      bufPtr+= strlen(bufPtr)-1;
171#else
172      *bufPtr = '%';
173#endif
174    }
175    bufPtr++;
176    c = UTEXT_NEXT32(text);
177  }
178  *bufPtr = 0;
179#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
180  char *ebuf = (char*)malloc(bufLen);
181  uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
182  uprv_strncpy(buf, ebuf, bufLen);
183  free((void*)ebuf);
184#endif
185  utext_setNativeIndex(text, oldIndex);
186}
187
188
189static char ASSERT_BUF[1024];
190
191const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
192  if(message.length()==0) {
193    strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
194  } else {
195    UnicodeString buf;
196    IntlTest::prettify(message,buf);
197    if(buf.length()==0) {
198      strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
199    } else {
200      buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
201      if(ASSERT_BUF[0]==0) {
202        ASSERT_BUF[0]=0;
203        for(int32_t i=0;i<buf.length();i++) {
204          UChar ch = buf[i];
205          sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
206        }
207      }
208    }
209  }
210  ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
211  return ASSERT_BUF;
212}
213
214#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
215
216#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
217
218#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
219                                                              __FILE__, __LINE__, u_errorName(status)); return;}}
220
221#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
222
223#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
224if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
225    __LINE__, u_errorName(errcode), u_errorName(status));};}
226
227#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
228    "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
229
230#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
231    errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
232
233#define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
234
235
236static UBool testUTextEqual(UText *uta, UText *utb) {
237    UChar32 ca = 0;
238    UChar32 cb = 0;
239    utext_setNativeIndex(uta, 0);
240    utext_setNativeIndex(utb, 0);
241    do {
242        ca = utext_next32(uta);
243        cb = utext_next32(utb);
244        if (ca != cb) {
245            break;
246        }
247    } while (ca != U_SENTINEL);
248    return ca == cb;
249}
250
251
252/**
253 * @param expected expected text in UTF-8 (not platform) codepage
254 */
255void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
256    UErrorCode status = U_ZERO_ERROR;
257    UText expectedText = UTEXT_INITIALIZER;
258    utext_openUTF8(&expectedText, expected, -1, &status);
259    if(U_FAILURE(status)) {
260      errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
261      return;
262    }
263    if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
264      errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
265      return;
266    }
267    utext_setNativeIndex(actual, 0);
268    if (!testUTextEqual(&expectedText, actual)) {
269        char buf[201 /*21*/];
270        char expectedBuf[201];
271        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
272        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
273        errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
274    }
275    utext_close(&expectedText);
276}
277/**
278 * @param expected invariant (platform local text) input
279 */
280
281void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
282    UErrorCode status = U_ZERO_ERROR;
283    UText expectedText = UTEXT_INITIALIZER;
284    regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
285    if(U_FAILURE(status)) {
286      errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
287      return;
288    }
289    utext_setNativeIndex(actual, 0);
290    if (!testUTextEqual(&expectedText, actual)) {
291        char buf[201 /*21*/];
292        char expectedBuf[201];
293        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
294        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
295        errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
296    }
297    utext_close(&expectedText);
298}
299
300/**
301 * Assumes utf-8 input
302 */
303#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
304/**
305 * Assumes Invariant input
306 */
307#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
308
309/**
310 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
311 * passed into utext_openUTF8. An error will be given if
312 * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
313 */
314
315#define INV_BUFSIZ 2048 /* increase this if too small */
316
317static int64_t inv_next=0;
318
319#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
320static char inv_buf[INV_BUFSIZ];
321#endif
322
323static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
324  if(length==-1) length=strlen(inv);
325#if U_CHARSET_FAMILY==U_ASCII_FAMILY
326  inv_next+=length;
327  return utext_openUTF8(ut, inv, length, status);
328#else
329  if(inv_next+length+1>INV_BUFSIZ) {
330    fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
331            __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
332    *status = U_MEMORY_ALLOCATION_ERROR;
333    return NULL;
334  }
335
336  unsigned char *buf = (unsigned char*)inv_buf+inv_next;
337  uprv_aestrncpy(buf, (const uint8_t*)inv, length);
338  inv_next+=length;
339
340#if 0
341  fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
342#endif
343
344  return utext_openUTF8(ut, (const char*)buf, length, status);
345#endif
346}
347
348
349//---------------------------------------------------------------------------
350//
351//    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
352//                       for the LookingAt() and  Match() functions.
353//
354//       usage:
355//          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
356//
357//          The expected results are UBool - TRUE or FALSE.
358//          The input text is unescaped.  The pattern is not.
359//
360//
361//---------------------------------------------------------------------------
362
363#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
364
365UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
366    const UnicodeString pattern(pat, -1, US_INV);
367    const UnicodeString inputText(text, -1, US_INV);
368    UErrorCode          status  = U_ZERO_ERROR;
369    UParseError         pe;
370    RegexPattern        *REPattern = NULL;
371    RegexMatcher        *REMatcher = NULL;
372    UBool               retVal     = TRUE;
373
374    UnicodeString patString(pat, -1, US_INV);
375    REPattern = RegexPattern::compile(patString, 0, pe, status);
376    if (U_FAILURE(status)) {
377        dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
378            line, u_errorName(status));
379        return FALSE;
380    }
381    if (line==376) { REPattern->dumpPattern();}
382
383    UnicodeString inputString(inputText);
384    UnicodeString unEscapedInput = inputString.unescape();
385    REMatcher = REPattern->matcher(unEscapedInput, status);
386    if (U_FAILURE(status)) {
387        errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
388            line, u_errorName(status));
389        return FALSE;
390    }
391
392    UBool actualmatch;
393    actualmatch = REMatcher->lookingAt(status);
394    if (U_FAILURE(status)) {
395        errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
396            line, u_errorName(status));
397        retVal =  FALSE;
398    }
399    if (actualmatch != looking) {
400        errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
401        retVal = FALSE;
402    }
403
404    status = U_ZERO_ERROR;
405    actualmatch = REMatcher->matches(status);
406    if (U_FAILURE(status)) {
407        errln("RegexTest failure in matches() at line %d.  Status = %s\n",
408            line, u_errorName(status));
409        retVal = FALSE;
410    }
411    if (actualmatch != match) {
412        errln("RegexTest: wrong return from matches() at line %d.\n", line);
413        retVal = FALSE;
414    }
415
416    if (retVal == FALSE) {
417        REPattern->dumpPattern();
418    }
419
420    delete REPattern;
421    delete REMatcher;
422    return retVal;
423}
424
425
426UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
427    UText               pattern    = UTEXT_INITIALIZER;
428    int32_t             inputUTF8Length;
429    char                *textChars = NULL;
430    UText               inputText  = UTEXT_INITIALIZER;
431    UErrorCode          status     = U_ZERO_ERROR;
432    UParseError         pe;
433    RegexPattern        *REPattern = NULL;
434    RegexMatcher        *REMatcher = NULL;
435    UBool               retVal     = TRUE;
436
437    regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
438    REPattern = RegexPattern::compile(&pattern, 0, pe, status);
439    if (U_FAILURE(status)) {
440        dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
441            line, u_errorName(status));
442        return FALSE;
443    }
444
445    UnicodeString inputString(text, -1, US_INV);
446    UnicodeString unEscapedInput = inputString.unescape();
447    LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
448    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
449
450    inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
451    if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
452        // UTF-8 does not allow unpaired surrogates, so this could actually happen
453        logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
454        return TRUE; // not a failure of the Regex engine
455    }
456    status = U_ZERO_ERROR; // buffer overflow
457    textChars = new char[inputUTF8Length+1];
458    unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
459    utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
460
461    REMatcher = &REPattern->matcher(status)->reset(&inputText);
462    if (U_FAILURE(status)) {
463        errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
464            line, u_errorName(status));
465        return FALSE;
466    }
467
468    UBool actualmatch;
469    actualmatch = REMatcher->lookingAt(status);
470    if (U_FAILURE(status)) {
471        errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
472            line, u_errorName(status));
473        retVal =  FALSE;
474    }
475    if (actualmatch != looking) {
476        errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
477        retVal = FALSE;
478    }
479
480    status = U_ZERO_ERROR;
481    actualmatch = REMatcher->matches(status);
482    if (U_FAILURE(status)) {
483        errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
484            line, u_errorName(status));
485        retVal = FALSE;
486    }
487    if (actualmatch != match) {
488        errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
489        retVal = FALSE;
490    }
491
492    if (retVal == FALSE) {
493        REPattern->dumpPattern();
494    }
495
496    delete REPattern;
497    delete REMatcher;
498    utext_close(&inputText);
499    utext_close(&pattern);
500    delete[] textChars;
501    return retVal;
502}
503
504
505
506//---------------------------------------------------------------------------
507//
508//    REGEX_ERR       Macro + invocation function to simplify writing tests
509//                       regex tests for incorrect patterns
510//
511//       usage:
512//          REGEX_ERR("pattern",   expected error line, column, expected status);
513//
514//---------------------------------------------------------------------------
515#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
516
517void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
518                          UErrorCode expectedStatus, int32_t line) {
519    UnicodeString       pattern(pat);
520
521    UErrorCode          status         = U_ZERO_ERROR;
522    UParseError         pe;
523    RegexPattern        *callerPattern = NULL;
524
525    //
526    //  Compile the caller's pattern
527    //
528    UnicodeString patString(pat);
529    callerPattern = RegexPattern::compile(patString, 0, pe, status);
530    if (status != expectedStatus) {
531        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
532    } else {
533        if (status != U_ZERO_ERROR) {
534            if (pe.line != errLine || pe.offset != errCol) {
535                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
536                    line, errLine, errCol, pe.line, pe.offset);
537            }
538        }
539    }
540
541    delete callerPattern;
542
543    //
544    //  Compile again, using a UTF-8-based UText
545    //
546    UText patternText = UTEXT_INITIALIZER;
547    regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
548    callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
549    if (status != expectedStatus) {
550        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
551    } else {
552        if (status != U_ZERO_ERROR) {
553            if (pe.line != errLine || pe.offset != errCol) {
554                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
555                    line, errLine, errCol, pe.line, pe.offset);
556            }
557        }
558    }
559
560    delete callerPattern;
561    utext_close(&patternText);
562}
563
564
565
566//---------------------------------------------------------------------------
567//
568//      Basic      Check for basic functionality of regex pattern matching.
569//                 Avoid the use of REGEX_FIND test macro, which has
570//                 substantial dependencies on basic Regex functionality.
571//
572//---------------------------------------------------------------------------
573void RegexTest::Basic() {
574
575
576//
577// Debug - slide failing test cases early
578//
579#if 0
580    {
581        // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
582        UParseError pe;
583        UErrorCode  status = U_ZERO_ERROR;
584        RegexPattern *pattern;
585        pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
586        pattern->dumpPattern();
587        RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
588        UBool result = m->find();
589        printf("result = %d\n", result);
590        // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
591        // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
592    }
593    exit(1);
594#endif
595
596
597    //
598    // Pattern with parentheses
599    //
600    REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
601    REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
602    REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
603
604    //
605    // Patterns with *
606    //
607    REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
608    REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
609    REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
610    REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
611    REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
612
613    REGEX_TESTLM("a*", "",  TRUE, TRUE);
614    REGEX_TESTLM("a*", "b", TRUE, FALSE);
615
616
617    //
618    //  Patterns with "."
619    //
620    REGEX_TESTLM(".", "abc", TRUE, FALSE);
621    REGEX_TESTLM("...", "abc", TRUE, TRUE);
622    REGEX_TESTLM("....", "abc", FALSE, FALSE);
623    REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
624    REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
625    REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
626    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
627    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
628
629    //
630    //  Patterns with * applied to chars at end of literal string
631    //
632    REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
633    REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
634
635    //
636    //  Supplemental chars match as single chars, not a pair of surrogates.
637    //
638    REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
639    REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
640    REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
641
642
643    //
644    //  UnicodeSets in the pattern
645    //
646    REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
647    REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
648    REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
649    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
650    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
651    REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
652
653    REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
654    REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
655    REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
656    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
657    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
658
659    //
660    //   OR operator in patterns
661    //
662    REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
663    REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
664    REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
665    REGEX_TESTLM("a|b", "b", TRUE, TRUE);
666
667    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
668    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
669    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
670    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
671    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
672    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
673
674    //
675    //  +
676    //
677    REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
678    REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
679    REGEX_TESTLM("b+", "", FALSE, FALSE);
680    REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
681    REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
682    REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
683
684    //
685    //   ?
686    //
687    REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
688    REGEX_TESTLM("ab?", "a", TRUE, TRUE);
689    REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
690    REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
691    REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
692    REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
693    REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
694    REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
695    REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
696
697    //
698    //  Escape sequences that become single literal chars, handled internally
699    //   by ICU's Unescape.
700    //
701
702    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
703    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
704    REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
705    REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
706    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
707    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
708    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
709    REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
710    REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
711    REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
712
713    REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
714    REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
715
716    // Escape of special chars in patterns
717    REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
718}
719
720
721//---------------------------------------------------------------------------
722//
723//    UTextBasic   Check for quirks that are specific to the UText
724//                 implementation.
725//
726//---------------------------------------------------------------------------
727void RegexTest::UTextBasic() {
728    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
729    UErrorCode status = U_ZERO_ERROR;
730    UText pattern = UTEXT_INITIALIZER;
731    utext_openUTF8(&pattern, str_abc, -1, &status);
732    RegexMatcher matcher(&pattern, 0, status);
733    REGEX_CHECK_STATUS;
734
735    UText input = UTEXT_INITIALIZER;
736    utext_openUTF8(&input, str_abc, -1, &status);
737    REGEX_CHECK_STATUS;
738    matcher.reset(&input);
739    REGEX_CHECK_STATUS;
740    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
741
742    matcher.reset(matcher.inputText());
743    REGEX_CHECK_STATUS;
744    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
745
746    utext_close(&pattern);
747    utext_close(&input);
748}
749
750
751//---------------------------------------------------------------------------
752//
753//      API_Match   Test that the API for class RegexMatcher
754//                  is present and nominally working, but excluding functions
755//                  implementing replace operations.
756//
757//---------------------------------------------------------------------------
758void RegexTest::API_Match() {
759    UParseError         pe;
760    UErrorCode          status=U_ZERO_ERROR;
761    int32_t             flags = 0;
762
763    //
764    // Debug - slide failing test cases early
765    //
766#if 0
767    {
768    }
769    return;
770#endif
771
772    //
773    // Simple pattern compilation
774    //
775    {
776        UnicodeString       re("abc");
777        RegexPattern        *pat2;
778        pat2 = RegexPattern::compile(re, flags, pe, status);
779        REGEX_CHECK_STATUS;
780
781        UnicodeString inStr1 = "abcdef this is a test";
782        UnicodeString instr2 = "not abc";
783        UnicodeString empty  = "";
784
785
786        //
787        // Matcher creation and reset.
788        //
789        RegexMatcher *m1 = pat2->matcher(inStr1, status);
790        REGEX_CHECK_STATUS;
791        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
792        REGEX_ASSERT(m1->input() == inStr1);
793        m1->reset(instr2);
794        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
795        REGEX_ASSERT(m1->input() == instr2);
796        m1->reset(inStr1);
797        REGEX_ASSERT(m1->input() == inStr1);
798        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
799        m1->reset(empty);
800        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
801        REGEX_ASSERT(m1->input() == empty);
802        REGEX_ASSERT(&m1->pattern() == pat2);
803
804        //
805        //  reset(pos, status)
806        //
807        m1->reset(inStr1);
808        m1->reset(4, status);
809        REGEX_CHECK_STATUS;
810        REGEX_ASSERT(m1->input() == inStr1);
811        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
812
813        m1->reset(-1, status);
814        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
815        status = U_ZERO_ERROR;
816
817        m1->reset(0, status);
818        REGEX_CHECK_STATUS;
819        status = U_ZERO_ERROR;
820
821        int32_t len = m1->input().length();
822        m1->reset(len-1, status);
823        REGEX_CHECK_STATUS;
824        status = U_ZERO_ERROR;
825
826        m1->reset(len, status);
827        REGEX_CHECK_STATUS;
828        status = U_ZERO_ERROR;
829
830        m1->reset(len+1, status);
831        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
832        status = U_ZERO_ERROR;
833
834        //
835        // match(pos, status)
836        //
837        m1->reset(instr2);
838        REGEX_ASSERT(m1->matches(4, status) == TRUE);
839        m1->reset();
840        REGEX_ASSERT(m1->matches(3, status) == FALSE);
841        m1->reset();
842        REGEX_ASSERT(m1->matches(5, status) == FALSE);
843        REGEX_ASSERT(m1->matches(4, status) == TRUE);
844        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
845        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
846
847        // Match() at end of string should fail, but should not
848        //  be an error.
849        status = U_ZERO_ERROR;
850        len = m1->input().length();
851        REGEX_ASSERT(m1->matches(len, status) == FALSE);
852        REGEX_CHECK_STATUS;
853
854        // Match beyond end of string should fail with an error.
855        status = U_ZERO_ERROR;
856        REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
857        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
858
859        // Successful match at end of string.
860        {
861            status = U_ZERO_ERROR;
862            RegexMatcher m("A?", 0, status);  // will match zero length string.
863            REGEX_CHECK_STATUS;
864            m.reset(inStr1);
865            len = inStr1.length();
866            REGEX_ASSERT(m.matches(len, status) == TRUE);
867            REGEX_CHECK_STATUS;
868            m.reset(empty);
869            REGEX_ASSERT(m.matches(0, status) == TRUE);
870            REGEX_CHECK_STATUS;
871        }
872
873
874        //
875        // lookingAt(pos, status)
876        //
877        status = U_ZERO_ERROR;
878        m1->reset(instr2);  // "not abc"
879        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
880        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
881        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
882        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
883        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
884        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
885        status = U_ZERO_ERROR;
886        len = m1->input().length();
887        REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
888        REGEX_CHECK_STATUS;
889        REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
890        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
891
892        delete m1;
893        delete pat2;
894    }
895
896
897    //
898    // Capture Group.
899    //     RegexMatcher::start();
900    //     RegexMatcher::end();
901    //     RegexMatcher::groupCount();
902    //
903    {
904        int32_t             flags=0;
905        UParseError         pe;
906        UErrorCode          status=U_ZERO_ERROR;
907
908        UnicodeString       re("01(23(45)67)(.*)");
909        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
910        REGEX_CHECK_STATUS;
911        UnicodeString data = "0123456789";
912
913        RegexMatcher *matcher = pat->matcher(data, status);
914        REGEX_CHECK_STATUS;
915        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
916        static const int32_t matchStarts[] = {0,  2, 4, 8};
917        static const int32_t matchEnds[]   = {10, 8, 6, 10};
918        int32_t i;
919        for (i=0; i<4; i++) {
920            int32_t actualStart = matcher->start(i, status);
921            REGEX_CHECK_STATUS;
922            if (actualStart != matchStarts[i]) {
923                errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
924                    __LINE__, i, matchStarts[i], actualStart);
925            }
926            int32_t actualEnd = matcher->end(i, status);
927            REGEX_CHECK_STATUS;
928            if (actualEnd != matchEnds[i]) {
929                errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
930                    __LINE__, i, matchEnds[i], actualEnd);
931            }
932        }
933
934        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
935        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
936
937        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
938        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
939        matcher->reset();
940        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
941
942        matcher->lookingAt(status);
943        REGEX_ASSERT(matcher->group(status)    == "0123456789");
944        REGEX_ASSERT(matcher->group(0, status) == "0123456789");
945        REGEX_ASSERT(matcher->group(1, status) == "234567"    );
946        REGEX_ASSERT(matcher->group(2, status) == "45"        );
947        REGEX_ASSERT(matcher->group(3, status) == "89"        );
948        REGEX_CHECK_STATUS;
949        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
950        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
951        matcher->reset();
952        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
953
954        delete matcher;
955        delete pat;
956
957    }
958
959    //
960    //  find
961    //
962    {
963        int32_t             flags=0;
964        UParseError         pe;
965        UErrorCode          status=U_ZERO_ERROR;
966
967        UnicodeString       re("abc");
968        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
969        REGEX_CHECK_STATUS;
970        UnicodeString data = ".abc..abc...abc..";
971        //                    012345678901234567
972
973        RegexMatcher *matcher = pat->matcher(data, status);
974        REGEX_CHECK_STATUS;
975        REGEX_ASSERT(matcher->find());
976        REGEX_ASSERT(matcher->start(status) == 1);
977        REGEX_ASSERT(matcher->find());
978        REGEX_ASSERT(matcher->start(status) == 6);
979        REGEX_ASSERT(matcher->find());
980        REGEX_ASSERT(matcher->start(status) == 12);
981        REGEX_ASSERT(matcher->find() == FALSE);
982        REGEX_ASSERT(matcher->find() == FALSE);
983
984        matcher->reset();
985        REGEX_ASSERT(matcher->find());
986        REGEX_ASSERT(matcher->start(status) == 1);
987
988        REGEX_ASSERT(matcher->find(0, status));
989        REGEX_ASSERT(matcher->start(status) == 1);
990        REGEX_ASSERT(matcher->find(1, status));
991        REGEX_ASSERT(matcher->start(status) == 1);
992        REGEX_ASSERT(matcher->find(2, status));
993        REGEX_ASSERT(matcher->start(status) == 6);
994        REGEX_ASSERT(matcher->find(12, status));
995        REGEX_ASSERT(matcher->start(status) == 12);
996        REGEX_ASSERT(matcher->find(13, status) == FALSE);
997        REGEX_ASSERT(matcher->find(16, status) == FALSE);
998        REGEX_ASSERT(matcher->find(17, status) == FALSE);
999        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1000
1001        status = U_ZERO_ERROR;
1002        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1003        status = U_ZERO_ERROR;
1004        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1005
1006        REGEX_ASSERT(matcher->groupCount() == 0);
1007
1008        delete matcher;
1009        delete pat;
1010    }
1011
1012
1013    //
1014    //  find, with \G in pattern (true if at the end of a previous match).
1015    //
1016    {
1017        int32_t             flags=0;
1018        UParseError         pe;
1019        UErrorCode          status=U_ZERO_ERROR;
1020
1021        UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1022        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1023        REGEX_CHECK_STATUS;
1024        UnicodeString data = ".abcabc.abc..";
1025        //                    012345678901234567
1026
1027        RegexMatcher *matcher = pat->matcher(data, status);
1028        REGEX_CHECK_STATUS;
1029        REGEX_ASSERT(matcher->find());
1030        REGEX_ASSERT(matcher->start(status) == 0);
1031        REGEX_ASSERT(matcher->start(1, status) == -1);
1032        REGEX_ASSERT(matcher->start(2, status) == 1);
1033
1034        REGEX_ASSERT(matcher->find());
1035        REGEX_ASSERT(matcher->start(status) == 4);
1036        REGEX_ASSERT(matcher->start(1, status) == 4);
1037        REGEX_ASSERT(matcher->start(2, status) == -1);
1038        REGEX_CHECK_STATUS;
1039
1040        delete matcher;
1041        delete pat;
1042    }
1043
1044    //
1045    //   find with zero length matches, match position should bump ahead
1046    //     to prevent loops.
1047    //
1048    {
1049        int32_t                 i;
1050        UErrorCode          status=U_ZERO_ERROR;
1051        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1052                                                      //   using an always-true look-ahead.
1053        REGEX_CHECK_STATUS;
1054        UnicodeString s("    ");
1055        m.reset(s);
1056        for (i=0; ; i++) {
1057            if (m.find() == FALSE) {
1058                break;
1059            }
1060            REGEX_ASSERT(m.start(status) == i);
1061            REGEX_ASSERT(m.end(status) == i);
1062        }
1063        REGEX_ASSERT(i==5);
1064
1065        // Check that the bump goes over surrogate pairs OK
1066        s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1067        s = s.unescape();
1068        m.reset(s);
1069        for (i=0; ; i+=2) {
1070            if (m.find() == FALSE) {
1071                break;
1072            }
1073            REGEX_ASSERT(m.start(status) == i);
1074            REGEX_ASSERT(m.end(status) == i);
1075        }
1076        REGEX_ASSERT(i==10);
1077    }
1078    {
1079        // find() loop breaking test.
1080        //        with pattern of /.?/, should see a series of one char matches, then a single
1081        //        match of zero length at the end of the input string.
1082        int32_t                 i;
1083        UErrorCode          status=U_ZERO_ERROR;
1084        RegexMatcher        m(".?", 0, status);
1085        REGEX_CHECK_STATUS;
1086        UnicodeString s("    ");
1087        m.reset(s);
1088        for (i=0; ; i++) {
1089            if (m.find() == FALSE) {
1090                break;
1091            }
1092            REGEX_ASSERT(m.start(status) == i);
1093            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1094        }
1095        REGEX_ASSERT(i==5);
1096    }
1097
1098
1099    //
1100    // Matchers with no input string behave as if they had an empty input string.
1101    //
1102
1103    {
1104        UErrorCode status = U_ZERO_ERROR;
1105        RegexMatcher  m(".?", 0, status);
1106        REGEX_CHECK_STATUS;
1107        REGEX_ASSERT(m.find());
1108        REGEX_ASSERT(m.start(status) == 0);
1109        REGEX_ASSERT(m.input() == "");
1110    }
1111    {
1112        UErrorCode status = U_ZERO_ERROR;
1113        RegexPattern  *p = RegexPattern::compile(".", 0, status);
1114        RegexMatcher  *m = p->matcher(status);
1115        REGEX_CHECK_STATUS;
1116
1117        REGEX_ASSERT(m->find() == FALSE);
1118        REGEX_ASSERT(m->input() == "");
1119        delete m;
1120        delete p;
1121    }
1122
1123    //
1124    // Regions
1125    //
1126    {
1127        UErrorCode status = U_ZERO_ERROR;
1128        UnicodeString testString("This is test data");
1129        RegexMatcher m(".*", testString,  0, status);
1130        REGEX_CHECK_STATUS;
1131        REGEX_ASSERT(m.regionStart() == 0);
1132        REGEX_ASSERT(m.regionEnd() == testString.length());
1133        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1134        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1135
1136        m.region(2,4, status);
1137        REGEX_CHECK_STATUS;
1138        REGEX_ASSERT(m.matches(status));
1139        REGEX_ASSERT(m.start(status)==2);
1140        REGEX_ASSERT(m.end(status)==4);
1141        REGEX_CHECK_STATUS;
1142
1143        m.reset();
1144        REGEX_ASSERT(m.regionStart() == 0);
1145        REGEX_ASSERT(m.regionEnd() == testString.length());
1146
1147        UnicodeString shorterString("short");
1148        m.reset(shorterString);
1149        REGEX_ASSERT(m.regionStart() == 0);
1150        REGEX_ASSERT(m.regionEnd() == shorterString.length());
1151
1152        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1153        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1154        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1155        REGEX_ASSERT(&m == &m.reset());
1156        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1157
1158        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1159        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1160        REGEX_ASSERT(&m == &m.reset());
1161        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1162
1163        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1164        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1165        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1166        REGEX_ASSERT(&m == &m.reset());
1167        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1168
1169        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1170        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1171        REGEX_ASSERT(&m == &m.reset());
1172        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1173
1174    }
1175
1176    //
1177    // hitEnd() and requireEnd()
1178    //
1179    {
1180        UErrorCode status = U_ZERO_ERROR;
1181        UnicodeString testString("aabb");
1182        RegexMatcher m1(".*", testString,  0, status);
1183        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1184        REGEX_ASSERT(m1.hitEnd() == TRUE);
1185        REGEX_ASSERT(m1.requireEnd() == FALSE);
1186        REGEX_CHECK_STATUS;
1187
1188        status = U_ZERO_ERROR;
1189        RegexMatcher m2("a*", testString, 0, status);
1190        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1191        REGEX_ASSERT(m2.hitEnd() == FALSE);
1192        REGEX_ASSERT(m2.requireEnd() == FALSE);
1193        REGEX_CHECK_STATUS;
1194
1195        status = U_ZERO_ERROR;
1196        RegexMatcher m3(".*$", testString, 0, status);
1197        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1198        REGEX_ASSERT(m3.hitEnd() == TRUE);
1199        REGEX_ASSERT(m3.requireEnd() == TRUE);
1200        REGEX_CHECK_STATUS;
1201    }
1202
1203
1204    //
1205    // Compilation error on reset with UChar *
1206    //   These were a hazard that people were stumbling over with runtime errors.
1207    //   Changed them to compiler errors by adding private methods that more closely
1208    //   matched the incorrect use of the functions.
1209    //
1210#if 0
1211    {
1212        UErrorCode status = U_ZERO_ERROR;
1213        UChar ucharString[20];
1214        RegexMatcher m(".", 0, status);
1215        m.reset(ucharString);  // should not compile.
1216
1217        RegexPattern *p = RegexPattern::compile(".", 0, status);
1218        RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1219
1220        RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1221    }
1222#endif
1223
1224    //
1225    //  Time Outs.
1226    //       Note:  These tests will need to be changed when the regexp engine is
1227    //              able to detect and cut short the exponential time behavior on
1228    //              this type of match.
1229    //
1230    {
1231        UErrorCode status = U_ZERO_ERROR;
1232        //    Enough 'a's in the string to cause the match to time out.
1233        //       (Each on additonal 'a' doubles the time)
1234        UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1235        RegexMatcher matcher("(a+)+b", testString, 0, status);
1236        REGEX_CHECK_STATUS;
1237        REGEX_ASSERT(matcher.getTimeLimit() == 0);
1238        matcher.setTimeLimit(100, status);
1239        REGEX_ASSERT(matcher.getTimeLimit() == 100);
1240        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1241        REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1242    }
1243    {
1244        UErrorCode status = U_ZERO_ERROR;
1245        //   Few enough 'a's to slip in under the time limit.
1246        UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1247        RegexMatcher matcher("(a+)+b", testString, 0, status);
1248        REGEX_CHECK_STATUS;
1249        matcher.setTimeLimit(100, status);
1250        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1251        REGEX_CHECK_STATUS;
1252    }
1253
1254    //
1255    //  Stack Limits
1256    //
1257    {
1258        UErrorCode status = U_ZERO_ERROR;
1259        UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1260
1261        // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1262        //   of the '+', and makes the stack frames larger.
1263        RegexMatcher matcher("(A)+A$", testString, 0, status);
1264
1265        // With the default stack, this match should fail to run
1266        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1267        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1268
1269        // With unlimited stack, it should run
1270        status = U_ZERO_ERROR;
1271        matcher.setStackLimit(0, status);
1272        REGEX_CHECK_STATUS;
1273        REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1274        REGEX_CHECK_STATUS;
1275        REGEX_ASSERT(matcher.getStackLimit() == 0);
1276
1277        // With a limited stack, it the match should fail
1278        status = U_ZERO_ERROR;
1279        matcher.setStackLimit(10000, status);
1280        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1281        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1282        REGEX_ASSERT(matcher.getStackLimit() == 10000);
1283    }
1284
1285        // A pattern that doesn't save state should work with
1286        //   a minimal sized stack
1287    {
1288        UErrorCode status = U_ZERO_ERROR;
1289        UnicodeString testString = "abc";
1290        RegexMatcher matcher("abc", testString, 0, status);
1291        REGEX_CHECK_STATUS;
1292        matcher.setStackLimit(30, status);
1293        REGEX_CHECK_STATUS;
1294        REGEX_ASSERT(matcher.matches(status) == TRUE);
1295        REGEX_CHECK_STATUS;
1296        REGEX_ASSERT(matcher.getStackLimit() == 30);
1297
1298        // Negative stack sizes should fail
1299        status = U_ZERO_ERROR;
1300        matcher.setStackLimit(1000, status);
1301        REGEX_CHECK_STATUS;
1302        matcher.setStackLimit(-1, status);
1303        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1304        REGEX_ASSERT(matcher.getStackLimit() == 1000);
1305    }
1306
1307
1308}
1309
1310
1311
1312
1313
1314
1315//---------------------------------------------------------------------------
1316//
1317//      API_Replace        API test for class RegexMatcher, testing the
1318//                         Replace family of functions.
1319//
1320//---------------------------------------------------------------------------
1321void RegexTest::API_Replace() {
1322    //
1323    //  Replace
1324    //
1325    int32_t             flags=0;
1326    UParseError         pe;
1327    UErrorCode          status=U_ZERO_ERROR;
1328
1329    UnicodeString       re("abc");
1330    RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1331    REGEX_CHECK_STATUS;
1332    UnicodeString data = ".abc..abc...abc..";
1333    //                    012345678901234567
1334    RegexMatcher *matcher = pat->matcher(data, status);
1335
1336    //
1337    //  Plain vanilla matches.
1338    //
1339    UnicodeString  dest;
1340    dest = matcher->replaceFirst("yz", status);
1341    REGEX_CHECK_STATUS;
1342    REGEX_ASSERT(dest == ".yz..abc...abc..");
1343
1344    dest = matcher->replaceAll("yz", status);
1345    REGEX_CHECK_STATUS;
1346    REGEX_ASSERT(dest == ".yz..yz...yz..");
1347
1348    //
1349    //  Plain vanilla non-matches.
1350    //
1351    UnicodeString d2 = ".abx..abx...abx..";
1352    matcher->reset(d2);
1353    dest = matcher->replaceFirst("yz", status);
1354    REGEX_CHECK_STATUS;
1355    REGEX_ASSERT(dest == ".abx..abx...abx..");
1356
1357    dest = matcher->replaceAll("yz", status);
1358    REGEX_CHECK_STATUS;
1359    REGEX_ASSERT(dest == ".abx..abx...abx..");
1360
1361    //
1362    // Empty source string
1363    //
1364    UnicodeString d3 = "";
1365    matcher->reset(d3);
1366    dest = matcher->replaceFirst("yz", status);
1367    REGEX_CHECK_STATUS;
1368    REGEX_ASSERT(dest == "");
1369
1370    dest = matcher->replaceAll("yz", status);
1371    REGEX_CHECK_STATUS;
1372    REGEX_ASSERT(dest == "");
1373
1374    //
1375    // Empty substitution string
1376    //
1377    matcher->reset(data);              // ".abc..abc...abc.."
1378    dest = matcher->replaceFirst("", status);
1379    REGEX_CHECK_STATUS;
1380    REGEX_ASSERT(dest == "...abc...abc..");
1381
1382    dest = matcher->replaceAll("", status);
1383    REGEX_CHECK_STATUS;
1384    REGEX_ASSERT(dest == "........");
1385
1386    //
1387    // match whole string
1388    //
1389    UnicodeString d4 = "abc";
1390    matcher->reset(d4);
1391    dest = matcher->replaceFirst("xyz", status);
1392    REGEX_CHECK_STATUS;
1393    REGEX_ASSERT(dest == "xyz");
1394
1395    dest = matcher->replaceAll("xyz", status);
1396    REGEX_CHECK_STATUS;
1397    REGEX_ASSERT(dest == "xyz");
1398
1399    //
1400    // Capture Group, simple case
1401    //
1402    UnicodeString       re2("a(..)");
1403    RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1404    REGEX_CHECK_STATUS;
1405    UnicodeString d5 = "abcdefg";
1406    RegexMatcher *matcher2 = pat2->matcher(d5, status);
1407    REGEX_CHECK_STATUS;
1408    dest = matcher2->replaceFirst("$1$1", status);
1409    REGEX_CHECK_STATUS;
1410    REGEX_ASSERT(dest == "bcbcdefg");
1411
1412    dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1413    REGEX_CHECK_STATUS;
1414    REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1415
1416    dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1417    REGEX_CHECK_STATUS;
1418    REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1419
1420    UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1421    replacement = replacement.unescape();
1422    dest = matcher2->replaceFirst(replacement, status);
1423    REGEX_CHECK_STATUS;
1424    REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1425
1426    REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1427
1428
1429    //
1430    // Replacement String with \u hex escapes
1431    //
1432    {
1433        UnicodeString  src = "abc 1 abc 2 abc 3";
1434        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1435        matcher->reset(src);
1436        UnicodeString  result = matcher->replaceAll(substitute, status);
1437        REGEX_CHECK_STATUS;
1438        REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1439    }
1440    {
1441        UnicodeString  src = "abc !";
1442        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1443        matcher->reset(src);
1444        UnicodeString  result = matcher->replaceAll(substitute, status);
1445        REGEX_CHECK_STATUS;
1446        UnicodeString expected = UnicodeString("--");
1447        expected.append((UChar32)0x10000);
1448        expected.append("-- !");
1449        REGEX_ASSERT(result == expected);
1450    }
1451    // TODO:  need more through testing of capture substitutions.
1452
1453    // Bug 4057
1454    //
1455    {
1456        status = U_ZERO_ERROR;
1457        UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1458        RegexMatcher m("ss(.*?)ee", 0, status);
1459        REGEX_CHECK_STATUS;
1460        UnicodeString result;
1461
1462        // Multiple finds do NOT bump up the previous appendReplacement postion.
1463        m.reset(s);
1464        m.find();
1465        m.find();
1466        m.appendReplacement(result, "ooh", status);
1467        REGEX_CHECK_STATUS;
1468        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1469
1470        // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1471        status = U_ZERO_ERROR;
1472        result.truncate(0);
1473        m.reset(10, status);
1474        m.find();
1475        m.find();
1476        m.appendReplacement(result, "ooh", status);
1477        REGEX_CHECK_STATUS;
1478        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1479
1480        // find() at interior of string, appendReplacemnt still starts at beginning.
1481        status = U_ZERO_ERROR;
1482        result.truncate(0);
1483        m.reset();
1484        m.find(10, status);
1485        m.find();
1486        m.appendReplacement(result, "ooh", status);
1487        REGEX_CHECK_STATUS;
1488        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1489
1490        m.appendTail(result);
1491        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1492
1493    }
1494
1495    delete matcher2;
1496    delete pat2;
1497    delete matcher;
1498    delete pat;
1499}
1500
1501
1502//---------------------------------------------------------------------------
1503//
1504//      API_Pattern       Test that the API for class RegexPattern is
1505//                        present and nominally working.
1506//
1507//---------------------------------------------------------------------------
1508void RegexTest::API_Pattern() {
1509    RegexPattern        pata;    // Test default constructor to not crash.
1510    RegexPattern        patb;
1511
1512    REGEX_ASSERT(pata == patb);
1513    REGEX_ASSERT(pata == pata);
1514
1515    UnicodeString re1("abc[a-l][m-z]");
1516    UnicodeString re2("def");
1517    UErrorCode    status = U_ZERO_ERROR;
1518    UParseError   pe;
1519
1520    RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1521    RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1522    REGEX_CHECK_STATUS;
1523    REGEX_ASSERT(*pat1 == *pat1);
1524    REGEX_ASSERT(*pat1 != pata);
1525
1526    // Assign
1527    patb = *pat1;
1528    REGEX_ASSERT(patb == *pat1);
1529
1530    // Copy Construct
1531    RegexPattern patc(*pat1);
1532    REGEX_ASSERT(patc == *pat1);
1533    REGEX_ASSERT(patb == patc);
1534    REGEX_ASSERT(pat1 != pat2);
1535    patb = *pat2;
1536    REGEX_ASSERT(patb != patc);
1537    REGEX_ASSERT(patb == *pat2);
1538
1539    // Compile with no flags.
1540    RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1541    REGEX_ASSERT(*pat1a == *pat1);
1542
1543    REGEX_ASSERT(pat1a->flags() == 0);
1544
1545    // Compile with different flags should be not equal
1546    RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1547    REGEX_CHECK_STATUS;
1548
1549    REGEX_ASSERT(*pat1b != *pat1a);
1550    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1551    REGEX_ASSERT(pat1a->flags() == 0);
1552    delete pat1b;
1553
1554    // clone
1555    RegexPattern *pat1c = pat1->clone();
1556    REGEX_ASSERT(*pat1c == *pat1);
1557    REGEX_ASSERT(*pat1c != *pat2);
1558
1559    delete pat1c;
1560    delete pat1a;
1561    delete pat1;
1562    delete pat2;
1563
1564
1565    //
1566    //   Verify that a matcher created from a cloned pattern works.
1567    //     (Jitterbug 3423)
1568    //
1569    {
1570        UErrorCode     status     = U_ZERO_ERROR;
1571        RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1572        RegexPattern  *pClone     = pSource->clone();
1573        delete         pSource;
1574        RegexMatcher  *mFromClone = pClone->matcher(status);
1575        REGEX_CHECK_STATUS;
1576        UnicodeString s = "Hello World";
1577        mFromClone->reset(s);
1578        REGEX_ASSERT(mFromClone->find() == TRUE);
1579        REGEX_ASSERT(mFromClone->group(status) == "Hello");
1580        REGEX_ASSERT(mFromClone->find() == TRUE);
1581        REGEX_ASSERT(mFromClone->group(status) == "World");
1582        REGEX_ASSERT(mFromClone->find() == FALSE);
1583        delete mFromClone;
1584        delete pClone;
1585    }
1586
1587    //
1588    //   matches convenience API
1589    //
1590    REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1591    REGEX_CHECK_STATUS;
1592    REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1593    REGEX_CHECK_STATUS;
1594    REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1595    REGEX_CHECK_STATUS;
1596    REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1597    REGEX_CHECK_STATUS;
1598    REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1599    REGEX_CHECK_STATUS;
1600    status = U_INDEX_OUTOFBOUNDS_ERROR;
1601    REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1602    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1603
1604
1605    //
1606    // Split()
1607    //
1608    status = U_ZERO_ERROR;
1609    pat1 = RegexPattern::compile(" +",  pe, status);
1610    REGEX_CHECK_STATUS;
1611    UnicodeString  fields[10];
1612
1613    int32_t n;
1614    n = pat1->split("Now is the time", fields, 10, status);
1615    REGEX_CHECK_STATUS;
1616    REGEX_ASSERT(n==4);
1617    REGEX_ASSERT(fields[0]=="Now");
1618    REGEX_ASSERT(fields[1]=="is");
1619    REGEX_ASSERT(fields[2]=="the");
1620    REGEX_ASSERT(fields[3]=="time");
1621    REGEX_ASSERT(fields[4]=="");
1622
1623    n = pat1->split("Now is the time", fields, 2, status);
1624    REGEX_CHECK_STATUS;
1625    REGEX_ASSERT(n==2);
1626    REGEX_ASSERT(fields[0]=="Now");
1627    REGEX_ASSERT(fields[1]=="is the time");
1628    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1629
1630    fields[1] = "*";
1631    status = U_ZERO_ERROR;
1632    n = pat1->split("Now is the time", fields, 1, status);
1633    REGEX_CHECK_STATUS;
1634    REGEX_ASSERT(n==1);
1635    REGEX_ASSERT(fields[0]=="Now is the time");
1636    REGEX_ASSERT(fields[1]=="*");
1637    status = U_ZERO_ERROR;
1638
1639    n = pat1->split("    Now       is the time   ", fields, 10, status);
1640    REGEX_CHECK_STATUS;
1641    REGEX_ASSERT(n==6);
1642    REGEX_ASSERT(fields[0]=="");
1643    REGEX_ASSERT(fields[1]=="Now");
1644    REGEX_ASSERT(fields[2]=="is");
1645    REGEX_ASSERT(fields[3]=="the");
1646    REGEX_ASSERT(fields[4]=="time");
1647    REGEX_ASSERT(fields[5]=="");
1648
1649    n = pat1->split("     ", fields, 10, status);
1650    REGEX_CHECK_STATUS;
1651    REGEX_ASSERT(n==2);
1652    REGEX_ASSERT(fields[0]=="");
1653    REGEX_ASSERT(fields[1]=="");
1654
1655    fields[0] = "foo";
1656    n = pat1->split("", fields, 10, status);
1657    REGEX_CHECK_STATUS;
1658    REGEX_ASSERT(n==0);
1659    REGEX_ASSERT(fields[0]=="foo");
1660
1661    delete pat1;
1662
1663    //  split, with a pattern with (capture)
1664    pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1665    REGEX_CHECK_STATUS;
1666
1667    status = U_ZERO_ERROR;
1668    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1669    REGEX_CHECK_STATUS;
1670    REGEX_ASSERT(n==7);
1671    REGEX_ASSERT(fields[0]=="");
1672    REGEX_ASSERT(fields[1]=="a");
1673    REGEX_ASSERT(fields[2]=="Now is ");
1674    REGEX_ASSERT(fields[3]=="b");
1675    REGEX_ASSERT(fields[4]=="the time");
1676    REGEX_ASSERT(fields[5]=="c");
1677    REGEX_ASSERT(fields[6]=="");
1678    REGEX_ASSERT(status==U_ZERO_ERROR);
1679
1680    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1681    REGEX_CHECK_STATUS;
1682    REGEX_ASSERT(n==7);
1683    REGEX_ASSERT(fields[0]=="  ");
1684    REGEX_ASSERT(fields[1]=="a");
1685    REGEX_ASSERT(fields[2]=="Now is ");
1686    REGEX_ASSERT(fields[3]=="b");
1687    REGEX_ASSERT(fields[4]=="the time");
1688    REGEX_ASSERT(fields[5]=="c");
1689    REGEX_ASSERT(fields[6]=="");
1690
1691    status = U_ZERO_ERROR;
1692    fields[6] = "foo";
1693    n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1694    REGEX_CHECK_STATUS;
1695    REGEX_ASSERT(n==6);
1696    REGEX_ASSERT(fields[0]=="  ");
1697    REGEX_ASSERT(fields[1]=="a");
1698    REGEX_ASSERT(fields[2]=="Now is ");
1699    REGEX_ASSERT(fields[3]=="b");
1700    REGEX_ASSERT(fields[4]=="the time");
1701    REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1702    REGEX_ASSERT(fields[6]=="foo");
1703
1704    status = U_ZERO_ERROR;
1705    fields[5] = "foo";
1706    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1707    REGEX_CHECK_STATUS;
1708    REGEX_ASSERT(n==5);
1709    REGEX_ASSERT(fields[0]=="  ");
1710    REGEX_ASSERT(fields[1]=="a");
1711    REGEX_ASSERT(fields[2]=="Now is ");
1712    REGEX_ASSERT(fields[3]=="b");
1713    REGEX_ASSERT(fields[4]=="the time<c>");
1714    REGEX_ASSERT(fields[5]=="foo");
1715
1716    status = U_ZERO_ERROR;
1717    fields[5] = "foo";
1718    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1719    REGEX_CHECK_STATUS;
1720    REGEX_ASSERT(n==5);
1721    REGEX_ASSERT(fields[0]=="  ");
1722    REGEX_ASSERT(fields[1]=="a");
1723    REGEX_ASSERT(fields[2]=="Now is ");
1724    REGEX_ASSERT(fields[3]=="b");
1725    REGEX_ASSERT(fields[4]=="the time");
1726    REGEX_ASSERT(fields[5]=="foo");
1727
1728    status = U_ZERO_ERROR;
1729    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1730    REGEX_CHECK_STATUS;
1731    REGEX_ASSERT(n==4);
1732    REGEX_ASSERT(fields[0]=="  ");
1733    REGEX_ASSERT(fields[1]=="a");
1734    REGEX_ASSERT(fields[2]=="Now is ");
1735    REGEX_ASSERT(fields[3]=="the time<c>");
1736    status = U_ZERO_ERROR;
1737    delete pat1;
1738
1739    pat1 = RegexPattern::compile("([-,])",  pe, status);
1740    REGEX_CHECK_STATUS;
1741    n = pat1->split("1-10,20", fields, 10, status);
1742    REGEX_CHECK_STATUS;
1743    REGEX_ASSERT(n==5);
1744    REGEX_ASSERT(fields[0]=="1");
1745    REGEX_ASSERT(fields[1]=="-");
1746    REGEX_ASSERT(fields[2]=="10");
1747    REGEX_ASSERT(fields[3]==",");
1748    REGEX_ASSERT(fields[4]=="20");
1749    delete pat1;
1750
1751    // Test split of string with empty trailing fields
1752    pat1 = RegexPattern::compile(",", pe, status);
1753    REGEX_CHECK_STATUS;
1754    n = pat1->split("a,b,c,", fields, 10, status);
1755    REGEX_CHECK_STATUS;
1756    REGEX_ASSERT(n==4);
1757    REGEX_ASSERT(fields[0]=="a");
1758    REGEX_ASSERT(fields[1]=="b");
1759    REGEX_ASSERT(fields[2]=="c");
1760    REGEX_ASSERT(fields[3]=="");
1761
1762    n = pat1->split("a,,,", fields, 10, status);
1763    REGEX_CHECK_STATUS;
1764    REGEX_ASSERT(n==4);
1765    REGEX_ASSERT(fields[0]=="a");
1766    REGEX_ASSERT(fields[1]=="");
1767    REGEX_ASSERT(fields[2]=="");
1768    REGEX_ASSERT(fields[3]=="");
1769    delete pat1;
1770
1771    // Split Separator with zero length match.
1772    pat1 = RegexPattern::compile(":?", pe, status);
1773    REGEX_CHECK_STATUS;
1774    n = pat1->split("abc", fields, 10, status);
1775    REGEX_CHECK_STATUS;
1776    REGEX_ASSERT(n==5);
1777    REGEX_ASSERT(fields[0]=="");
1778    REGEX_ASSERT(fields[1]=="a");
1779    REGEX_ASSERT(fields[2]=="b");
1780    REGEX_ASSERT(fields[3]=="c");
1781    REGEX_ASSERT(fields[4]=="");
1782
1783    delete pat1;
1784
1785    //
1786    // RegexPattern::pattern()
1787    //
1788    pat1 = new RegexPattern();
1789    REGEX_ASSERT(pat1->pattern() == "");
1790    delete pat1;
1791
1792    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1793    REGEX_CHECK_STATUS;
1794    REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1795    delete pat1;
1796
1797
1798    //
1799    // classID functions
1800    //
1801    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1802    REGEX_CHECK_STATUS;
1803    REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1804    REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1805    UnicodeString Hello("Hello, world.");
1806    RegexMatcher *m = pat1->matcher(Hello, status);
1807    REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1808    REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1809    REGEX_ASSERT(m->getDynamicClassID() != NULL);
1810    delete m;
1811    delete pat1;
1812
1813}
1814
1815//---------------------------------------------------------------------------
1816//
1817//      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1818//                       is present and working, but excluding functions
1819//                       implementing replace operations.
1820//
1821//---------------------------------------------------------------------------
1822void RegexTest::API_Match_UTF8() {
1823    UParseError         pe;
1824    UErrorCode          status=U_ZERO_ERROR;
1825    int32_t             flags = 0;
1826
1827    //
1828    // Debug - slide failing test cases early
1829    //
1830#if 0
1831    {
1832    }
1833    return;
1834#endif
1835
1836    //
1837    // Simple pattern compilation
1838    //
1839    {
1840        UText               re = UTEXT_INITIALIZER;
1841        regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1842        REGEX_VERBOSE_TEXT(&re);
1843        RegexPattern        *pat2;
1844        pat2 = RegexPattern::compile(&re, flags, pe, status);
1845        REGEX_CHECK_STATUS;
1846
1847        UText input1 = UTEXT_INITIALIZER;
1848        UText input2 = UTEXT_INITIALIZER;
1849        UText empty  = UTEXT_INITIALIZER;
1850        regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1851        REGEX_VERBOSE_TEXT(&input1);
1852        regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1853        REGEX_VERBOSE_TEXT(&input2);
1854        utext_openUChars(&empty, NULL, 0, &status);
1855
1856        int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1857        int32_t input2Len = strlen("not abc");
1858
1859
1860        //
1861        // Matcher creation and reset.
1862        //
1863        RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1864        REGEX_CHECK_STATUS;
1865        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1866        const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1867        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1868        m1->reset(&input2);
1869        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1870        const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1871        REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1872        m1->reset(&input1);
1873        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1874        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1875        m1->reset(&empty);
1876        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1877        REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1878
1879        //
1880        //  reset(pos, status)
1881        //
1882        m1->reset(&input1);
1883        m1->reset(4, status);
1884        REGEX_CHECK_STATUS;
1885        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1886        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1887
1888        m1->reset(-1, status);
1889        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1890        status = U_ZERO_ERROR;
1891
1892        m1->reset(0, status);
1893        REGEX_CHECK_STATUS;
1894        status = U_ZERO_ERROR;
1895
1896        m1->reset(input1Len-1, status);
1897        REGEX_CHECK_STATUS;
1898        status = U_ZERO_ERROR;
1899
1900        m1->reset(input1Len, status);
1901        REGEX_CHECK_STATUS;
1902        status = U_ZERO_ERROR;
1903
1904        m1->reset(input1Len+1, status);
1905        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1906        status = U_ZERO_ERROR;
1907
1908        //
1909        // match(pos, status)
1910        //
1911        m1->reset(&input2);
1912        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1913        m1->reset();
1914        REGEX_ASSERT(m1->matches(3, status) == FALSE);
1915        m1->reset();
1916        REGEX_ASSERT(m1->matches(5, status) == FALSE);
1917        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1918        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1919        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1920
1921        // Match() at end of string should fail, but should not
1922        //  be an error.
1923        status = U_ZERO_ERROR;
1924        REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1925        REGEX_CHECK_STATUS;
1926
1927        // Match beyond end of string should fail with an error.
1928        status = U_ZERO_ERROR;
1929        REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1930        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1931
1932        // Successful match at end of string.
1933        {
1934            status = U_ZERO_ERROR;
1935            RegexMatcher m("A?", 0, status);  // will match zero length string.
1936            REGEX_CHECK_STATUS;
1937            m.reset(&input1);
1938            REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1939            REGEX_CHECK_STATUS;
1940            m.reset(&empty);
1941            REGEX_ASSERT(m.matches(0, status) == TRUE);
1942            REGEX_CHECK_STATUS;
1943        }
1944
1945
1946        //
1947        // lookingAt(pos, status)
1948        //
1949        status = U_ZERO_ERROR;
1950        m1->reset(&input2);  // "not abc"
1951        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1952        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1953        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1954        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1955        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1956        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1957        status = U_ZERO_ERROR;
1958        REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1959        REGEX_CHECK_STATUS;
1960        REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1961        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1962
1963        delete m1;
1964        delete pat2;
1965
1966        utext_close(&re);
1967        utext_close(&input1);
1968        utext_close(&input2);
1969        utext_close(&empty);
1970    }
1971
1972
1973    //
1974    // Capture Group.
1975    //     RegexMatcher::start();
1976    //     RegexMatcher::end();
1977    //     RegexMatcher::groupCount();
1978    //
1979    {
1980        int32_t             flags=0;
1981        UParseError         pe;
1982        UErrorCode          status=U_ZERO_ERROR;
1983        UText               re=UTEXT_INITIALIZER;
1984        const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1985        utext_openUTF8(&re, str_01234567_pat, -1, &status);
1986
1987        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1988        REGEX_CHECK_STATUS;
1989
1990        UText input = UTEXT_INITIALIZER;
1991        const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1992        utext_openUTF8(&input, str_0123456789, -1, &status);
1993
1994        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1995        REGEX_CHECK_STATUS;
1996        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1997        static const int32_t matchStarts[] = {0,  2, 4, 8};
1998        static const int32_t matchEnds[]   = {10, 8, 6, 10};
1999        int32_t i;
2000        for (i=0; i<4; i++) {
2001            int32_t actualStart = matcher->start(i, status);
2002            REGEX_CHECK_STATUS;
2003            if (actualStart != matchStarts[i]) {
2004                errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2005                      __FILE__, __LINE__, i, matchStarts[i], actualStart);
2006            }
2007            int32_t actualEnd = matcher->end(i, status);
2008            REGEX_CHECK_STATUS;
2009            if (actualEnd != matchEnds[i]) {
2010                errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2011                      __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2012            }
2013        }
2014
2015        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2016        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2017
2018        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2019        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2020        matcher->reset();
2021        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2022
2023        matcher->lookingAt(status);
2024
2025        UnicodeString dest;
2026        UText destText = UTEXT_INITIALIZER;
2027        utext_openUnicodeString(&destText, &dest, &status);
2028        UText *result;
2029        //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2030        //	Test shallow-clone API
2031        int64_t   group_len;
2032        result = matcher->group((UText *)NULL, group_len, status);
2033        REGEX_CHECK_STATUS;
2034        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2035        utext_close(result);
2036        result = matcher->group(0, &destText, group_len, status);
2037        REGEX_CHECK_STATUS;
2038        REGEX_ASSERT(result == &destText);
2039        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2040        //  destText is now immutable, reopen it
2041        utext_close(&destText);
2042        utext_openUnicodeString(&destText, &dest, &status);
2043
2044        result = matcher->group(0, NULL, status);
2045        REGEX_CHECK_STATUS;
2046        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2047        utext_close(result);
2048        result = matcher->group(0, &destText, status);
2049        REGEX_CHECK_STATUS;
2050        REGEX_ASSERT(result == &destText);
2051        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2052
2053        result = matcher->group(1, NULL, status);
2054        REGEX_CHECK_STATUS;
2055        const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
2056        REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2057        utext_close(result);
2058        result = matcher->group(1, &destText, status);
2059        REGEX_CHECK_STATUS;
2060        REGEX_ASSERT(result == &destText);
2061        REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2062
2063        result = matcher->group(2, NULL, status);
2064        REGEX_CHECK_STATUS;
2065        const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
2066        REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2067        utext_close(result);
2068        result = matcher->group(2, &destText, status);
2069        REGEX_CHECK_STATUS;
2070        REGEX_ASSERT(result == &destText);
2071        REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2072
2073        result = matcher->group(3, NULL, status);
2074        REGEX_CHECK_STATUS;
2075        const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
2076        REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2077        utext_close(result);
2078        result = matcher->group(3, &destText, status);
2079        REGEX_CHECK_STATUS;
2080        REGEX_ASSERT(result == &destText);
2081        REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2082
2083        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2084        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2085        matcher->reset();
2086        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2087
2088        delete matcher;
2089        delete pat;
2090
2091        utext_close(&destText);
2092        utext_close(&input);
2093        utext_close(&re);
2094    }
2095
2096    //
2097    //  find
2098    //
2099    {
2100        int32_t             flags=0;
2101        UParseError         pe;
2102        UErrorCode          status=U_ZERO_ERROR;
2103        UText               re=UTEXT_INITIALIZER;
2104        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2105        utext_openUTF8(&re, str_abc, -1, &status);
2106
2107        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2108        REGEX_CHECK_STATUS;
2109        UText input = UTEXT_INITIALIZER;
2110        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2111        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2112        //                      012345678901234567
2113
2114        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2115        REGEX_CHECK_STATUS;
2116        REGEX_ASSERT(matcher->find());
2117        REGEX_ASSERT(matcher->start(status) == 1);
2118        REGEX_ASSERT(matcher->find());
2119        REGEX_ASSERT(matcher->start(status) == 6);
2120        REGEX_ASSERT(matcher->find());
2121        REGEX_ASSERT(matcher->start(status) == 12);
2122        REGEX_ASSERT(matcher->find() == FALSE);
2123        REGEX_ASSERT(matcher->find() == FALSE);
2124
2125        matcher->reset();
2126        REGEX_ASSERT(matcher->find());
2127        REGEX_ASSERT(matcher->start(status) == 1);
2128
2129        REGEX_ASSERT(matcher->find(0, status));
2130        REGEX_ASSERT(matcher->start(status) == 1);
2131        REGEX_ASSERT(matcher->find(1, status));
2132        REGEX_ASSERT(matcher->start(status) == 1);
2133        REGEX_ASSERT(matcher->find(2, status));
2134        REGEX_ASSERT(matcher->start(status) == 6);
2135        REGEX_ASSERT(matcher->find(12, status));
2136        REGEX_ASSERT(matcher->start(status) == 12);
2137        REGEX_ASSERT(matcher->find(13, status) == FALSE);
2138        REGEX_ASSERT(matcher->find(16, status) == FALSE);
2139        REGEX_ASSERT(matcher->find(17, status) == FALSE);
2140        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2141
2142        status = U_ZERO_ERROR;
2143        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2144        status = U_ZERO_ERROR;
2145        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2146
2147        REGEX_ASSERT(matcher->groupCount() == 0);
2148
2149        delete matcher;
2150        delete pat;
2151
2152        utext_close(&input);
2153        utext_close(&re);
2154    }
2155
2156
2157    //
2158    //  find, with \G in pattern (true if at the end of a previous match).
2159    //
2160    {
2161        int32_t             flags=0;
2162        UParseError         pe;
2163        UErrorCode          status=U_ZERO_ERROR;
2164        UText               re=UTEXT_INITIALIZER;
2165        const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2166        utext_openUTF8(&re, str_Gabcabc, -1, &status);
2167
2168        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2169
2170        REGEX_CHECK_STATUS;
2171        UText input = UTEXT_INITIALIZER;
2172        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2173        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2174        //                      012345678901234567
2175
2176        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2177        REGEX_CHECK_STATUS;
2178        REGEX_ASSERT(matcher->find());
2179        REGEX_ASSERT(matcher->start(status) == 0);
2180        REGEX_ASSERT(matcher->start(1, status) == -1);
2181        REGEX_ASSERT(matcher->start(2, status) == 1);
2182
2183        REGEX_ASSERT(matcher->find());
2184        REGEX_ASSERT(matcher->start(status) == 4);
2185        REGEX_ASSERT(matcher->start(1, status) == 4);
2186        REGEX_ASSERT(matcher->start(2, status) == -1);
2187        REGEX_CHECK_STATUS;
2188
2189        delete matcher;
2190        delete pat;
2191
2192        utext_close(&input);
2193        utext_close(&re);
2194    }
2195
2196    //
2197    //   find with zero length matches, match position should bump ahead
2198    //     to prevent loops.
2199    //
2200    {
2201        int32_t                 i;
2202        UErrorCode          status=U_ZERO_ERROR;
2203        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2204                                                      //   using an always-true look-ahead.
2205        REGEX_CHECK_STATUS;
2206        UText s = UTEXT_INITIALIZER;
2207        utext_openUTF8(&s, "    ", -1, &status);
2208        m.reset(&s);
2209        for (i=0; ; i++) {
2210            if (m.find() == FALSE) {
2211                break;
2212            }
2213            REGEX_ASSERT(m.start(status) == i);
2214            REGEX_ASSERT(m.end(status) == i);
2215        }
2216        REGEX_ASSERT(i==5);
2217
2218        // Check that the bump goes over characters outside the BMP OK
2219        // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2220        unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2221        utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2222        m.reset(&s);
2223        for (i=0; ; i+=4) {
2224            if (m.find() == FALSE) {
2225                break;
2226            }
2227            REGEX_ASSERT(m.start(status) == i);
2228            REGEX_ASSERT(m.end(status) == i);
2229        }
2230        REGEX_ASSERT(i==20);
2231
2232        utext_close(&s);
2233    }
2234    {
2235        // find() loop breaking test.
2236        //        with pattern of /.?/, should see a series of one char matches, then a single
2237        //        match of zero length at the end of the input string.
2238        int32_t                 i;
2239        UErrorCode          status=U_ZERO_ERROR;
2240        RegexMatcher        m(".?", 0, status);
2241        REGEX_CHECK_STATUS;
2242        UText s = UTEXT_INITIALIZER;
2243        utext_openUTF8(&s, "    ", -1, &status);
2244        m.reset(&s);
2245        for (i=0; ; i++) {
2246            if (m.find() == FALSE) {
2247                break;
2248            }
2249            REGEX_ASSERT(m.start(status) == i);
2250            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2251        }
2252        REGEX_ASSERT(i==5);
2253
2254        utext_close(&s);
2255    }
2256
2257
2258    //
2259    // Matchers with no input string behave as if they had an empty input string.
2260    //
2261
2262    {
2263        UErrorCode status = U_ZERO_ERROR;
2264        RegexMatcher  m(".?", 0, status);
2265        REGEX_CHECK_STATUS;
2266        REGEX_ASSERT(m.find());
2267        REGEX_ASSERT(m.start(status) == 0);
2268        REGEX_ASSERT(m.input() == "");
2269    }
2270    {
2271        UErrorCode status = U_ZERO_ERROR;
2272        RegexPattern  *p = RegexPattern::compile(".", 0, status);
2273        RegexMatcher  *m = p->matcher(status);
2274        REGEX_CHECK_STATUS;
2275
2276        REGEX_ASSERT(m->find() == FALSE);
2277        REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2278        delete m;
2279        delete p;
2280    }
2281
2282    //
2283    // Regions
2284    //
2285    {
2286        UErrorCode status = U_ZERO_ERROR;
2287        UText testPattern = UTEXT_INITIALIZER;
2288        UText testText    = UTEXT_INITIALIZER;
2289        regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2290        REGEX_VERBOSE_TEXT(&testPattern);
2291        regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2292        REGEX_VERBOSE_TEXT(&testText);
2293
2294        RegexMatcher m(&testPattern, &testText, 0, status);
2295        REGEX_CHECK_STATUS;
2296        REGEX_ASSERT(m.regionStart() == 0);
2297        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2298        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2299        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2300
2301        m.region(2,4, status);
2302        REGEX_CHECK_STATUS;
2303        REGEX_ASSERT(m.matches(status));
2304        REGEX_ASSERT(m.start(status)==2);
2305        REGEX_ASSERT(m.end(status)==4);
2306        REGEX_CHECK_STATUS;
2307
2308        m.reset();
2309        REGEX_ASSERT(m.regionStart() == 0);
2310        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2311
2312        regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2313        REGEX_VERBOSE_TEXT(&testText);
2314        m.reset(&testText);
2315        REGEX_ASSERT(m.regionStart() == 0);
2316        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2317
2318        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2319        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2320        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2321        REGEX_ASSERT(&m == &m.reset());
2322        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2323
2324        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2325        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2326        REGEX_ASSERT(&m == &m.reset());
2327        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2328
2329        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2330        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2331        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2332        REGEX_ASSERT(&m == &m.reset());
2333        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2334
2335        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2336        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2337        REGEX_ASSERT(&m == &m.reset());
2338        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2339
2340        utext_close(&testText);
2341        utext_close(&testPattern);
2342    }
2343
2344    //
2345    // hitEnd() and requireEnd()
2346    //
2347    {
2348        UErrorCode status = U_ZERO_ERROR;
2349        UText testPattern = UTEXT_INITIALIZER;
2350        UText testText    = UTEXT_INITIALIZER;
2351        const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2352        const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2353        utext_openUTF8(&testPattern, str_, -1, &status);
2354        utext_openUTF8(&testText, str_aabb, -1, &status);
2355
2356        RegexMatcher m1(&testPattern, &testText,  0, status);
2357        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2358        REGEX_ASSERT(m1.hitEnd() == TRUE);
2359        REGEX_ASSERT(m1.requireEnd() == FALSE);
2360        REGEX_CHECK_STATUS;
2361
2362        status = U_ZERO_ERROR;
2363        const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2364        utext_openUTF8(&testPattern, str_a, -1, &status);
2365        RegexMatcher m2(&testPattern, &testText, 0, status);
2366        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2367        REGEX_ASSERT(m2.hitEnd() == FALSE);
2368        REGEX_ASSERT(m2.requireEnd() == FALSE);
2369        REGEX_CHECK_STATUS;
2370
2371        status = U_ZERO_ERROR;
2372        const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2373        utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2374        RegexMatcher m3(&testPattern, &testText, 0, status);
2375        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2376        REGEX_ASSERT(m3.hitEnd() == TRUE);
2377        REGEX_ASSERT(m3.requireEnd() == TRUE);
2378        REGEX_CHECK_STATUS;
2379
2380        utext_close(&testText);
2381        utext_close(&testPattern);
2382    }
2383}
2384
2385
2386//---------------------------------------------------------------------------
2387//
2388//      API_Replace_UTF8   API test for class RegexMatcher, testing the
2389//                         Replace family of functions.
2390//
2391//---------------------------------------------------------------------------
2392void RegexTest::API_Replace_UTF8() {
2393    //
2394    //  Replace
2395    //
2396    int32_t             flags=0;
2397    UParseError         pe;
2398    UErrorCode          status=U_ZERO_ERROR;
2399
2400    UText               re=UTEXT_INITIALIZER;
2401    regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2402    REGEX_VERBOSE_TEXT(&re);
2403    RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2404    REGEX_CHECK_STATUS;
2405
2406    char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2407    //             012345678901234567
2408    UText dataText = UTEXT_INITIALIZER;
2409    utext_openUTF8(&dataText, data, -1, &status);
2410    REGEX_CHECK_STATUS;
2411    REGEX_VERBOSE_TEXT(&dataText);
2412    RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2413
2414    //
2415    //  Plain vanilla matches.
2416    //
2417    UnicodeString  dest;
2418    UText destText = UTEXT_INITIALIZER;
2419    utext_openUnicodeString(&destText, &dest, &status);
2420    UText *result;
2421
2422    UText replText = UTEXT_INITIALIZER;
2423
2424    const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2425    utext_openUTF8(&replText, str_yz, -1, &status);
2426    REGEX_VERBOSE_TEXT(&replText);
2427    result = matcher->replaceFirst(&replText, NULL, status);
2428    REGEX_CHECK_STATUS;
2429    const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2430    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2431    utext_close(result);
2432    result = matcher->replaceFirst(&replText, &destText, status);
2433    REGEX_CHECK_STATUS;
2434    REGEX_ASSERT(result == &destText);
2435    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2436
2437    result = matcher->replaceAll(&replText, NULL, status);
2438    REGEX_CHECK_STATUS;
2439    const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2440    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2441    utext_close(result);
2442
2443    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2444    result = matcher->replaceAll(&replText, &destText, status);
2445    REGEX_CHECK_STATUS;
2446    REGEX_ASSERT(result == &destText);
2447    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2448
2449    //
2450    //  Plain vanilla non-matches.
2451    //
2452    const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2453    utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2454    matcher->reset(&dataText);
2455
2456    result = matcher->replaceFirst(&replText, NULL, status);
2457    REGEX_CHECK_STATUS;
2458    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2459    utext_close(result);
2460    result = matcher->replaceFirst(&replText, &destText, status);
2461    REGEX_CHECK_STATUS;
2462    REGEX_ASSERT(result == &destText);
2463    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2464
2465    result = matcher->replaceAll(&replText, NULL, status);
2466    REGEX_CHECK_STATUS;
2467    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2468    utext_close(result);
2469    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2470    result = matcher->replaceAll(&replText, &destText, status);
2471    REGEX_CHECK_STATUS;
2472    REGEX_ASSERT(result == &destText);
2473    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2474
2475    //
2476    // Empty source string
2477    //
2478    utext_openUTF8(&dataText, NULL, 0, &status);
2479    matcher->reset(&dataText);
2480
2481    result = matcher->replaceFirst(&replText, NULL, status);
2482    REGEX_CHECK_STATUS;
2483    REGEX_ASSERT_UTEXT_UTF8("", result);
2484    utext_close(result);
2485    result = matcher->replaceFirst(&replText, &destText, status);
2486    REGEX_CHECK_STATUS;
2487    REGEX_ASSERT(result == &destText);
2488    REGEX_ASSERT_UTEXT_UTF8("", result);
2489
2490    result = matcher->replaceAll(&replText, NULL, status);
2491    REGEX_CHECK_STATUS;
2492    REGEX_ASSERT_UTEXT_UTF8("", result);
2493    utext_close(result);
2494    result = matcher->replaceAll(&replText, &destText, status);
2495    REGEX_CHECK_STATUS;
2496    REGEX_ASSERT(result == &destText);
2497    REGEX_ASSERT_UTEXT_UTF8("", result);
2498
2499    //
2500    // Empty substitution string
2501    //
2502    utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2503    matcher->reset(&dataText);
2504
2505    utext_openUTF8(&replText, NULL, 0, &status);
2506    result = matcher->replaceFirst(&replText, NULL, status);
2507    REGEX_CHECK_STATUS;
2508    const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2509    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2510    utext_close(result);
2511    result = matcher->replaceFirst(&replText, &destText, status);
2512    REGEX_CHECK_STATUS;
2513    REGEX_ASSERT(result == &destText);
2514    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2515
2516    result = matcher->replaceAll(&replText, NULL, status);
2517    REGEX_CHECK_STATUS;
2518    const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2519    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2520    utext_close(result);
2521    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2522    result = matcher->replaceAll(&replText, &destText, status);
2523    REGEX_CHECK_STATUS;
2524    REGEX_ASSERT(result == &destText);
2525    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2526
2527    //
2528    // match whole string
2529    //
2530    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2531    utext_openUTF8(&dataText, str_abc, -1, &status);
2532    matcher->reset(&dataText);
2533
2534    const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2535    utext_openUTF8(&replText, str_xyz, -1, &status);
2536    result = matcher->replaceFirst(&replText, NULL, status);
2537    REGEX_CHECK_STATUS;
2538    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2539    utext_close(result);
2540    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2541    result = matcher->replaceFirst(&replText, &destText, status);
2542    REGEX_CHECK_STATUS;
2543    REGEX_ASSERT(result == &destText);
2544    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2545
2546    result = matcher->replaceAll(&replText, NULL, status);
2547    REGEX_CHECK_STATUS;
2548    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2549    utext_close(result);
2550    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2551    result = matcher->replaceAll(&replText, &destText, status);
2552    REGEX_CHECK_STATUS;
2553    REGEX_ASSERT(result == &destText);
2554    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2555
2556    //
2557    // Capture Group, simple case
2558    //
2559    const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2560    utext_openUTF8(&re, str_add, -1, &status);
2561    RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2562    REGEX_CHECK_STATUS;
2563
2564    const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2565    utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2566    RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2567    REGEX_CHECK_STATUS;
2568
2569    const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2570    utext_openUTF8(&replText, str_11, -1, &status);
2571    result = matcher2->replaceFirst(&replText, NULL, status);
2572    REGEX_CHECK_STATUS;
2573    const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2574    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2575    utext_close(result);
2576    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2577    result = matcher2->replaceFirst(&replText, &destText, status);
2578    REGEX_CHECK_STATUS;
2579    REGEX_ASSERT(result == &destText);
2580    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2581
2582    const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2583    utext_openUTF8(&replText, str_v, -1, &status);
2584    REGEX_VERBOSE_TEXT(&replText);
2585    result = matcher2->replaceFirst(&replText, NULL, status);
2586    REGEX_CHECK_STATUS;
2587    const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2588    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2589    utext_close(result);
2590    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2591    result = matcher2->replaceFirst(&replText, &destText, status);
2592    REGEX_CHECK_STATUS;
2593    REGEX_ASSERT(result == &destText);
2594    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2595
2596    const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2597    utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2598    result = matcher2->replaceFirst(&replText, NULL, status);
2599    REGEX_CHECK_STATUS;
2600    const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2601    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2602    utext_close(result);
2603    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2604    result = matcher2->replaceFirst(&replText, &destText, status);
2605    REGEX_CHECK_STATUS;
2606    REGEX_ASSERT(result == &destText);
2607    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2608
2609    unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2610    //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2611    //                                 012345678901234567890123456
2612    supplDigitChars[22] = 0xF0;
2613    supplDigitChars[23] = 0x9D;
2614    supplDigitChars[24] = 0x9F;
2615    supplDigitChars[25] = 0x8F;
2616    utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2617
2618    result = matcher2->replaceFirst(&replText, NULL, status);
2619    REGEX_CHECK_STATUS;
2620    const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2621    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2622    utext_close(result);
2623    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2624    result = matcher2->replaceFirst(&replText, &destText, status);
2625    REGEX_CHECK_STATUS;
2626    REGEX_ASSERT(result == &destText);
2627    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2628    const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2629    utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2630    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2631//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2632    utext_close(result);
2633    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2634    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2635    REGEX_ASSERT(result == &destText);
2636//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2637
2638    //
2639    // Replacement String with \u hex escapes
2640    //
2641    {
2642      const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2643      const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2644        utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2645        utext_openUTF8(&replText, str_u0043, -1, &status);
2646        matcher->reset(&dataText);
2647
2648        result = matcher->replaceAll(&replText, NULL, status);
2649        REGEX_CHECK_STATUS;
2650        const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2651        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2652        utext_close(result);
2653        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2654        result = matcher->replaceAll(&replText, &destText, status);
2655        REGEX_CHECK_STATUS;
2656        REGEX_ASSERT(result == &destText);
2657        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2658    }
2659    {
2660      const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2661        utext_openUTF8(&dataText, str_abc, -1, &status);
2662        const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2663        utext_openUTF8(&replText, str_U00010000, -1, &status);
2664        matcher->reset(&dataText);
2665
2666        unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2667        //                          0123456789
2668        expected[2] = 0xF0;
2669        expected[3] = 0x90;
2670        expected[4] = 0x80;
2671        expected[5] = 0x80;
2672
2673        result = matcher->replaceAll(&replText, NULL, status);
2674        REGEX_CHECK_STATUS;
2675        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2676        utext_close(result);
2677        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2678        result = matcher->replaceAll(&replText, &destText, status);
2679        REGEX_CHECK_STATUS;
2680        REGEX_ASSERT(result == &destText);
2681        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2682    }
2683    // TODO:  need more through testing of capture substitutions.
2684
2685    // Bug 4057
2686    //
2687    {
2688        status = U_ZERO_ERROR;
2689const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2690const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2691const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2692        utext_openUTF8(&re, str_ssee, -1, &status);
2693        utext_openUTF8(&dataText, str_blah, -1, &status);
2694        utext_openUTF8(&replText, str_ooh, -1, &status);
2695
2696        RegexMatcher m(&re, 0, status);
2697        REGEX_CHECK_STATUS;
2698
2699        UnicodeString result;
2700        UText resultText = UTEXT_INITIALIZER;
2701        utext_openUnicodeString(&resultText, &result, &status);
2702
2703        // Multiple finds do NOT bump up the previous appendReplacement postion.
2704        m.reset(&dataText);
2705        m.find();
2706        m.find();
2707        m.appendReplacement(&resultText, &replText, status);
2708        REGEX_CHECK_STATUS;
2709        const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2710        REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2711
2712        // After a reset into the interior of a string, appendReplacement still starts at beginning.
2713        status = U_ZERO_ERROR;
2714        result.truncate(0);
2715        utext_openUnicodeString(&resultText, &result, &status);
2716        m.reset(10, status);
2717        m.find();
2718        m.find();
2719        m.appendReplacement(&resultText, &replText, status);
2720        REGEX_CHECK_STATUS;
2721        const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2722        REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2723
2724        // find() at interior of string, appendReplacement still starts at beginning.
2725        status = U_ZERO_ERROR;
2726        result.truncate(0);
2727        utext_openUnicodeString(&resultText, &result, &status);
2728        m.reset();
2729        m.find(10, status);
2730        m.find();
2731        m.appendReplacement(&resultText, &replText, status);
2732        REGEX_CHECK_STATUS;
2733        const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2734        REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2735
2736        m.appendTail(&resultText, status);
2737        const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2738        REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2739
2740        utext_close(&resultText);
2741    }
2742
2743    delete matcher2;
2744    delete pat2;
2745    delete matcher;
2746    delete pat;
2747
2748    utext_close(&dataText);
2749    utext_close(&replText);
2750    utext_close(&destText);
2751    utext_close(&re);
2752}
2753
2754
2755//---------------------------------------------------------------------------
2756//
2757//      API_Pattern_UTF8  Test that the API for class RegexPattern is
2758//                        present and nominally working.
2759//
2760//---------------------------------------------------------------------------
2761void RegexTest::API_Pattern_UTF8() {
2762    RegexPattern        pata;    // Test default constructor to not crash.
2763    RegexPattern        patb;
2764
2765    REGEX_ASSERT(pata == patb);
2766    REGEX_ASSERT(pata == pata);
2767
2768    UText         re1 = UTEXT_INITIALIZER;
2769    UText         re2 = UTEXT_INITIALIZER;
2770    UErrorCode    status = U_ZERO_ERROR;
2771    UParseError   pe;
2772
2773    const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2774    const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2775    utext_openUTF8(&re1, str_abcalmz, -1, &status);
2776    utext_openUTF8(&re2, str_def, -1, &status);
2777
2778    RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2779    RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2780    REGEX_CHECK_STATUS;
2781    REGEX_ASSERT(*pat1 == *pat1);
2782    REGEX_ASSERT(*pat1 != pata);
2783
2784    // Assign
2785    patb = *pat1;
2786    REGEX_ASSERT(patb == *pat1);
2787
2788    // Copy Construct
2789    RegexPattern patc(*pat1);
2790    REGEX_ASSERT(patc == *pat1);
2791    REGEX_ASSERT(patb == patc);
2792    REGEX_ASSERT(pat1 != pat2);
2793    patb = *pat2;
2794    REGEX_ASSERT(patb != patc);
2795    REGEX_ASSERT(patb == *pat2);
2796
2797    // Compile with no flags.
2798    RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2799    REGEX_ASSERT(*pat1a == *pat1);
2800
2801    REGEX_ASSERT(pat1a->flags() == 0);
2802
2803    // Compile with different flags should be not equal
2804    RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2805    REGEX_CHECK_STATUS;
2806
2807    REGEX_ASSERT(*pat1b != *pat1a);
2808    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2809    REGEX_ASSERT(pat1a->flags() == 0);
2810    delete pat1b;
2811
2812    // clone
2813    RegexPattern *pat1c = pat1->clone();
2814    REGEX_ASSERT(*pat1c == *pat1);
2815    REGEX_ASSERT(*pat1c != *pat2);
2816
2817    delete pat1c;
2818    delete pat1a;
2819    delete pat1;
2820    delete pat2;
2821
2822    utext_close(&re1);
2823    utext_close(&re2);
2824
2825
2826    //
2827    //   Verify that a matcher created from a cloned pattern works.
2828    //     (Jitterbug 3423)
2829    //
2830    {
2831        UErrorCode     status     = U_ZERO_ERROR;
2832        UText          pattern    = UTEXT_INITIALIZER;
2833        const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2834        utext_openUTF8(&pattern, str_pL, -1, &status);
2835
2836        RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2837        RegexPattern  *pClone     = pSource->clone();
2838        delete         pSource;
2839        RegexMatcher  *mFromClone = pClone->matcher(status);
2840        REGEX_CHECK_STATUS;
2841
2842        UText          input      = UTEXT_INITIALIZER;
2843        const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2844        utext_openUTF8(&input, str_HelloWorld, -1, &status);
2845        mFromClone->reset(&input);
2846        REGEX_ASSERT(mFromClone->find() == TRUE);
2847        REGEX_ASSERT(mFromClone->group(status) == "Hello");
2848        REGEX_ASSERT(mFromClone->find() == TRUE);
2849        REGEX_ASSERT(mFromClone->group(status) == "World");
2850        REGEX_ASSERT(mFromClone->find() == FALSE);
2851        delete mFromClone;
2852        delete pClone;
2853
2854        utext_close(&input);
2855        utext_close(&pattern);
2856    }
2857
2858    //
2859    //   matches convenience API
2860    //
2861    {
2862        UErrorCode status  = U_ZERO_ERROR;
2863        UText      pattern = UTEXT_INITIALIZER;
2864        UText      input   = UTEXT_INITIALIZER;
2865
2866        const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2867        utext_openUTF8(&input, str_randominput, -1, &status);
2868
2869        const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2870        utext_openUTF8(&pattern, str_dotstar, -1, &status);
2871        REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2872        REGEX_CHECK_STATUS;
2873
2874        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2875        utext_openUTF8(&pattern, str_abc, -1, &status);
2876        REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2877        REGEX_CHECK_STATUS;
2878
2879        const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2880        utext_openUTF8(&pattern, str_nput, -1, &status);
2881        REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2882        REGEX_CHECK_STATUS;
2883
2884        utext_openUTF8(&pattern, str_randominput, -1, &status);
2885        REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2886        REGEX_CHECK_STATUS;
2887
2888        const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2889        utext_openUTF8(&pattern, str_u, -1, &status);
2890        REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2891        REGEX_CHECK_STATUS;
2892
2893        utext_openUTF8(&input, str_abc, -1, &status);
2894        utext_openUTF8(&pattern, str_abc, -1, &status);
2895        status = U_INDEX_OUTOFBOUNDS_ERROR;
2896        REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2897        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2898
2899        utext_close(&input);
2900        utext_close(&pattern);
2901    }
2902
2903
2904    //
2905    // Split()
2906    //
2907    status = U_ZERO_ERROR;
2908    const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2909    utext_openUTF8(&re1, str_spaceplus, -1, &status);
2910    pat1 = RegexPattern::compile(&re1, pe, status);
2911    REGEX_CHECK_STATUS;
2912    UnicodeString  fields[10];
2913
2914    int32_t n;
2915    n = pat1->split("Now is the time", fields, 10, status);
2916    REGEX_CHECK_STATUS;
2917    REGEX_ASSERT(n==4);
2918    REGEX_ASSERT(fields[0]=="Now");
2919    REGEX_ASSERT(fields[1]=="is");
2920    REGEX_ASSERT(fields[2]=="the");
2921    REGEX_ASSERT(fields[3]=="time");
2922    REGEX_ASSERT(fields[4]=="");
2923
2924    n = pat1->split("Now is the time", fields, 2, status);
2925    REGEX_CHECK_STATUS;
2926    REGEX_ASSERT(n==2);
2927    REGEX_ASSERT(fields[0]=="Now");
2928    REGEX_ASSERT(fields[1]=="is the time");
2929    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2930
2931    fields[1] = "*";
2932    status = U_ZERO_ERROR;
2933    n = pat1->split("Now is the time", fields, 1, status);
2934    REGEX_CHECK_STATUS;
2935    REGEX_ASSERT(n==1);
2936    REGEX_ASSERT(fields[0]=="Now is the time");
2937    REGEX_ASSERT(fields[1]=="*");
2938    status = U_ZERO_ERROR;
2939
2940    n = pat1->split("    Now       is the time   ", fields, 10, status);
2941    REGEX_CHECK_STATUS;
2942    REGEX_ASSERT(n==6);
2943    REGEX_ASSERT(fields[0]=="");
2944    REGEX_ASSERT(fields[1]=="Now");
2945    REGEX_ASSERT(fields[2]=="is");
2946    REGEX_ASSERT(fields[3]=="the");
2947    REGEX_ASSERT(fields[4]=="time");
2948    REGEX_ASSERT(fields[5]=="");
2949    REGEX_ASSERT(fields[6]=="");
2950
2951    fields[2] = "*";
2952    n = pat1->split("     ", fields, 10, status);
2953    REGEX_CHECK_STATUS;
2954    REGEX_ASSERT(n==2);
2955    REGEX_ASSERT(fields[0]=="");
2956    REGEX_ASSERT(fields[1]=="");
2957    REGEX_ASSERT(fields[2]=="*");
2958
2959    fields[0] = "foo";
2960    n = pat1->split("", fields, 10, status);
2961    REGEX_CHECK_STATUS;
2962    REGEX_ASSERT(n==0);
2963    REGEX_ASSERT(fields[0]=="foo");
2964
2965    delete pat1;
2966
2967    //  split, with a pattern with (capture)
2968    regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2969    pat1 = RegexPattern::compile(&re1,  pe, status);
2970    REGEX_CHECK_STATUS;
2971
2972    status = U_ZERO_ERROR;
2973    fields[6] = fields[7] = "*";
2974    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2975    REGEX_CHECK_STATUS;
2976    REGEX_ASSERT(n==7);
2977    REGEX_ASSERT(fields[0]=="");
2978    REGEX_ASSERT(fields[1]=="a");
2979    REGEX_ASSERT(fields[2]=="Now is ");
2980    REGEX_ASSERT(fields[3]=="b");
2981    REGEX_ASSERT(fields[4]=="the time");
2982    REGEX_ASSERT(fields[5]=="c");
2983    REGEX_ASSERT(fields[6]=="");
2984    REGEX_ASSERT(fields[7]=="*");
2985    REGEX_ASSERT(status==U_ZERO_ERROR);
2986
2987    fields[6] = fields[7] = "*";
2988    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2989    REGEX_CHECK_STATUS;
2990    REGEX_ASSERT(n==7);
2991    REGEX_ASSERT(fields[0]=="  ");
2992    REGEX_ASSERT(fields[1]=="a");
2993    REGEX_ASSERT(fields[2]=="Now is ");
2994    REGEX_ASSERT(fields[3]=="b");
2995    REGEX_ASSERT(fields[4]=="the time");
2996    REGEX_ASSERT(fields[5]=="c");
2997    REGEX_ASSERT(fields[6]=="");
2998    REGEX_ASSERT(fields[7]=="*");
2999
3000    status = U_ZERO_ERROR;
3001    fields[6] = "foo";
3002    n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
3003    REGEX_CHECK_STATUS;
3004    REGEX_ASSERT(n==6);
3005    REGEX_ASSERT(fields[0]=="  ");
3006    REGEX_ASSERT(fields[1]=="a");
3007    REGEX_ASSERT(fields[2]=="Now is ");
3008    REGEX_ASSERT(fields[3]=="b");
3009    REGEX_ASSERT(fields[4]=="the time");
3010    REGEX_ASSERT(fields[5]==" ");
3011    REGEX_ASSERT(fields[6]=="foo");
3012
3013    status = U_ZERO_ERROR;
3014    fields[5] = "foo";
3015    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3016    REGEX_CHECK_STATUS;
3017    REGEX_ASSERT(n==5);
3018    REGEX_ASSERT(fields[0]=="  ");
3019    REGEX_ASSERT(fields[1]=="a");
3020    REGEX_ASSERT(fields[2]=="Now is ");
3021    REGEX_ASSERT(fields[3]=="b");
3022    REGEX_ASSERT(fields[4]=="the time<c>");
3023    REGEX_ASSERT(fields[5]=="foo");
3024
3025    status = U_ZERO_ERROR;
3026    fields[5] = "foo";
3027    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3028    REGEX_CHECK_STATUS;
3029    REGEX_ASSERT(n==5);
3030    REGEX_ASSERT(fields[0]=="  ");
3031    REGEX_ASSERT(fields[1]=="a");
3032    REGEX_ASSERT(fields[2]=="Now is ");
3033    REGEX_ASSERT(fields[3]=="b");
3034    REGEX_ASSERT(fields[4]=="the time");
3035    REGEX_ASSERT(fields[5]=="foo");
3036
3037    status = U_ZERO_ERROR;
3038    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3039    REGEX_CHECK_STATUS;
3040    REGEX_ASSERT(n==4);
3041    REGEX_ASSERT(fields[0]=="  ");
3042    REGEX_ASSERT(fields[1]=="a");
3043    REGEX_ASSERT(fields[2]=="Now is ");
3044    REGEX_ASSERT(fields[3]=="the time<c>");
3045    status = U_ZERO_ERROR;
3046    delete pat1;
3047
3048    regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3049    pat1 = RegexPattern::compile(&re1, pe, status);
3050    REGEX_CHECK_STATUS;
3051    n = pat1->split("1-10,20", fields, 10, status);
3052    REGEX_CHECK_STATUS;
3053    REGEX_ASSERT(n==5);
3054    REGEX_ASSERT(fields[0]=="1");
3055    REGEX_ASSERT(fields[1]=="-");
3056    REGEX_ASSERT(fields[2]=="10");
3057    REGEX_ASSERT(fields[3]==",");
3058    REGEX_ASSERT(fields[4]=="20");
3059    delete pat1;
3060
3061
3062    //
3063    // RegexPattern::pattern() and patternText()
3064    //
3065    pat1 = new RegexPattern();
3066    REGEX_ASSERT(pat1->pattern() == "");
3067    REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3068    delete pat1;
3069    const char *helloWorldInvariant = "(Hello, world)*";
3070    regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3071    pat1 = RegexPattern::compile(&re1, pe, status);
3072    REGEX_CHECK_STATUS;
3073    REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
3074    REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3075    delete pat1;
3076
3077    utext_close(&re1);
3078}
3079
3080
3081//---------------------------------------------------------------------------
3082//
3083//      Extended       A more thorough check for features of regex patterns
3084//                     The test cases are in a separate data file,
3085//                       source/tests/testdata/regextst.txt
3086//                     A description of the test data format is included in that file.
3087//
3088//---------------------------------------------------------------------------
3089
3090const char *
3091RegexTest::getPath(char buffer[2048], const char *filename) {
3092    UErrorCode status=U_ZERO_ERROR;
3093    const char *testDataDirectory = IntlTest::getSourceTestData(status);
3094    if (U_FAILURE(status)) {
3095        errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3096        return NULL;
3097    }
3098
3099    strcpy(buffer, testDataDirectory);
3100    strcat(buffer, filename);
3101    return buffer;
3102}
3103
3104void RegexTest::Extended() {
3105    char tdd[2048];
3106    const char *srcPath;
3107    UErrorCode  status  = U_ZERO_ERROR;
3108    int32_t     lineNum = 0;
3109
3110    //
3111    //  Open and read the test data file.
3112    //
3113    srcPath=getPath(tdd, "regextst.txt");
3114    if(srcPath==NULL) {
3115        return; /* something went wrong, error already output */
3116    }
3117
3118    int32_t    len;
3119    UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3120    if (U_FAILURE(status)) {
3121        return; /* something went wrong, error already output */
3122    }
3123
3124    //
3125    //  Put the test data into a UnicodeString
3126    //
3127    UnicodeString testString(FALSE, testData, len);
3128
3129    RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3130    RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3131    RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3132
3133    RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3134    UnicodeString   testPattern;   // The pattern for test from the test file.
3135    UnicodeString   testFlags;     // the flags   for a test.
3136    UnicodeString   matchString;   // The marked up string to be used as input
3137
3138    if (U_FAILURE(status)){
3139        dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3140        delete [] testData;
3141        return;
3142    }
3143
3144    //
3145    //  Loop over the test data file, once per line.
3146    //
3147    while (lineMat.find()) {
3148        lineNum++;
3149        if (U_FAILURE(status)) {
3150          errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3151        }
3152
3153        status = U_ZERO_ERROR;
3154        UnicodeString testLine = lineMat.group(1, status);
3155        if (testLine.length() == 0) {
3156            continue;
3157        }
3158
3159        //
3160        // Parse the test line.  Skip blank and comment only lines.
3161        // Separate out the three main fields - pattern, flags, target.
3162        //
3163
3164        commentMat.reset(testLine);
3165        if (commentMat.lookingAt(status)) {
3166            // This line is a comment, or blank.
3167            continue;
3168        }
3169
3170        //
3171        //  Pull out the pattern field, remove it from the test file line.
3172        //
3173        quotedStuffMat.reset(testLine);
3174        if (quotedStuffMat.lookingAt(status)) {
3175            testPattern = quotedStuffMat.group(2, status);
3176            testLine.remove(0, quotedStuffMat.end(0, status));
3177        } else {
3178            errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3179            continue;
3180        }
3181
3182
3183        //
3184        //  Pull out the flags from the test file line.
3185        //
3186        flagsMat.reset(testLine);
3187        flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3188        testFlags = flagsMat.group(1, status);
3189        if (flagsMat.group(2, status).length() > 0) {
3190            errln("Bad Match flag at line %d. Scanning %c\n",
3191                lineNum, flagsMat.group(2, status).charAt(0));
3192            continue;
3193        }
3194        testLine.remove(0, flagsMat.end(0, status));
3195
3196        //
3197        //  Pull out the match string, as a whole.
3198        //    We'll process the <tags> later.
3199        //
3200        quotedStuffMat.reset(testLine);
3201        if (quotedStuffMat.lookingAt(status)) {
3202            matchString = quotedStuffMat.group(2, status);
3203            testLine.remove(0, quotedStuffMat.end(0, status));
3204        } else {
3205            errln("Bad match string at test file line %d", lineNum);
3206            continue;
3207        }
3208
3209        //
3210        //  The only thing left from the input line should be an optional trailing comment.
3211        //
3212        commentMat.reset(testLine);
3213        if (commentMat.lookingAt(status) == FALSE) {
3214            errln("Line %d: unexpected characters at end of test line.", lineNum);
3215            continue;
3216        }
3217
3218        //
3219        //  Run the test
3220        //
3221        regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3222    }
3223
3224    delete [] testData;
3225
3226}
3227
3228
3229
3230//---------------------------------------------------------------------------
3231//
3232//    regex_find(pattern, flags, inputString, lineNumber)
3233//
3234//         Function to run a single test from the Extended (data driven) tests.
3235//         See file test/testdata/regextst.txt for a description of the
3236//         pattern and inputString fields, and the allowed flags.
3237//         lineNumber is the source line in regextst.txt of the test.
3238//
3239//---------------------------------------------------------------------------
3240
3241
3242//  Set a value into a UVector at position specified by a decimal number in
3243//   a UnicodeString.   This is a utility function needed by the actual test function,
3244//   which follows.
3245static void set(UVector &vec, int32_t val, UnicodeString index) {
3246    UErrorCode  status=U_ZERO_ERROR;
3247    int32_t  idx = 0;
3248    for (int32_t i=0; i<index.length(); i++) {
3249        int32_t d=u_charDigitValue(index.charAt(i));
3250        if (d<0) {return;}
3251        idx = idx*10 + d;
3252    }
3253    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3254    vec.setElementAt(val, idx);
3255}
3256
3257static void setInt(UVector &vec, int32_t val, int32_t idx) {
3258    UErrorCode  status=U_ZERO_ERROR;
3259    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3260    vec.setElementAt(val, idx);
3261}
3262
3263static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3264{
3265    UBool couldFind = TRUE;
3266    UTEXT_SETNATIVEINDEX(utext, 0);
3267    int32_t i = 0;
3268    while (i < unistrOffset) {
3269        UChar32 c = UTEXT_NEXT32(utext);
3270        if (c != U_SENTINEL) {
3271            i += U16_LENGTH(c);
3272        } else {
3273            couldFind = FALSE;
3274            break;
3275        }
3276    }
3277    nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3278    return couldFind;
3279}
3280
3281
3282void RegexTest::regex_find(const UnicodeString &pattern,
3283                           const UnicodeString &flags,
3284                           const UnicodeString &inputString,
3285                           const char *srcPath,
3286                           int32_t line) {
3287    UnicodeString       unEscapedInput;
3288    UnicodeString       deTaggedInput;
3289
3290    int32_t             patternUTF8Length,      inputUTF8Length;
3291    char                *patternChars  = NULL, *inputChars = NULL;
3292    UText               patternText    = UTEXT_INITIALIZER;
3293    UText               inputText      = UTEXT_INITIALIZER;
3294    UConverter          *UTF8Converter = NULL;
3295
3296    UErrorCode          status         = U_ZERO_ERROR;
3297    UParseError         pe;
3298    RegexPattern        *parsePat      = NULL;
3299    RegexMatcher        *parseMatcher  = NULL;
3300    RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3301    RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3302    UVector             groupStarts(status);
3303    UVector             groupEnds(status);
3304    UVector             groupStartsUTF8(status);
3305    UVector             groupEndsUTF8(status);
3306    UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3307    UBool               failed         = FALSE;
3308    int32_t             numFinds;
3309    int32_t             i;
3310    UBool               useMatchesFunc   = FALSE;
3311    UBool               useLookingAtFunc = FALSE;
3312    int32_t             regionStart      = -1;
3313    int32_t             regionEnd        = -1;
3314    int32_t             regionStartUTF8  = -1;
3315    int32_t             regionEndUTF8    = -1;
3316
3317
3318    //
3319    //  Compile the caller's pattern
3320    //
3321    uint32_t bflags = 0;
3322    if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3323        bflags |= UREGEX_CASE_INSENSITIVE;
3324    }
3325    if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3326        bflags |= UREGEX_COMMENTS;
3327    }
3328    if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3329        bflags |= UREGEX_DOTALL;
3330    }
3331    if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3332        bflags |= UREGEX_MULTILINE;
3333    }
3334
3335    if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3336        bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3337    }
3338    if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3339        bflags |= UREGEX_UNIX_LINES;
3340    }
3341    if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3342        bflags |= UREGEX_LITERAL;
3343    }
3344
3345
3346    callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3347    if (status != U_ZERO_ERROR) {
3348        #if UCONFIG_NO_BREAK_ITERATION==1
3349        // 'v' test flag means that the test pattern should not compile if ICU was configured
3350        //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3351        if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3352            goto cleanupAndReturn;
3353        }
3354        #endif
3355        if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3356            // Expected pattern compilation error.
3357            if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3358                logln("Pattern Compile returns \"%s\"", u_errorName(status));
3359            }
3360            goto cleanupAndReturn;
3361        } else {
3362            // Unexpected pattern compilation error.
3363            dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3364            goto cleanupAndReturn;
3365        }
3366    }
3367
3368    UTF8Converter = ucnv_open("UTF8", &status);
3369    ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3370
3371    patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3372    status = U_ZERO_ERROR; // buffer overflow
3373    patternChars = new char[patternUTF8Length+1];
3374    pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3375    utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3376
3377    if (status == U_ZERO_ERROR) {
3378        UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3379
3380        if (status != U_ZERO_ERROR) {
3381#if UCONFIG_NO_BREAK_ITERATION==1
3382            // 'v' test flag means that the test pattern should not compile if ICU was configured
3383            //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3384            if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3385                goto cleanupAndReturn;
3386            }
3387#endif
3388            if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3389                // Expected pattern compilation error.
3390                if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3391                    logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3392                }
3393                goto cleanupAndReturn;
3394            } else {
3395                // Unexpected pattern compilation error.
3396                errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3397                goto cleanupAndReturn;
3398            }
3399        }
3400    }
3401
3402    if (UTF8Pattern == NULL) {
3403        // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3404        logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3405        status = U_ZERO_ERROR;
3406    }
3407
3408    if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3409        callerPattern->dumpPattern();
3410    }
3411
3412    if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3413        errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3414        goto cleanupAndReturn;
3415    }
3416
3417
3418    //
3419    // Number of times find() should be called on the test string, default to 1
3420    //
3421    numFinds = 1;
3422    for (i=2; i<=9; i++) {
3423        if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3424            if (numFinds != 1) {
3425                errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3426                goto cleanupAndReturn;
3427            }
3428            numFinds = i;
3429        }
3430    }
3431
3432    // 'M' flag.  Use matches() instead of find()
3433    if (flags.indexOf((UChar)0x4d) >= 0) {
3434        useMatchesFunc = TRUE;
3435    }
3436    if (flags.indexOf((UChar)0x4c) >= 0) {
3437        useLookingAtFunc = TRUE;
3438    }
3439
3440    //
3441    //  Find the tags in the input data, remove them, and record the group boundary
3442    //    positions.
3443    //
3444    parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3445    REGEX_CHECK_STATUS_L(line);
3446
3447    unEscapedInput = inputString.unescape();
3448    parseMatcher = parsePat->matcher(unEscapedInput, status);
3449    REGEX_CHECK_STATUS_L(line);
3450    while(parseMatcher->find()) {
3451        parseMatcher->appendReplacement(deTaggedInput, "", status);
3452        REGEX_CHECK_STATUS;
3453        UnicodeString groupNum = parseMatcher->group(2, status);
3454        if (groupNum == "r") {
3455            // <r> or </r>, a region specification within the string
3456            if (parseMatcher->group(1, status) == "/") {
3457                regionEnd = deTaggedInput.length();
3458            } else {
3459                regionStart = deTaggedInput.length();
3460            }
3461        } else {
3462            // <digits> or </digits>, a group match boundary tag.
3463            if (parseMatcher->group(1, status) == "/") {
3464                set(groupEnds, deTaggedInput.length(), groupNum);
3465            } else {
3466                set(groupStarts, deTaggedInput.length(), groupNum);
3467            }
3468        }
3469    }
3470    parseMatcher->appendTail(deTaggedInput);
3471    REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3472    if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3473      errln("mismatched <r> tags");
3474      failed = TRUE;
3475      goto cleanupAndReturn;
3476    }
3477
3478    //
3479    //  Configure the matcher according to the flags specified with this test.
3480    //
3481    matcher = callerPattern->matcher(deTaggedInput, status);
3482    REGEX_CHECK_STATUS_L(line);
3483    if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3484        matcher->setTrace(TRUE);
3485    }
3486
3487    if (UTF8Pattern != NULL) {
3488        inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3489        status = U_ZERO_ERROR; // buffer overflow
3490        inputChars = new char[inputUTF8Length+1];
3491        deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3492        utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3493
3494        if (status == U_ZERO_ERROR) {
3495            UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3496            REGEX_CHECK_STATUS_L(line);
3497        }
3498
3499        if (UTF8Matcher == NULL) {
3500            // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3501          logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3502            status = U_ZERO_ERROR;
3503        }
3504    }
3505
3506    //
3507    //  Generate native indices for UTF8 versions of region and capture group info
3508    //
3509    if (UTF8Matcher != NULL) {
3510        if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3511        if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3512
3513        //  Fill out the native index UVector info.
3514        //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3515        for (i=0; i<groupStarts.size(); i++) {
3516            int32_t  start = groupStarts.elementAti(i);
3517            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3518            if (start >= 0) {
3519                int32_t  startUTF8;
3520                if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3521                    errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3522                    failed = TRUE;
3523                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3524                }
3525                setInt(groupStartsUTF8, startUTF8, i);
3526            }
3527
3528            int32_t  end = groupEnds.elementAti(i);
3529            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3530            if (end >= 0) {
3531                int32_t  endUTF8;
3532                if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3533                    errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3534                    failed = TRUE;
3535                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3536                }
3537                setInt(groupEndsUTF8, endUTF8, i);
3538            }
3539        }
3540    }
3541
3542    if (regionStart>=0) {
3543       matcher->region(regionStart, regionEnd, status);
3544       REGEX_CHECK_STATUS_L(line);
3545       if (UTF8Matcher != NULL) {
3546           UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3547           REGEX_CHECK_STATUS_L(line);
3548       }
3549    }
3550    if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3551        matcher->useAnchoringBounds(FALSE);
3552        if (UTF8Matcher != NULL) {
3553            UTF8Matcher->useAnchoringBounds(FALSE);
3554        }
3555    }
3556    if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3557        matcher->useTransparentBounds(TRUE);
3558        if (UTF8Matcher != NULL) {
3559            UTF8Matcher->useTransparentBounds(TRUE);
3560        }
3561    }
3562
3563
3564
3565    //
3566    // Do a find on the de-tagged input using the caller's pattern
3567    //     TODO: error on count>1 and not find().
3568    //           error on both matches() and lookingAt().
3569    //
3570    for (i=0; i<numFinds; i++) {
3571        if (useMatchesFunc) {
3572            isMatch = matcher->matches(status);
3573            if (UTF8Matcher != NULL) {
3574               isUTF8Match = UTF8Matcher->matches(status);
3575            }
3576        } else  if (useLookingAtFunc) {
3577            isMatch = matcher->lookingAt(status);
3578            if (UTF8Matcher != NULL) {
3579                isUTF8Match = UTF8Matcher->lookingAt(status);
3580            }
3581        } else {
3582            isMatch = matcher->find();
3583            if (UTF8Matcher != NULL) {
3584                isUTF8Match = UTF8Matcher->find();
3585            }
3586        }
3587    }
3588    matcher->setTrace(FALSE);
3589    if (U_FAILURE(status)) {
3590        errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3591    }
3592
3593    //
3594    // Match up the groups from the find() with the groups from the tags
3595    //
3596
3597    // number of tags should match number of groups from find operation.
3598    // matcher->groupCount does not include group 0, the entire match, hence the +1.
3599    //   G option in test means that capture group data is not available in the
3600    //     expected results, so the check needs to be suppressed.
3601    if (isMatch == FALSE && groupStarts.size() != 0) {
3602        dataerrln("Error at line %d:  Match expected, but none found.", line);
3603        failed = TRUE;
3604        goto cleanupAndReturn;
3605    } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3606        errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3607        failed = TRUE;
3608        goto cleanupAndReturn;
3609    }
3610
3611    if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3612        // Only check for match / no match.  Don't check capture groups.
3613        if (isMatch && groupStarts.size() == 0) {
3614            errln("Error at line %d:  No match expected, but one found.", line);
3615            failed = TRUE;
3616        } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3617            errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3618            failed = TRUE;
3619        }
3620        goto cleanupAndReturn;
3621    }
3622
3623    REGEX_CHECK_STATUS_L(line);
3624    for (i=0; i<=matcher->groupCount(); i++) {
3625        int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3626        int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3627        if (matcher->start(i, status) != expectedStart) {
3628            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3629                line, i, expectedStart, matcher->start(i, status));
3630            failed = TRUE;
3631            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3632        } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3633            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3634                  line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3635            failed = TRUE;
3636            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3637        }
3638
3639        int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3640        int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3641        if (matcher->end(i, status) != expectedEnd) {
3642            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3643                line, i, expectedEnd, matcher->end(i, status));
3644            failed = TRUE;
3645            // Error on end position;  keep going; real error is probably yet to come as group
3646            //   end positions work from end of the input data towards the front.
3647        } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3648            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3649                  line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3650            failed = TRUE;
3651            // Error on end position;  keep going; real error is probably yet to come as group
3652            //   end positions work from end of the input data towards the front.
3653        }
3654    }
3655    if ( matcher->groupCount()+1 < groupStarts.size()) {
3656        errln("Error at line %d: Expected %d capture groups, found %d.",
3657            line, groupStarts.size()-1, matcher->groupCount());
3658        failed = TRUE;
3659        }
3660    else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3661        errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3662              line, groupStarts.size()-1, UTF8Matcher->groupCount());
3663        failed = TRUE;
3664    }
3665
3666    if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3667        matcher->requireEnd() == TRUE) {
3668        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3669        failed = TRUE;
3670    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3671        UTF8Matcher->requireEnd() == TRUE) {
3672        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3673        failed = TRUE;
3674    }
3675
3676    if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3677        matcher->requireEnd() == FALSE) {
3678        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3679        failed = TRUE;
3680    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3681        UTF8Matcher->requireEnd() == FALSE) {
3682        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3683        failed = TRUE;
3684    }
3685
3686    if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3687        matcher->hitEnd() == TRUE) {
3688        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3689        failed = TRUE;
3690    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3691               UTF8Matcher->hitEnd() == TRUE) {
3692        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3693        failed = TRUE;
3694    }
3695
3696    if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3697        matcher->hitEnd() == FALSE) {
3698        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3699        failed = TRUE;
3700    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3701               UTF8Matcher->hitEnd() == FALSE) {
3702        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3703        failed = TRUE;
3704    }
3705
3706
3707cleanupAndReturn:
3708    if (failed) {
3709        infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3710            +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3711        // callerPattern->dump();
3712    }
3713    delete parseMatcher;
3714    delete parsePat;
3715    delete UTF8Matcher;
3716    delete UTF8Pattern;
3717    delete matcher;
3718    delete callerPattern;
3719
3720    utext_close(&inputText);
3721    delete[] inputChars;
3722    utext_close(&patternText);
3723    delete[] patternChars;
3724    ucnv_close(UTF8Converter);
3725}
3726
3727
3728
3729
3730//---------------------------------------------------------------------------
3731//
3732//      Errors     Check for error handling in patterns.
3733//
3734//---------------------------------------------------------------------------
3735void RegexTest::Errors() {
3736    // \escape sequences that aren't implemented yet.
3737    //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3738
3739    // Missing close parentheses
3740    REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3741    REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3742    REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3743
3744    // Extra close paren
3745    REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3746    REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3747    REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3748
3749    // Look-ahead, Look-behind
3750    //  TODO:  add tests for unbounded length look-behinds.
3751    REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3752
3753    // Attempt to use non-default flags
3754    {
3755        UParseError   pe;
3756        UErrorCode    status = U_ZERO_ERROR;
3757        int32_t       flags  = UREGEX_CANON_EQ |
3758                               UREGEX_COMMENTS         | UREGEX_DOTALL   |
3759                               UREGEX_MULTILINE;
3760        RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3761        REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3762        delete pat1;
3763    }
3764
3765
3766    // Quantifiers are allowed only after something that can be quantified.
3767    REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3768    REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3769    REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3770
3771    // Mal-formed {min,max} quantifiers
3772    REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3773    REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3774    REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3775    REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3776    REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3777    REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3778    REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3779    REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3780    REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3781
3782    // Ticket 5389
3783    REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3784
3785    // Invalid Back Reference \0
3786    //    For ICU 3.8 and earlier
3787    //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3788    //
3789    REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3790
3791}
3792
3793
3794//-------------------------------------------------------------------------------
3795//
3796//  Read a text data file, convert it to UChars, and return the data
3797//    in one big UChar * buffer, which the caller must delete.
3798//
3799//--------------------------------------------------------------------------------
3800UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3801                                     const char *defEncoding, UErrorCode &status) {
3802    UChar       *retPtr  = NULL;
3803    char        *fileBuf = NULL;
3804    UConverter* conv     = NULL;
3805    FILE        *f       = NULL;
3806
3807    ulen = 0;
3808    if (U_FAILURE(status)) {
3809        return retPtr;
3810    }
3811
3812    //
3813    //  Open the file.
3814    //
3815    f = fopen(fileName, "rb");
3816    if (f == 0) {
3817        dataerrln("Error opening test data file %s\n", fileName);
3818        status = U_FILE_ACCESS_ERROR;
3819        return NULL;
3820    }
3821    //
3822    //  Read it in
3823    //
3824    int32_t            fileSize;
3825    int32_t            amt_read;
3826
3827    fseek( f, 0, SEEK_END);
3828    fileSize = ftell(f);
3829    fileBuf = new char[fileSize];
3830    fseek(f, 0, SEEK_SET);
3831    amt_read = fread(fileBuf, 1, fileSize, f);
3832    if (amt_read != fileSize || fileSize <= 0) {
3833        errln("Error reading test data file.");
3834        goto cleanUpAndReturn;
3835    }
3836
3837    //
3838    // Look for a Unicode Signature (BOM) on the data just read
3839    //
3840    int32_t        signatureLength;
3841    const char *   fileBufC;
3842    const char*    encoding;
3843
3844    fileBufC = fileBuf;
3845    encoding = ucnv_detectUnicodeSignature(
3846        fileBuf, fileSize, &signatureLength, &status);
3847    if(encoding!=NULL ){
3848        fileBufC  += signatureLength;
3849        fileSize  -= signatureLength;
3850    } else {
3851        encoding = defEncoding;
3852        if (strcmp(encoding, "utf-8") == 0) {
3853            errln("file %s is missing its BOM", fileName);
3854        }
3855    }
3856
3857    //
3858    // Open a converter to take the rule file to UTF-16
3859    //
3860    conv = ucnv_open(encoding, &status);
3861    if (U_FAILURE(status)) {
3862        goto cleanUpAndReturn;
3863    }
3864
3865    //
3866    // Convert the rules to UChar.
3867    //  Preflight first to determine required buffer size.
3868    //
3869    ulen = ucnv_toUChars(conv,
3870        NULL,           //  dest,
3871        0,              //  destCapacity,
3872        fileBufC,
3873        fileSize,
3874        &status);
3875    if (status == U_BUFFER_OVERFLOW_ERROR) {
3876        // Buffer Overflow is expected from the preflight operation.
3877        status = U_ZERO_ERROR;
3878
3879        retPtr = new UChar[ulen+1];
3880        ucnv_toUChars(conv,
3881            retPtr,       //  dest,
3882            ulen+1,
3883            fileBufC,
3884            fileSize,
3885            &status);
3886    }
3887
3888cleanUpAndReturn:
3889    fclose(f);
3890    delete[] fileBuf;
3891    ucnv_close(conv);
3892    if (U_FAILURE(status)) {
3893        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3894        delete []retPtr;
3895        retPtr = 0;
3896        ulen   = 0;
3897    };
3898    return retPtr;
3899}
3900
3901
3902//-------------------------------------------------------------------------------
3903//
3904//   PerlTests  - Run Perl's regular expression tests
3905//                The input file for this test is re_tests, the standard regular
3906//                expression test data distributed with the Perl source code.
3907//
3908//                Here is Perl's description of the test data file:
3909//
3910//        # The tests are in a separate file 't/op/re_tests'.
3911//        # Each line in that file is a separate test.
3912//        # There are five columns, separated by tabs.
3913//        #
3914//        # Column 1 contains the pattern, optionally enclosed in C<''>.
3915//        # Modifiers can be put after the closing C<'>.
3916//        #
3917//        # Column 2 contains the string to be matched.
3918//        #
3919//        # Column 3 contains the expected result:
3920//        #     y   expect a match
3921//        #     n   expect no match
3922//        #     c   expect an error
3923//        # B   test exposes a known bug in Perl, should be skipped
3924//        # b   test exposes a known bug in Perl, should be skipped if noamp
3925//        #
3926//        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3927//        #
3928//        # Column 4 contains a string, usually C<$&>.
3929//        #
3930//        # Column 5 contains the expected result of double-quote
3931//        # interpolating that string after the match, or start of error message.
3932//        #
3933//        # Column 6, if present, contains a reason why the test is skipped.
3934//        # This is printed with "skipped", for harness to pick up.
3935//        #
3936//        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3937//        #
3938//        # If you want to add a regular expression test that can't be expressed
3939//        # in this format, don't add it here: put it in op/pat.t instead.
3940//
3941//        For ICU, if field 3 contains an 'i', the test will be skipped.
3942//        The test exposes is some known incompatibility between ICU and Perl regexps.
3943//        (The i is in addition to whatever was there before.)
3944//
3945//-------------------------------------------------------------------------------
3946void RegexTest::PerlTests() {
3947    char tdd[2048];
3948    const char *srcPath;
3949    UErrorCode  status = U_ZERO_ERROR;
3950    UParseError pe;
3951
3952    //
3953    //  Open and read the test data file.
3954    //
3955    srcPath=getPath(tdd, "re_tests.txt");
3956    if(srcPath==NULL) {
3957        return; /* something went wrong, error already output */
3958    }
3959
3960    int32_t    len;
3961    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3962    if (U_FAILURE(status)) {
3963        return; /* something went wrong, error already output */
3964    }
3965
3966    //
3967    //  Put the test data into a UnicodeString
3968    //
3969    UnicodeString testDataString(FALSE, testData, len);
3970
3971    //
3972    //  Regex to break the input file into lines, and strip the new lines.
3973    //     One line per match, capture group one is the desired data.
3974    //
3975    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3976    if (U_FAILURE(status)) {
3977        dataerrln("RegexPattern::compile() error");
3978        return;
3979    }
3980    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3981
3982    //
3983    //  Regex to split a test file line into fields.
3984    //    There are six fields, separated by tabs.
3985    //
3986    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3987
3988    //
3989    //  Regex to identify test patterns with flag settings, and to separate them.
3990    //    Test patterns with flags look like 'pattern'i
3991    //    Test patterns without flags are not quoted:   pattern
3992    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3993    //
3994    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3995    RegexMatcher* flagMat = flagPat->matcher(status);
3996
3997    //
3998    // The Perl tests reference several perl-isms, which are evaluated/substituted
3999    //   in the test data.  Not being perl, this must be done explicitly.  Here
4000    //   are string constants and REs for these constructs.
4001    //
4002    UnicodeString nulnulSrc("${nulnul}");
4003    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4004    nulnul = nulnul.unescape();
4005
4006    UnicodeString ffffSrc("${ffff}");
4007    UnicodeString ffff("\\uffff", -1, US_INV);
4008    ffff = ffff.unescape();
4009
4010    //  regexp for $-[0], $+[2], etc.
4011    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4012    RegexMatcher *groupsMat = groupsPat->matcher(status);
4013
4014    //  regexp for $0, $1, $2, etc.
4015    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4016    RegexMatcher *cgMat = cgPat->matcher(status);
4017
4018
4019    //
4020    // Main Loop for the Perl Tests, runs once per line from the
4021    //   test data file.
4022    //
4023    int32_t  lineNum = 0;
4024    int32_t  skippedUnimplementedCount = 0;
4025    while (lineMat->find()) {
4026        lineNum++;
4027
4028        //
4029        //  Get a line, break it into its fields, do the Perl
4030        //    variable substitutions.
4031        //
4032        UnicodeString line = lineMat->group(1, status);
4033        UnicodeString fields[7];
4034        fieldPat->split(line, fields, 7, status);
4035
4036        flagMat->reset(fields[0]);
4037        flagMat->matches(status);
4038        UnicodeString pattern  = flagMat->group(2, status);
4039        pattern.findAndReplace("${bang}", "!");
4040        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4041        pattern.findAndReplace(ffffSrc, ffff);
4042
4043        //
4044        //  Identify patterns that include match flag settings,
4045        //    split off the flags, remove the extra quotes.
4046        //
4047        UnicodeString flagStr = flagMat->group(3, status);
4048        if (U_FAILURE(status)) {
4049            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4050            return;
4051        }
4052        int32_t flags = 0;
4053        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4054        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4055        const UChar UChar_m = 0x6d;
4056        const UChar UChar_x = 0x78;
4057        const UChar UChar_y = 0x79;
4058        if (flagStr.indexOf(UChar_i) != -1) {
4059            flags |= UREGEX_CASE_INSENSITIVE;
4060        }
4061        if (flagStr.indexOf(UChar_m) != -1) {
4062            flags |= UREGEX_MULTILINE;
4063        }
4064        if (flagStr.indexOf(UChar_x) != -1) {
4065            flags |= UREGEX_COMMENTS;
4066        }
4067
4068        //
4069        // Compile the test pattern.
4070        //
4071        status = U_ZERO_ERROR;
4072        RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4073        if (status == U_REGEX_UNIMPLEMENTED) {
4074            //
4075            // Test of a feature that is planned for ICU, but not yet implemented.
4076            //   skip the test.
4077            skippedUnimplementedCount++;
4078            delete testPat;
4079            status = U_ZERO_ERROR;
4080            continue;
4081        }
4082
4083        if (U_FAILURE(status)) {
4084            // Some tests are supposed to generate errors.
4085            //   Only report an error for tests that are supposed to succeed.
4086            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4087                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4088            {
4089                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4090            }
4091            status = U_ZERO_ERROR;
4092            delete testPat;
4093            continue;
4094        }
4095
4096        if (fields[2].indexOf(UChar_i) >= 0) {
4097            // ICU should skip this test.
4098            delete testPat;
4099            continue;
4100        }
4101
4102        if (fields[2].indexOf(UChar_c) >= 0) {
4103            // This pattern should have caused a compilation error, but didn't/
4104            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4105            delete testPat;
4106            continue;
4107        }
4108
4109        //
4110        // replace the Perl variables that appear in some of the
4111        //   match data strings.
4112        //
4113        UnicodeString matchString = fields[1];
4114        matchString.findAndReplace(nulnulSrc, nulnul);
4115        matchString.findAndReplace(ffffSrc,   ffff);
4116
4117        // Replace any \n in the match string with an actual new-line char.
4118        //  Don't do full unescape, as this unescapes more than Perl does, which
4119        //  causes other spurious failures in the tests.
4120        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4121
4122
4123
4124        //
4125        // Run the test, check for expected match/don't match result.
4126        //
4127        RegexMatcher *testMat = testPat->matcher(matchString, status);
4128        UBool found = testMat->find();
4129        UBool expected = FALSE;
4130        if (fields[2].indexOf(UChar_y) >=0) {
4131            expected = TRUE;
4132        }
4133        if (expected != found) {
4134            errln("line %d: Expected %smatch, got %smatch",
4135                lineNum, expected?"":"no ", found?"":"no " );
4136            continue;
4137        }
4138
4139        // Don't try to check expected results if there is no match.
4140        //   (Some have stuff in the expected fields)
4141        if (!found) {
4142            delete testMat;
4143            delete testPat;
4144            continue;
4145        }
4146
4147        //
4148        // Interpret the Perl expression from the fourth field of the data file,
4149        // building up an ICU string from the results of the ICU match.
4150        //   The Perl expression will contain references to the results of
4151        //     a regex match, including the matched string, capture group strings,
4152        //     group starting and ending indicies, etc.
4153        //
4154        UnicodeString resultString;
4155        UnicodeString perlExpr = fields[3];
4156#if SUPPORT_MUTATING_INPUT_STRING
4157        groupsMat->reset(perlExpr);
4158        cgMat->reset(perlExpr);
4159#endif
4160
4161        while (perlExpr.length() > 0) {
4162#if !SUPPORT_MUTATING_INPUT_STRING
4163            //  Perferred usage.  Reset after any modification to input string.
4164            groupsMat->reset(perlExpr);
4165            cgMat->reset(perlExpr);
4166#endif
4167
4168            if (perlExpr.startsWith("$&")) {
4169                resultString.append(testMat->group(status));
4170                perlExpr.remove(0, 2);
4171            }
4172
4173            else if (groupsMat->lookingAt(status)) {
4174                // $-[0]   $+[2]  etc.
4175                UnicodeString digitString = groupsMat->group(2, status);
4176                int32_t t = 0;
4177                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4178                UnicodeString plusOrMinus = groupsMat->group(1, status);
4179                int32_t matchPosition;
4180                if (plusOrMinus.compare("+") == 0) {
4181                    matchPosition = testMat->end(groupNum, status);
4182                } else {
4183                    matchPosition = testMat->start(groupNum, status);
4184                }
4185                if (matchPosition != -1) {
4186                    ICU_Utility::appendNumber(resultString, matchPosition);
4187                }
4188                perlExpr.remove(0, groupsMat->end(status));
4189            }
4190
4191            else if (cgMat->lookingAt(status)) {
4192                // $1, $2, $3, etc.
4193                UnicodeString digitString = cgMat->group(1, status);
4194                int32_t t = 0;
4195                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4196                if (U_SUCCESS(status)) {
4197                    resultString.append(testMat->group(groupNum, status));
4198                    status = U_ZERO_ERROR;
4199                }
4200                perlExpr.remove(0, cgMat->end(status));
4201            }
4202
4203            else if (perlExpr.startsWith("@-")) {
4204                int32_t i;
4205                for (i=0; i<=testMat->groupCount(); i++) {
4206                    if (i>0) {
4207                        resultString.append(" ");
4208                    }
4209                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4210                }
4211                perlExpr.remove(0, 2);
4212            }
4213
4214            else if (perlExpr.startsWith("@+")) {
4215                int32_t i;
4216                for (i=0; i<=testMat->groupCount(); i++) {
4217                    if (i>0) {
4218                        resultString.append(" ");
4219                    }
4220                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4221                }
4222                perlExpr.remove(0, 2);
4223            }
4224
4225            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4226                                                     //           or as an escaped sequence (e.g. \n)
4227                if (perlExpr.length() > 1) {
4228                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4229                }
4230                UChar c = perlExpr.charAt(0);
4231                switch (c) {
4232                case 'n':   c = '\n'; break;
4233                // add any other escape sequences that show up in the test expected results.
4234                }
4235                resultString.append(c);
4236                perlExpr.remove(0, 1);
4237            }
4238
4239            else  {
4240                // Any characters from the perl expression that we don't explicitly
4241                //  recognize before here are assumed to be literals and copied
4242                //  as-is to the expected results.
4243                resultString.append(perlExpr.charAt(0));
4244                perlExpr.remove(0, 1);
4245            }
4246
4247            if (U_FAILURE(status)) {
4248                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4249                break;
4250            }
4251        }
4252
4253        //
4254        // Expected Results Compare
4255        //
4256        UnicodeString expectedS(fields[4]);
4257        expectedS.findAndReplace(nulnulSrc, nulnul);
4258        expectedS.findAndReplace(ffffSrc,   ffff);
4259        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4260
4261
4262        if (expectedS.compare(resultString) != 0) {
4263            err("Line %d: Incorrect perl expression results.", lineNum);
4264            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4265        }
4266
4267        delete testMat;
4268        delete testPat;
4269    }
4270
4271    //
4272    // All done.  Clean up allocated stuff.
4273    //
4274    delete cgMat;
4275    delete cgPat;
4276
4277    delete groupsMat;
4278    delete groupsPat;
4279
4280    delete flagMat;
4281    delete flagPat;
4282
4283    delete lineMat;
4284    delete linePat;
4285
4286    delete fieldPat;
4287    delete [] testData;
4288
4289
4290    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4291
4292}
4293
4294
4295//-------------------------------------------------------------------------------
4296//
4297//   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4298//                  (instead of using UnicodeStrings) to test the alternate engine.
4299//                  The input file for this test is re_tests, the standard regular
4300//                  expression test data distributed with the Perl source code.
4301//                  See PerlTests() for more information.
4302//
4303//-------------------------------------------------------------------------------
4304void RegexTest::PerlTestsUTF8() {
4305    char tdd[2048];
4306    const char *srcPath;
4307    UErrorCode  status = U_ZERO_ERROR;
4308    UParseError pe;
4309    LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4310    UText       patternText = UTEXT_INITIALIZER;
4311    char       *patternChars = NULL;
4312    int32_t     patternLength;
4313    int32_t     patternCapacity = 0;
4314    UText       inputText = UTEXT_INITIALIZER;
4315    char       *inputChars = NULL;
4316    int32_t     inputLength;
4317    int32_t     inputCapacity = 0;
4318
4319    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4320
4321    //
4322    //  Open and read the test data file.
4323    //
4324    srcPath=getPath(tdd, "re_tests.txt");
4325    if(srcPath==NULL) {
4326        return; /* something went wrong, error already output */
4327    }
4328
4329    int32_t    len;
4330    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4331    if (U_FAILURE(status)) {
4332        return; /* something went wrong, error already output */
4333    }
4334
4335    //
4336    //  Put the test data into a UnicodeString
4337    //
4338    UnicodeString testDataString(FALSE, testData, len);
4339
4340    //
4341    //  Regex to break the input file into lines, and strip the new lines.
4342    //     One line per match, capture group one is the desired data.
4343    //
4344    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4345    if (U_FAILURE(status)) {
4346        dataerrln("RegexPattern::compile() error");
4347        return;
4348    }
4349    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4350
4351    //
4352    //  Regex to split a test file line into fields.
4353    //    There are six fields, separated by tabs.
4354    //
4355    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4356
4357    //
4358    //  Regex to identify test patterns with flag settings, and to separate them.
4359    //    Test patterns with flags look like 'pattern'i
4360    //    Test patterns without flags are not quoted:   pattern
4361    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4362    //
4363    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4364    RegexMatcher* flagMat = flagPat->matcher(status);
4365
4366    //
4367    // The Perl tests reference several perl-isms, which are evaluated/substituted
4368    //   in the test data.  Not being perl, this must be done explicitly.  Here
4369    //   are string constants and REs for these constructs.
4370    //
4371    UnicodeString nulnulSrc("${nulnul}");
4372    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4373    nulnul = nulnul.unescape();
4374
4375    UnicodeString ffffSrc("${ffff}");
4376    UnicodeString ffff("\\uffff", -1, US_INV);
4377    ffff = ffff.unescape();
4378
4379    //  regexp for $-[0], $+[2], etc.
4380    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4381    RegexMatcher *groupsMat = groupsPat->matcher(status);
4382
4383    //  regexp for $0, $1, $2, etc.
4384    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4385    RegexMatcher *cgMat = cgPat->matcher(status);
4386
4387
4388    //
4389    // Main Loop for the Perl Tests, runs once per line from the
4390    //   test data file.
4391    //
4392    int32_t  lineNum = 0;
4393    int32_t  skippedUnimplementedCount = 0;
4394    while (lineMat->find()) {
4395        lineNum++;
4396
4397        //
4398        //  Get a line, break it into its fields, do the Perl
4399        //    variable substitutions.
4400        //
4401        UnicodeString line = lineMat->group(1, status);
4402        UnicodeString fields[7];
4403        fieldPat->split(line, fields, 7, status);
4404
4405        flagMat->reset(fields[0]);
4406        flagMat->matches(status);
4407        UnicodeString pattern  = flagMat->group(2, status);
4408        pattern.findAndReplace("${bang}", "!");
4409        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4410        pattern.findAndReplace(ffffSrc, ffff);
4411
4412        //
4413        //  Identify patterns that include match flag settings,
4414        //    split off the flags, remove the extra quotes.
4415        //
4416        UnicodeString flagStr = flagMat->group(3, status);
4417        if (U_FAILURE(status)) {
4418            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4419            return;
4420        }
4421        int32_t flags = 0;
4422        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4423        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4424        const UChar UChar_m = 0x6d;
4425        const UChar UChar_x = 0x78;
4426        const UChar UChar_y = 0x79;
4427        if (flagStr.indexOf(UChar_i) != -1) {
4428            flags |= UREGEX_CASE_INSENSITIVE;
4429        }
4430        if (flagStr.indexOf(UChar_m) != -1) {
4431            flags |= UREGEX_MULTILINE;
4432        }
4433        if (flagStr.indexOf(UChar_x) != -1) {
4434            flags |= UREGEX_COMMENTS;
4435        }
4436
4437        //
4438        // Put the pattern in a UTF-8 UText
4439        //
4440        status = U_ZERO_ERROR;
4441        patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4442        if (status == U_BUFFER_OVERFLOW_ERROR) {
4443            status = U_ZERO_ERROR;
4444            delete[] patternChars;
4445            patternCapacity = patternLength + 1;
4446            patternChars = new char[patternCapacity];
4447            pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4448        }
4449        utext_openUTF8(&patternText, patternChars, patternLength, &status);
4450
4451        //
4452        // Compile the test pattern.
4453        //
4454        RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4455        if (status == U_REGEX_UNIMPLEMENTED) {
4456            //
4457            // Test of a feature that is planned for ICU, but not yet implemented.
4458            //   skip the test.
4459            skippedUnimplementedCount++;
4460            delete testPat;
4461            status = U_ZERO_ERROR;
4462            continue;
4463        }
4464
4465        if (U_FAILURE(status)) {
4466            // Some tests are supposed to generate errors.
4467            //   Only report an error for tests that are supposed to succeed.
4468            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4469                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4470            {
4471                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4472            }
4473            status = U_ZERO_ERROR;
4474            delete testPat;
4475            continue;
4476        }
4477
4478        if (fields[2].indexOf(UChar_i) >= 0) {
4479            // ICU should skip this test.
4480            delete testPat;
4481            continue;
4482        }
4483
4484        if (fields[2].indexOf(UChar_c) >= 0) {
4485            // This pattern should have caused a compilation error, but didn't/
4486            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4487            delete testPat;
4488            continue;
4489        }
4490
4491
4492        //
4493        // replace the Perl variables that appear in some of the
4494        //   match data strings.
4495        //
4496        UnicodeString matchString = fields[1];
4497        matchString.findAndReplace(nulnulSrc, nulnul);
4498        matchString.findAndReplace(ffffSrc,   ffff);
4499
4500        // Replace any \n in the match string with an actual new-line char.
4501        //  Don't do full unescape, as this unescapes more than Perl does, which
4502        //  causes other spurious failures in the tests.
4503        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4504
4505        //
4506        // Put the input in a UTF-8 UText
4507        //
4508        status = U_ZERO_ERROR;
4509        inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4510        if (status == U_BUFFER_OVERFLOW_ERROR) {
4511            status = U_ZERO_ERROR;
4512            delete[] inputChars;
4513            inputCapacity = inputLength + 1;
4514            inputChars = new char[inputCapacity];
4515            matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4516        }
4517        utext_openUTF8(&inputText, inputChars, inputLength, &status);
4518
4519        //
4520        // Run the test, check for expected match/don't match result.
4521        //
4522        RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4523        UBool found = testMat->find();
4524        UBool expected = FALSE;
4525        if (fields[2].indexOf(UChar_y) >=0) {
4526            expected = TRUE;
4527        }
4528        if (expected != found) {
4529            errln("line %d: Expected %smatch, got %smatch",
4530                lineNum, expected?"":"no ", found?"":"no " );
4531            continue;
4532        }
4533
4534        // Don't try to check expected results if there is no match.
4535        //   (Some have stuff in the expected fields)
4536        if (!found) {
4537            delete testMat;
4538            delete testPat;
4539            continue;
4540        }
4541
4542        //
4543        // Interpret the Perl expression from the fourth field of the data file,
4544        // building up an ICU string from the results of the ICU match.
4545        //   The Perl expression will contain references to the results of
4546        //     a regex match, including the matched string, capture group strings,
4547        //     group starting and ending indicies, etc.
4548        //
4549        UnicodeString resultString;
4550        UnicodeString perlExpr = fields[3];
4551
4552        while (perlExpr.length() > 0) {
4553            groupsMat->reset(perlExpr);
4554            cgMat->reset(perlExpr);
4555
4556            if (perlExpr.startsWith("$&")) {
4557                resultString.append(testMat->group(status));
4558                perlExpr.remove(0, 2);
4559            }
4560
4561            else if (groupsMat->lookingAt(status)) {
4562                // $-[0]   $+[2]  etc.
4563                UnicodeString digitString = groupsMat->group(2, status);
4564                int32_t t = 0;
4565                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4566                UnicodeString plusOrMinus = groupsMat->group(1, status);
4567                int32_t matchPosition;
4568                if (plusOrMinus.compare("+") == 0) {
4569                    matchPosition = testMat->end(groupNum, status);
4570                } else {
4571                    matchPosition = testMat->start(groupNum, status);
4572                }
4573                if (matchPosition != -1) {
4574                    ICU_Utility::appendNumber(resultString, matchPosition);
4575                }
4576                perlExpr.remove(0, groupsMat->end(status));
4577            }
4578
4579            else if (cgMat->lookingAt(status)) {
4580                // $1, $2, $3, etc.
4581                UnicodeString digitString = cgMat->group(1, status);
4582                int32_t t = 0;
4583                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4584                if (U_SUCCESS(status)) {
4585                    resultString.append(testMat->group(groupNum, status));
4586                    status = U_ZERO_ERROR;
4587                }
4588                perlExpr.remove(0, cgMat->end(status));
4589            }
4590
4591            else if (perlExpr.startsWith("@-")) {
4592                int32_t i;
4593                for (i=0; i<=testMat->groupCount(); i++) {
4594                    if (i>0) {
4595                        resultString.append(" ");
4596                    }
4597                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4598                }
4599                perlExpr.remove(0, 2);
4600            }
4601
4602            else if (perlExpr.startsWith("@+")) {
4603                int32_t i;
4604                for (i=0; i<=testMat->groupCount(); i++) {
4605                    if (i>0) {
4606                        resultString.append(" ");
4607                    }
4608                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4609                }
4610                perlExpr.remove(0, 2);
4611            }
4612
4613            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4614                                                     //           or as an escaped sequence (e.g. \n)
4615                if (perlExpr.length() > 1) {
4616                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4617                }
4618                UChar c = perlExpr.charAt(0);
4619                switch (c) {
4620                case 'n':   c = '\n'; break;
4621                // add any other escape sequences that show up in the test expected results.
4622                }
4623                resultString.append(c);
4624                perlExpr.remove(0, 1);
4625            }
4626
4627            else  {
4628                // Any characters from the perl expression that we don't explicitly
4629                //  recognize before here are assumed to be literals and copied
4630                //  as-is to the expected results.
4631                resultString.append(perlExpr.charAt(0));
4632                perlExpr.remove(0, 1);
4633            }
4634
4635            if (U_FAILURE(status)) {
4636                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4637                break;
4638            }
4639        }
4640
4641        //
4642        // Expected Results Compare
4643        //
4644        UnicodeString expectedS(fields[4]);
4645        expectedS.findAndReplace(nulnulSrc, nulnul);
4646        expectedS.findAndReplace(ffffSrc,   ffff);
4647        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4648
4649
4650        if (expectedS.compare(resultString) != 0) {
4651            err("Line %d: Incorrect perl expression results.", lineNum);
4652            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4653        }
4654
4655        delete testMat;
4656        delete testPat;
4657    }
4658
4659    //
4660    // All done.  Clean up allocated stuff.
4661    //
4662    delete cgMat;
4663    delete cgPat;
4664
4665    delete groupsMat;
4666    delete groupsPat;
4667
4668    delete flagMat;
4669    delete flagPat;
4670
4671    delete lineMat;
4672    delete linePat;
4673
4674    delete fieldPat;
4675    delete [] testData;
4676
4677    utext_close(&patternText);
4678    utext_close(&inputText);
4679
4680    delete [] patternChars;
4681    delete [] inputChars;
4682
4683
4684    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4685
4686}
4687
4688
4689//--------------------------------------------------------------
4690//
4691//  Bug6149   Verify limits to heap expansion for backtrack stack.
4692//             Use this pattern,
4693//                 "(a?){1,8000000}"
4694//             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4695//                   This test is likely to be fragile, as further optimizations stop
4696//                   more cases of pointless looping in the match engine.
4697//
4698//---------------------------------------------------------------
4699void RegexTest::Bug6149() {
4700    UnicodeString pattern("(a?){1,8000000}");
4701    UnicodeString s("xyz");
4702    uint32_t flags = 0;
4703    UErrorCode status = U_ZERO_ERROR;
4704
4705    RegexMatcher  matcher(pattern, s, flags, status);
4706    UBool result = false;
4707    REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4708    REGEX_ASSERT(result == FALSE);
4709 }
4710
4711
4712//
4713//   Callbacks()    Test the callback function.
4714//                  When set, callbacks occur periodically during matching operations,
4715//                  giving the application code the ability to abort the operation
4716//                  before it's normal completion.
4717//
4718
4719struct callBackContext {
4720    RegexTest        *test;
4721    int32_t          maxCalls;
4722    int32_t          numCalls;
4723    int32_t          lastSteps;
4724    void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4725};
4726
4727U_CDECL_BEGIN
4728static UBool U_CALLCONV
4729testCallBackFn(const void *context, int32_t steps) {
4730    callBackContext  *info = (callBackContext *)context;
4731    if (info->lastSteps+1 != steps) {
4732        info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4733    }
4734    info->lastSteps = steps;
4735    info->numCalls++;
4736    return (info->numCalls < info->maxCalls);
4737}
4738U_CDECL_END
4739
4740void RegexTest::Callbacks() {
4741   {
4742        // Getter returns NULLs if no callback has been set
4743
4744        //   The variables that the getter will fill in.
4745        //   Init to non-null values so that the action of the getter can be seen.
4746        const void          *returnedContext = &returnedContext;
4747        URegexMatchCallback *returnedFn = &testCallBackFn;
4748
4749        UErrorCode status = U_ZERO_ERROR;
4750        RegexMatcher matcher("x", 0, status);
4751        REGEX_CHECK_STATUS;
4752        matcher.getMatchCallback(returnedFn, returnedContext, status);
4753        REGEX_CHECK_STATUS;
4754        REGEX_ASSERT(returnedFn == NULL);
4755        REGEX_ASSERT(returnedContext == NULL);
4756    }
4757
4758   {
4759        // Set and Get work
4760        callBackContext cbInfo = {this, 0, 0, 0};
4761        const void          *returnedContext;
4762        URegexMatchCallback *returnedFn;
4763        UErrorCode status = U_ZERO_ERROR;
4764        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4765        REGEX_CHECK_STATUS;
4766        matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4767        REGEX_CHECK_STATUS;
4768        matcher.getMatchCallback(returnedFn, returnedContext, status);
4769        REGEX_CHECK_STATUS;
4770        REGEX_ASSERT(returnedFn == testCallBackFn);
4771        REGEX_ASSERT(returnedContext == &cbInfo);
4772
4773        // A short-running match shouldn't invoke the callback
4774        status = U_ZERO_ERROR;
4775        cbInfo.reset(1);
4776        UnicodeString s = "xxx";
4777        matcher.reset(s);
4778        REGEX_ASSERT(matcher.matches(status));
4779        REGEX_CHECK_STATUS;
4780        REGEX_ASSERT(cbInfo.numCalls == 0);
4781
4782        // A medium-length match that runs long enough to invoke the
4783        //   callback, but not so long that the callback aborts it.
4784        status = U_ZERO_ERROR;
4785        cbInfo.reset(4);
4786        s = "aaaaaaaaaaaaaaaaaaab";
4787        matcher.reset(s);
4788        REGEX_ASSERT(matcher.matches(status)==FALSE);
4789        REGEX_CHECK_STATUS;
4790        REGEX_ASSERT(cbInfo.numCalls > 0);
4791
4792        // A longer running match that the callback function will abort.
4793        status = U_ZERO_ERROR;
4794        cbInfo.reset(4);
4795        s = "aaaaaaaaaaaaaaaaaaaaaaab";
4796        matcher.reset(s);
4797        REGEX_ASSERT(matcher.matches(status)==FALSE);
4798        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4799        REGEX_ASSERT(cbInfo.numCalls == 4);
4800    }
4801
4802
4803}
4804
4805
4806//
4807//   FindProgressCallbacks()    Test the find "progress" callback function.
4808//                  When set, the find progress callback will be invoked during a find operations
4809//                  after each return from a match attempt, giving the application the opportunity
4810//                  to terminate a long-running find operation before it's normal completion.
4811//
4812
4813struct progressCallBackContext {
4814    RegexTest        *test;
4815    int64_t          lastIndex;
4816    int32_t          maxCalls;
4817    int32_t          numCalls;
4818    void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4819};
4820
4821U_CDECL_BEGIN
4822static UBool U_CALLCONV
4823testProgressCallBackFn(const void *context, int64_t matchIndex) {
4824    progressCallBackContext  *info = (progressCallBackContext *)context;
4825    info->numCalls++;
4826    info->lastIndex = matchIndex;
4827//    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4828    return (info->numCalls < info->maxCalls);
4829}
4830U_CDECL_END
4831
4832void RegexTest::FindProgressCallbacks() {
4833   {
4834        // Getter returns NULLs if no callback has been set
4835
4836        //   The variables that the getter will fill in.
4837        //   Init to non-null values so that the action of the getter can be seen.
4838        const void                  *returnedContext = &returnedContext;
4839        URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4840
4841        UErrorCode status = U_ZERO_ERROR;
4842        RegexMatcher matcher("x", 0, status);
4843        REGEX_CHECK_STATUS;
4844        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4845        REGEX_CHECK_STATUS;
4846        REGEX_ASSERT(returnedFn == NULL);
4847        REGEX_ASSERT(returnedContext == NULL);
4848    }
4849
4850   {
4851        // Set and Get work
4852        progressCallBackContext cbInfo = {this, 0, 0, 0};
4853        const void                  *returnedContext;
4854        URegexFindProgressCallback  *returnedFn;
4855        UErrorCode status = U_ZERO_ERROR;
4856        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4857        REGEX_CHECK_STATUS;
4858        matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4859        REGEX_CHECK_STATUS;
4860        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4861        REGEX_CHECK_STATUS;
4862        REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4863        REGEX_ASSERT(returnedContext == &cbInfo);
4864
4865        // A short-running match should NOT invoke the callback.
4866        status = U_ZERO_ERROR;
4867        cbInfo.reset(100);
4868        UnicodeString s = "abxxx";
4869        matcher.reset(s);
4870#if 0
4871        matcher.setTrace(TRUE);
4872#endif
4873        REGEX_ASSERT(matcher.find(0, status));
4874        REGEX_CHECK_STATUS;
4875        REGEX_ASSERT(cbInfo.numCalls == 0);
4876
4877        // A medium running match that causes matcher.find() to invoke our callback for each index.
4878        status = U_ZERO_ERROR;
4879        s = "aaaaaaaaaaaaaaaaaaab";
4880        cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4881        matcher.reset(s);
4882        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4883        REGEX_CHECK_STATUS;
4884        REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4885
4886        // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4887        status = U_ZERO_ERROR;
4888        UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4889        cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4890        matcher.reset(s1);
4891        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4892        REGEX_CHECK_STATUS;
4893        REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4894
4895#if 0
4896        // Now a match that will succeed, but after an interruption
4897        status = U_ZERO_ERROR;
4898        UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4899        cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4900        matcher.reset(s2);
4901        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4902        REGEX_CHECK_STATUS;
4903        // Now retry the match from where left off
4904        cbInfo.maxCalls = 100; //  No callback limit
4905        REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4906        REGEX_CHECK_STATUS;
4907#endif
4908    }
4909
4910
4911}
4912
4913
4914//---------------------------------------------------------------------------
4915//
4916//    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4917//                             UTexts. The pure-C implementation of UText
4918//                             has no mutable backing stores, but we can
4919//                             use UnicodeString here to test the functionality.
4920//
4921//---------------------------------------------------------------------------
4922void RegexTest::PreAllocatedUTextCAPI () {
4923    UErrorCode           status = U_ZERO_ERROR;
4924    URegularExpression  *re;
4925    UText                patternText = UTEXT_INITIALIZER;
4926    UnicodeString        buffer;
4927    UText                bufferText = UTEXT_INITIALIZER;
4928
4929    utext_openUnicodeString(&bufferText, &buffer, &status);
4930
4931    /*
4932     *  getText() and getUText()
4933     */
4934    {
4935        UText  text1 = UTEXT_INITIALIZER;
4936        UText  text2 = UTEXT_INITIALIZER;
4937        UChar  text2Chars[20];
4938        UText  *resultText;
4939
4940        status = U_ZERO_ERROR;
4941        regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4942        regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4943        u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4944        utext_openUChars(&text2, text2Chars, -1, &status);
4945
4946        regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4947        re = uregex_openUText(&patternText, 0, NULL, &status);
4948
4949        /* First set a UText */
4950        uregex_setUText(re, &text1, &status);
4951        resultText = uregex_getUText(re, &bufferText, &status);
4952        REGEX_CHECK_STATUS;
4953        REGEX_ASSERT(resultText == &bufferText);
4954        utext_setNativeIndex(resultText, 0);
4955        utext_setNativeIndex(&text1, 0);
4956        REGEX_ASSERT(testUTextEqual(resultText, &text1));
4957
4958        resultText = uregex_getUText(re, &bufferText, &status);
4959        REGEX_CHECK_STATUS;
4960        REGEX_ASSERT(resultText == &bufferText);
4961        utext_setNativeIndex(resultText, 0);
4962        utext_setNativeIndex(&text1, 0);
4963        REGEX_ASSERT(testUTextEqual(resultText, &text1));
4964
4965        /* Then set a UChar * */
4966        uregex_setText(re, text2Chars, 7, &status);
4967        resultText = uregex_getUText(re, &bufferText, &status);
4968        REGEX_CHECK_STATUS;
4969        REGEX_ASSERT(resultText == &bufferText);
4970        utext_setNativeIndex(resultText, 0);
4971        utext_setNativeIndex(&text2, 0);
4972        REGEX_ASSERT(testUTextEqual(resultText, &text2));
4973
4974        uregex_close(re);
4975        utext_close(&text1);
4976        utext_close(&text2);
4977    }
4978
4979    /*
4980     *  group()
4981     */
4982    {
4983        UChar    text1[80];
4984        UText   *actual;
4985        UBool    result;
4986        u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
4987
4988        status = U_ZERO_ERROR;
4989        re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4990        REGEX_CHECK_STATUS;
4991
4992        uregex_setText(re, text1, -1, &status);
4993        result = uregex_find(re, 0, &status);
4994        REGEX_ASSERT(result==TRUE);
4995
4996        /*  Capture Group 0, the full match.  Should succeed.  */
4997        status = U_ZERO_ERROR;
4998        actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
4999        REGEX_CHECK_STATUS;
5000        REGEX_ASSERT(actual == &bufferText);
5001        REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
5002
5003        /*  Capture group #1.  Should succeed. */
5004        status = U_ZERO_ERROR;
5005        actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
5006        REGEX_CHECK_STATUS;
5007        REGEX_ASSERT(actual == &bufferText);
5008        REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
5009
5010        /*  Capture group out of range.  Error. */
5011        status = U_ZERO_ERROR;
5012        actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
5013        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5014        REGEX_ASSERT(actual == &bufferText);
5015
5016        uregex_close(re);
5017
5018    }
5019
5020    /*
5021     *  replaceFirst()
5022     */
5023    {
5024        UChar    text1[80];
5025        UChar    text2[80];
5026        UText    replText = UTEXT_INITIALIZER;
5027        UText   *result;
5028
5029        status = U_ZERO_ERROR;
5030        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5031        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5032        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5033
5034        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5035        REGEX_CHECK_STATUS;
5036
5037        /*  Normal case, with match */
5038        uregex_setText(re, text1, -1, &status);
5039        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5040        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5041        REGEX_CHECK_STATUS;
5042        REGEX_ASSERT(result == &bufferText);
5043        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5044
5045        /* No match.  Text should copy to output with no changes.  */
5046        uregex_setText(re, text2, -1, &status);
5047        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5048        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5049        REGEX_CHECK_STATUS;
5050        REGEX_ASSERT(result == &bufferText);
5051        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5052
5053        /* Unicode escapes */
5054        uregex_setText(re, text1, -1, &status);
5055        regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
5056        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5057        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5058        REGEX_CHECK_STATUS;
5059        REGEX_ASSERT(result == &bufferText);
5060        REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5061
5062        uregex_close(re);
5063        utext_close(&replText);
5064    }
5065
5066
5067    /*
5068     *  replaceAll()
5069     */
5070    {
5071        UChar    text1[80];
5072        UChar    text2[80];
5073        UText    replText = UTEXT_INITIALIZER;
5074        UText   *result;
5075
5076        status = U_ZERO_ERROR;
5077        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5078        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5079        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5080
5081        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5082        REGEX_CHECK_STATUS;
5083
5084        /*  Normal case, with match */
5085        uregex_setText(re, text1, -1, &status);
5086        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5087        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5088        REGEX_CHECK_STATUS;
5089        REGEX_ASSERT(result == &bufferText);
5090        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5091
5092        /* No match.  Text should copy to output with no changes.  */
5093        uregex_setText(re, text2, -1, &status);
5094        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5095        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5096        REGEX_CHECK_STATUS;
5097        REGEX_ASSERT(result == &bufferText);
5098        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5099
5100        uregex_close(re);
5101        utext_close(&replText);
5102    }
5103
5104
5105    /*
5106     *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5107     *   so we don't need to test it here.
5108     */
5109
5110    utext_close(&bufferText);
5111    utext_close(&patternText);
5112}
5113
5114//--------------------------------------------------------------
5115//
5116//  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5117//
5118//---------------------------------------------------------------
5119void RegexTest::Bug7651() {
5120    UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5121    //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5122    //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5123    UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5124    UnicodeString s("#ff @abcd This is test");
5125    RegexPattern  *REPattern = NULL;
5126    RegexMatcher  *REMatcher = NULL;
5127    UErrorCode status = U_ZERO_ERROR;
5128    UParseError pe;
5129
5130    REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5131    REGEX_CHECK_STATUS;
5132    REMatcher = REPattern->matcher(s, status);
5133    REGEX_CHECK_STATUS;
5134    REGEX_ASSERT(REMatcher->find());
5135    REGEX_ASSERT(REMatcher->start(status) == 0);
5136    delete REPattern;
5137    delete REMatcher;
5138    status = U_ZERO_ERROR;
5139
5140    REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5141    REGEX_CHECK_STATUS;
5142    REMatcher = REPattern->matcher(s, status);
5143    REGEX_CHECK_STATUS;
5144    REGEX_ASSERT(REMatcher->find());
5145    REGEX_ASSERT(REMatcher->start(status) == 0);
5146    delete REPattern;
5147    delete REMatcher;
5148    status = U_ZERO_ERROR;
5149 }
5150
5151void RegexTest::Bug7740() {
5152    UErrorCode status = U_ZERO_ERROR;
5153    UnicodeString pattern = "(a)";
5154    UnicodeString text = "abcdef";
5155    RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5156    REGEX_CHECK_STATUS;
5157    REGEX_ASSERT(m->lookingAt(status));
5158    REGEX_CHECK_STATUS;
5159    status = U_ILLEGAL_ARGUMENT_ERROR;
5160    UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5161    REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5162    REGEX_ASSERT(s == "");
5163    delete m;
5164}
5165
5166// Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5167
5168void RegexTest::Bug8479() {
5169    UErrorCode status = U_ZERO_ERROR;
5170
5171    RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5172    REGEX_CHECK_STATUS;
5173    if (U_SUCCESS(status))
5174    {
5175        UnicodeString str;
5176        str.setToBogus();
5177        pMatcher->reset(str);
5178        status = U_ZERO_ERROR;
5179        pMatcher->matches(status);
5180        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5181        delete pMatcher;
5182    }
5183}
5184
5185
5186// Bug 7029
5187void RegexTest::Bug7029() {
5188    UErrorCode status = U_ZERO_ERROR;
5189
5190    RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5191    UnicodeString text = "abc.def";
5192    UnicodeString splits[10];
5193    REGEX_CHECK_STATUS;
5194    int32_t numFields = pMatcher->split(text, splits, 10, status);
5195    REGEX_CHECK_STATUS;
5196    REGEX_ASSERT(numFields == 8);
5197    delete pMatcher;
5198}
5199
5200// Bug 9283
5201//   This test is checking for the existance of any supplemental characters that case-fold
5202//   to a bmp character.
5203//
5204//   At the time of this writing there are none. If any should appear in a subsequent release
5205//   of Unicode, the code in regular expressions compilation that determines the longest
5206//   posssible match for a literal string  will need to be enhanced.
5207//
5208//   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5209//   for details on what to do in case of a failure of this test.
5210//
5211void RegexTest::Bug9283() {
5212#if !UCONFIG_NO_NORMALIZATION
5213    UErrorCode status = U_ZERO_ERROR;
5214    UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5215    REGEX_CHECK_STATUS;
5216    int32_t index;
5217    UChar32 c;
5218    for (index=0; ; index++) {
5219        c = supplementalsWithCaseFolding.charAt(index);
5220        if (c == -1) {
5221            break;
5222        }
5223        UnicodeString cf = UnicodeString(c).foldCase();
5224        REGEX_ASSERT(cf.length() >= 2);
5225    }
5226#endif /* #if !UCONFIG_NO_NORMALIZATION */
5227}
5228
5229
5230void RegexTest::CheckInvBufSize() {
5231  if(inv_next>=INV_BUFSIZ) {
5232    errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5233          __FILE__, INV_BUFSIZ, inv_next);
5234  } else {
5235    logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5236  }
5237}
5238
5239
5240void RegexTest::Bug10459() {
5241    UErrorCode status = U_ZERO_ERROR;
5242    UnicodeString patternString("(txt)");
5243    UnicodeString txtString("txt");
5244
5245    UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5246    REGEX_CHECK_STATUS;
5247    UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5248    REGEX_CHECK_STATUS;
5249
5250    URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5251    REGEX_CHECK_STATUS;
5252
5253    uregex_setUText(icu_re, utext_txt, &status);
5254    REGEX_CHECK_STATUS;
5255
5256    // The bug was that calling uregex_group() before doing a matching operation
5257    //   was causing a segfault. Only for Regular Expressions created from UText.
5258    //   It should set an U_REGEX_INVALID_STATE.
5259
5260    UChar buf[100];
5261    int32_t len = uregex_group(icu_re, 0, buf, LENGTHOF(buf), &status);
5262    REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5263    REGEX_ASSERT(len == 0);
5264
5265    uregex_close(icu_re);
5266    utext_close(utext_pat);
5267    utext_close(utext_txt);
5268}
5269
5270#endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5271
5272