1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2011, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7//
8//   regextst.cpp
9//
10//      ICU Regular Expressions test, part of intltest.
11//
12
13/*
14     NOTE!!
15
16     PLEASE be careful about ASCII assumptions in this test.
17     This test is one of the worst repeat offenders.
18     If you have questions, contact someone on the ICU PMC
19     who has access to an EBCDIC system.
20
21 */
22
23#include "intltest.h"
24#if !UCONFIG_NO_REGULAR_EXPRESSIONS
25
26#include "unicode/regex.h"
27#include "unicode/uchar.h"
28#include "unicode/ucnv.h"
29#include "unicode/ustring.h"
30#include "regextst.h"
31#include "uvector.h"
32#include "util.h"
33#include <stdlib.h>
34#include <string.h>
35#include <stdio.h>
36#include "cstring.h"
37#include "uinvchar.h"
38
39#define SUPPORT_MUTATING_INPUT_STRING   0
40
41//---------------------------------------------------------------------------
42//
43//  Test class boilerplate
44//
45//---------------------------------------------------------------------------
46RegexTest::RegexTest()
47{
48}
49
50
51RegexTest::~RegexTest()
52{
53}
54
55
56
57void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
58{
59    if (exec) logln("TestSuite RegexTest: ");
60    switch (index) {
61
62        case 0: name = "Basic";
63            if (exec) Basic();
64            break;
65        case 1: name = "API_Match";
66            if (exec) API_Match();
67            break;
68        case 2: name = "API_Replace";
69            if (exec) API_Replace();
70            break;
71        case 3: name = "API_Pattern";
72            if (exec) API_Pattern();
73            break;
74        case 4:
75#if !UCONFIG_NO_FILE_IO
76            name = "Extended";
77            if (exec) Extended();
78#else
79            name = "skip";
80#endif
81            break;
82        case 5: name = "Errors";
83            if (exec) Errors();
84            break;
85        case 6: name = "PerlTests";
86            if (exec) PerlTests();
87            break;
88        case 7: name = "Callbacks";
89            if (exec) Callbacks();
90            break;
91        case 8: name = "FindProgressCallbacks";
92            if (exec) FindProgressCallbacks();
93            break;
94        case 9: name = "Bug 6149";
95             if (exec) Bug6149();
96             break;
97        case 10: name = "UTextBasic";
98          if (exec) UTextBasic();
99          break;
100        case 11: name = "API_Match_UTF8";
101          if (exec) API_Match_UTF8();
102          break;
103        case 12: name = "API_Replace_UTF8";
104          if (exec) API_Replace_UTF8();
105          break;
106        case 13: name = "API_Pattern_UTF8";
107          if (exec) API_Pattern_UTF8();
108          break;
109        case 14: name = "PerlTestsUTF8";
110          if (exec) PerlTestsUTF8();
111          break;
112        case 15: name = "PreAllocatedUTextCAPI";
113          if (exec) PreAllocatedUTextCAPI();
114          break;
115        case 16: name = "Bug 7651";
116             if (exec) Bug7651();
117             break;
118        case 17: name = "Bug 7740";
119            if (exec) Bug7740();
120            break;
121        case 18: name = "Bug 8479";
122            if (exec) Bug8479();
123            break;
124        case 19: name = "Bug 7029";
125            if (exec) Bug7029();
126            break;
127        case 20: name = "CheckInvBufSize";
128            if (exec) CheckInvBufSize();
129            break;
130
131        default: name = "";
132            break; //needed to end loop
133    }
134}
135
136
137
138/**
139 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
140 * into ASCII.
141 * @see utext_openUTF8
142 */
143static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
144
145//---------------------------------------------------------------------------
146//
147//   Error Checking / Reporting macros used in all of the tests.
148//
149//---------------------------------------------------------------------------
150
151static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
152  int64_t oldIndex = utext_getNativeIndex(text);
153  utext_setNativeIndex(text, 0);
154  char *bufPtr = buf;
155  UChar32 c = utext_next32From(text, 0);
156  while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
157    if (0x000020<=c && c<0x00007e) {
158      *bufPtr = c;
159    } else {
160#if 0
161      sprintf(bufPtr,"U+%04X", c);
162      bufPtr+= strlen(bufPtr)-1;
163#else
164      *bufPtr = '%';
165#endif
166    }
167    bufPtr++;
168    c = UTEXT_NEXT32(text);
169  }
170  *bufPtr = 0;
171#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
172  char *ebuf = (char*)malloc(bufLen);
173  uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
174  uprv_strncpy(buf, ebuf, bufLen);
175  free((void*)ebuf);
176#endif
177  utext_setNativeIndex(text, oldIndex);
178}
179
180static inline UChar toHex(int32_t i) {
181    return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10)));
182}
183
184static UnicodeString& escape(const UnicodeString& s, UnicodeString& result) {
185    for (int32_t i=0; i<s.length(); ++i) {
186        UChar c = s[i];
187        if ((c <= (UChar)0x7F) && (c>0)) {
188            result += c;
189        } else {
190            result += (UChar)0x5c;
191            result += (UChar)0x75;
192            result += toHex((c >> 12) & 0xF);
193            result += toHex((c >>  8) & 0xF);
194            result += toHex((c >>  4) & 0xF);
195            result += toHex( c        & 0xF);
196        }
197    }
198    return result;
199}
200
201static char ASSERT_BUF[1024];
202
203static const char* extractToAssertBuf(const UnicodeString& message) {
204  if(message.length()==0) {
205    strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
206  } else {
207    UnicodeString buf;
208    escape(message, buf);
209    if(buf.length()==0) {
210      strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
211    } else {
212      buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
213      if(ASSERT_BUF[0]==0) {
214        ASSERT_BUF[0]=0;
215        for(int32_t i=0;i<buf.length();i++) {
216          UChar ch = buf[i];
217          sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
218        }
219      }
220    }
221  }
222  ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
223  return ASSERT_BUF;
224}
225
226
227#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
228
229#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
230                                                              __FILE__, __LINE__, u_errorName(status)); return;}}
231
232#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
233
234#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
235if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
236    __LINE__, u_errorName(errcode), u_errorName(status));};}
237
238#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
239    "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
240
241#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
242    errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
243
244#define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
245
246/**
247 * @param expected expected text in UTF-8 (not platform) codepage
248 */
249void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
250    UErrorCode status = U_ZERO_ERROR;
251    UText expectedText = UTEXT_INITIALIZER;
252    utext_openUTF8(&expectedText, expected, -1, &status);
253    if(U_FAILURE(status)) {
254      errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
255      return;
256    }
257    if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
258      errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
259      return;
260    }
261    utext_setNativeIndex(actual, 0);
262    if (utext_compare(&expectedText, -1, actual, -1) != 0) {
263        char buf[201 /*21*/];
264        char expectedBuf[201];
265        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
266        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
267        errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
268    }
269    utext_close(&expectedText);
270}
271/**
272 * @param expected invariant (platform local text) input
273 */
274
275void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
276    UErrorCode status = U_ZERO_ERROR;
277    UText expectedText = UTEXT_INITIALIZER;
278    regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
279    if(U_FAILURE(status)) {
280      errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
281      return;
282    }
283    utext_setNativeIndex(actual, 0);
284    if (utext_compare(&expectedText, -1, actual, -1) != 0) {
285        char buf[201 /*21*/];
286        char expectedBuf[201];
287        utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
288        utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
289        errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
290    }
291    utext_close(&expectedText);
292}
293
294/**
295 * Assumes utf-8 input
296 */
297#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
298/**
299 * Assumes Invariant input
300 */
301#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
302
303/**
304 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
305 * passed into utext_openUTF8. An error will be given if
306 * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
307 */
308
309#define INV_BUFSIZ 2048 /* increase this if too small */
310
311static int32_t inv_next=0;
312
313#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
314static char inv_buf[INV_BUFSIZ];
315#endif
316
317static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
318  if(length==-1) length=strlen(inv);
319#if U_CHARSET_FAMILY==U_ASCII_FAMILY
320  inv_next+=length;
321  return utext_openUTF8(ut, inv, length, status);
322#else
323  if(inv_next+length+1>INV_BUFSIZ) {
324    fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
325            __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
326    *status = U_MEMORY_ALLOCATION_ERROR;
327    return NULL;
328  }
329
330  unsigned char *buf = (unsigned char*)inv_buf+inv_next;
331  uprv_aestrncpy(buf, (const uint8_t*)inv, length);
332  inv_next+=length;
333
334#if 0
335  fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
336#endif
337
338  return utext_openUTF8(ut, (const char*)buf, length, status);
339#endif
340}
341
342
343//---------------------------------------------------------------------------
344//
345//    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
346//                       for the LookingAt() and  Match() functions.
347//
348//       usage:
349//          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
350//
351//          The expected results are UBool - TRUE or FALSE.
352//          The input text is unescaped.  The pattern is not.
353//
354//
355//---------------------------------------------------------------------------
356
357#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
358
359UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
360    const UnicodeString pattern(pat, -1, US_INV);
361    const UnicodeString inputText(text, -1, US_INV);
362    UErrorCode          status  = U_ZERO_ERROR;
363    UParseError         pe;
364    RegexPattern        *REPattern = NULL;
365    RegexMatcher        *REMatcher = NULL;
366    UBool               retVal     = TRUE;
367
368    UnicodeString patString(pat, -1, US_INV);
369    REPattern = RegexPattern::compile(patString, 0, pe, status);
370    if (U_FAILURE(status)) {
371        dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
372            line, u_errorName(status));
373        return FALSE;
374    }
375    if (line==376) { RegexPatternDump(REPattern);}
376
377    UnicodeString inputString(inputText);
378    UnicodeString unEscapedInput = inputString.unescape();
379    REMatcher = REPattern->matcher(unEscapedInput, status);
380    if (U_FAILURE(status)) {
381        errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
382            line, u_errorName(status));
383        return FALSE;
384    }
385
386    UBool actualmatch;
387    actualmatch = REMatcher->lookingAt(status);
388    if (U_FAILURE(status)) {
389        errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
390            line, u_errorName(status));
391        retVal =  FALSE;
392    }
393    if (actualmatch != looking) {
394        errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
395        retVal = FALSE;
396    }
397
398    status = U_ZERO_ERROR;
399    actualmatch = REMatcher->matches(status);
400    if (U_FAILURE(status)) {
401        errln("RegexTest failure in matches() at line %d.  Status = %s\n",
402            line, u_errorName(status));
403        retVal = FALSE;
404    }
405    if (actualmatch != match) {
406        errln("RegexTest: wrong return from matches() at line %d.\n", line);
407        retVal = FALSE;
408    }
409
410    if (retVal == FALSE) {
411        RegexPatternDump(REPattern);
412    }
413
414    delete REPattern;
415    delete REMatcher;
416    return retVal;
417}
418
419
420UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
421    UText               pattern    = UTEXT_INITIALIZER;
422    int32_t             inputUTF8Length;
423    char                *textChars = NULL;
424    UText               inputText  = UTEXT_INITIALIZER;
425    UErrorCode          status     = U_ZERO_ERROR;
426    UParseError         pe;
427    RegexPattern        *REPattern = NULL;
428    RegexMatcher        *REMatcher = NULL;
429    UBool               retVal     = TRUE;
430
431    regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
432    REPattern = RegexPattern::compile(&pattern, 0, pe, status);
433    if (U_FAILURE(status)) {
434        dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
435            line, u_errorName(status));
436        return FALSE;
437    }
438
439    UnicodeString inputString(text, -1, US_INV);
440    UnicodeString unEscapedInput = inputString.unescape();
441    LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
442    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
443
444    inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
445    if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
446        // UTF-8 does not allow unpaired surrogates, so this could actually happen
447        logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
448        return TRUE; // not a failure of the Regex engine
449    }
450    status = U_ZERO_ERROR; // buffer overflow
451    textChars = new char[inputUTF8Length+1];
452    unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
453    utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
454
455    REMatcher = &REPattern->matcher(status)->reset(&inputText);
456    if (U_FAILURE(status)) {
457        errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
458            line, u_errorName(status));
459        return FALSE;
460    }
461
462    UBool actualmatch;
463    actualmatch = REMatcher->lookingAt(status);
464    if (U_FAILURE(status)) {
465        errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
466            line, u_errorName(status));
467        retVal =  FALSE;
468    }
469    if (actualmatch != looking) {
470        errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
471        retVal = FALSE;
472    }
473
474    status = U_ZERO_ERROR;
475    actualmatch = REMatcher->matches(status);
476    if (U_FAILURE(status)) {
477        errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
478            line, u_errorName(status));
479        retVal = FALSE;
480    }
481    if (actualmatch != match) {
482        errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
483        retVal = FALSE;
484    }
485
486    if (retVal == FALSE) {
487        RegexPatternDump(REPattern);
488    }
489
490    delete REPattern;
491    delete REMatcher;
492    utext_close(&inputText);
493    utext_close(&pattern);
494    delete[] textChars;
495    return retVal;
496}
497
498
499
500//---------------------------------------------------------------------------
501//
502//    REGEX_ERR       Macro + invocation function to simplify writing tests
503//                       regex tests for incorrect patterns
504//
505//       usage:
506//          REGEX_ERR("pattern",   expected error line, column, expected status);
507//
508//---------------------------------------------------------------------------
509#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
510
511void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
512                          UErrorCode expectedStatus, int32_t line) {
513    UnicodeString       pattern(pat);
514
515    UErrorCode          status         = U_ZERO_ERROR;
516    UParseError         pe;
517    RegexPattern        *callerPattern = NULL;
518
519    //
520    //  Compile the caller's pattern
521    //
522    UnicodeString patString(pat);
523    callerPattern = RegexPattern::compile(patString, 0, pe, status);
524    if (status != expectedStatus) {
525        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
526    } else {
527        if (status != U_ZERO_ERROR) {
528            if (pe.line != errLine || pe.offset != errCol) {
529                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
530                    line, errLine, errCol, pe.line, pe.offset);
531            }
532        }
533    }
534
535    delete callerPattern;
536
537    //
538    //  Compile again, using a UTF-8-based UText
539    //
540    UText patternText = UTEXT_INITIALIZER;
541    regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
542    callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
543    if (status != expectedStatus) {
544        dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
545    } else {
546        if (status != U_ZERO_ERROR) {
547            if (pe.line != errLine || pe.offset != errCol) {
548                errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
549                    line, errLine, errCol, pe.line, pe.offset);
550            }
551        }
552    }
553
554    delete callerPattern;
555    utext_close(&patternText);
556}
557
558
559
560//---------------------------------------------------------------------------
561//
562//      Basic      Check for basic functionality of regex pattern matching.
563//                 Avoid the use of REGEX_FIND test macro, which has
564//                 substantial dependencies on basic Regex functionality.
565//
566//---------------------------------------------------------------------------
567void RegexTest::Basic() {
568
569
570//
571// Debug - slide failing test cases early
572//
573#if 0
574    {
575        // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
576        UParseError pe;
577        UErrorCode  status = U_ZERO_ERROR;
578        RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
579        // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
580        // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
581    }
582    exit(1);
583#endif
584
585
586    //
587    // Pattern with parentheses
588    //
589    REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
590    REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
591    REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
592
593    //
594    // Patterns with *
595    //
596    REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
597    REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
598    REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
599    REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
600    REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
601
602    REGEX_TESTLM("a*", "",  TRUE, TRUE);
603    REGEX_TESTLM("a*", "b", TRUE, FALSE);
604
605
606    //
607    //  Patterns with "."
608    //
609    REGEX_TESTLM(".", "abc", TRUE, FALSE);
610    REGEX_TESTLM("...", "abc", TRUE, TRUE);
611    REGEX_TESTLM("....", "abc", FALSE, FALSE);
612    REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
613    REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
614    REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
615    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
616    REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
617
618    //
619    //  Patterns with * applied to chars at end of literal string
620    //
621    REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
622    REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
623
624    //
625    //  Supplemental chars match as single chars, not a pair of surrogates.
626    //
627    REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
628    REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
629    REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
630
631
632    //
633    //  UnicodeSets in the pattern
634    //
635    REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
636    REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
637    REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
638    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
639    REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
640    REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
641
642    REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
643    REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
644    REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
645    REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
646    REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
647
648    //
649    //   OR operator in patterns
650    //
651    REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
652    REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
653    REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
654    REGEX_TESTLM("a|b", "b", TRUE, TRUE);
655
656    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
657    REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
658    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
659    REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
660    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
661    REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
662
663    //
664    //  +
665    //
666    REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
667    REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
668    REGEX_TESTLM("b+", "", FALSE, FALSE);
669    REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
670    REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
671    REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
672
673    //
674    //   ?
675    //
676    REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
677    REGEX_TESTLM("ab?", "a", TRUE, TRUE);
678    REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
679    REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
680    REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
681    REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
682    REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
683    REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
684    REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
685
686    //
687    //  Escape sequences that become single literal chars, handled internally
688    //   by ICU's Unescape.
689    //
690
691    // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
692    REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
693    REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
694    REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
695    REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
696    REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
697    REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
698    REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
699    REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
700    REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
701
702    REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
703    REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
704
705    // Escape of special chars in patterns
706    REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
707}
708
709
710//---------------------------------------------------------------------------
711//
712//    UTextBasic   Check for quirks that are specific to the UText
713//                 implementation.
714//
715//---------------------------------------------------------------------------
716void RegexTest::UTextBasic() {
717    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
718    UErrorCode status = U_ZERO_ERROR;
719    UText pattern = UTEXT_INITIALIZER;
720    utext_openUTF8(&pattern, str_abc, -1, &status);
721    RegexMatcher matcher(&pattern, 0, status);
722    REGEX_CHECK_STATUS;
723
724    UText input = UTEXT_INITIALIZER;
725    utext_openUTF8(&input, str_abc, -1, &status);
726    REGEX_CHECK_STATUS;
727    matcher.reset(&input);
728    REGEX_CHECK_STATUS;
729    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
730
731    matcher.reset(matcher.inputText());
732    REGEX_CHECK_STATUS;
733    REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
734
735    utext_close(&pattern);
736    utext_close(&input);
737}
738
739
740//---------------------------------------------------------------------------
741//
742//      API_Match   Test that the API for class RegexMatcher
743//                  is present and nominally working, but excluding functions
744//                  implementing replace operations.
745//
746//---------------------------------------------------------------------------
747void RegexTest::API_Match() {
748    UParseError         pe;
749    UErrorCode          status=U_ZERO_ERROR;
750    int32_t             flags = 0;
751
752    //
753    // Debug - slide failing test cases early
754    //
755#if 0
756    {
757    }
758    return;
759#endif
760
761    //
762    // Simple pattern compilation
763    //
764    {
765        UnicodeString       re("abc");
766        RegexPattern        *pat2;
767        pat2 = RegexPattern::compile(re, flags, pe, status);
768        REGEX_CHECK_STATUS;
769
770        UnicodeString inStr1 = "abcdef this is a test";
771        UnicodeString instr2 = "not abc";
772        UnicodeString empty  = "";
773
774
775        //
776        // Matcher creation and reset.
777        //
778        RegexMatcher *m1 = pat2->matcher(inStr1, status);
779        REGEX_CHECK_STATUS;
780        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
781        REGEX_ASSERT(m1->input() == inStr1);
782        m1->reset(instr2);
783        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
784        REGEX_ASSERT(m1->input() == instr2);
785        m1->reset(inStr1);
786        REGEX_ASSERT(m1->input() == inStr1);
787        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
788        m1->reset(empty);
789        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
790        REGEX_ASSERT(m1->input() == empty);
791        REGEX_ASSERT(&m1->pattern() == pat2);
792
793        //
794        //  reset(pos, status)
795        //
796        m1->reset(inStr1);
797        m1->reset(4, status);
798        REGEX_CHECK_STATUS;
799        REGEX_ASSERT(m1->input() == inStr1);
800        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
801
802        m1->reset(-1, status);
803        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
804        status = U_ZERO_ERROR;
805
806        m1->reset(0, status);
807        REGEX_CHECK_STATUS;
808        status = U_ZERO_ERROR;
809
810        int32_t len = m1->input().length();
811        m1->reset(len-1, status);
812        REGEX_CHECK_STATUS;
813        status = U_ZERO_ERROR;
814
815        m1->reset(len, status);
816        REGEX_CHECK_STATUS;
817        status = U_ZERO_ERROR;
818
819        m1->reset(len+1, status);
820        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
821        status = U_ZERO_ERROR;
822
823        //
824        // match(pos, status)
825        //
826        m1->reset(instr2);
827        REGEX_ASSERT(m1->matches(4, status) == TRUE);
828        m1->reset();
829        REGEX_ASSERT(m1->matches(3, status) == FALSE);
830        m1->reset();
831        REGEX_ASSERT(m1->matches(5, status) == FALSE);
832        REGEX_ASSERT(m1->matches(4, status) == TRUE);
833        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
834        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
835
836        // Match() at end of string should fail, but should not
837        //  be an error.
838        status = U_ZERO_ERROR;
839        len = m1->input().length();
840        REGEX_ASSERT(m1->matches(len, status) == FALSE);
841        REGEX_CHECK_STATUS;
842
843        // Match beyond end of string should fail with an error.
844        status = U_ZERO_ERROR;
845        REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
846        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
847
848        // Successful match at end of string.
849        {
850            status = U_ZERO_ERROR;
851            RegexMatcher m("A?", 0, status);  // will match zero length string.
852            REGEX_CHECK_STATUS;
853            m.reset(inStr1);
854            len = inStr1.length();
855            REGEX_ASSERT(m.matches(len, status) == TRUE);
856            REGEX_CHECK_STATUS;
857            m.reset(empty);
858            REGEX_ASSERT(m.matches(0, status) == TRUE);
859            REGEX_CHECK_STATUS;
860        }
861
862
863        //
864        // lookingAt(pos, status)
865        //
866        status = U_ZERO_ERROR;
867        m1->reset(instr2);  // "not abc"
868        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
869        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
870        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
871        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
872        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
873        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
874        status = U_ZERO_ERROR;
875        len = m1->input().length();
876        REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
877        REGEX_CHECK_STATUS;
878        REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
879        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
880
881        delete m1;
882        delete pat2;
883    }
884
885
886    //
887    // Capture Group.
888    //     RegexMatcher::start();
889    //     RegexMatcher::end();
890    //     RegexMatcher::groupCount();
891    //
892    {
893        int32_t             flags=0;
894        UParseError         pe;
895        UErrorCode          status=U_ZERO_ERROR;
896
897        UnicodeString       re("01(23(45)67)(.*)");
898        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
899        REGEX_CHECK_STATUS;
900        UnicodeString data = "0123456789";
901
902        RegexMatcher *matcher = pat->matcher(data, status);
903        REGEX_CHECK_STATUS;
904        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
905        static const int32_t matchStarts[] = {0,  2, 4, 8};
906        static const int32_t matchEnds[]   = {10, 8, 6, 10};
907        int32_t i;
908        for (i=0; i<4; i++) {
909            int32_t actualStart = matcher->start(i, status);
910            REGEX_CHECK_STATUS;
911            if (actualStart != matchStarts[i]) {
912                errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
913                    __LINE__, i, matchStarts[i], actualStart);
914            }
915            int32_t actualEnd = matcher->end(i, status);
916            REGEX_CHECK_STATUS;
917            if (actualEnd != matchEnds[i]) {
918                errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
919                    __LINE__, i, matchEnds[i], actualEnd);
920            }
921        }
922
923        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
924        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
925
926        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
927        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
928        matcher->reset();
929        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
930
931        matcher->lookingAt(status);
932        REGEX_ASSERT(matcher->group(status)    == "0123456789");
933        REGEX_ASSERT(matcher->group(0, status) == "0123456789");
934        REGEX_ASSERT(matcher->group(1, status) == "234567"    );
935        REGEX_ASSERT(matcher->group(2, status) == "45"        );
936        REGEX_ASSERT(matcher->group(3, status) == "89"        );
937        REGEX_CHECK_STATUS;
938        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
939        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
940        matcher->reset();
941        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
942
943        delete matcher;
944        delete pat;
945
946    }
947
948    //
949    //  find
950    //
951    {
952        int32_t             flags=0;
953        UParseError         pe;
954        UErrorCode          status=U_ZERO_ERROR;
955
956        UnicodeString       re("abc");
957        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
958        REGEX_CHECK_STATUS;
959        UnicodeString data = ".abc..abc...abc..";
960        //                    012345678901234567
961
962        RegexMatcher *matcher = pat->matcher(data, status);
963        REGEX_CHECK_STATUS;
964        REGEX_ASSERT(matcher->find());
965        REGEX_ASSERT(matcher->start(status) == 1);
966        REGEX_ASSERT(matcher->find());
967        REGEX_ASSERT(matcher->start(status) == 6);
968        REGEX_ASSERT(matcher->find());
969        REGEX_ASSERT(matcher->start(status) == 12);
970        REGEX_ASSERT(matcher->find() == FALSE);
971        REGEX_ASSERT(matcher->find() == FALSE);
972
973        matcher->reset();
974        REGEX_ASSERT(matcher->find());
975        REGEX_ASSERT(matcher->start(status) == 1);
976
977        REGEX_ASSERT(matcher->find(0, status));
978        REGEX_ASSERT(matcher->start(status) == 1);
979        REGEX_ASSERT(matcher->find(1, status));
980        REGEX_ASSERT(matcher->start(status) == 1);
981        REGEX_ASSERT(matcher->find(2, status));
982        REGEX_ASSERT(matcher->start(status) == 6);
983        REGEX_ASSERT(matcher->find(12, status));
984        REGEX_ASSERT(matcher->start(status) == 12);
985        REGEX_ASSERT(matcher->find(13, status) == FALSE);
986        REGEX_ASSERT(matcher->find(16, status) == FALSE);
987        REGEX_ASSERT(matcher->find(17, status) == FALSE);
988        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
989
990        status = U_ZERO_ERROR;
991        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
992        status = U_ZERO_ERROR;
993        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
994
995        REGEX_ASSERT(matcher->groupCount() == 0);
996
997        delete matcher;
998        delete pat;
999    }
1000
1001
1002    //
1003    //  find, with \G in pattern (true if at the end of a previous match).
1004    //
1005    {
1006        int32_t             flags=0;
1007        UParseError         pe;
1008        UErrorCode          status=U_ZERO_ERROR;
1009
1010        UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1011        RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1012        REGEX_CHECK_STATUS;
1013        UnicodeString data = ".abcabc.abc..";
1014        //                    012345678901234567
1015
1016        RegexMatcher *matcher = pat->matcher(data, status);
1017        REGEX_CHECK_STATUS;
1018        REGEX_ASSERT(matcher->find());
1019        REGEX_ASSERT(matcher->start(status) == 0);
1020        REGEX_ASSERT(matcher->start(1, status) == -1);
1021        REGEX_ASSERT(matcher->start(2, status) == 1);
1022
1023        REGEX_ASSERT(matcher->find());
1024        REGEX_ASSERT(matcher->start(status) == 4);
1025        REGEX_ASSERT(matcher->start(1, status) == 4);
1026        REGEX_ASSERT(matcher->start(2, status) == -1);
1027        REGEX_CHECK_STATUS;
1028
1029        delete matcher;
1030        delete pat;
1031    }
1032
1033    //
1034    //   find with zero length matches, match position should bump ahead
1035    //     to prevent loops.
1036    //
1037    {
1038        int32_t                 i;
1039        UErrorCode          status=U_ZERO_ERROR;
1040        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1041                                                      //   using an always-true look-ahead.
1042        REGEX_CHECK_STATUS;
1043        UnicodeString s("    ");
1044        m.reset(s);
1045        for (i=0; ; i++) {
1046            if (m.find() == FALSE) {
1047                break;
1048            }
1049            REGEX_ASSERT(m.start(status) == i);
1050            REGEX_ASSERT(m.end(status) == i);
1051        }
1052        REGEX_ASSERT(i==5);
1053
1054        // Check that the bump goes over surrogate pairs OK
1055        s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1056        s = s.unescape();
1057        m.reset(s);
1058        for (i=0; ; i+=2) {
1059            if (m.find() == FALSE) {
1060                break;
1061            }
1062            REGEX_ASSERT(m.start(status) == i);
1063            REGEX_ASSERT(m.end(status) == i);
1064        }
1065        REGEX_ASSERT(i==10);
1066    }
1067    {
1068        // find() loop breaking test.
1069        //        with pattern of /.?/, should see a series of one char matches, then a single
1070        //        match of zero length at the end of the input string.
1071        int32_t                 i;
1072        UErrorCode          status=U_ZERO_ERROR;
1073        RegexMatcher        m(".?", 0, status);
1074        REGEX_CHECK_STATUS;
1075        UnicodeString s("    ");
1076        m.reset(s);
1077        for (i=0; ; i++) {
1078            if (m.find() == FALSE) {
1079                break;
1080            }
1081            REGEX_ASSERT(m.start(status) == i);
1082            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1083        }
1084        REGEX_ASSERT(i==5);
1085    }
1086
1087
1088    //
1089    // Matchers with no input string behave as if they had an empty input string.
1090    //
1091
1092    {
1093        UErrorCode status = U_ZERO_ERROR;
1094        RegexMatcher  m(".?", 0, status);
1095        REGEX_CHECK_STATUS;
1096        REGEX_ASSERT(m.find());
1097        REGEX_ASSERT(m.start(status) == 0);
1098        REGEX_ASSERT(m.input() == "");
1099    }
1100    {
1101        UErrorCode status = U_ZERO_ERROR;
1102        RegexPattern  *p = RegexPattern::compile(".", 0, status);
1103        RegexMatcher  *m = p->matcher(status);
1104        REGEX_CHECK_STATUS;
1105
1106        REGEX_ASSERT(m->find() == FALSE);
1107        REGEX_ASSERT(m->input() == "");
1108        delete m;
1109        delete p;
1110    }
1111
1112    //
1113    // Regions
1114    //
1115    {
1116        UErrorCode status = U_ZERO_ERROR;
1117        UnicodeString testString("This is test data");
1118        RegexMatcher m(".*", testString,  0, status);
1119        REGEX_CHECK_STATUS;
1120        REGEX_ASSERT(m.regionStart() == 0);
1121        REGEX_ASSERT(m.regionEnd() == testString.length());
1122        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1123        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1124
1125        m.region(2,4, status);
1126        REGEX_CHECK_STATUS;
1127        REGEX_ASSERT(m.matches(status));
1128        REGEX_ASSERT(m.start(status)==2);
1129        REGEX_ASSERT(m.end(status)==4);
1130        REGEX_CHECK_STATUS;
1131
1132        m.reset();
1133        REGEX_ASSERT(m.regionStart() == 0);
1134        REGEX_ASSERT(m.regionEnd() == testString.length());
1135
1136        UnicodeString shorterString("short");
1137        m.reset(shorterString);
1138        REGEX_ASSERT(m.regionStart() == 0);
1139        REGEX_ASSERT(m.regionEnd() == shorterString.length());
1140
1141        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1142        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1143        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1144        REGEX_ASSERT(&m == &m.reset());
1145        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1146
1147        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1148        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1149        REGEX_ASSERT(&m == &m.reset());
1150        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1151
1152        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1153        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1154        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1155        REGEX_ASSERT(&m == &m.reset());
1156        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1157
1158        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1159        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1160        REGEX_ASSERT(&m == &m.reset());
1161        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1162
1163    }
1164
1165    //
1166    // hitEnd() and requireEnd()
1167    //
1168    {
1169        UErrorCode status = U_ZERO_ERROR;
1170        UnicodeString testString("aabb");
1171        RegexMatcher m1(".*", testString,  0, status);
1172        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1173        REGEX_ASSERT(m1.hitEnd() == TRUE);
1174        REGEX_ASSERT(m1.requireEnd() == FALSE);
1175        REGEX_CHECK_STATUS;
1176
1177        status = U_ZERO_ERROR;
1178        RegexMatcher m2("a*", testString, 0, status);
1179        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1180        REGEX_ASSERT(m2.hitEnd() == FALSE);
1181        REGEX_ASSERT(m2.requireEnd() == FALSE);
1182        REGEX_CHECK_STATUS;
1183
1184        status = U_ZERO_ERROR;
1185        RegexMatcher m3(".*$", testString, 0, status);
1186        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1187        REGEX_ASSERT(m3.hitEnd() == TRUE);
1188        REGEX_ASSERT(m3.requireEnd() == TRUE);
1189        REGEX_CHECK_STATUS;
1190    }
1191
1192
1193    //
1194    // Compilation error on reset with UChar *
1195    //   These were a hazard that people were stumbling over with runtime errors.
1196    //   Changed them to compiler errors by adding private methods that more closely
1197    //   matched the incorrect use of the functions.
1198    //
1199#if 0
1200    {
1201        UErrorCode status = U_ZERO_ERROR;
1202        UChar ucharString[20];
1203        RegexMatcher m(".", 0, status);
1204        m.reset(ucharString);  // should not compile.
1205
1206        RegexPattern *p = RegexPattern::compile(".", 0, status);
1207        RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1208
1209        RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1210    }
1211#endif
1212
1213    //
1214    //  Time Outs.
1215    //       Note:  These tests will need to be changed when the regexp engine is
1216    //              able to detect and cut short the exponential time behavior on
1217    //              this type of match.
1218    //
1219    {
1220        UErrorCode status = U_ZERO_ERROR;
1221        //    Enough 'a's in the string to cause the match to time out.
1222        //       (Each on additonal 'a' doubles the time)
1223        UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1224        RegexMatcher matcher("(a+)+b", testString, 0, status);
1225        REGEX_CHECK_STATUS;
1226        REGEX_ASSERT(matcher.getTimeLimit() == 0);
1227        matcher.setTimeLimit(100, status);
1228        REGEX_ASSERT(matcher.getTimeLimit() == 100);
1229        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1230        REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1231    }
1232    {
1233        UErrorCode status = U_ZERO_ERROR;
1234        //   Few enough 'a's to slip in under the time limit.
1235        UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1236        RegexMatcher matcher("(a+)+b", testString, 0, status);
1237        REGEX_CHECK_STATUS;
1238        matcher.setTimeLimit(100, status);
1239        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1240        REGEX_CHECK_STATUS;
1241    }
1242
1243    //
1244    //  Stack Limits
1245    //
1246    {
1247        UErrorCode status = U_ZERO_ERROR;
1248        UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1249
1250        // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1251        //   of the '+', and makes the stack frames larger.
1252        RegexMatcher matcher("(A)+A$", testString, 0, status);
1253
1254        // With the default stack, this match should fail to run
1255        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1256        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1257
1258        // With unlimited stack, it should run
1259        status = U_ZERO_ERROR;
1260        matcher.setStackLimit(0, status);
1261        REGEX_CHECK_STATUS;
1262        REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1263        REGEX_CHECK_STATUS;
1264        REGEX_ASSERT(matcher.getStackLimit() == 0);
1265
1266        // With a limited stack, it the match should fail
1267        status = U_ZERO_ERROR;
1268        matcher.setStackLimit(10000, status);
1269        REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1270        REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1271        REGEX_ASSERT(matcher.getStackLimit() == 10000);
1272    }
1273
1274        // A pattern that doesn't save state should work with
1275        //   a minimal sized stack
1276    {
1277        UErrorCode status = U_ZERO_ERROR;
1278        UnicodeString testString = "abc";
1279        RegexMatcher matcher("abc", testString, 0, status);
1280        REGEX_CHECK_STATUS;
1281        matcher.setStackLimit(30, status);
1282        REGEX_CHECK_STATUS;
1283        REGEX_ASSERT(matcher.matches(status) == TRUE);
1284        REGEX_CHECK_STATUS;
1285        REGEX_ASSERT(matcher.getStackLimit() == 30);
1286
1287        // Negative stack sizes should fail
1288        status = U_ZERO_ERROR;
1289        matcher.setStackLimit(1000, status);
1290        REGEX_CHECK_STATUS;
1291        matcher.setStackLimit(-1, status);
1292        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1293        REGEX_ASSERT(matcher.getStackLimit() == 1000);
1294    }
1295
1296
1297}
1298
1299
1300
1301
1302
1303
1304//---------------------------------------------------------------------------
1305//
1306//      API_Replace        API test for class RegexMatcher, testing the
1307//                         Replace family of functions.
1308//
1309//---------------------------------------------------------------------------
1310void RegexTest::API_Replace() {
1311    //
1312    //  Replace
1313    //
1314    int32_t             flags=0;
1315    UParseError         pe;
1316    UErrorCode          status=U_ZERO_ERROR;
1317
1318    UnicodeString       re("abc");
1319    RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1320    REGEX_CHECK_STATUS;
1321    UnicodeString data = ".abc..abc...abc..";
1322    //                    012345678901234567
1323    RegexMatcher *matcher = pat->matcher(data, status);
1324
1325    //
1326    //  Plain vanilla matches.
1327    //
1328    UnicodeString  dest;
1329    dest = matcher->replaceFirst("yz", status);
1330    REGEX_CHECK_STATUS;
1331    REGEX_ASSERT(dest == ".yz..abc...abc..");
1332
1333    dest = matcher->replaceAll("yz", status);
1334    REGEX_CHECK_STATUS;
1335    REGEX_ASSERT(dest == ".yz..yz...yz..");
1336
1337    //
1338    //  Plain vanilla non-matches.
1339    //
1340    UnicodeString d2 = ".abx..abx...abx..";
1341    matcher->reset(d2);
1342    dest = matcher->replaceFirst("yz", status);
1343    REGEX_CHECK_STATUS;
1344    REGEX_ASSERT(dest == ".abx..abx...abx..");
1345
1346    dest = matcher->replaceAll("yz", status);
1347    REGEX_CHECK_STATUS;
1348    REGEX_ASSERT(dest == ".abx..abx...abx..");
1349
1350    //
1351    // Empty source string
1352    //
1353    UnicodeString d3 = "";
1354    matcher->reset(d3);
1355    dest = matcher->replaceFirst("yz", status);
1356    REGEX_CHECK_STATUS;
1357    REGEX_ASSERT(dest == "");
1358
1359    dest = matcher->replaceAll("yz", status);
1360    REGEX_CHECK_STATUS;
1361    REGEX_ASSERT(dest == "");
1362
1363    //
1364    // Empty substitution string
1365    //
1366    matcher->reset(data);              // ".abc..abc...abc.."
1367    dest = matcher->replaceFirst("", status);
1368    REGEX_CHECK_STATUS;
1369    REGEX_ASSERT(dest == "...abc...abc..");
1370
1371    dest = matcher->replaceAll("", status);
1372    REGEX_CHECK_STATUS;
1373    REGEX_ASSERT(dest == "........");
1374
1375    //
1376    // match whole string
1377    //
1378    UnicodeString d4 = "abc";
1379    matcher->reset(d4);
1380    dest = matcher->replaceFirst("xyz", status);
1381    REGEX_CHECK_STATUS;
1382    REGEX_ASSERT(dest == "xyz");
1383
1384    dest = matcher->replaceAll("xyz", status);
1385    REGEX_CHECK_STATUS;
1386    REGEX_ASSERT(dest == "xyz");
1387
1388    //
1389    // Capture Group, simple case
1390    //
1391    UnicodeString       re2("a(..)");
1392    RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1393    REGEX_CHECK_STATUS;
1394    UnicodeString d5 = "abcdefg";
1395    RegexMatcher *matcher2 = pat2->matcher(d5, status);
1396    REGEX_CHECK_STATUS;
1397    dest = matcher2->replaceFirst("$1$1", status);
1398    REGEX_CHECK_STATUS;
1399    REGEX_ASSERT(dest == "bcbcdefg");
1400
1401    dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1402    REGEX_CHECK_STATUS;
1403    REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1404
1405    dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1406    REGEX_CHECK_STATUS;
1407    REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1408
1409    UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1410    replacement = replacement.unescape();
1411    dest = matcher2->replaceFirst(replacement, status);
1412    REGEX_CHECK_STATUS;
1413    REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1414
1415    REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1416
1417
1418    //
1419    // Replacement String with \u hex escapes
1420    //
1421    {
1422        UnicodeString  src = "abc 1 abc 2 abc 3";
1423        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1424        matcher->reset(src);
1425        UnicodeString  result = matcher->replaceAll(substitute, status);
1426        REGEX_CHECK_STATUS;
1427        REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1428    }
1429    {
1430        UnicodeString  src = "abc !";
1431        UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1432        matcher->reset(src);
1433        UnicodeString  result = matcher->replaceAll(substitute, status);
1434        REGEX_CHECK_STATUS;
1435        UnicodeString expected = UnicodeString("--");
1436        expected.append((UChar32)0x10000);
1437        expected.append("-- !");
1438        REGEX_ASSERT(result == expected);
1439    }
1440    // TODO:  need more through testing of capture substitutions.
1441
1442    // Bug 4057
1443    //
1444    {
1445        status = U_ZERO_ERROR;
1446        UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1447        RegexMatcher m("ss(.*?)ee", 0, status);
1448        REGEX_CHECK_STATUS;
1449        UnicodeString result;
1450
1451        // Multiple finds do NOT bump up the previous appendReplacement postion.
1452        m.reset(s);
1453        m.find();
1454        m.find();
1455        m.appendReplacement(result, "ooh", status);
1456        REGEX_CHECK_STATUS;
1457        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1458
1459        // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1460        status = U_ZERO_ERROR;
1461        result.truncate(0);
1462        m.reset(10, status);
1463        m.find();
1464        m.find();
1465        m.appendReplacement(result, "ooh", status);
1466        REGEX_CHECK_STATUS;
1467        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1468
1469        // find() at interior of string, appendReplacemnt still starts at beginning.
1470        status = U_ZERO_ERROR;
1471        result.truncate(0);
1472        m.reset();
1473        m.find(10, status);
1474        m.find();
1475        m.appendReplacement(result, "ooh", status);
1476        REGEX_CHECK_STATUS;
1477        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1478
1479        m.appendTail(result);
1480        REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1481
1482    }
1483
1484    delete matcher2;
1485    delete pat2;
1486    delete matcher;
1487    delete pat;
1488}
1489
1490
1491//---------------------------------------------------------------------------
1492//
1493//      API_Pattern       Test that the API for class RegexPattern is
1494//                        present and nominally working.
1495//
1496//---------------------------------------------------------------------------
1497void RegexTest::API_Pattern() {
1498    RegexPattern        pata;    // Test default constructor to not crash.
1499    RegexPattern        patb;
1500
1501    REGEX_ASSERT(pata == patb);
1502    REGEX_ASSERT(pata == pata);
1503
1504    UnicodeString re1("abc[a-l][m-z]");
1505    UnicodeString re2("def");
1506    UErrorCode    status = U_ZERO_ERROR;
1507    UParseError   pe;
1508
1509    RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1510    RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1511    REGEX_CHECK_STATUS;
1512    REGEX_ASSERT(*pat1 == *pat1);
1513    REGEX_ASSERT(*pat1 != pata);
1514
1515    // Assign
1516    patb = *pat1;
1517    REGEX_ASSERT(patb == *pat1);
1518
1519    // Copy Construct
1520    RegexPattern patc(*pat1);
1521    REGEX_ASSERT(patc == *pat1);
1522    REGEX_ASSERT(patb == patc);
1523    REGEX_ASSERT(pat1 != pat2);
1524    patb = *pat2;
1525    REGEX_ASSERT(patb != patc);
1526    REGEX_ASSERT(patb == *pat2);
1527
1528    // Compile with no flags.
1529    RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1530    REGEX_ASSERT(*pat1a == *pat1);
1531
1532    REGEX_ASSERT(pat1a->flags() == 0);
1533
1534    // Compile with different flags should be not equal
1535    RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1536    REGEX_CHECK_STATUS;
1537
1538    REGEX_ASSERT(*pat1b != *pat1a);
1539    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1540    REGEX_ASSERT(pat1a->flags() == 0);
1541    delete pat1b;
1542
1543    // clone
1544    RegexPattern *pat1c = pat1->clone();
1545    REGEX_ASSERT(*pat1c == *pat1);
1546    REGEX_ASSERT(*pat1c != *pat2);
1547
1548    delete pat1c;
1549    delete pat1a;
1550    delete pat1;
1551    delete pat2;
1552
1553
1554    //
1555    //   Verify that a matcher created from a cloned pattern works.
1556    //     (Jitterbug 3423)
1557    //
1558    {
1559        UErrorCode     status     = U_ZERO_ERROR;
1560        RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1561        RegexPattern  *pClone     = pSource->clone();
1562        delete         pSource;
1563        RegexMatcher  *mFromClone = pClone->matcher(status);
1564        REGEX_CHECK_STATUS;
1565        UnicodeString s = "Hello World";
1566        mFromClone->reset(s);
1567        REGEX_ASSERT(mFromClone->find() == TRUE);
1568        REGEX_ASSERT(mFromClone->group(status) == "Hello");
1569        REGEX_ASSERT(mFromClone->find() == TRUE);
1570        REGEX_ASSERT(mFromClone->group(status) == "World");
1571        REGEX_ASSERT(mFromClone->find() == FALSE);
1572        delete mFromClone;
1573        delete pClone;
1574    }
1575
1576    //
1577    //   matches convenience API
1578    //
1579    REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1580    REGEX_CHECK_STATUS;
1581    REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1582    REGEX_CHECK_STATUS;
1583    REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1584    REGEX_CHECK_STATUS;
1585    REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1586    REGEX_CHECK_STATUS;
1587    REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1588    REGEX_CHECK_STATUS;
1589    status = U_INDEX_OUTOFBOUNDS_ERROR;
1590    REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1591    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1592
1593
1594    //
1595    // Split()
1596    //
1597    status = U_ZERO_ERROR;
1598    pat1 = RegexPattern::compile(" +",  pe, status);
1599    REGEX_CHECK_STATUS;
1600    UnicodeString  fields[10];
1601
1602    int32_t n;
1603    n = pat1->split("Now is the time", fields, 10, status);
1604    REGEX_CHECK_STATUS;
1605    REGEX_ASSERT(n==4);
1606    REGEX_ASSERT(fields[0]=="Now");
1607    REGEX_ASSERT(fields[1]=="is");
1608    REGEX_ASSERT(fields[2]=="the");
1609    REGEX_ASSERT(fields[3]=="time");
1610    REGEX_ASSERT(fields[4]=="");
1611
1612    n = pat1->split("Now is the time", fields, 2, status);
1613    REGEX_CHECK_STATUS;
1614    REGEX_ASSERT(n==2);
1615    REGEX_ASSERT(fields[0]=="Now");
1616    REGEX_ASSERT(fields[1]=="is the time");
1617    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1618
1619    fields[1] = "*";
1620    status = U_ZERO_ERROR;
1621    n = pat1->split("Now is the time", fields, 1, status);
1622    REGEX_CHECK_STATUS;
1623    REGEX_ASSERT(n==1);
1624    REGEX_ASSERT(fields[0]=="Now is the time");
1625    REGEX_ASSERT(fields[1]=="*");
1626    status = U_ZERO_ERROR;
1627
1628    n = pat1->split("    Now       is the time   ", fields, 10, status);
1629    REGEX_CHECK_STATUS;
1630    REGEX_ASSERT(n==6);
1631    REGEX_ASSERT(fields[0]=="");
1632    REGEX_ASSERT(fields[1]=="Now");
1633    REGEX_ASSERT(fields[2]=="is");
1634    REGEX_ASSERT(fields[3]=="the");
1635    REGEX_ASSERT(fields[4]=="time");
1636    REGEX_ASSERT(fields[5]=="");
1637
1638    n = pat1->split("     ", fields, 10, status);
1639    REGEX_CHECK_STATUS;
1640    REGEX_ASSERT(n==2);
1641    REGEX_ASSERT(fields[0]=="");
1642    REGEX_ASSERT(fields[1]=="");
1643
1644    fields[0] = "foo";
1645    n = pat1->split("", fields, 10, status);
1646    REGEX_CHECK_STATUS;
1647    REGEX_ASSERT(n==0);
1648    REGEX_ASSERT(fields[0]=="foo");
1649
1650    delete pat1;
1651
1652    //  split, with a pattern with (capture)
1653    pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1654    REGEX_CHECK_STATUS;
1655
1656    status = U_ZERO_ERROR;
1657    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1658    REGEX_CHECK_STATUS;
1659    REGEX_ASSERT(n==7);
1660    REGEX_ASSERT(fields[0]=="");
1661    REGEX_ASSERT(fields[1]=="a");
1662    REGEX_ASSERT(fields[2]=="Now is ");
1663    REGEX_ASSERT(fields[3]=="b");
1664    REGEX_ASSERT(fields[4]=="the time");
1665    REGEX_ASSERT(fields[5]=="c");
1666    REGEX_ASSERT(fields[6]=="");
1667    REGEX_ASSERT(status==U_ZERO_ERROR);
1668
1669    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1670    REGEX_CHECK_STATUS;
1671    REGEX_ASSERT(n==7);
1672    REGEX_ASSERT(fields[0]=="  ");
1673    REGEX_ASSERT(fields[1]=="a");
1674    REGEX_ASSERT(fields[2]=="Now is ");
1675    REGEX_ASSERT(fields[3]=="b");
1676    REGEX_ASSERT(fields[4]=="the time");
1677    REGEX_ASSERT(fields[5]=="c");
1678    REGEX_ASSERT(fields[6]=="");
1679
1680    status = U_ZERO_ERROR;
1681    fields[6] = "foo";
1682    n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1683    REGEX_CHECK_STATUS;
1684    REGEX_ASSERT(n==6);
1685    REGEX_ASSERT(fields[0]=="  ");
1686    REGEX_ASSERT(fields[1]=="a");
1687    REGEX_ASSERT(fields[2]=="Now is ");
1688    REGEX_ASSERT(fields[3]=="b");
1689    REGEX_ASSERT(fields[4]=="the time");
1690    REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1691    REGEX_ASSERT(fields[6]=="foo");
1692
1693    status = U_ZERO_ERROR;
1694    fields[5] = "foo";
1695    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1696    REGEX_CHECK_STATUS;
1697    REGEX_ASSERT(n==5);
1698    REGEX_ASSERT(fields[0]=="  ");
1699    REGEX_ASSERT(fields[1]=="a");
1700    REGEX_ASSERT(fields[2]=="Now is ");
1701    REGEX_ASSERT(fields[3]=="b");
1702    REGEX_ASSERT(fields[4]=="the time<c>");
1703    REGEX_ASSERT(fields[5]=="foo");
1704
1705    status = U_ZERO_ERROR;
1706    fields[5] = "foo";
1707    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1708    REGEX_CHECK_STATUS;
1709    REGEX_ASSERT(n==5);
1710    REGEX_ASSERT(fields[0]=="  ");
1711    REGEX_ASSERT(fields[1]=="a");
1712    REGEX_ASSERT(fields[2]=="Now is ");
1713    REGEX_ASSERT(fields[3]=="b");
1714    REGEX_ASSERT(fields[4]=="the time");
1715    REGEX_ASSERT(fields[5]=="foo");
1716
1717    status = U_ZERO_ERROR;
1718    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1719    REGEX_CHECK_STATUS;
1720    REGEX_ASSERT(n==4);
1721    REGEX_ASSERT(fields[0]=="  ");
1722    REGEX_ASSERT(fields[1]=="a");
1723    REGEX_ASSERT(fields[2]=="Now is ");
1724    REGEX_ASSERT(fields[3]=="the time<c>");
1725    status = U_ZERO_ERROR;
1726    delete pat1;
1727
1728    pat1 = RegexPattern::compile("([-,])",  pe, status);
1729    REGEX_CHECK_STATUS;
1730    n = pat1->split("1-10,20", fields, 10, status);
1731    REGEX_CHECK_STATUS;
1732    REGEX_ASSERT(n==5);
1733    REGEX_ASSERT(fields[0]=="1");
1734    REGEX_ASSERT(fields[1]=="-");
1735    REGEX_ASSERT(fields[2]=="10");
1736    REGEX_ASSERT(fields[3]==",");
1737    REGEX_ASSERT(fields[4]=="20");
1738    delete pat1;
1739
1740    // Test split of string with empty trailing fields
1741    pat1 = RegexPattern::compile(",", pe, status);
1742    REGEX_CHECK_STATUS;
1743    n = pat1->split("a,b,c,", fields, 10, status);
1744    REGEX_CHECK_STATUS;
1745    REGEX_ASSERT(n==4);
1746    REGEX_ASSERT(fields[0]=="a");
1747    REGEX_ASSERT(fields[1]=="b");
1748    REGEX_ASSERT(fields[2]=="c");
1749    REGEX_ASSERT(fields[3]=="");
1750
1751    n = pat1->split("a,,,", fields, 10, status);
1752    REGEX_CHECK_STATUS;
1753    REGEX_ASSERT(n==4);
1754    REGEX_ASSERT(fields[0]=="a");
1755    REGEX_ASSERT(fields[1]=="");
1756    REGEX_ASSERT(fields[2]=="");
1757    REGEX_ASSERT(fields[3]=="");
1758    delete pat1;
1759
1760    // Split Separator with zero length match.
1761    pat1 = RegexPattern::compile(":?", pe, status);
1762    REGEX_CHECK_STATUS;
1763    n = pat1->split("abc", fields, 10, status);
1764    REGEX_CHECK_STATUS;
1765    REGEX_ASSERT(n==5);
1766    REGEX_ASSERT(fields[0]=="");
1767    REGEX_ASSERT(fields[1]=="a");
1768    REGEX_ASSERT(fields[2]=="b");
1769    REGEX_ASSERT(fields[3]=="c");
1770    REGEX_ASSERT(fields[4]=="");
1771
1772    delete pat1;
1773
1774    //
1775    // RegexPattern::pattern()
1776    //
1777    pat1 = new RegexPattern();
1778    REGEX_ASSERT(pat1->pattern() == "");
1779    delete pat1;
1780
1781    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1782    REGEX_CHECK_STATUS;
1783    REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1784    delete pat1;
1785
1786
1787    //
1788    // classID functions
1789    //
1790    pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1791    REGEX_CHECK_STATUS;
1792    REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1793    REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1794    UnicodeString Hello("Hello, world.");
1795    RegexMatcher *m = pat1->matcher(Hello, status);
1796    REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1797    REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1798    REGEX_ASSERT(m->getDynamicClassID() != NULL);
1799    delete m;
1800    delete pat1;
1801
1802}
1803
1804//---------------------------------------------------------------------------
1805//
1806//      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1807//                       is present and working, but excluding functions
1808//                       implementing replace operations.
1809//
1810//---------------------------------------------------------------------------
1811void RegexTest::API_Match_UTF8() {
1812    UParseError         pe;
1813    UErrorCode          status=U_ZERO_ERROR;
1814    int32_t             flags = 0;
1815
1816    //
1817    // Debug - slide failing test cases early
1818    //
1819#if 0
1820    {
1821    }
1822    return;
1823#endif
1824
1825    //
1826    // Simple pattern compilation
1827    //
1828    {
1829        UText               re = UTEXT_INITIALIZER;
1830        regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1831        REGEX_VERBOSE_TEXT(&re);
1832        RegexPattern        *pat2;
1833        pat2 = RegexPattern::compile(&re, flags, pe, status);
1834        REGEX_CHECK_STATUS;
1835
1836        UText input1 = UTEXT_INITIALIZER;
1837        UText input2 = UTEXT_INITIALIZER;
1838        UText empty  = UTEXT_INITIALIZER;
1839        regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1840        REGEX_VERBOSE_TEXT(&input1);
1841        regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1842        REGEX_VERBOSE_TEXT(&input2);
1843        utext_openUChars(&empty, NULL, 0, &status);
1844
1845        int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1846        int32_t input2Len = strlen("not abc");
1847
1848
1849        //
1850        // Matcher creation and reset.
1851        //
1852        RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1853        REGEX_CHECK_STATUS;
1854        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1855        const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1856        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1857        m1->reset(&input2);
1858        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1859        const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1860        REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1861        m1->reset(&input1);
1862        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1863        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1864        m1->reset(&empty);
1865        REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1866        REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1867
1868        //
1869        //  reset(pos, status)
1870        //
1871        m1->reset(&input1);
1872        m1->reset(4, status);
1873        REGEX_CHECK_STATUS;
1874        REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1875        REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1876
1877        m1->reset(-1, status);
1878        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1879        status = U_ZERO_ERROR;
1880
1881        m1->reset(0, status);
1882        REGEX_CHECK_STATUS;
1883        status = U_ZERO_ERROR;
1884
1885        m1->reset(input1Len-1, status);
1886        REGEX_CHECK_STATUS;
1887        status = U_ZERO_ERROR;
1888
1889        m1->reset(input1Len, status);
1890        REGEX_CHECK_STATUS;
1891        status = U_ZERO_ERROR;
1892
1893        m1->reset(input1Len+1, status);
1894        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1895        status = U_ZERO_ERROR;
1896
1897        //
1898        // match(pos, status)
1899        //
1900        m1->reset(&input2);
1901        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1902        m1->reset();
1903        REGEX_ASSERT(m1->matches(3, status) == FALSE);
1904        m1->reset();
1905        REGEX_ASSERT(m1->matches(5, status) == FALSE);
1906        REGEX_ASSERT(m1->matches(4, status) == TRUE);
1907        REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1908        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1909
1910        // Match() at end of string should fail, but should not
1911        //  be an error.
1912        status = U_ZERO_ERROR;
1913        REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1914        REGEX_CHECK_STATUS;
1915
1916        // Match beyond end of string should fail with an error.
1917        status = U_ZERO_ERROR;
1918        REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1919        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1920
1921        // Successful match at end of string.
1922        {
1923            status = U_ZERO_ERROR;
1924            RegexMatcher m("A?", 0, status);  // will match zero length string.
1925            REGEX_CHECK_STATUS;
1926            m.reset(&input1);
1927            REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1928            REGEX_CHECK_STATUS;
1929            m.reset(&empty);
1930            REGEX_ASSERT(m.matches(0, status) == TRUE);
1931            REGEX_CHECK_STATUS;
1932        }
1933
1934
1935        //
1936        // lookingAt(pos, status)
1937        //
1938        status = U_ZERO_ERROR;
1939        m1->reset(&input2);  // "not abc"
1940        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1941        REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1942        REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1943        REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1944        REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1945        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1946        status = U_ZERO_ERROR;
1947        REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1948        REGEX_CHECK_STATUS;
1949        REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1950        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1951
1952        delete m1;
1953        delete pat2;
1954
1955        utext_close(&re);
1956        utext_close(&input1);
1957        utext_close(&input2);
1958        utext_close(&empty);
1959    }
1960
1961
1962    //
1963    // Capture Group.
1964    //     RegexMatcher::start();
1965    //     RegexMatcher::end();
1966    //     RegexMatcher::groupCount();
1967    //
1968    {
1969        int32_t             flags=0;
1970        UParseError         pe;
1971        UErrorCode          status=U_ZERO_ERROR;
1972        UText               re=UTEXT_INITIALIZER;
1973        const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1974        utext_openUTF8(&re, str_01234567_pat, -1, &status);
1975
1976        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1977        REGEX_CHECK_STATUS;
1978
1979        UText input = UTEXT_INITIALIZER;
1980        const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1981        utext_openUTF8(&input, str_0123456789, -1, &status);
1982
1983        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1984        REGEX_CHECK_STATUS;
1985        REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1986        static const int32_t matchStarts[] = {0,  2, 4, 8};
1987        static const int32_t matchEnds[]   = {10, 8, 6, 10};
1988        int32_t i;
1989        for (i=0; i<4; i++) {
1990            int32_t actualStart = matcher->start(i, status);
1991            REGEX_CHECK_STATUS;
1992            if (actualStart != matchStarts[i]) {
1993                errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
1994                      __FILE__, __LINE__, i, matchStarts[i], actualStart);
1995            }
1996            int32_t actualEnd = matcher->end(i, status);
1997            REGEX_CHECK_STATUS;
1998            if (actualEnd != matchEnds[i]) {
1999                errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2000                      __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2001            }
2002        }
2003
2004        REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2005        REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2006
2007        REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2008        REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2009        matcher->reset();
2010        REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2011
2012        matcher->lookingAt(status);
2013
2014        UnicodeString dest;
2015        UText destText = UTEXT_INITIALIZER;
2016        utext_openUnicodeString(&destText, &dest, &status);
2017        UText *result;
2018        //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2019        //	Test shallow-clone API
2020        int64_t   group_len;
2021        result = matcher->group((UText *)NULL, group_len, status);
2022        REGEX_CHECK_STATUS;
2023        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2024        utext_close(result);
2025        result = matcher->group(0, &destText, group_len, status);
2026        REGEX_CHECK_STATUS;
2027        REGEX_ASSERT(result == &destText);
2028        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2029        //  destText is now immutable, reopen it
2030        utext_close(&destText);
2031        utext_openUnicodeString(&destText, &dest, &status);
2032
2033        result = matcher->group(0, NULL, status);
2034        REGEX_CHECK_STATUS;
2035        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2036        utext_close(result);
2037        result = matcher->group(0, &destText, status);
2038        REGEX_CHECK_STATUS;
2039        REGEX_ASSERT(result == &destText);
2040        REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2041
2042        result = matcher->group(1, NULL, status);
2043        REGEX_CHECK_STATUS;
2044        const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
2045        REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2046        utext_close(result);
2047        result = matcher->group(1, &destText, status);
2048        REGEX_CHECK_STATUS;
2049        REGEX_ASSERT(result == &destText);
2050        REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2051
2052        result = matcher->group(2, NULL, status);
2053        REGEX_CHECK_STATUS;
2054        const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
2055        REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2056        utext_close(result);
2057        result = matcher->group(2, &destText, status);
2058        REGEX_CHECK_STATUS;
2059        REGEX_ASSERT(result == &destText);
2060        REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2061
2062        result = matcher->group(3, NULL, status);
2063        REGEX_CHECK_STATUS;
2064        const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
2065        REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2066        utext_close(result);
2067        result = matcher->group(3, &destText, status);
2068        REGEX_CHECK_STATUS;
2069        REGEX_ASSERT(result == &destText);
2070        REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2071
2072        REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2073        REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2074        matcher->reset();
2075        REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2076
2077        delete matcher;
2078        delete pat;
2079
2080        utext_close(&destText);
2081        utext_close(&input);
2082        utext_close(&re);
2083    }
2084
2085    //
2086    //  find
2087    //
2088    {
2089        int32_t             flags=0;
2090        UParseError         pe;
2091        UErrorCode          status=U_ZERO_ERROR;
2092        UText               re=UTEXT_INITIALIZER;
2093        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2094        utext_openUTF8(&re, str_abc, -1, &status);
2095
2096        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2097        REGEX_CHECK_STATUS;
2098        UText input = UTEXT_INITIALIZER;
2099        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2100        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2101        //                      012345678901234567
2102
2103        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2104        REGEX_CHECK_STATUS;
2105        REGEX_ASSERT(matcher->find());
2106        REGEX_ASSERT(matcher->start(status) == 1);
2107        REGEX_ASSERT(matcher->find());
2108        REGEX_ASSERT(matcher->start(status) == 6);
2109        REGEX_ASSERT(matcher->find());
2110        REGEX_ASSERT(matcher->start(status) == 12);
2111        REGEX_ASSERT(matcher->find() == FALSE);
2112        REGEX_ASSERT(matcher->find() == FALSE);
2113
2114        matcher->reset();
2115        REGEX_ASSERT(matcher->find());
2116        REGEX_ASSERT(matcher->start(status) == 1);
2117
2118        REGEX_ASSERT(matcher->find(0, status));
2119        REGEX_ASSERT(matcher->start(status) == 1);
2120        REGEX_ASSERT(matcher->find(1, status));
2121        REGEX_ASSERT(matcher->start(status) == 1);
2122        REGEX_ASSERT(matcher->find(2, status));
2123        REGEX_ASSERT(matcher->start(status) == 6);
2124        REGEX_ASSERT(matcher->find(12, status));
2125        REGEX_ASSERT(matcher->start(status) == 12);
2126        REGEX_ASSERT(matcher->find(13, status) == FALSE);
2127        REGEX_ASSERT(matcher->find(16, status) == FALSE);
2128        REGEX_ASSERT(matcher->find(17, status) == FALSE);
2129        REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2130
2131        status = U_ZERO_ERROR;
2132        REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2133        status = U_ZERO_ERROR;
2134        REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2135
2136        REGEX_ASSERT(matcher->groupCount() == 0);
2137
2138        delete matcher;
2139        delete pat;
2140
2141        utext_close(&input);
2142        utext_close(&re);
2143    }
2144
2145
2146    //
2147    //  find, with \G in pattern (true if at the end of a previous match).
2148    //
2149    {
2150        int32_t             flags=0;
2151        UParseError         pe;
2152        UErrorCode          status=U_ZERO_ERROR;
2153        UText               re=UTEXT_INITIALIZER;
2154        const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2155        utext_openUTF8(&re, str_Gabcabc, -1, &status);
2156
2157        RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2158
2159        REGEX_CHECK_STATUS;
2160        UText input = UTEXT_INITIALIZER;
2161        const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2162        utext_openUTF8(&input, str_abcabcabc, -1, &status);
2163        //                      012345678901234567
2164
2165        RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2166        REGEX_CHECK_STATUS;
2167        REGEX_ASSERT(matcher->find());
2168        REGEX_ASSERT(matcher->start(status) == 0);
2169        REGEX_ASSERT(matcher->start(1, status) == -1);
2170        REGEX_ASSERT(matcher->start(2, status) == 1);
2171
2172        REGEX_ASSERT(matcher->find());
2173        REGEX_ASSERT(matcher->start(status) == 4);
2174        REGEX_ASSERT(matcher->start(1, status) == 4);
2175        REGEX_ASSERT(matcher->start(2, status) == -1);
2176        REGEX_CHECK_STATUS;
2177
2178        delete matcher;
2179        delete pat;
2180
2181        utext_close(&input);
2182        utext_close(&re);
2183    }
2184
2185    //
2186    //   find with zero length matches, match position should bump ahead
2187    //     to prevent loops.
2188    //
2189    {
2190        int32_t                 i;
2191        UErrorCode          status=U_ZERO_ERROR;
2192        RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2193                                                      //   using an always-true look-ahead.
2194        REGEX_CHECK_STATUS;
2195        UText s = UTEXT_INITIALIZER;
2196        utext_openUTF8(&s, "    ", -1, &status);
2197        m.reset(&s);
2198        for (i=0; ; i++) {
2199            if (m.find() == FALSE) {
2200                break;
2201            }
2202            REGEX_ASSERT(m.start(status) == i);
2203            REGEX_ASSERT(m.end(status) == i);
2204        }
2205        REGEX_ASSERT(i==5);
2206
2207        // Check that the bump goes over characters outside the BMP OK
2208        // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2209        unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2210        utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2211        m.reset(&s);
2212        for (i=0; ; i+=4) {
2213            if (m.find() == FALSE) {
2214                break;
2215            }
2216            REGEX_ASSERT(m.start(status) == i);
2217            REGEX_ASSERT(m.end(status) == i);
2218        }
2219        REGEX_ASSERT(i==20);
2220
2221        utext_close(&s);
2222    }
2223    {
2224        // find() loop breaking test.
2225        //        with pattern of /.?/, should see a series of one char matches, then a single
2226        //        match of zero length at the end of the input string.
2227        int32_t                 i;
2228        UErrorCode          status=U_ZERO_ERROR;
2229        RegexMatcher        m(".?", 0, status);
2230        REGEX_CHECK_STATUS;
2231        UText s = UTEXT_INITIALIZER;
2232        utext_openUTF8(&s, "    ", -1, &status);
2233        m.reset(&s);
2234        for (i=0; ; i++) {
2235            if (m.find() == FALSE) {
2236                break;
2237            }
2238            REGEX_ASSERT(m.start(status) == i);
2239            REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2240        }
2241        REGEX_ASSERT(i==5);
2242
2243        utext_close(&s);
2244    }
2245
2246
2247    //
2248    // Matchers with no input string behave as if they had an empty input string.
2249    //
2250
2251    {
2252        UErrorCode status = U_ZERO_ERROR;
2253        RegexMatcher  m(".?", 0, status);
2254        REGEX_CHECK_STATUS;
2255        REGEX_ASSERT(m.find());
2256        REGEX_ASSERT(m.start(status) == 0);
2257        REGEX_ASSERT(m.input() == "");
2258    }
2259    {
2260        UErrorCode status = U_ZERO_ERROR;
2261        RegexPattern  *p = RegexPattern::compile(".", 0, status);
2262        RegexMatcher  *m = p->matcher(status);
2263        REGEX_CHECK_STATUS;
2264
2265        REGEX_ASSERT(m->find() == FALSE);
2266        REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2267        delete m;
2268        delete p;
2269    }
2270
2271    //
2272    // Regions
2273    //
2274    {
2275        UErrorCode status = U_ZERO_ERROR;
2276        UText testPattern = UTEXT_INITIALIZER;
2277        UText testText    = UTEXT_INITIALIZER;
2278        regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2279        REGEX_VERBOSE_TEXT(&testPattern);
2280        regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2281        REGEX_VERBOSE_TEXT(&testText);
2282
2283        RegexMatcher m(&testPattern, &testText, 0, status);
2284        REGEX_CHECK_STATUS;
2285        REGEX_ASSERT(m.regionStart() == 0);
2286        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2287        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2288        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2289
2290        m.region(2,4, status);
2291        REGEX_CHECK_STATUS;
2292        REGEX_ASSERT(m.matches(status));
2293        REGEX_ASSERT(m.start(status)==2);
2294        REGEX_ASSERT(m.end(status)==4);
2295        REGEX_CHECK_STATUS;
2296
2297        m.reset();
2298        REGEX_ASSERT(m.regionStart() == 0);
2299        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2300
2301        regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2302        REGEX_VERBOSE_TEXT(&testText);
2303        m.reset(&testText);
2304        REGEX_ASSERT(m.regionStart() == 0);
2305        REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2306
2307        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2308        REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2309        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2310        REGEX_ASSERT(&m == &m.reset());
2311        REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2312
2313        REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2314        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2315        REGEX_ASSERT(&m == &m.reset());
2316        REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2317
2318        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2319        REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2320        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2321        REGEX_ASSERT(&m == &m.reset());
2322        REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2323
2324        REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2325        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2326        REGEX_ASSERT(&m == &m.reset());
2327        REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2328
2329        utext_close(&testText);
2330        utext_close(&testPattern);
2331    }
2332
2333    //
2334    // hitEnd() and requireEnd()
2335    //
2336    {
2337        UErrorCode status = U_ZERO_ERROR;
2338        UText testPattern = UTEXT_INITIALIZER;
2339        UText testText    = UTEXT_INITIALIZER;
2340        const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2341        const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2342        utext_openUTF8(&testPattern, str_, -1, &status);
2343        utext_openUTF8(&testText, str_aabb, -1, &status);
2344
2345        RegexMatcher m1(&testPattern, &testText,  0, status);
2346        REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2347        REGEX_ASSERT(m1.hitEnd() == TRUE);
2348        REGEX_ASSERT(m1.requireEnd() == FALSE);
2349        REGEX_CHECK_STATUS;
2350
2351        status = U_ZERO_ERROR;
2352        const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2353        utext_openUTF8(&testPattern, str_a, -1, &status);
2354        RegexMatcher m2(&testPattern, &testText, 0, status);
2355        REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2356        REGEX_ASSERT(m2.hitEnd() == FALSE);
2357        REGEX_ASSERT(m2.requireEnd() == FALSE);
2358        REGEX_CHECK_STATUS;
2359
2360        status = U_ZERO_ERROR;
2361        const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2362        utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2363        RegexMatcher m3(&testPattern, &testText, 0, status);
2364        REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2365        REGEX_ASSERT(m3.hitEnd() == TRUE);
2366        REGEX_ASSERT(m3.requireEnd() == TRUE);
2367        REGEX_CHECK_STATUS;
2368
2369        utext_close(&testText);
2370        utext_close(&testPattern);
2371    }
2372}
2373
2374
2375//---------------------------------------------------------------------------
2376//
2377//      API_Replace_UTF8   API test for class RegexMatcher, testing the
2378//                         Replace family of functions.
2379//
2380//---------------------------------------------------------------------------
2381void RegexTest::API_Replace_UTF8() {
2382    //
2383    //  Replace
2384    //
2385    int32_t             flags=0;
2386    UParseError         pe;
2387    UErrorCode          status=U_ZERO_ERROR;
2388
2389    UText               re=UTEXT_INITIALIZER;
2390    regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2391    REGEX_VERBOSE_TEXT(&re);
2392    RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2393    REGEX_CHECK_STATUS;
2394
2395    char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2396    //             012345678901234567
2397    UText dataText = UTEXT_INITIALIZER;
2398    utext_openUTF8(&dataText, data, -1, &status);
2399    REGEX_CHECK_STATUS;
2400    REGEX_VERBOSE_TEXT(&dataText);
2401    RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2402
2403    //
2404    //  Plain vanilla matches.
2405    //
2406    UnicodeString  dest;
2407    UText destText = UTEXT_INITIALIZER;
2408    utext_openUnicodeString(&destText, &dest, &status);
2409    UText *result;
2410
2411    UText replText = UTEXT_INITIALIZER;
2412
2413    const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2414    utext_openUTF8(&replText, str_yz, -1, &status);
2415    REGEX_VERBOSE_TEXT(&replText);
2416    result = matcher->replaceFirst(&replText, NULL, status);
2417    REGEX_CHECK_STATUS;
2418    const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2419    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2420    utext_close(result);
2421    result = matcher->replaceFirst(&replText, &destText, status);
2422    REGEX_CHECK_STATUS;
2423    REGEX_ASSERT(result == &destText);
2424    REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2425
2426    result = matcher->replaceAll(&replText, NULL, status);
2427    REGEX_CHECK_STATUS;
2428    const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2429    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2430    utext_close(result);
2431
2432    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2433    result = matcher->replaceAll(&replText, &destText, status);
2434    REGEX_CHECK_STATUS;
2435    REGEX_ASSERT(result == &destText);
2436    REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2437
2438    //
2439    //  Plain vanilla non-matches.
2440    //
2441    const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2442    utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2443    matcher->reset(&dataText);
2444
2445    result = matcher->replaceFirst(&replText, NULL, status);
2446    REGEX_CHECK_STATUS;
2447    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2448    utext_close(result);
2449    result = matcher->replaceFirst(&replText, &destText, status);
2450    REGEX_CHECK_STATUS;
2451    REGEX_ASSERT(result == &destText);
2452    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2453
2454    result = matcher->replaceAll(&replText, NULL, status);
2455    REGEX_CHECK_STATUS;
2456    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2457    utext_close(result);
2458    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2459    result = matcher->replaceAll(&replText, &destText, status);
2460    REGEX_CHECK_STATUS;
2461    REGEX_ASSERT(result == &destText);
2462    REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2463
2464    //
2465    // Empty source string
2466    //
2467    utext_openUTF8(&dataText, NULL, 0, &status);
2468    matcher->reset(&dataText);
2469
2470    result = matcher->replaceFirst(&replText, NULL, status);
2471    REGEX_CHECK_STATUS;
2472    REGEX_ASSERT_UTEXT_UTF8("", result);
2473    utext_close(result);
2474    result = matcher->replaceFirst(&replText, &destText, status);
2475    REGEX_CHECK_STATUS;
2476    REGEX_ASSERT(result == &destText);
2477    REGEX_ASSERT_UTEXT_UTF8("", result);
2478
2479    result = matcher->replaceAll(&replText, NULL, status);
2480    REGEX_CHECK_STATUS;
2481    REGEX_ASSERT_UTEXT_UTF8("", result);
2482    utext_close(result);
2483    result = matcher->replaceAll(&replText, &destText, status);
2484    REGEX_CHECK_STATUS;
2485    REGEX_ASSERT(result == &destText);
2486    REGEX_ASSERT_UTEXT_UTF8("", result);
2487
2488    //
2489    // Empty substitution string
2490    //
2491    utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2492    matcher->reset(&dataText);
2493
2494    utext_openUTF8(&replText, NULL, 0, &status);
2495    result = matcher->replaceFirst(&replText, NULL, status);
2496    REGEX_CHECK_STATUS;
2497    const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2498    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2499    utext_close(result);
2500    result = matcher->replaceFirst(&replText, &destText, status);
2501    REGEX_CHECK_STATUS;
2502    REGEX_ASSERT(result == &destText);
2503    REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2504
2505    result = matcher->replaceAll(&replText, NULL, status);
2506    REGEX_CHECK_STATUS;
2507    const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2508    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2509    utext_close(result);
2510    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2511    result = matcher->replaceAll(&replText, &destText, status);
2512    REGEX_CHECK_STATUS;
2513    REGEX_ASSERT(result == &destText);
2514    REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2515
2516    //
2517    // match whole string
2518    //
2519    const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2520    utext_openUTF8(&dataText, str_abc, -1, &status);
2521    matcher->reset(&dataText);
2522
2523    const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2524    utext_openUTF8(&replText, str_xyz, -1, &status);
2525    result = matcher->replaceFirst(&replText, NULL, status);
2526    REGEX_CHECK_STATUS;
2527    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2528    utext_close(result);
2529    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2530    result = matcher->replaceFirst(&replText, &destText, status);
2531    REGEX_CHECK_STATUS;
2532    REGEX_ASSERT(result == &destText);
2533    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2534
2535    result = matcher->replaceAll(&replText, NULL, status);
2536    REGEX_CHECK_STATUS;
2537    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2538    utext_close(result);
2539    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2540    result = matcher->replaceAll(&replText, &destText, status);
2541    REGEX_CHECK_STATUS;
2542    REGEX_ASSERT(result == &destText);
2543    REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2544
2545    //
2546    // Capture Group, simple case
2547    //
2548    const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2549    utext_openUTF8(&re, str_add, -1, &status);
2550    RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2551    REGEX_CHECK_STATUS;
2552
2553    const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2554    utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2555    RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2556    REGEX_CHECK_STATUS;
2557
2558    const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2559    utext_openUTF8(&replText, str_11, -1, &status);
2560    result = matcher2->replaceFirst(&replText, NULL, status);
2561    REGEX_CHECK_STATUS;
2562    const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2563    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2564    utext_close(result);
2565    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2566    result = matcher2->replaceFirst(&replText, &destText, status);
2567    REGEX_CHECK_STATUS;
2568    REGEX_ASSERT(result == &destText);
2569    REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2570
2571    const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2572    utext_openUTF8(&replText, str_v, -1, &status);
2573    REGEX_VERBOSE_TEXT(&replText);
2574    result = matcher2->replaceFirst(&replText, NULL, status);
2575    REGEX_CHECK_STATUS;
2576    const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2577    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2578    utext_close(result);
2579    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2580    result = matcher2->replaceFirst(&replText, &destText, status);
2581    REGEX_CHECK_STATUS;
2582    REGEX_ASSERT(result == &destText);
2583    REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2584
2585    const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2586    utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2587    result = matcher2->replaceFirst(&replText, NULL, status);
2588    REGEX_CHECK_STATUS;
2589    const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2590    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2591    utext_close(result);
2592    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2593    result = matcher2->replaceFirst(&replText, &destText, status);
2594    REGEX_CHECK_STATUS;
2595    REGEX_ASSERT(result == &destText);
2596    REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2597
2598    unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2599    //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2600    //                                 012345678901234567890123456
2601    supplDigitChars[22] = 0xF0;
2602    supplDigitChars[23] = 0x9D;
2603    supplDigitChars[24] = 0x9F;
2604    supplDigitChars[25] = 0x8F;
2605    utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2606
2607    result = matcher2->replaceFirst(&replText, NULL, status);
2608    REGEX_CHECK_STATUS;
2609    const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2610    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2611    utext_close(result);
2612    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2613    result = matcher2->replaceFirst(&replText, &destText, status);
2614    REGEX_CHECK_STATUS;
2615    REGEX_ASSERT(result == &destText);
2616    REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2617    const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2618    utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2619    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2620//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2621    utext_close(result);
2622    utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2623    REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2624    REGEX_ASSERT(result == &destText);
2625//    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2626
2627    //
2628    // Replacement String with \u hex escapes
2629    //
2630    {
2631      const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2632      const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2633        utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2634        utext_openUTF8(&replText, str_u0043, -1, &status);
2635        matcher->reset(&dataText);
2636
2637        result = matcher->replaceAll(&replText, NULL, status);
2638        REGEX_CHECK_STATUS;
2639        const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2640        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2641        utext_close(result);
2642        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2643        result = matcher->replaceAll(&replText, &destText, status);
2644        REGEX_CHECK_STATUS;
2645        REGEX_ASSERT(result == &destText);
2646        REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2647    }
2648    {
2649      const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2650        utext_openUTF8(&dataText, str_abc, -1, &status);
2651        const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2652        utext_openUTF8(&replText, str_U00010000, -1, &status);
2653        matcher->reset(&dataText);
2654
2655        unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2656        //                          0123456789
2657        expected[2] = 0xF0;
2658        expected[3] = 0x90;
2659        expected[4] = 0x80;
2660        expected[5] = 0x80;
2661
2662        result = matcher->replaceAll(&replText, NULL, status);
2663        REGEX_CHECK_STATUS;
2664        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2665        utext_close(result);
2666        utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2667        result = matcher->replaceAll(&replText, &destText, status);
2668        REGEX_CHECK_STATUS;
2669        REGEX_ASSERT(result == &destText);
2670        REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2671    }
2672    // TODO:  need more through testing of capture substitutions.
2673
2674    // Bug 4057
2675    //
2676    {
2677        status = U_ZERO_ERROR;
2678const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2679const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2680const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2681        utext_openUTF8(&re, str_ssee, -1, &status);
2682        utext_openUTF8(&dataText, str_blah, -1, &status);
2683        utext_openUTF8(&replText, str_ooh, -1, &status);
2684
2685        RegexMatcher m(&re, 0, status);
2686        REGEX_CHECK_STATUS;
2687
2688        UnicodeString result;
2689        UText resultText = UTEXT_INITIALIZER;
2690        utext_openUnicodeString(&resultText, &result, &status);
2691
2692        // Multiple finds do NOT bump up the previous appendReplacement postion.
2693        m.reset(&dataText);
2694        m.find();
2695        m.find();
2696        m.appendReplacement(&resultText, &replText, status);
2697        REGEX_CHECK_STATUS;
2698        const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2699        REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2700
2701        // After a reset into the interior of a string, appendReplacement still starts at beginning.
2702        status = U_ZERO_ERROR;
2703        result.truncate(0);
2704        utext_openUnicodeString(&resultText, &result, &status);
2705        m.reset(10, status);
2706        m.find();
2707        m.find();
2708        m.appendReplacement(&resultText, &replText, status);
2709        REGEX_CHECK_STATUS;
2710        const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2711        REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2712
2713        // find() at interior of string, appendReplacement still starts at beginning.
2714        status = U_ZERO_ERROR;
2715        result.truncate(0);
2716        utext_openUnicodeString(&resultText, &result, &status);
2717        m.reset();
2718        m.find(10, status);
2719        m.find();
2720        m.appendReplacement(&resultText, &replText, status);
2721        REGEX_CHECK_STATUS;
2722        const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2723        REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2724
2725        m.appendTail(&resultText, status);
2726        const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2727        REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2728
2729        utext_close(&resultText);
2730    }
2731
2732    delete matcher2;
2733    delete pat2;
2734    delete matcher;
2735    delete pat;
2736
2737    utext_close(&dataText);
2738    utext_close(&replText);
2739    utext_close(&destText);
2740    utext_close(&re);
2741}
2742
2743
2744//---------------------------------------------------------------------------
2745//
2746//      API_Pattern_UTF8  Test that the API for class RegexPattern is
2747//                        present and nominally working.
2748//
2749//---------------------------------------------------------------------------
2750void RegexTest::API_Pattern_UTF8() {
2751    RegexPattern        pata;    // Test default constructor to not crash.
2752    RegexPattern        patb;
2753
2754    REGEX_ASSERT(pata == patb);
2755    REGEX_ASSERT(pata == pata);
2756
2757    UText         re1 = UTEXT_INITIALIZER;
2758    UText         re2 = UTEXT_INITIALIZER;
2759    UErrorCode    status = U_ZERO_ERROR;
2760    UParseError   pe;
2761
2762    const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2763    const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2764    utext_openUTF8(&re1, str_abcalmz, -1, &status);
2765    utext_openUTF8(&re2, str_def, -1, &status);
2766
2767    RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2768    RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2769    REGEX_CHECK_STATUS;
2770    REGEX_ASSERT(*pat1 == *pat1);
2771    REGEX_ASSERT(*pat1 != pata);
2772
2773    // Assign
2774    patb = *pat1;
2775    REGEX_ASSERT(patb == *pat1);
2776
2777    // Copy Construct
2778    RegexPattern patc(*pat1);
2779    REGEX_ASSERT(patc == *pat1);
2780    REGEX_ASSERT(patb == patc);
2781    REGEX_ASSERT(pat1 != pat2);
2782    patb = *pat2;
2783    REGEX_ASSERT(patb != patc);
2784    REGEX_ASSERT(patb == *pat2);
2785
2786    // Compile with no flags.
2787    RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2788    REGEX_ASSERT(*pat1a == *pat1);
2789
2790    REGEX_ASSERT(pat1a->flags() == 0);
2791
2792    // Compile with different flags should be not equal
2793    RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2794    REGEX_CHECK_STATUS;
2795
2796    REGEX_ASSERT(*pat1b != *pat1a);
2797    REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2798    REGEX_ASSERT(pat1a->flags() == 0);
2799    delete pat1b;
2800
2801    // clone
2802    RegexPattern *pat1c = pat1->clone();
2803    REGEX_ASSERT(*pat1c == *pat1);
2804    REGEX_ASSERT(*pat1c != *pat2);
2805
2806    delete pat1c;
2807    delete pat1a;
2808    delete pat1;
2809    delete pat2;
2810
2811    utext_close(&re1);
2812    utext_close(&re2);
2813
2814
2815    //
2816    //   Verify that a matcher created from a cloned pattern works.
2817    //     (Jitterbug 3423)
2818    //
2819    {
2820        UErrorCode     status     = U_ZERO_ERROR;
2821        UText          pattern    = UTEXT_INITIALIZER;
2822        const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2823        utext_openUTF8(&pattern, str_pL, -1, &status);
2824
2825        RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2826        RegexPattern  *pClone     = pSource->clone();
2827        delete         pSource;
2828        RegexMatcher  *mFromClone = pClone->matcher(status);
2829        REGEX_CHECK_STATUS;
2830
2831        UText          input      = UTEXT_INITIALIZER;
2832        const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2833        utext_openUTF8(&input, str_HelloWorld, -1, &status);
2834        mFromClone->reset(&input);
2835        REGEX_ASSERT(mFromClone->find() == TRUE);
2836        REGEX_ASSERT(mFromClone->group(status) == "Hello");
2837        REGEX_ASSERT(mFromClone->find() == TRUE);
2838        REGEX_ASSERT(mFromClone->group(status) == "World");
2839        REGEX_ASSERT(mFromClone->find() == FALSE);
2840        delete mFromClone;
2841        delete pClone;
2842
2843        utext_close(&input);
2844        utext_close(&pattern);
2845    }
2846
2847    //
2848    //   matches convenience API
2849    //
2850    {
2851        UErrorCode status  = U_ZERO_ERROR;
2852        UText      pattern = UTEXT_INITIALIZER;
2853        UText      input   = UTEXT_INITIALIZER;
2854
2855        const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2856        utext_openUTF8(&input, str_randominput, -1, &status);
2857
2858        const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2859        utext_openUTF8(&pattern, str_dotstar, -1, &status);
2860        REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2861        REGEX_CHECK_STATUS;
2862
2863        const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2864        utext_openUTF8(&pattern, str_abc, -1, &status);
2865        REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2866        REGEX_CHECK_STATUS;
2867
2868        const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2869        utext_openUTF8(&pattern, str_nput, -1, &status);
2870        REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2871        REGEX_CHECK_STATUS;
2872
2873        utext_openUTF8(&pattern, str_randominput, -1, &status);
2874        REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2875        REGEX_CHECK_STATUS;
2876
2877        const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2878        utext_openUTF8(&pattern, str_u, -1, &status);
2879        REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2880        REGEX_CHECK_STATUS;
2881
2882        utext_openUTF8(&input, str_abc, -1, &status);
2883        utext_openUTF8(&pattern, str_abc, -1, &status);
2884        status = U_INDEX_OUTOFBOUNDS_ERROR;
2885        REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2886        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2887
2888        utext_close(&input);
2889        utext_close(&pattern);
2890    }
2891
2892
2893    //
2894    // Split()
2895    //
2896    status = U_ZERO_ERROR;
2897    const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2898    utext_openUTF8(&re1, str_spaceplus, -1, &status);
2899    pat1 = RegexPattern::compile(&re1, pe, status);
2900    REGEX_CHECK_STATUS;
2901    UnicodeString  fields[10];
2902
2903    int32_t n;
2904    n = pat1->split("Now is the time", fields, 10, status);
2905    REGEX_CHECK_STATUS;
2906    REGEX_ASSERT(n==4);
2907    REGEX_ASSERT(fields[0]=="Now");
2908    REGEX_ASSERT(fields[1]=="is");
2909    REGEX_ASSERT(fields[2]=="the");
2910    REGEX_ASSERT(fields[3]=="time");
2911    REGEX_ASSERT(fields[4]=="");
2912
2913    n = pat1->split("Now is the time", fields, 2, status);
2914    REGEX_CHECK_STATUS;
2915    REGEX_ASSERT(n==2);
2916    REGEX_ASSERT(fields[0]=="Now");
2917    REGEX_ASSERT(fields[1]=="is the time");
2918    REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2919
2920    fields[1] = "*";
2921    status = U_ZERO_ERROR;
2922    n = pat1->split("Now is the time", fields, 1, status);
2923    REGEX_CHECK_STATUS;
2924    REGEX_ASSERT(n==1);
2925    REGEX_ASSERT(fields[0]=="Now is the time");
2926    REGEX_ASSERT(fields[1]=="*");
2927    status = U_ZERO_ERROR;
2928
2929    n = pat1->split("    Now       is the time   ", fields, 10, status);
2930    REGEX_CHECK_STATUS;
2931    REGEX_ASSERT(n==6);
2932    REGEX_ASSERT(fields[0]=="");
2933    REGEX_ASSERT(fields[1]=="Now");
2934    REGEX_ASSERT(fields[2]=="is");
2935    REGEX_ASSERT(fields[3]=="the");
2936    REGEX_ASSERT(fields[4]=="time");
2937    REGEX_ASSERT(fields[5]=="");
2938    REGEX_ASSERT(fields[6]=="");
2939
2940    fields[2] = "*";
2941    n = pat1->split("     ", fields, 10, status);
2942    REGEX_CHECK_STATUS;
2943    REGEX_ASSERT(n==2);
2944    REGEX_ASSERT(fields[0]=="");
2945    REGEX_ASSERT(fields[1]=="");
2946    REGEX_ASSERT(fields[2]=="*");
2947
2948    fields[0] = "foo";
2949    n = pat1->split("", fields, 10, status);
2950    REGEX_CHECK_STATUS;
2951    REGEX_ASSERT(n==0);
2952    REGEX_ASSERT(fields[0]=="foo");
2953
2954    delete pat1;
2955
2956    //  split, with a pattern with (capture)
2957    regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2958    pat1 = RegexPattern::compile(&re1,  pe, status);
2959    REGEX_CHECK_STATUS;
2960
2961    status = U_ZERO_ERROR;
2962    fields[6] = fields[7] = "*";
2963    n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2964    REGEX_CHECK_STATUS;
2965    REGEX_ASSERT(n==7);
2966    REGEX_ASSERT(fields[0]=="");
2967    REGEX_ASSERT(fields[1]=="a");
2968    REGEX_ASSERT(fields[2]=="Now is ");
2969    REGEX_ASSERT(fields[3]=="b");
2970    REGEX_ASSERT(fields[4]=="the time");
2971    REGEX_ASSERT(fields[5]=="c");
2972    REGEX_ASSERT(fields[6]=="");
2973    REGEX_ASSERT(fields[7]=="*");
2974    REGEX_ASSERT(status==U_ZERO_ERROR);
2975
2976    fields[6] = fields[7] = "*";
2977    n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2978    REGEX_CHECK_STATUS;
2979    REGEX_ASSERT(n==7);
2980    REGEX_ASSERT(fields[0]=="  ");
2981    REGEX_ASSERT(fields[1]=="a");
2982    REGEX_ASSERT(fields[2]=="Now is ");
2983    REGEX_ASSERT(fields[3]=="b");
2984    REGEX_ASSERT(fields[4]=="the time");
2985    REGEX_ASSERT(fields[5]=="c");
2986    REGEX_ASSERT(fields[6]=="");
2987    REGEX_ASSERT(fields[7]=="*");
2988
2989    status = U_ZERO_ERROR;
2990    fields[6] = "foo";
2991    n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
2992    REGEX_CHECK_STATUS;
2993    REGEX_ASSERT(n==6);
2994    REGEX_ASSERT(fields[0]=="  ");
2995    REGEX_ASSERT(fields[1]=="a");
2996    REGEX_ASSERT(fields[2]=="Now is ");
2997    REGEX_ASSERT(fields[3]=="b");
2998    REGEX_ASSERT(fields[4]=="the time");
2999    REGEX_ASSERT(fields[5]==" ");
3000    REGEX_ASSERT(fields[6]=="foo");
3001
3002    status = U_ZERO_ERROR;
3003    fields[5] = "foo";
3004    n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3005    REGEX_CHECK_STATUS;
3006    REGEX_ASSERT(n==5);
3007    REGEX_ASSERT(fields[0]=="  ");
3008    REGEX_ASSERT(fields[1]=="a");
3009    REGEX_ASSERT(fields[2]=="Now is ");
3010    REGEX_ASSERT(fields[3]=="b");
3011    REGEX_ASSERT(fields[4]=="the time<c>");
3012    REGEX_ASSERT(fields[5]=="foo");
3013
3014    status = U_ZERO_ERROR;
3015    fields[5] = "foo";
3016    n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3017    REGEX_CHECK_STATUS;
3018    REGEX_ASSERT(n==5);
3019    REGEX_ASSERT(fields[0]=="  ");
3020    REGEX_ASSERT(fields[1]=="a");
3021    REGEX_ASSERT(fields[2]=="Now is ");
3022    REGEX_ASSERT(fields[3]=="b");
3023    REGEX_ASSERT(fields[4]=="the time");
3024    REGEX_ASSERT(fields[5]=="foo");
3025
3026    status = U_ZERO_ERROR;
3027    n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3028    REGEX_CHECK_STATUS;
3029    REGEX_ASSERT(n==4);
3030    REGEX_ASSERT(fields[0]=="  ");
3031    REGEX_ASSERT(fields[1]=="a");
3032    REGEX_ASSERT(fields[2]=="Now is ");
3033    REGEX_ASSERT(fields[3]=="the time<c>");
3034    status = U_ZERO_ERROR;
3035    delete pat1;
3036
3037    regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3038    pat1 = RegexPattern::compile(&re1, pe, status);
3039    REGEX_CHECK_STATUS;
3040    n = pat1->split("1-10,20", fields, 10, status);
3041    REGEX_CHECK_STATUS;
3042    REGEX_ASSERT(n==5);
3043    REGEX_ASSERT(fields[0]=="1");
3044    REGEX_ASSERT(fields[1]=="-");
3045    REGEX_ASSERT(fields[2]=="10");
3046    REGEX_ASSERT(fields[3]==",");
3047    REGEX_ASSERT(fields[4]=="20");
3048    delete pat1;
3049
3050
3051    //
3052    // RegexPattern::pattern() and patternText()
3053    //
3054    pat1 = new RegexPattern();
3055    REGEX_ASSERT(pat1->pattern() == "");
3056    REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3057    delete pat1;
3058    const char *helloWorldInvariant = "(Hello, world)*";
3059    regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3060    pat1 = RegexPattern::compile(&re1, pe, status);
3061    REGEX_CHECK_STATUS;
3062    REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
3063    REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3064    delete pat1;
3065
3066    utext_close(&re1);
3067}
3068
3069
3070//---------------------------------------------------------------------------
3071//
3072//      Extended       A more thorough check for features of regex patterns
3073//                     The test cases are in a separate data file,
3074//                       source/tests/testdata/regextst.txt
3075//                     A description of the test data format is included in that file.
3076//
3077//---------------------------------------------------------------------------
3078
3079const char *
3080RegexTest::getPath(char buffer[2048], const char *filename) {
3081    UErrorCode status=U_ZERO_ERROR;
3082    const char *testDataDirectory = IntlTest::getSourceTestData(status);
3083    if (U_FAILURE(status)) {
3084        errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3085        return NULL;
3086    }
3087
3088    strcpy(buffer, testDataDirectory);
3089    strcat(buffer, filename);
3090    return buffer;
3091}
3092
3093void RegexTest::Extended() {
3094    char tdd[2048];
3095    const char *srcPath;
3096    UErrorCode  status  = U_ZERO_ERROR;
3097    int32_t     lineNum = 0;
3098
3099    //
3100    //  Open and read the test data file.
3101    //
3102    srcPath=getPath(tdd, "regextst.txt");
3103    if(srcPath==NULL) {
3104        return; /* something went wrong, error already output */
3105    }
3106
3107    int32_t    len;
3108    UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3109    if (U_FAILURE(status)) {
3110        return; /* something went wrong, error already output */
3111    }
3112
3113    //
3114    //  Put the test data into a UnicodeString
3115    //
3116    UnicodeString testString(FALSE, testData, len);
3117
3118    RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3119    RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3120    RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3121
3122    RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3123    UnicodeString   testPattern;   // The pattern for test from the test file.
3124    UnicodeString   testFlags;     // the flags   for a test.
3125    UnicodeString   matchString;   // The marked up string to be used as input
3126
3127    if (U_FAILURE(status)){
3128        dataerrln("Construct RegexMatcher() error.");
3129        delete [] testData;
3130        return;
3131    }
3132
3133    //
3134    //  Loop over the test data file, once per line.
3135    //
3136    while (lineMat.find()) {
3137        lineNum++;
3138        if (U_FAILURE(status)) {
3139          errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3140        }
3141
3142        status = U_ZERO_ERROR;
3143        UnicodeString testLine = lineMat.group(1, status);
3144        if (testLine.length() == 0) {
3145            continue;
3146        }
3147
3148        //
3149        // Parse the test line.  Skip blank and comment only lines.
3150        // Separate out the three main fields - pattern, flags, target.
3151        //
3152
3153        commentMat.reset(testLine);
3154        if (commentMat.lookingAt(status)) {
3155            // This line is a comment, or blank.
3156            continue;
3157        }
3158
3159        //
3160        //  Pull out the pattern field, remove it from the test file line.
3161        //
3162        quotedStuffMat.reset(testLine);
3163        if (quotedStuffMat.lookingAt(status)) {
3164            testPattern = quotedStuffMat.group(2, status);
3165            testLine.remove(0, quotedStuffMat.end(0, status));
3166        } else {
3167            errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3168            continue;
3169        }
3170
3171
3172        //
3173        //  Pull out the flags from the test file line.
3174        //
3175        flagsMat.reset(testLine);
3176        flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3177        testFlags = flagsMat.group(1, status);
3178        if (flagsMat.group(2, status).length() > 0) {
3179            errln("Bad Match flag at line %d. Scanning %c\n",
3180                lineNum, flagsMat.group(2, status).charAt(0));
3181            continue;
3182        }
3183        testLine.remove(0, flagsMat.end(0, status));
3184
3185        //
3186        //  Pull out the match string, as a whole.
3187        //    We'll process the <tags> later.
3188        //
3189        quotedStuffMat.reset(testLine);
3190        if (quotedStuffMat.lookingAt(status)) {
3191            matchString = quotedStuffMat.group(2, status);
3192            testLine.remove(0, quotedStuffMat.end(0, status));
3193        } else {
3194            errln("Bad match string at test file line %d", lineNum);
3195            continue;
3196        }
3197
3198        //
3199        //  The only thing left from the input line should be an optional trailing comment.
3200        //
3201        commentMat.reset(testLine);
3202        if (commentMat.lookingAt(status) == FALSE) {
3203            errln("Line %d: unexpected characters at end of test line.", lineNum);
3204            continue;
3205        }
3206
3207        //
3208        //  Run the test
3209        //
3210        regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3211    }
3212
3213    delete [] testData;
3214
3215}
3216
3217
3218
3219//---------------------------------------------------------------------------
3220//
3221//    regex_find(pattern, flags, inputString, lineNumber)
3222//
3223//         Function to run a single test from the Extended (data driven) tests.
3224//         See file test/testdata/regextst.txt for a description of the
3225//         pattern and inputString fields, and the allowed flags.
3226//         lineNumber is the source line in regextst.txt of the test.
3227//
3228//---------------------------------------------------------------------------
3229
3230
3231//  Set a value into a UVector at position specified by a decimal number in
3232//   a UnicodeString.   This is a utility function needed by the actual test function,
3233//   which follows.
3234static void set(UVector &vec, int32_t val, UnicodeString index) {
3235    UErrorCode  status=U_ZERO_ERROR;
3236    int32_t  idx = 0;
3237    for (int32_t i=0; i<index.length(); i++) {
3238        int32_t d=u_charDigitValue(index.charAt(i));
3239        if (d<0) {return;}
3240        idx = idx*10 + d;
3241    }
3242    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3243    vec.setElementAt(val, idx);
3244}
3245
3246static void setInt(UVector &vec, int32_t val, int32_t idx) {
3247    UErrorCode  status=U_ZERO_ERROR;
3248    while (vec.size()<idx+1) {vec.addElement(-1, status);}
3249    vec.setElementAt(val, idx);
3250}
3251
3252static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3253{
3254    UBool couldFind = TRUE;
3255    UTEXT_SETNATIVEINDEX(utext, 0);
3256    int32_t i = 0;
3257    while (i < unistrOffset) {
3258        UChar32 c = UTEXT_NEXT32(utext);
3259        if (c != U_SENTINEL) {
3260            i += U16_LENGTH(c);
3261        } else {
3262            couldFind = FALSE;
3263            break;
3264        }
3265    }
3266    nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3267    return couldFind;
3268}
3269
3270
3271void RegexTest::regex_find(const UnicodeString &pattern,
3272                           const UnicodeString &flags,
3273                           const UnicodeString &inputString,
3274                           const char *srcPath,
3275                           int32_t line) {
3276    UnicodeString       unEscapedInput;
3277    UnicodeString       deTaggedInput;
3278
3279    int32_t             patternUTF8Length,      inputUTF8Length;
3280    char                *patternChars  = NULL, *inputChars = NULL;
3281    UText               patternText    = UTEXT_INITIALIZER;
3282    UText               inputText      = UTEXT_INITIALIZER;
3283    UConverter          *UTF8Converter = NULL;
3284
3285    UErrorCode          status         = U_ZERO_ERROR;
3286    UParseError         pe;
3287    RegexPattern        *parsePat      = NULL;
3288    RegexMatcher        *parseMatcher  = NULL;
3289    RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3290    RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3291    UVector             groupStarts(status);
3292    UVector             groupEnds(status);
3293    UVector             groupStartsUTF8(status);
3294    UVector             groupEndsUTF8(status);
3295    UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3296    UBool               failed         = FALSE;
3297    int32_t             numFinds;
3298    int32_t             i;
3299    UBool               useMatchesFunc   = FALSE;
3300    UBool               useLookingAtFunc = FALSE;
3301    int32_t             regionStart      = -1;
3302    int32_t             regionEnd        = -1;
3303    int32_t             regionStartUTF8  = -1;
3304    int32_t             regionEndUTF8    = -1;
3305
3306
3307    //
3308    //  Compile the caller's pattern
3309    //
3310    uint32_t bflags = 0;
3311    if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3312        bflags |= UREGEX_CASE_INSENSITIVE;
3313    }
3314    if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3315        bflags |= UREGEX_COMMENTS;
3316    }
3317    if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3318        bflags |= UREGEX_DOTALL;
3319    }
3320    if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3321        bflags |= UREGEX_MULTILINE;
3322    }
3323
3324    if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3325        bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3326    }
3327    if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3328        bflags |= UREGEX_UNIX_LINES;
3329    }
3330
3331
3332    callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3333    if (status != U_ZERO_ERROR) {
3334        #if UCONFIG_NO_BREAK_ITERATION==1
3335        // 'v' test flag means that the test pattern should not compile if ICU was configured
3336        //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3337        if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3338            goto cleanupAndReturn;
3339        }
3340        #endif
3341        if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3342            // Expected pattern compilation error.
3343            if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3344                logln("Pattern Compile returns \"%s\"", u_errorName(status));
3345            }
3346            goto cleanupAndReturn;
3347        } else {
3348            // Unexpected pattern compilation error.
3349            dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3350            goto cleanupAndReturn;
3351        }
3352    }
3353
3354    UTF8Converter = ucnv_open("UTF8", &status);
3355    ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3356
3357    patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3358    status = U_ZERO_ERROR; // buffer overflow
3359    patternChars = new char[patternUTF8Length+1];
3360    pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3361    utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3362
3363    if (status == U_ZERO_ERROR) {
3364        UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3365
3366        if (status != U_ZERO_ERROR) {
3367#if UCONFIG_NO_BREAK_ITERATION==1
3368            // 'v' test flag means that the test pattern should not compile if ICU was configured
3369            //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3370            if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3371                goto cleanupAndReturn;
3372            }
3373#endif
3374            if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3375                // Expected pattern compilation error.
3376                if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3377                    logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3378                }
3379                goto cleanupAndReturn;
3380            } else {
3381                // Unexpected pattern compilation error.
3382                errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3383                goto cleanupAndReturn;
3384            }
3385        }
3386    }
3387
3388    if (UTF8Pattern == NULL) {
3389        // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3390        logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3391        status = U_ZERO_ERROR;
3392    }
3393
3394    if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3395        RegexPatternDump(callerPattern);
3396    }
3397
3398    if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3399        errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3400        goto cleanupAndReturn;
3401    }
3402
3403
3404    //
3405    // Number of times find() should be called on the test string, default to 1
3406    //
3407    numFinds = 1;
3408    for (i=2; i<=9; i++) {
3409        if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3410            if (numFinds != 1) {
3411                errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3412                goto cleanupAndReturn;
3413            }
3414            numFinds = i;
3415        }
3416    }
3417
3418    // 'M' flag.  Use matches() instead of find()
3419    if (flags.indexOf((UChar)0x4d) >= 0) {
3420        useMatchesFunc = TRUE;
3421    }
3422    if (flags.indexOf((UChar)0x4c) >= 0) {
3423        useLookingAtFunc = TRUE;
3424    }
3425
3426    //
3427    //  Find the tags in the input data, remove them, and record the group boundary
3428    //    positions.
3429    //
3430    parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3431    REGEX_CHECK_STATUS_L(line);
3432
3433    unEscapedInput = inputString.unescape();
3434    parseMatcher = parsePat->matcher(unEscapedInput, status);
3435    REGEX_CHECK_STATUS_L(line);
3436    while(parseMatcher->find()) {
3437        parseMatcher->appendReplacement(deTaggedInput, "", status);
3438        REGEX_CHECK_STATUS;
3439        UnicodeString groupNum = parseMatcher->group(2, status);
3440        if (groupNum == "r") {
3441            // <r> or </r>, a region specification within the string
3442            if (parseMatcher->group(1, status) == "/") {
3443                regionEnd = deTaggedInput.length();
3444            } else {
3445                regionStart = deTaggedInput.length();
3446            }
3447        } else {
3448            // <digits> or </digits>, a group match boundary tag.
3449            if (parseMatcher->group(1, status) == "/") {
3450                set(groupEnds, deTaggedInput.length(), groupNum);
3451            } else {
3452                set(groupStarts, deTaggedInput.length(), groupNum);
3453            }
3454        }
3455    }
3456    parseMatcher->appendTail(deTaggedInput);
3457    REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3458    if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3459      errln("mismatched <r> tags");
3460      failed = TRUE;
3461      goto cleanupAndReturn;
3462    }
3463
3464    //
3465    //  Configure the matcher according to the flags specified with this test.
3466    //
3467    matcher = callerPattern->matcher(deTaggedInput, status);
3468    REGEX_CHECK_STATUS_L(line);
3469    if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3470        matcher->setTrace(TRUE);
3471    }
3472
3473    if (UTF8Pattern != NULL) {
3474        inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3475        status = U_ZERO_ERROR; // buffer overflow
3476        inputChars = new char[inputUTF8Length+1];
3477        deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3478        utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3479
3480        if (status == U_ZERO_ERROR) {
3481            UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3482            REGEX_CHECK_STATUS_L(line);
3483        }
3484
3485        if (UTF8Matcher == NULL) {
3486            // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3487          logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3488            status = U_ZERO_ERROR;
3489        }
3490    }
3491
3492    //
3493    //  Generate native indices for UTF8 versions of region and capture group info
3494    //
3495    if (UTF8Matcher != NULL) {
3496        if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3497        if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3498
3499        //  Fill out the native index UVector info.
3500        //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3501        for (i=0; i<groupStarts.size(); i++) {
3502            int32_t  start = groupStarts.elementAti(i);
3503            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3504            if (start >= 0) {
3505                int32_t  startUTF8;
3506                if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3507                    errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3508                    failed = TRUE;
3509                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3510                }
3511                setInt(groupStartsUTF8, startUTF8, i);
3512            }
3513
3514            int32_t  end = groupEnds.elementAti(i);
3515            //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3516            if (end >= 0) {
3517                int32_t  endUTF8;
3518                if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3519                    errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3520                    failed = TRUE;
3521                    goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3522                }
3523                setInt(groupEndsUTF8, endUTF8, i);
3524            }
3525        }
3526    }
3527
3528    if (regionStart>=0) {
3529       matcher->region(regionStart, regionEnd, status);
3530       REGEX_CHECK_STATUS_L(line);
3531       if (UTF8Matcher != NULL) {
3532           UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3533           REGEX_CHECK_STATUS_L(line);
3534       }
3535    }
3536    if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3537        matcher->useAnchoringBounds(FALSE);
3538        if (UTF8Matcher != NULL) {
3539            UTF8Matcher->useAnchoringBounds(FALSE);
3540        }
3541    }
3542    if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3543        matcher->useTransparentBounds(TRUE);
3544        if (UTF8Matcher != NULL) {
3545            UTF8Matcher->useTransparentBounds(TRUE);
3546        }
3547    }
3548
3549
3550
3551    //
3552    // Do a find on the de-tagged input using the caller's pattern
3553    //     TODO: error on count>1 and not find().
3554    //           error on both matches() and lookingAt().
3555    //
3556    for (i=0; i<numFinds; i++) {
3557        if (useMatchesFunc) {
3558            isMatch = matcher->matches(status);
3559            if (UTF8Matcher != NULL) {
3560               isUTF8Match = UTF8Matcher->matches(status);
3561            }
3562        } else  if (useLookingAtFunc) {
3563            isMatch = matcher->lookingAt(status);
3564            if (UTF8Matcher != NULL) {
3565                isUTF8Match = UTF8Matcher->lookingAt(status);
3566            }
3567        } else {
3568            isMatch = matcher->find();
3569            if (UTF8Matcher != NULL) {
3570                isUTF8Match = UTF8Matcher->find();
3571            }
3572        }
3573    }
3574    matcher->setTrace(FALSE);
3575
3576    //
3577    // Match up the groups from the find() with the groups from the tags
3578    //
3579
3580    // number of tags should match number of groups from find operation.
3581    // matcher->groupCount does not include group 0, the entire match, hence the +1.
3582    //   G option in test means that capture group data is not available in the
3583    //     expected results, so the check needs to be suppressed.
3584    if (isMatch == FALSE && groupStarts.size() != 0) {
3585        dataerrln("Error at line %d:  Match expected, but none found.", line);
3586        failed = TRUE;
3587        goto cleanupAndReturn;
3588    } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3589        errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3590        failed = TRUE;
3591        goto cleanupAndReturn;
3592    }
3593
3594    if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3595        // Only check for match / no match.  Don't check capture groups.
3596        if (isMatch && groupStarts.size() == 0) {
3597            errln("Error at line %d:  No match expected, but one found.", line);
3598            failed = TRUE;
3599        } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3600            errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3601            failed = TRUE;
3602        }
3603        goto cleanupAndReturn;
3604    }
3605
3606    REGEX_CHECK_STATUS_L(line);
3607    for (i=0; i<=matcher->groupCount(); i++) {
3608        int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3609        int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3610        if (matcher->start(i, status) != expectedStart) {
3611            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3612                line, i, expectedStart, matcher->start(i, status));
3613            failed = TRUE;
3614            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3615        } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3616            errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3617                  line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3618            failed = TRUE;
3619            goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3620        }
3621
3622        int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3623        int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3624        if (matcher->end(i, status) != expectedEnd) {
3625            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3626                line, i, expectedEnd, matcher->end(i, status));
3627            failed = TRUE;
3628            // Error on end position;  keep going; real error is probably yet to come as group
3629            //   end positions work from end of the input data towards the front.
3630        } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3631            errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3632                  line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3633            failed = TRUE;
3634            // Error on end position;  keep going; real error is probably yet to come as group
3635            //   end positions work from end of the input data towards the front.
3636        }
3637    }
3638    if ( matcher->groupCount()+1 < groupStarts.size()) {
3639        errln("Error at line %d: Expected %d capture groups, found %d.",
3640            line, groupStarts.size()-1, matcher->groupCount());
3641        failed = TRUE;
3642        }
3643    else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3644        errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3645              line, groupStarts.size()-1, UTF8Matcher->groupCount());
3646        failed = TRUE;
3647    }
3648
3649    if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3650        matcher->requireEnd() == TRUE) {
3651        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3652        failed = TRUE;
3653    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3654        UTF8Matcher->requireEnd() == TRUE) {
3655        errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3656        failed = TRUE;
3657    }
3658
3659    if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3660        matcher->requireEnd() == FALSE) {
3661        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3662        failed = TRUE;
3663    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3664        UTF8Matcher->requireEnd() == FALSE) {
3665        errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3666        failed = TRUE;
3667    }
3668
3669    if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3670        matcher->hitEnd() == TRUE) {
3671        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3672        failed = TRUE;
3673    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3674               UTF8Matcher->hitEnd() == TRUE) {
3675        errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3676        failed = TRUE;
3677    }
3678
3679    if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3680        matcher->hitEnd() == FALSE) {
3681        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3682        failed = TRUE;
3683    } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3684               UTF8Matcher->hitEnd() == FALSE) {
3685        errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3686        failed = TRUE;
3687    }
3688
3689
3690cleanupAndReturn:
3691    if (failed) {
3692        infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3693            +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3694        // callerPattern->dump();
3695    }
3696    delete parseMatcher;
3697    delete parsePat;
3698    delete UTF8Matcher;
3699    delete UTF8Pattern;
3700    delete matcher;
3701    delete callerPattern;
3702
3703    utext_close(&inputText);
3704    delete[] inputChars;
3705    utext_close(&patternText);
3706    delete[] patternChars;
3707    ucnv_close(UTF8Converter);
3708}
3709
3710
3711
3712
3713//---------------------------------------------------------------------------
3714//
3715//      Errors     Check for error handling in patterns.
3716//
3717//---------------------------------------------------------------------------
3718void RegexTest::Errors() {
3719    // \escape sequences that aren't implemented yet.
3720    //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3721
3722    // Missing close parentheses
3723    REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3724    REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3725    REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3726
3727    // Extra close paren
3728    REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3729    REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3730    REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3731
3732    // Look-ahead, Look-behind
3733    //  TODO:  add tests for unbounded length look-behinds.
3734    REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3735
3736    // Attempt to use non-default flags
3737    {
3738        UParseError   pe;
3739        UErrorCode    status = U_ZERO_ERROR;
3740        int32_t       flags  = UREGEX_CANON_EQ |
3741                               UREGEX_COMMENTS         | UREGEX_DOTALL   |
3742                               UREGEX_MULTILINE;
3743        RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3744        REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3745        delete pat1;
3746    }
3747
3748
3749    // Quantifiers are allowed only after something that can be quantified.
3750    REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3751    REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3752    REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3753
3754    // Mal-formed {min,max} quantifiers
3755    REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3756    REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3757    REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3758    REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3759    REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3760    REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3761    REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3762    REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3763    REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3764
3765    // Ticket 5389
3766    REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3767
3768    // Invalid Back Reference \0
3769    //    For ICU 3.8 and earlier
3770    //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3771    //
3772    REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3773
3774}
3775
3776
3777//-------------------------------------------------------------------------------
3778//
3779//  Read a text data file, convert it to UChars, and return the data
3780//    in one big UChar * buffer, which the caller must delete.
3781//
3782//--------------------------------------------------------------------------------
3783UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3784                                     const char *defEncoding, UErrorCode &status) {
3785    UChar       *retPtr  = NULL;
3786    char        *fileBuf = NULL;
3787    UConverter* conv     = NULL;
3788    FILE        *f       = NULL;
3789
3790    ulen = 0;
3791    if (U_FAILURE(status)) {
3792        return retPtr;
3793    }
3794
3795    //
3796    //  Open the file.
3797    //
3798    f = fopen(fileName, "rb");
3799    if (f == 0) {
3800        dataerrln("Error opening test data file %s\n", fileName);
3801        status = U_FILE_ACCESS_ERROR;
3802        return NULL;
3803    }
3804    //
3805    //  Read it in
3806    //
3807    int32_t            fileSize;
3808    int32_t            amt_read;
3809
3810    fseek( f, 0, SEEK_END);
3811    fileSize = ftell(f);
3812    fileBuf = new char[fileSize];
3813    fseek(f, 0, SEEK_SET);
3814    amt_read = fread(fileBuf, 1, fileSize, f);
3815    if (amt_read != fileSize || fileSize <= 0) {
3816        errln("Error reading test data file.");
3817        goto cleanUpAndReturn;
3818    }
3819
3820    //
3821    // Look for a Unicode Signature (BOM) on the data just read
3822    //
3823    int32_t        signatureLength;
3824    const char *   fileBufC;
3825    const char*    encoding;
3826
3827    fileBufC = fileBuf;
3828    encoding = ucnv_detectUnicodeSignature(
3829        fileBuf, fileSize, &signatureLength, &status);
3830    if(encoding!=NULL ){
3831        fileBufC  += signatureLength;
3832        fileSize  -= signatureLength;
3833    } else {
3834        encoding = defEncoding;
3835        if (strcmp(encoding, "utf-8") == 0) {
3836            errln("file %s is missing its BOM", fileName);
3837        }
3838    }
3839
3840    //
3841    // Open a converter to take the rule file to UTF-16
3842    //
3843    conv = ucnv_open(encoding, &status);
3844    if (U_FAILURE(status)) {
3845        goto cleanUpAndReturn;
3846    }
3847
3848    //
3849    // Convert the rules to UChar.
3850    //  Preflight first to determine required buffer size.
3851    //
3852    ulen = ucnv_toUChars(conv,
3853        NULL,           //  dest,
3854        0,              //  destCapacity,
3855        fileBufC,
3856        fileSize,
3857        &status);
3858    if (status == U_BUFFER_OVERFLOW_ERROR) {
3859        // Buffer Overflow is expected from the preflight operation.
3860        status = U_ZERO_ERROR;
3861
3862        retPtr = new UChar[ulen+1];
3863        ucnv_toUChars(conv,
3864            retPtr,       //  dest,
3865            ulen+1,
3866            fileBufC,
3867            fileSize,
3868            &status);
3869    }
3870
3871cleanUpAndReturn:
3872    fclose(f);
3873    delete[] fileBuf;
3874    ucnv_close(conv);
3875    if (U_FAILURE(status)) {
3876        errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3877        delete []retPtr;
3878        retPtr = 0;
3879        ulen   = 0;
3880    };
3881    return retPtr;
3882}
3883
3884
3885//-------------------------------------------------------------------------------
3886//
3887//   PerlTests  - Run Perl's regular expression tests
3888//                The input file for this test is re_tests, the standard regular
3889//                expression test data distributed with the Perl source code.
3890//
3891//                Here is Perl's description of the test data file:
3892//
3893//        # The tests are in a separate file 't/op/re_tests'.
3894//        # Each line in that file is a separate test.
3895//        # There are five columns, separated by tabs.
3896//        #
3897//        # Column 1 contains the pattern, optionally enclosed in C<''>.
3898//        # Modifiers can be put after the closing C<'>.
3899//        #
3900//        # Column 2 contains the string to be matched.
3901//        #
3902//        # Column 3 contains the expected result:
3903//        #     y   expect a match
3904//        #     n   expect no match
3905//        #     c   expect an error
3906//        # B   test exposes a known bug in Perl, should be skipped
3907//        # b   test exposes a known bug in Perl, should be skipped if noamp
3908//        #
3909//        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3910//        #
3911//        # Column 4 contains a string, usually C<$&>.
3912//        #
3913//        # Column 5 contains the expected result of double-quote
3914//        # interpolating that string after the match, or start of error message.
3915//        #
3916//        # Column 6, if present, contains a reason why the test is skipped.
3917//        # This is printed with "skipped", for harness to pick up.
3918//        #
3919//        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3920//        #
3921//        # If you want to add a regular expression test that can't be expressed
3922//        # in this format, don't add it here: put it in op/pat.t instead.
3923//
3924//        For ICU, if field 3 contains an 'i', the test will be skipped.
3925//        The test exposes is some known incompatibility between ICU and Perl regexps.
3926//        (The i is in addition to whatever was there before.)
3927//
3928//-------------------------------------------------------------------------------
3929void RegexTest::PerlTests() {
3930    char tdd[2048];
3931    const char *srcPath;
3932    UErrorCode  status = U_ZERO_ERROR;
3933    UParseError pe;
3934
3935    //
3936    //  Open and read the test data file.
3937    //
3938    srcPath=getPath(tdd, "re_tests.txt");
3939    if(srcPath==NULL) {
3940        return; /* something went wrong, error already output */
3941    }
3942
3943    int32_t    len;
3944    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3945    if (U_FAILURE(status)) {
3946        return; /* something went wrong, error already output */
3947    }
3948
3949    //
3950    //  Put the test data into a UnicodeString
3951    //
3952    UnicodeString testDataString(FALSE, testData, len);
3953
3954    //
3955    //  Regex to break the input file into lines, and strip the new lines.
3956    //     One line per match, capture group one is the desired data.
3957    //
3958    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3959    if (U_FAILURE(status)) {
3960        dataerrln("RegexPattern::compile() error");
3961        return;
3962    }
3963    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3964
3965    //
3966    //  Regex to split a test file line into fields.
3967    //    There are six fields, separated by tabs.
3968    //
3969    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3970
3971    //
3972    //  Regex to identify test patterns with flag settings, and to separate them.
3973    //    Test patterns with flags look like 'pattern'i
3974    //    Test patterns without flags are not quoted:   pattern
3975    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3976    //
3977    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3978    RegexMatcher* flagMat = flagPat->matcher(status);
3979
3980    //
3981    // The Perl tests reference several perl-isms, which are evaluated/substituted
3982    //   in the test data.  Not being perl, this must be done explicitly.  Here
3983    //   are string constants and REs for these constructs.
3984    //
3985    UnicodeString nulnulSrc("${nulnul}");
3986    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3987    nulnul = nulnul.unescape();
3988
3989    UnicodeString ffffSrc("${ffff}");
3990    UnicodeString ffff("\\uffff", -1, US_INV);
3991    ffff = ffff.unescape();
3992
3993    //  regexp for $-[0], $+[2], etc.
3994    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3995    RegexMatcher *groupsMat = groupsPat->matcher(status);
3996
3997    //  regexp for $0, $1, $2, etc.
3998    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3999    RegexMatcher *cgMat = cgPat->matcher(status);
4000
4001
4002    //
4003    // Main Loop for the Perl Tests, runs once per line from the
4004    //   test data file.
4005    //
4006    int32_t  lineNum = 0;
4007    int32_t  skippedUnimplementedCount = 0;
4008    while (lineMat->find()) {
4009        lineNum++;
4010
4011        //
4012        //  Get a line, break it into its fields, do the Perl
4013        //    variable substitutions.
4014        //
4015        UnicodeString line = lineMat->group(1, status);
4016        UnicodeString fields[7];
4017        fieldPat->split(line, fields, 7, status);
4018
4019        flagMat->reset(fields[0]);
4020        flagMat->matches(status);
4021        UnicodeString pattern  = flagMat->group(2, status);
4022        pattern.findAndReplace("${bang}", "!");
4023        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4024        pattern.findAndReplace(ffffSrc, ffff);
4025
4026        //
4027        //  Identify patterns that include match flag settings,
4028        //    split off the flags, remove the extra quotes.
4029        //
4030        UnicodeString flagStr = flagMat->group(3, status);
4031        if (U_FAILURE(status)) {
4032            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4033            return;
4034        }
4035        int32_t flags = 0;
4036        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4037        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4038        const UChar UChar_m = 0x6d;
4039        const UChar UChar_x = 0x78;
4040        const UChar UChar_y = 0x79;
4041        if (flagStr.indexOf(UChar_i) != -1) {
4042            flags |= UREGEX_CASE_INSENSITIVE;
4043        }
4044        if (flagStr.indexOf(UChar_m) != -1) {
4045            flags |= UREGEX_MULTILINE;
4046        }
4047        if (flagStr.indexOf(UChar_x) != -1) {
4048            flags |= UREGEX_COMMENTS;
4049        }
4050
4051        //
4052        // Compile the test pattern.
4053        //
4054        status = U_ZERO_ERROR;
4055        RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4056        if (status == U_REGEX_UNIMPLEMENTED) {
4057            //
4058            // Test of a feature that is planned for ICU, but not yet implemented.
4059            //   skip the test.
4060            skippedUnimplementedCount++;
4061            delete testPat;
4062            status = U_ZERO_ERROR;
4063            continue;
4064        }
4065
4066        if (U_FAILURE(status)) {
4067            // Some tests are supposed to generate errors.
4068            //   Only report an error for tests that are supposed to succeed.
4069            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4070                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4071            {
4072                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4073            }
4074            status = U_ZERO_ERROR;
4075            delete testPat;
4076            continue;
4077        }
4078
4079        if (fields[2].indexOf(UChar_i) >= 0) {
4080            // ICU should skip this test.
4081            delete testPat;
4082            continue;
4083        }
4084
4085        if (fields[2].indexOf(UChar_c) >= 0) {
4086            // This pattern should have caused a compilation error, but didn't/
4087            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4088            delete testPat;
4089            continue;
4090        }
4091
4092        //
4093        // replace the Perl variables that appear in some of the
4094        //   match data strings.
4095        //
4096        UnicodeString matchString = fields[1];
4097        matchString.findAndReplace(nulnulSrc, nulnul);
4098        matchString.findAndReplace(ffffSrc,   ffff);
4099
4100        // Replace any \n in the match string with an actual new-line char.
4101        //  Don't do full unescape, as this unescapes more than Perl does, which
4102        //  causes other spurious failures in the tests.
4103        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4104
4105
4106
4107        //
4108        // Run the test, check for expected match/don't match result.
4109        //
4110        RegexMatcher *testMat = testPat->matcher(matchString, status);
4111        UBool found = testMat->find();
4112        UBool expected = FALSE;
4113        if (fields[2].indexOf(UChar_y) >=0) {
4114            expected = TRUE;
4115        }
4116        if (expected != found) {
4117            errln("line %d: Expected %smatch, got %smatch",
4118                lineNum, expected?"":"no ", found?"":"no " );
4119            continue;
4120        }
4121
4122        // Don't try to check expected results if there is no match.
4123        //   (Some have stuff in the expected fields)
4124        if (!found) {
4125            delete testMat;
4126            delete testPat;
4127            continue;
4128        }
4129
4130        //
4131        // Interpret the Perl expression from the fourth field of the data file,
4132        // building up an ICU string from the results of the ICU match.
4133        //   The Perl expression will contain references to the results of
4134        //     a regex match, including the matched string, capture group strings,
4135        //     group starting and ending indicies, etc.
4136        //
4137        UnicodeString resultString;
4138        UnicodeString perlExpr = fields[3];
4139#if SUPPORT_MUTATING_INPUT_STRING
4140        groupsMat->reset(perlExpr);
4141        cgMat->reset(perlExpr);
4142#endif
4143
4144        while (perlExpr.length() > 0) {
4145#if !SUPPORT_MUTATING_INPUT_STRING
4146            //  Perferred usage.  Reset after any modification to input string.
4147            groupsMat->reset(perlExpr);
4148            cgMat->reset(perlExpr);
4149#endif
4150
4151            if (perlExpr.startsWith("$&")) {
4152                resultString.append(testMat->group(status));
4153                perlExpr.remove(0, 2);
4154            }
4155
4156            else if (groupsMat->lookingAt(status)) {
4157                // $-[0]   $+[2]  etc.
4158                UnicodeString digitString = groupsMat->group(2, status);
4159                int32_t t = 0;
4160                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4161                UnicodeString plusOrMinus = groupsMat->group(1, status);
4162                int32_t matchPosition;
4163                if (plusOrMinus.compare("+") == 0) {
4164                    matchPosition = testMat->end(groupNum, status);
4165                } else {
4166                    matchPosition = testMat->start(groupNum, status);
4167                }
4168                if (matchPosition != -1) {
4169                    ICU_Utility::appendNumber(resultString, matchPosition);
4170                }
4171                perlExpr.remove(0, groupsMat->end(status));
4172            }
4173
4174            else if (cgMat->lookingAt(status)) {
4175                // $1, $2, $3, etc.
4176                UnicodeString digitString = cgMat->group(1, status);
4177                int32_t t = 0;
4178                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4179                if (U_SUCCESS(status)) {
4180                    resultString.append(testMat->group(groupNum, status));
4181                    status = U_ZERO_ERROR;
4182                }
4183                perlExpr.remove(0, cgMat->end(status));
4184            }
4185
4186            else if (perlExpr.startsWith("@-")) {
4187                int32_t i;
4188                for (i=0; i<=testMat->groupCount(); i++) {
4189                    if (i>0) {
4190                        resultString.append(" ");
4191                    }
4192                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4193                }
4194                perlExpr.remove(0, 2);
4195            }
4196
4197            else if (perlExpr.startsWith("@+")) {
4198                int32_t i;
4199                for (i=0; i<=testMat->groupCount(); i++) {
4200                    if (i>0) {
4201                        resultString.append(" ");
4202                    }
4203                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4204                }
4205                perlExpr.remove(0, 2);
4206            }
4207
4208            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4209                                                     //           or as an escaped sequence (e.g. \n)
4210                if (perlExpr.length() > 1) {
4211                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4212                }
4213                UChar c = perlExpr.charAt(0);
4214                switch (c) {
4215                case 'n':   c = '\n'; break;
4216                // add any other escape sequences that show up in the test expected results.
4217                }
4218                resultString.append(c);
4219                perlExpr.remove(0, 1);
4220            }
4221
4222            else  {
4223                // Any characters from the perl expression that we don't explicitly
4224                //  recognize before here are assumed to be literals and copied
4225                //  as-is to the expected results.
4226                resultString.append(perlExpr.charAt(0));
4227                perlExpr.remove(0, 1);
4228            }
4229
4230            if (U_FAILURE(status)) {
4231                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4232                break;
4233            }
4234        }
4235
4236        //
4237        // Expected Results Compare
4238        //
4239        UnicodeString expectedS(fields[4]);
4240        expectedS.findAndReplace(nulnulSrc, nulnul);
4241        expectedS.findAndReplace(ffffSrc,   ffff);
4242        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4243
4244
4245        if (expectedS.compare(resultString) != 0) {
4246            err("Line %d: Incorrect perl expression results.", lineNum);
4247            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4248        }
4249
4250        delete testMat;
4251        delete testPat;
4252    }
4253
4254    //
4255    // All done.  Clean up allocated stuff.
4256    //
4257    delete cgMat;
4258    delete cgPat;
4259
4260    delete groupsMat;
4261    delete groupsPat;
4262
4263    delete flagMat;
4264    delete flagPat;
4265
4266    delete lineMat;
4267    delete linePat;
4268
4269    delete fieldPat;
4270    delete [] testData;
4271
4272
4273    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4274
4275}
4276
4277
4278//-------------------------------------------------------------------------------
4279//
4280//   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4281//                  (instead of using UnicodeStrings) to test the alternate engine.
4282//                  The input file for this test is re_tests, the standard regular
4283//                  expression test data distributed with the Perl source code.
4284//                  See PerlTests() for more information.
4285//
4286//-------------------------------------------------------------------------------
4287void RegexTest::PerlTestsUTF8() {
4288    char tdd[2048];
4289    const char *srcPath;
4290    UErrorCode  status = U_ZERO_ERROR;
4291    UParseError pe;
4292    LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4293    UText       patternText = UTEXT_INITIALIZER;
4294    char       *patternChars = NULL;
4295    int32_t     patternLength;
4296    int32_t     patternCapacity = 0;
4297    UText       inputText = UTEXT_INITIALIZER;
4298    char       *inputChars = NULL;
4299    int32_t     inputLength;
4300    int32_t     inputCapacity = 0;
4301
4302    ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4303
4304    //
4305    //  Open and read the test data file.
4306    //
4307    srcPath=getPath(tdd, "re_tests.txt");
4308    if(srcPath==NULL) {
4309        return; /* something went wrong, error already output */
4310    }
4311
4312    int32_t    len;
4313    UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4314    if (U_FAILURE(status)) {
4315        return; /* something went wrong, error already output */
4316    }
4317
4318    //
4319    //  Put the test data into a UnicodeString
4320    //
4321    UnicodeString testDataString(FALSE, testData, len);
4322
4323    //
4324    //  Regex to break the input file into lines, and strip the new lines.
4325    //     One line per match, capture group one is the desired data.
4326    //
4327    RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4328    if (U_FAILURE(status)) {
4329        dataerrln("RegexPattern::compile() error");
4330        return;
4331    }
4332    RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4333
4334    //
4335    //  Regex to split a test file line into fields.
4336    //    There are six fields, separated by tabs.
4337    //
4338    RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4339
4340    //
4341    //  Regex to identify test patterns with flag settings, and to separate them.
4342    //    Test patterns with flags look like 'pattern'i
4343    //    Test patterns without flags are not quoted:   pattern
4344    //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4345    //
4346    RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4347    RegexMatcher* flagMat = flagPat->matcher(status);
4348
4349    //
4350    // The Perl tests reference several perl-isms, which are evaluated/substituted
4351    //   in the test data.  Not being perl, this must be done explicitly.  Here
4352    //   are string constants and REs for these constructs.
4353    //
4354    UnicodeString nulnulSrc("${nulnul}");
4355    UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4356    nulnul = nulnul.unescape();
4357
4358    UnicodeString ffffSrc("${ffff}");
4359    UnicodeString ffff("\\uffff", -1, US_INV);
4360    ffff = ffff.unescape();
4361
4362    //  regexp for $-[0], $+[2], etc.
4363    RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4364    RegexMatcher *groupsMat = groupsPat->matcher(status);
4365
4366    //  regexp for $0, $1, $2, etc.
4367    RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4368    RegexMatcher *cgMat = cgPat->matcher(status);
4369
4370
4371    //
4372    // Main Loop for the Perl Tests, runs once per line from the
4373    //   test data file.
4374    //
4375    int32_t  lineNum = 0;
4376    int32_t  skippedUnimplementedCount = 0;
4377    while (lineMat->find()) {
4378        lineNum++;
4379
4380        //
4381        //  Get a line, break it into its fields, do the Perl
4382        //    variable substitutions.
4383        //
4384        UnicodeString line = lineMat->group(1, status);
4385        UnicodeString fields[7];
4386        fieldPat->split(line, fields, 7, status);
4387
4388        flagMat->reset(fields[0]);
4389        flagMat->matches(status);
4390        UnicodeString pattern  = flagMat->group(2, status);
4391        pattern.findAndReplace("${bang}", "!");
4392        pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4393        pattern.findAndReplace(ffffSrc, ffff);
4394
4395        //
4396        //  Identify patterns that include match flag settings,
4397        //    split off the flags, remove the extra quotes.
4398        //
4399        UnicodeString flagStr = flagMat->group(3, status);
4400        if (U_FAILURE(status)) {
4401            errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4402            return;
4403        }
4404        int32_t flags = 0;
4405        const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4406        const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4407        const UChar UChar_m = 0x6d;
4408        const UChar UChar_x = 0x78;
4409        const UChar UChar_y = 0x79;
4410        if (flagStr.indexOf(UChar_i) != -1) {
4411            flags |= UREGEX_CASE_INSENSITIVE;
4412        }
4413        if (flagStr.indexOf(UChar_m) != -1) {
4414            flags |= UREGEX_MULTILINE;
4415        }
4416        if (flagStr.indexOf(UChar_x) != -1) {
4417            flags |= UREGEX_COMMENTS;
4418        }
4419
4420        //
4421        // Put the pattern in a UTF-8 UText
4422        //
4423        status = U_ZERO_ERROR;
4424        patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4425        if (status == U_BUFFER_OVERFLOW_ERROR) {
4426            status = U_ZERO_ERROR;
4427            delete[] patternChars;
4428            patternCapacity = patternLength + 1;
4429            patternChars = new char[patternCapacity];
4430            pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4431        }
4432        utext_openUTF8(&patternText, patternChars, patternLength, &status);
4433
4434        //
4435        // Compile the test pattern.
4436        //
4437        RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4438        if (status == U_REGEX_UNIMPLEMENTED) {
4439            //
4440            // Test of a feature that is planned for ICU, but not yet implemented.
4441            //   skip the test.
4442            skippedUnimplementedCount++;
4443            delete testPat;
4444            status = U_ZERO_ERROR;
4445            continue;
4446        }
4447
4448        if (U_FAILURE(status)) {
4449            // Some tests are supposed to generate errors.
4450            //   Only report an error for tests that are supposed to succeed.
4451            if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4452                fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4453            {
4454                errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4455            }
4456            status = U_ZERO_ERROR;
4457            delete testPat;
4458            continue;
4459        }
4460
4461        if (fields[2].indexOf(UChar_i) >= 0) {
4462            // ICU should skip this test.
4463            delete testPat;
4464            continue;
4465        }
4466
4467        if (fields[2].indexOf(UChar_c) >= 0) {
4468            // This pattern should have caused a compilation error, but didn't/
4469            errln("line %d: Expected a pattern compile error, got success.", lineNum);
4470            delete testPat;
4471            continue;
4472        }
4473
4474
4475        //
4476        // replace the Perl variables that appear in some of the
4477        //   match data strings.
4478        //
4479        UnicodeString matchString = fields[1];
4480        matchString.findAndReplace(nulnulSrc, nulnul);
4481        matchString.findAndReplace(ffffSrc,   ffff);
4482
4483        // Replace any \n in the match string with an actual new-line char.
4484        //  Don't do full unescape, as this unescapes more than Perl does, which
4485        //  causes other spurious failures in the tests.
4486        matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4487
4488        //
4489        // Put the input in a UTF-8 UText
4490        //
4491        status = U_ZERO_ERROR;
4492        inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4493        if (status == U_BUFFER_OVERFLOW_ERROR) {
4494            status = U_ZERO_ERROR;
4495            delete[] inputChars;
4496            inputCapacity = inputLength + 1;
4497            inputChars = new char[inputCapacity];
4498            matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4499        }
4500        utext_openUTF8(&inputText, inputChars, inputLength, &status);
4501
4502        //
4503        // Run the test, check for expected match/don't match result.
4504        //
4505        RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4506        UBool found = testMat->find();
4507        UBool expected = FALSE;
4508        if (fields[2].indexOf(UChar_y) >=0) {
4509            expected = TRUE;
4510        }
4511        if (expected != found) {
4512            errln("line %d: Expected %smatch, got %smatch",
4513                lineNum, expected?"":"no ", found?"":"no " );
4514            continue;
4515        }
4516
4517        // Don't try to check expected results if there is no match.
4518        //   (Some have stuff in the expected fields)
4519        if (!found) {
4520            delete testMat;
4521            delete testPat;
4522            continue;
4523        }
4524
4525        //
4526        // Interpret the Perl expression from the fourth field of the data file,
4527        // building up an ICU string from the results of the ICU match.
4528        //   The Perl expression will contain references to the results of
4529        //     a regex match, including the matched string, capture group strings,
4530        //     group starting and ending indicies, etc.
4531        //
4532        UnicodeString resultString;
4533        UnicodeString perlExpr = fields[3];
4534
4535        while (perlExpr.length() > 0) {
4536            groupsMat->reset(perlExpr);
4537            cgMat->reset(perlExpr);
4538
4539            if (perlExpr.startsWith("$&")) {
4540                resultString.append(testMat->group(status));
4541                perlExpr.remove(0, 2);
4542            }
4543
4544            else if (groupsMat->lookingAt(status)) {
4545                // $-[0]   $+[2]  etc.
4546                UnicodeString digitString = groupsMat->group(2, status);
4547                int32_t t = 0;
4548                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4549                UnicodeString plusOrMinus = groupsMat->group(1, status);
4550                int32_t matchPosition;
4551                if (plusOrMinus.compare("+") == 0) {
4552                    matchPosition = testMat->end(groupNum, status);
4553                } else {
4554                    matchPosition = testMat->start(groupNum, status);
4555                }
4556                if (matchPosition != -1) {
4557                    ICU_Utility::appendNumber(resultString, matchPosition);
4558                }
4559                perlExpr.remove(0, groupsMat->end(status));
4560            }
4561
4562            else if (cgMat->lookingAt(status)) {
4563                // $1, $2, $3, etc.
4564                UnicodeString digitString = cgMat->group(1, status);
4565                int32_t t = 0;
4566                int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4567                if (U_SUCCESS(status)) {
4568                    resultString.append(testMat->group(groupNum, status));
4569                    status = U_ZERO_ERROR;
4570                }
4571                perlExpr.remove(0, cgMat->end(status));
4572            }
4573
4574            else if (perlExpr.startsWith("@-")) {
4575                int32_t i;
4576                for (i=0; i<=testMat->groupCount(); i++) {
4577                    if (i>0) {
4578                        resultString.append(" ");
4579                    }
4580                    ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4581                }
4582                perlExpr.remove(0, 2);
4583            }
4584
4585            else if (perlExpr.startsWith("@+")) {
4586                int32_t i;
4587                for (i=0; i<=testMat->groupCount(); i++) {
4588                    if (i>0) {
4589                        resultString.append(" ");
4590                    }
4591                    ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4592                }
4593                perlExpr.remove(0, 2);
4594            }
4595
4596            else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4597                                                     //           or as an escaped sequence (e.g. \n)
4598                if (perlExpr.length() > 1) {
4599                    perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4600                }
4601                UChar c = perlExpr.charAt(0);
4602                switch (c) {
4603                case 'n':   c = '\n'; break;
4604                // add any other escape sequences that show up in the test expected results.
4605                }
4606                resultString.append(c);
4607                perlExpr.remove(0, 1);
4608            }
4609
4610            else  {
4611                // Any characters from the perl expression that we don't explicitly
4612                //  recognize before here are assumed to be literals and copied
4613                //  as-is to the expected results.
4614                resultString.append(perlExpr.charAt(0));
4615                perlExpr.remove(0, 1);
4616            }
4617
4618            if (U_FAILURE(status)) {
4619                errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4620                break;
4621            }
4622        }
4623
4624        //
4625        // Expected Results Compare
4626        //
4627        UnicodeString expectedS(fields[4]);
4628        expectedS.findAndReplace(nulnulSrc, nulnul);
4629        expectedS.findAndReplace(ffffSrc,   ffff);
4630        expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4631
4632
4633        if (expectedS.compare(resultString) != 0) {
4634            err("Line %d: Incorrect perl expression results.", lineNum);
4635            infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4636        }
4637
4638        delete testMat;
4639        delete testPat;
4640    }
4641
4642    //
4643    // All done.  Clean up allocated stuff.
4644    //
4645    delete cgMat;
4646    delete cgPat;
4647
4648    delete groupsMat;
4649    delete groupsPat;
4650
4651    delete flagMat;
4652    delete flagPat;
4653
4654    delete lineMat;
4655    delete linePat;
4656
4657    delete fieldPat;
4658    delete [] testData;
4659
4660    utext_close(&patternText);
4661    utext_close(&inputText);
4662
4663    delete [] patternChars;
4664    delete [] inputChars;
4665
4666
4667    logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4668
4669}
4670
4671
4672//--------------------------------------------------------------
4673//
4674//  Bug6149   Verify limits to heap expansion for backtrack stack.
4675//             Use this pattern,
4676//                 "(a?){1,}"
4677//             The zero-length match will repeat forever.
4678//                (That this goes into a loop is another bug)
4679//
4680//---------------------------------------------------------------
4681void RegexTest::Bug6149() {
4682    UnicodeString pattern("(a?){1,}");
4683    UnicodeString s("xyz");
4684    uint32_t flags = 0;
4685    UErrorCode status = U_ZERO_ERROR;
4686
4687    RegexMatcher  matcher(pattern, s, flags, status);
4688    UBool result = false;
4689    REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4690    REGEX_ASSERT(result == FALSE);
4691 }
4692
4693
4694//
4695//   Callbacks()    Test the callback function.
4696//                  When set, callbacks occur periodically during matching operations,
4697//                  giving the application code the ability to abort the operation
4698//                  before it's normal completion.
4699//
4700
4701struct callBackContext {
4702    RegexTest        *test;
4703    int32_t          maxCalls;
4704    int32_t          numCalls;
4705    int32_t          lastSteps;
4706    void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4707};
4708
4709U_CDECL_BEGIN
4710static UBool U_CALLCONV
4711testCallBackFn(const void *context, int32_t steps) {
4712    callBackContext  *info = (callBackContext *)context;
4713    if (info->lastSteps+1 != steps) {
4714        info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4715    }
4716    info->lastSteps = steps;
4717    info->numCalls++;
4718    return (info->numCalls < info->maxCalls);
4719}
4720U_CDECL_END
4721
4722void RegexTest::Callbacks() {
4723   {
4724        // Getter returns NULLs if no callback has been set
4725
4726        //   The variables that the getter will fill in.
4727        //   Init to non-null values so that the action of the getter can be seen.
4728        const void          *returnedContext = &returnedContext;
4729        URegexMatchCallback *returnedFn = &testCallBackFn;
4730
4731        UErrorCode status = U_ZERO_ERROR;
4732        RegexMatcher matcher("x", 0, status);
4733        REGEX_CHECK_STATUS;
4734        matcher.getMatchCallback(returnedFn, returnedContext, status);
4735        REGEX_CHECK_STATUS;
4736        REGEX_ASSERT(returnedFn == NULL);
4737        REGEX_ASSERT(returnedContext == NULL);
4738    }
4739
4740   {
4741        // Set and Get work
4742        callBackContext cbInfo = {this, 0, 0, 0};
4743        const void          *returnedContext;
4744        URegexMatchCallback *returnedFn;
4745        UErrorCode status = U_ZERO_ERROR;
4746        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4747        REGEX_CHECK_STATUS;
4748        matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4749        REGEX_CHECK_STATUS;
4750        matcher.getMatchCallback(returnedFn, returnedContext, status);
4751        REGEX_CHECK_STATUS;
4752        REGEX_ASSERT(returnedFn == testCallBackFn);
4753        REGEX_ASSERT(returnedContext == &cbInfo);
4754
4755        // A short-running match shouldn't invoke the callback
4756        status = U_ZERO_ERROR;
4757        cbInfo.reset(1);
4758        UnicodeString s = "xxx";
4759        matcher.reset(s);
4760        REGEX_ASSERT(matcher.matches(status));
4761        REGEX_CHECK_STATUS;
4762        REGEX_ASSERT(cbInfo.numCalls == 0);
4763
4764        // A medium-length match that runs long enough to invoke the
4765        //   callback, but not so long that the callback aborts it.
4766        status = U_ZERO_ERROR;
4767        cbInfo.reset(4);
4768        s = "aaaaaaaaaaaaaaaaaaab";
4769        matcher.reset(s);
4770        REGEX_ASSERT(matcher.matches(status)==FALSE);
4771        REGEX_CHECK_STATUS;
4772        REGEX_ASSERT(cbInfo.numCalls > 0);
4773
4774        // A longer running match that the callback function will abort.
4775        status = U_ZERO_ERROR;
4776        cbInfo.reset(4);
4777        s = "aaaaaaaaaaaaaaaaaaaaaaab";
4778        matcher.reset(s);
4779        REGEX_ASSERT(matcher.matches(status)==FALSE);
4780        REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4781        REGEX_ASSERT(cbInfo.numCalls == 4);
4782    }
4783
4784
4785}
4786
4787
4788//
4789//   FindProgressCallbacks()    Test the find "progress" callback function.
4790//                  When set, the find progress callback will be invoked during a find operations
4791//                  after each return from a match attempt, giving the application the opportunity
4792//                  to terminate a long-running find operation before it's normal completion.
4793//
4794
4795struct progressCallBackContext {
4796    RegexTest        *test;
4797    int64_t          lastIndex;
4798    int32_t          maxCalls;
4799    int32_t          numCalls;
4800    void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4801};
4802
4803U_CDECL_BEGIN
4804static UBool U_CALLCONV
4805testProgressCallBackFn(const void *context, int64_t matchIndex) {
4806    progressCallBackContext  *info = (progressCallBackContext *)context;
4807    info->numCalls++;
4808    info->lastIndex = matchIndex;
4809//    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4810    return (info->numCalls < info->maxCalls);
4811}
4812U_CDECL_END
4813
4814void RegexTest::FindProgressCallbacks() {
4815   {
4816        // Getter returns NULLs if no callback has been set
4817
4818        //   The variables that the getter will fill in.
4819        //   Init to non-null values so that the action of the getter can be seen.
4820        const void                  *returnedContext = &returnedContext;
4821        URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4822
4823        UErrorCode status = U_ZERO_ERROR;
4824        RegexMatcher matcher("x", 0, status);
4825        REGEX_CHECK_STATUS;
4826        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4827        REGEX_CHECK_STATUS;
4828        REGEX_ASSERT(returnedFn == NULL);
4829        REGEX_ASSERT(returnedContext == NULL);
4830    }
4831
4832   {
4833        // Set and Get work
4834        progressCallBackContext cbInfo = {this, 0, 0, 0};
4835        const void                  *returnedContext;
4836        URegexFindProgressCallback  *returnedFn;
4837        UErrorCode status = U_ZERO_ERROR;
4838        RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4839        REGEX_CHECK_STATUS;
4840        matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4841        REGEX_CHECK_STATUS;
4842        matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4843        REGEX_CHECK_STATUS;
4844        REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4845        REGEX_ASSERT(returnedContext == &cbInfo);
4846
4847        // A short-running match should NOT invoke the callback.
4848        status = U_ZERO_ERROR;
4849        cbInfo.reset(100);
4850        UnicodeString s = "abxxx";
4851        matcher.reset(s);
4852#if 0
4853        matcher.setTrace(TRUE);
4854#endif
4855        REGEX_ASSERT(matcher.find(0, status));
4856        REGEX_CHECK_STATUS;
4857        REGEX_ASSERT(cbInfo.numCalls == 0);
4858
4859        // A medium running match that causes matcher.find() to invoke our callback for each index.
4860        status = U_ZERO_ERROR;
4861        s = "aaaaaaaaaaaaaaaaaaab";
4862        cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4863        matcher.reset(s);
4864        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4865        REGEX_CHECK_STATUS;
4866        REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4867
4868        // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4869        status = U_ZERO_ERROR;
4870        UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4871        cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4872        matcher.reset(s1);
4873        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4874        REGEX_CHECK_STATUS;
4875        REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4876
4877#if 0
4878        // Now a match that will succeed, but after an interruption
4879        status = U_ZERO_ERROR;
4880        UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4881        cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4882        matcher.reset(s2);
4883        REGEX_ASSERT(matcher.find(0, status)==FALSE);
4884        REGEX_CHECK_STATUS;
4885        // Now retry the match from where left off
4886        cbInfo.maxCalls = 100; //  No callback limit
4887        REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4888        REGEX_CHECK_STATUS;
4889#endif
4890    }
4891
4892
4893}
4894
4895
4896//---------------------------------------------------------------------------
4897//
4898//    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4899//                             UTexts. The pure-C implementation of UText
4900//                             has no mutable backing stores, but we can
4901//                             use UnicodeString here to test the functionality.
4902//
4903//---------------------------------------------------------------------------
4904void RegexTest::PreAllocatedUTextCAPI () {
4905    UErrorCode           status = U_ZERO_ERROR;
4906    URegularExpression  *re;
4907    UText                patternText = UTEXT_INITIALIZER;
4908    UnicodeString        buffer;
4909    UText                bufferText = UTEXT_INITIALIZER;
4910
4911    utext_openUnicodeString(&bufferText, &buffer, &status);
4912
4913    /*
4914     *  getText() and getUText()
4915     */
4916    {
4917        UText  text1 = UTEXT_INITIALIZER;
4918        UText  text2 = UTEXT_INITIALIZER;
4919        UChar  text2Chars[20];
4920        UText  *resultText;
4921
4922        status = U_ZERO_ERROR;
4923        regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4924        regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4925        u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4926        utext_openUChars(&text2, text2Chars, -1, &status);
4927
4928        regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4929        re = uregex_openUText(&patternText, 0, NULL, &status);
4930
4931        /* First set a UText */
4932        uregex_setUText(re, &text1, &status);
4933        resultText = uregex_getUText(re, &bufferText, &status);
4934        REGEX_CHECK_STATUS;
4935        REGEX_ASSERT(resultText == &bufferText);
4936        utext_setNativeIndex(resultText, 0);
4937        utext_setNativeIndex(&text1, 0);
4938        REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4939
4940        resultText = uregex_getUText(re, &bufferText, &status);
4941        REGEX_CHECK_STATUS;
4942        REGEX_ASSERT(resultText == &bufferText);
4943        utext_setNativeIndex(resultText, 0);
4944        utext_setNativeIndex(&text1, 0);
4945        REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4946
4947        /* Then set a UChar * */
4948        uregex_setText(re, text2Chars, 7, &status);
4949        resultText = uregex_getUText(re, &bufferText, &status);
4950        REGEX_CHECK_STATUS;
4951        REGEX_ASSERT(resultText == &bufferText);
4952        utext_setNativeIndex(resultText, 0);
4953        utext_setNativeIndex(&text2, 0);
4954        REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
4955
4956        uregex_close(re);
4957        utext_close(&text1);
4958        utext_close(&text2);
4959    }
4960
4961    /*
4962     *  group()
4963     */
4964    {
4965        UChar    text1[80];
4966        UText   *actual;
4967        UBool    result;
4968        u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
4969
4970        status = U_ZERO_ERROR;
4971        re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4972        REGEX_CHECK_STATUS;
4973
4974        uregex_setText(re, text1, -1, &status);
4975        result = uregex_find(re, 0, &status);
4976        REGEX_ASSERT(result==TRUE);
4977
4978        /*  Capture Group 0, the full match.  Should succeed.  */
4979        status = U_ZERO_ERROR;
4980        actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
4981        REGEX_CHECK_STATUS;
4982        REGEX_ASSERT(actual == &bufferText);
4983        REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
4984
4985        /*  Capture group #1.  Should succeed. */
4986        status = U_ZERO_ERROR;
4987        actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
4988        REGEX_CHECK_STATUS;
4989        REGEX_ASSERT(actual == &bufferText);
4990        REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
4991
4992        /*  Capture group out of range.  Error. */
4993        status = U_ZERO_ERROR;
4994        actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
4995        REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
4996        REGEX_ASSERT(actual == &bufferText);
4997
4998        uregex_close(re);
4999
5000    }
5001
5002    /*
5003     *  replaceFirst()
5004     */
5005    {
5006        UChar    text1[80];
5007        UChar    text2[80];
5008        UText    replText = UTEXT_INITIALIZER;
5009        UText   *result;
5010
5011        status = U_ZERO_ERROR;
5012        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5013        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5014        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5015
5016        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5017        REGEX_CHECK_STATUS;
5018
5019        /*  Normal case, with match */
5020        uregex_setText(re, text1, -1, &status);
5021        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5022        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5023        REGEX_CHECK_STATUS;
5024        REGEX_ASSERT(result == &bufferText);
5025        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5026
5027        /* No match.  Text should copy to output with no changes.  */
5028        uregex_setText(re, text2, -1, &status);
5029        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5030        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5031        REGEX_CHECK_STATUS;
5032        REGEX_ASSERT(result == &bufferText);
5033        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5034
5035        /* Unicode escapes */
5036        uregex_setText(re, text1, -1, &status);
5037        regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
5038        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5039        result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5040        REGEX_CHECK_STATUS;
5041        REGEX_ASSERT(result == &bufferText);
5042        REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5043
5044        uregex_close(re);
5045        utext_close(&replText);
5046    }
5047
5048
5049    /*
5050     *  replaceAll()
5051     */
5052    {
5053        UChar    text1[80];
5054        UChar    text2[80];
5055        UText    replText = UTEXT_INITIALIZER;
5056        UText   *result;
5057
5058        status = U_ZERO_ERROR;
5059        u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5060        u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5061        regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5062
5063        re = uregex_openC("x(.*?)x", 0, NULL, &status);
5064        REGEX_CHECK_STATUS;
5065
5066        /*  Normal case, with match */
5067        uregex_setText(re, text1, -1, &status);
5068        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5069        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5070        REGEX_CHECK_STATUS;
5071        REGEX_ASSERT(result == &bufferText);
5072        REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5073
5074        /* No match.  Text should copy to output with no changes.  */
5075        uregex_setText(re, text2, -1, &status);
5076        utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5077        result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5078        REGEX_CHECK_STATUS;
5079        REGEX_ASSERT(result == &bufferText);
5080        REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5081
5082        uregex_close(re);
5083        utext_close(&replText);
5084    }
5085
5086
5087    /*
5088     *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5089     *   so we don't need to test it here.
5090     */
5091
5092    utext_close(&bufferText);
5093    utext_close(&patternText);
5094}
5095
5096//--------------------------------------------------------------
5097//
5098//  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5099//
5100//---------------------------------------------------------------
5101void RegexTest::Bug7651() {
5102    UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5103    //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5104    //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5105    UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5106    UnicodeString s("#ff @abcd This is test");
5107    RegexPattern  *REPattern = NULL;
5108    RegexMatcher  *REMatcher = NULL;
5109    UErrorCode status = U_ZERO_ERROR;
5110    UParseError pe;
5111
5112    REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5113    REGEX_CHECK_STATUS;
5114    REMatcher = REPattern->matcher(s, status);
5115    REGEX_CHECK_STATUS;
5116    REGEX_ASSERT(REMatcher->find());
5117    REGEX_ASSERT(REMatcher->start(status) == 0);
5118    delete REPattern;
5119    delete REMatcher;
5120    status = U_ZERO_ERROR;
5121
5122    REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5123    REGEX_CHECK_STATUS;
5124    REMatcher = REPattern->matcher(s, status);
5125    REGEX_CHECK_STATUS;
5126    REGEX_ASSERT(REMatcher->find());
5127    REGEX_ASSERT(REMatcher->start(status) == 0);
5128    delete REPattern;
5129    delete REMatcher;
5130    status = U_ZERO_ERROR;
5131 }
5132
5133void RegexTest::Bug7740() {
5134    UErrorCode status = U_ZERO_ERROR;
5135    UnicodeString pattern = "(a)";
5136    UnicodeString text = "abcdef";
5137    RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5138    REGEX_CHECK_STATUS;
5139    REGEX_ASSERT(m->lookingAt(status));
5140    REGEX_CHECK_STATUS;
5141    status = U_ILLEGAL_ARGUMENT_ERROR;
5142    UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5143    REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5144    REGEX_ASSERT(s == "");
5145    delete m;
5146}
5147
5148// Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5149
5150void RegexTest::Bug8479() {
5151    UErrorCode status = U_ZERO_ERROR;
5152
5153    RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5154    REGEX_CHECK_STATUS;
5155    if (U_SUCCESS(status))
5156    {
5157        UnicodeString str;
5158        str.setToBogus();
5159        pMatcher->reset(str);
5160        status = U_ZERO_ERROR;
5161        pMatcher->matches(status);
5162        REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5163        delete pMatcher;
5164    }
5165}
5166
5167
5168// Bug 7029
5169void RegexTest::Bug7029() {
5170    UErrorCode status = U_ZERO_ERROR;
5171
5172    RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5173    UnicodeString text = "abc.def";
5174    UnicodeString splits[10];
5175    REGEX_CHECK_STATUS;
5176    int32_t numFields = pMatcher->split(text, splits, 10, status);
5177    REGEX_CHECK_STATUS;
5178    REGEX_ASSERT(numFields == 8);
5179    delete pMatcher;
5180}
5181
5182void RegexTest::CheckInvBufSize() {
5183  if(inv_next>=INV_BUFSIZ) {
5184    errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5185          __FILE__, INV_BUFSIZ, inv_next);
5186  } else {
5187    logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5188  }
5189}
5190
5191#endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5192
5193