1/******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 2002-2011, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 7// 8// regextst.cpp 9// 10// ICU Regular Expressions test, part of intltest. 11// 12 13/* 14 NOTE!! 15 16 PLEASE be careful about ASCII assumptions in this test. 17 This test is one of the worst repeat offenders. 18 If you have questions, contact someone on the ICU PMC 19 who has access to an EBCDIC system. 20 21 */ 22 23#include "intltest.h" 24#if !UCONFIG_NO_REGULAR_EXPRESSIONS 25 26#include "unicode/regex.h" 27#include "unicode/uchar.h" 28#include "unicode/ucnv.h" 29#include "unicode/ustring.h" 30#include "regextst.h" 31#include "uvector.h" 32#include "util.h" 33#include <stdlib.h> 34#include <string.h> 35#include <stdio.h> 36#include "cstring.h" 37#include "uinvchar.h" 38 39#define SUPPORT_MUTATING_INPUT_STRING 0 40 41//--------------------------------------------------------------------------- 42// 43// Test class boilerplate 44// 45//--------------------------------------------------------------------------- 46RegexTest::RegexTest() 47{ 48} 49 50 51RegexTest::~RegexTest() 52{ 53} 54 55 56 57void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 58{ 59 if (exec) logln("TestSuite RegexTest: "); 60 switch (index) { 61 62 case 0: name = "Basic"; 63 if (exec) Basic(); 64 break; 65 case 1: name = "API_Match"; 66 if (exec) API_Match(); 67 break; 68 case 2: name = "API_Replace"; 69 if (exec) API_Replace(); 70 break; 71 case 3: name = "API_Pattern"; 72 if (exec) API_Pattern(); 73 break; 74 case 4: 75#if !UCONFIG_NO_FILE_IO 76 name = "Extended"; 77 if (exec) Extended(); 78#else 79 name = "skip"; 80#endif 81 break; 82 case 5: name = "Errors"; 83 if (exec) Errors(); 84 break; 85 case 6: name = "PerlTests"; 86 if (exec) PerlTests(); 87 break; 88 case 7: name = "Callbacks"; 89 if (exec) Callbacks(); 90 break; 91 case 8: name = "FindProgressCallbacks"; 92 if (exec) FindProgressCallbacks(); 93 break; 94 case 9: name = "Bug 6149"; 95 if (exec) Bug6149(); 96 break; 97 case 10: name = "UTextBasic"; 98 if (exec) UTextBasic(); 99 break; 100 case 11: name = "API_Match_UTF8"; 101 if (exec) API_Match_UTF8(); 102 break; 103 case 12: name = "API_Replace_UTF8"; 104 if (exec) API_Replace_UTF8(); 105 break; 106 case 13: name = "API_Pattern_UTF8"; 107 if (exec) API_Pattern_UTF8(); 108 break; 109 case 14: name = "PerlTestsUTF8"; 110 if (exec) PerlTestsUTF8(); 111 break; 112 case 15: name = "PreAllocatedUTextCAPI"; 113 if (exec) PreAllocatedUTextCAPI(); 114 break; 115 case 16: name = "Bug 7651"; 116 if (exec) Bug7651(); 117 break; 118 case 17: name = "Bug 7740"; 119 if (exec) Bug7740(); 120 break; 121 case 18: name = "Bug 8479"; 122 if (exec) Bug8479(); 123 break; 124 case 19: name = "Bug 7029"; 125 if (exec) Bug7029(); 126 break; 127 case 20: name = "CheckInvBufSize"; 128 if (exec) CheckInvBufSize(); 129 break; 130 131 default: name = ""; 132 break; //needed to end loop 133 } 134} 135 136 137 138/** 139 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage 140 * into ASCII. 141 * @see utext_openUTF8 142 */ 143static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status); 144 145//--------------------------------------------------------------------------- 146// 147// Error Checking / Reporting macros used in all of the tests. 148// 149//--------------------------------------------------------------------------- 150 151static void utextToPrintable(char *buf, int32_t bufLen, UText *text) { 152 int64_t oldIndex = utext_getNativeIndex(text); 153 utext_setNativeIndex(text, 0); 154 char *bufPtr = buf; 155 UChar32 c = utext_next32From(text, 0); 156 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) { 157 if (0x000020<=c && c<0x00007e) { 158 *bufPtr = c; 159 } else { 160#if 0 161 sprintf(bufPtr,"U+%04X", c); 162 bufPtr+= strlen(bufPtr)-1; 163#else 164 *bufPtr = '%'; 165#endif 166 } 167 bufPtr++; 168 c = UTEXT_NEXT32(text); 169 } 170 *bufPtr = 0; 171#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY) 172 char *ebuf = (char*)malloc(bufLen); 173 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen); 174 uprv_strncpy(buf, ebuf, bufLen); 175 free((void*)ebuf); 176#endif 177 utext_setNativeIndex(text, oldIndex); 178} 179 180static inline UChar toHex(int32_t i) { 181 return (UChar)(i + (i < 10 ? 0x30 : (0x41 - 10))); 182} 183 184static UnicodeString& escape(const UnicodeString& s, UnicodeString& result) { 185 for (int32_t i=0; i<s.length(); ++i) { 186 UChar c = s[i]; 187 if ((c <= (UChar)0x7F) && (c>0)) { 188 result += c; 189 } else { 190 result += (UChar)0x5c; 191 result += (UChar)0x75; 192 result += toHex((c >> 12) & 0xF); 193 result += toHex((c >> 8) & 0xF); 194 result += toHex((c >> 4) & 0xF); 195 result += toHex( c & 0xF); 196 } 197 } 198 return result; 199} 200 201static char ASSERT_BUF[1024]; 202 203static const char* extractToAssertBuf(const UnicodeString& message) { 204 if(message.length()==0) { 205 strcpy(ASSERT_BUF, "[[empty UnicodeString]]"); 206 } else { 207 UnicodeString buf; 208 escape(message, buf); 209 if(buf.length()==0) { 210 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]"); 211 } else { 212 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1); 213 if(ASSERT_BUF[0]==0) { 214 ASSERT_BUF[0]=0; 215 for(int32_t i=0;i<buf.length();i++) { 216 UChar ch = buf[i]; 217 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch); 218 } 219 } 220 } 221 } 222 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0; 223 return ASSERT_BUF; 224} 225 226 227#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);} 228 229#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \ 230 __FILE__, __LINE__, u_errorName(status)); return;}} 231 232#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};} 233 234#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\ 235if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \ 236 __LINE__, u_errorName(errcode), u_errorName(status));};} 237 238#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \ 239 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }} 240 241#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ 242 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} 243 244#define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};} 245 246/** 247 * @param expected expected text in UTF-8 (not platform) codepage 248 */ 249void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) { 250 UErrorCode status = U_ZERO_ERROR; 251 UText expectedText = UTEXT_INITIALIZER; 252 utext_openUTF8(&expectedText, expected, -1, &status); 253 if(U_FAILURE(status)) { 254 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 255 return; 256 } 257 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) { 258 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected)); 259 return; 260 } 261 utext_setNativeIndex(actual, 0); 262 if (utext_compare(&expectedText, -1, actual, -1) != 0) { 263 char buf[201 /*21*/]; 264 char expectedBuf[201]; 265 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); 266 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); 267 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 268 } 269 utext_close(&expectedText); 270} 271/** 272 * @param expected invariant (platform local text) input 273 */ 274 275void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) { 276 UErrorCode status = U_ZERO_ERROR; 277 UText expectedText = UTEXT_INITIALIZER; 278 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status); 279 if(U_FAILURE(status)) { 280 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 281 return; 282 } 283 utext_setNativeIndex(actual, 0); 284 if (utext_compare(&expectedText, -1, actual, -1) != 0) { 285 char buf[201 /*21*/]; 286 char expectedBuf[201]; 287 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); 288 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); 289 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 290 } 291 utext_close(&expectedText); 292} 293 294/** 295 * Assumes utf-8 input 296 */ 297#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__) 298/** 299 * Assumes Invariant input 300 */ 301#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__) 302 303/** 304 * This buffer ( inv_buf ) is used to hold the UTF-8 strings 305 * passed into utext_openUTF8. An error will be given if 306 * INV_BUFSIZ is too small. It's only used on EBCDIC systems. 307 */ 308 309#define INV_BUFSIZ 2048 /* increase this if too small */ 310 311static int32_t inv_next=0; 312 313#if U_CHARSET_FAMILY!=U_ASCII_FAMILY 314static char inv_buf[INV_BUFSIZ]; 315#endif 316 317static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) { 318 if(length==-1) length=strlen(inv); 319#if U_CHARSET_FAMILY==U_ASCII_FAMILY 320 inv_next+=length; 321 return utext_openUTF8(ut, inv, length, status); 322#else 323 if(inv_next+length+1>INV_BUFSIZ) { 324 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n", 325 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1)); 326 *status = U_MEMORY_ALLOCATION_ERROR; 327 return NULL; 328 } 329 330 unsigned char *buf = (unsigned char*)inv_buf+inv_next; 331 uprv_aestrncpy(buf, (const uint8_t*)inv, length); 332 inv_next+=length; 333 334#if 0 335 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next); 336#endif 337 338 return utext_openUTF8(ut, (const char*)buf, length, status); 339#endif 340} 341 342 343//--------------------------------------------------------------------------- 344// 345// REGEX_TESTLM Macro + invocation function to simplify writing quick tests 346// for the LookingAt() and Match() functions. 347// 348// usage: 349// REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected); 350// 351// The expected results are UBool - TRUE or FALSE. 352// The input text is unescaped. The pattern is not. 353// 354// 355//--------------------------------------------------------------------------- 356 357#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);} 358 359UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 360 const UnicodeString pattern(pat, -1, US_INV); 361 const UnicodeString inputText(text, -1, US_INV); 362 UErrorCode status = U_ZERO_ERROR; 363 UParseError pe; 364 RegexPattern *REPattern = NULL; 365 RegexMatcher *REMatcher = NULL; 366 UBool retVal = TRUE; 367 368 UnicodeString patString(pat, -1, US_INV); 369 REPattern = RegexPattern::compile(patString, 0, pe, status); 370 if (U_FAILURE(status)) { 371 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s", 372 line, u_errorName(status)); 373 return FALSE; 374 } 375 if (line==376) { RegexPatternDump(REPattern);} 376 377 UnicodeString inputString(inputText); 378 UnicodeString unEscapedInput = inputString.unescape(); 379 REMatcher = REPattern->matcher(unEscapedInput, status); 380 if (U_FAILURE(status)) { 381 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n", 382 line, u_errorName(status)); 383 return FALSE; 384 } 385 386 UBool actualmatch; 387 actualmatch = REMatcher->lookingAt(status); 388 if (U_FAILURE(status)) { 389 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n", 390 line, u_errorName(status)); 391 retVal = FALSE; 392 } 393 if (actualmatch != looking) { 394 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line); 395 retVal = FALSE; 396 } 397 398 status = U_ZERO_ERROR; 399 actualmatch = REMatcher->matches(status); 400 if (U_FAILURE(status)) { 401 errln("RegexTest failure in matches() at line %d. Status = %s\n", 402 line, u_errorName(status)); 403 retVal = FALSE; 404 } 405 if (actualmatch != match) { 406 errln("RegexTest: wrong return from matches() at line %d.\n", line); 407 retVal = FALSE; 408 } 409 410 if (retVal == FALSE) { 411 RegexPatternDump(REPattern); 412 } 413 414 delete REPattern; 415 delete REMatcher; 416 return retVal; 417} 418 419 420UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 421 UText pattern = UTEXT_INITIALIZER; 422 int32_t inputUTF8Length; 423 char *textChars = NULL; 424 UText inputText = UTEXT_INITIALIZER; 425 UErrorCode status = U_ZERO_ERROR; 426 UParseError pe; 427 RegexPattern *REPattern = NULL; 428 RegexMatcher *REMatcher = NULL; 429 UBool retVal = TRUE; 430 431 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status); 432 REPattern = RegexPattern::compile(&pattern, 0, pe, status); 433 if (U_FAILURE(status)) { 434 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n", 435 line, u_errorName(status)); 436 return FALSE; 437 } 438 439 UnicodeString inputString(text, -1, US_INV); 440 UnicodeString unEscapedInput = inputString.unescape(); 441 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status)); 442 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 443 444 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status); 445 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 446 // UTF-8 does not allow unpaired surrogates, so this could actually happen 447 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status)); 448 return TRUE; // not a failure of the Regex engine 449 } 450 status = U_ZERO_ERROR; // buffer overflow 451 textChars = new char[inputUTF8Length+1]; 452 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status); 453 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status); 454 455 REMatcher = &REPattern->matcher(status)->reset(&inputText); 456 if (U_FAILURE(status)) { 457 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n", 458 line, u_errorName(status)); 459 return FALSE; 460 } 461 462 UBool actualmatch; 463 actualmatch = REMatcher->lookingAt(status); 464 if (U_FAILURE(status)) { 465 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n", 466 line, u_errorName(status)); 467 retVal = FALSE; 468 } 469 if (actualmatch != looking) { 470 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line); 471 retVal = FALSE; 472 } 473 474 status = U_ZERO_ERROR; 475 actualmatch = REMatcher->matches(status); 476 if (U_FAILURE(status)) { 477 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n", 478 line, u_errorName(status)); 479 retVal = FALSE; 480 } 481 if (actualmatch != match) { 482 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line); 483 retVal = FALSE; 484 } 485 486 if (retVal == FALSE) { 487 RegexPatternDump(REPattern); 488 } 489 490 delete REPattern; 491 delete REMatcher; 492 utext_close(&inputText); 493 utext_close(&pattern); 494 delete[] textChars; 495 return retVal; 496} 497 498 499 500//--------------------------------------------------------------------------- 501// 502// REGEX_ERR Macro + invocation function to simplify writing tests 503// regex tests for incorrect patterns 504// 505// usage: 506// REGEX_ERR("pattern", expected error line, column, expected status); 507// 508//--------------------------------------------------------------------------- 509#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__); 510 511void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol, 512 UErrorCode expectedStatus, int32_t line) { 513 UnicodeString pattern(pat); 514 515 UErrorCode status = U_ZERO_ERROR; 516 UParseError pe; 517 RegexPattern *callerPattern = NULL; 518 519 // 520 // Compile the caller's pattern 521 // 522 UnicodeString patString(pat); 523 callerPattern = RegexPattern::compile(patString, 0, pe, status); 524 if (status != expectedStatus) { 525 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 526 } else { 527 if (status != U_ZERO_ERROR) { 528 if (pe.line != errLine || pe.offset != errCol) { 529 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 530 line, errLine, errCol, pe.line, pe.offset); 531 } 532 } 533 } 534 535 delete callerPattern; 536 537 // 538 // Compile again, using a UTF-8-based UText 539 // 540 UText patternText = UTEXT_INITIALIZER; 541 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status); 542 callerPattern = RegexPattern::compile(&patternText, 0, pe, status); 543 if (status != expectedStatus) { 544 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 545 } else { 546 if (status != U_ZERO_ERROR) { 547 if (pe.line != errLine || pe.offset != errCol) { 548 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 549 line, errLine, errCol, pe.line, pe.offset); 550 } 551 } 552 } 553 554 delete callerPattern; 555 utext_close(&patternText); 556} 557 558 559 560//--------------------------------------------------------------------------- 561// 562// Basic Check for basic functionality of regex pattern matching. 563// Avoid the use of REGEX_FIND test macro, which has 564// substantial dependencies on basic Regex functionality. 565// 566//--------------------------------------------------------------------------- 567void RegexTest::Basic() { 568 569 570// 571// Debug - slide failing test cases early 572// 573#if 0 574 { 575 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE); 576 UParseError pe; 577 UErrorCode status = U_ZERO_ERROR; 578 RegexPattern::compile("^(?:a?b?)*$", 0, pe, status); 579 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd"); 580 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); 581 } 582 exit(1); 583#endif 584 585 586 // 587 // Pattern with parentheses 588 // 589 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE); 590 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE); 591 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE); 592 593 // 594 // Patterns with * 595 // 596 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE); 597 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE); 598 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE); 599 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE); 600 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE); 601 602 REGEX_TESTLM("a*", "", TRUE, TRUE); 603 REGEX_TESTLM("a*", "b", TRUE, FALSE); 604 605 606 // 607 // Patterns with "." 608 // 609 REGEX_TESTLM(".", "abc", TRUE, FALSE); 610 REGEX_TESTLM("...", "abc", TRUE, TRUE); 611 REGEX_TESTLM("....", "abc", FALSE, FALSE); 612 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE); 613 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE); 614 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE); 615 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE); 616 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE); 617 618 // 619 // Patterns with * applied to chars at end of literal string 620 // 621 REGEX_TESTLM("abc*", "ab", TRUE, TRUE); 622 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE); 623 624 // 625 // Supplemental chars match as single chars, not a pair of surrogates. 626 // 627 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE); 628 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE); 629 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE); 630 631 632 // 633 // UnicodeSets in the pattern 634 // 635 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE); 636 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE); 637 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE); 638 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 639 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 640 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE); 641 642 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE); 643 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE); 644 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE); 645 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences. 646 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE); 647 648 // 649 // OR operator in patterns 650 // 651 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE); 652 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE); 653 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE); 654 REGEX_TESTLM("a|b", "b", TRUE, TRUE); 655 656 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE); 657 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE); 658 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE); 659 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE); 660 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE); 661 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE); 662 663 // 664 // + 665 // 666 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE); 667 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE); 668 REGEX_TESTLM("b+", "", FALSE, FALSE); 669 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE); 670 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE); 671 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE); 672 673 // 674 // ? 675 // 676 REGEX_TESTLM("ab?", "ab", TRUE, TRUE); 677 REGEX_TESTLM("ab?", "a", TRUE, TRUE); 678 REGEX_TESTLM("ab?", "ac", TRUE, FALSE); 679 REGEX_TESTLM("ab?", "abb", TRUE, FALSE); 680 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE); 681 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE); 682 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE); 683 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE); 684 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE); 685 686 // 687 // Escape sequences that become single literal chars, handled internally 688 // by ICU's Unescape. 689 // 690 691 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet. 692 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL 693 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L 694 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape 695 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed 696 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line 697 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR 698 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab 699 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE); 700 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE); 701 702 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input 703 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input 704 705 // Escape of special chars in patterns 706 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE); 707} 708 709 710//--------------------------------------------------------------------------- 711// 712// UTextBasic Check for quirks that are specific to the UText 713// implementation. 714// 715//--------------------------------------------------------------------------- 716void RegexTest::UTextBasic() { 717 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 718 UErrorCode status = U_ZERO_ERROR; 719 UText pattern = UTEXT_INITIALIZER; 720 utext_openUTF8(&pattern, str_abc, -1, &status); 721 RegexMatcher matcher(&pattern, 0, status); 722 REGEX_CHECK_STATUS; 723 724 UText input = UTEXT_INITIALIZER; 725 utext_openUTF8(&input, str_abc, -1, &status); 726 REGEX_CHECK_STATUS; 727 matcher.reset(&input); 728 REGEX_CHECK_STATUS; 729 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 730 731 matcher.reset(matcher.inputText()); 732 REGEX_CHECK_STATUS; 733 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 734 735 utext_close(&pattern); 736 utext_close(&input); 737} 738 739 740//--------------------------------------------------------------------------- 741// 742// API_Match Test that the API for class RegexMatcher 743// is present and nominally working, but excluding functions 744// implementing replace operations. 745// 746//--------------------------------------------------------------------------- 747void RegexTest::API_Match() { 748 UParseError pe; 749 UErrorCode status=U_ZERO_ERROR; 750 int32_t flags = 0; 751 752 // 753 // Debug - slide failing test cases early 754 // 755#if 0 756 { 757 } 758 return; 759#endif 760 761 // 762 // Simple pattern compilation 763 // 764 { 765 UnicodeString re("abc"); 766 RegexPattern *pat2; 767 pat2 = RegexPattern::compile(re, flags, pe, status); 768 REGEX_CHECK_STATUS; 769 770 UnicodeString inStr1 = "abcdef this is a test"; 771 UnicodeString instr2 = "not abc"; 772 UnicodeString empty = ""; 773 774 775 // 776 // Matcher creation and reset. 777 // 778 RegexMatcher *m1 = pat2->matcher(inStr1, status); 779 REGEX_CHECK_STATUS; 780 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 781 REGEX_ASSERT(m1->input() == inStr1); 782 m1->reset(instr2); 783 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 784 REGEX_ASSERT(m1->input() == instr2); 785 m1->reset(inStr1); 786 REGEX_ASSERT(m1->input() == inStr1); 787 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 788 m1->reset(empty); 789 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 790 REGEX_ASSERT(m1->input() == empty); 791 REGEX_ASSERT(&m1->pattern() == pat2); 792 793 // 794 // reset(pos, status) 795 // 796 m1->reset(inStr1); 797 m1->reset(4, status); 798 REGEX_CHECK_STATUS; 799 REGEX_ASSERT(m1->input() == inStr1); 800 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 801 802 m1->reset(-1, status); 803 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 804 status = U_ZERO_ERROR; 805 806 m1->reset(0, status); 807 REGEX_CHECK_STATUS; 808 status = U_ZERO_ERROR; 809 810 int32_t len = m1->input().length(); 811 m1->reset(len-1, status); 812 REGEX_CHECK_STATUS; 813 status = U_ZERO_ERROR; 814 815 m1->reset(len, status); 816 REGEX_CHECK_STATUS; 817 status = U_ZERO_ERROR; 818 819 m1->reset(len+1, status); 820 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 821 status = U_ZERO_ERROR; 822 823 // 824 // match(pos, status) 825 // 826 m1->reset(instr2); 827 REGEX_ASSERT(m1->matches(4, status) == TRUE); 828 m1->reset(); 829 REGEX_ASSERT(m1->matches(3, status) == FALSE); 830 m1->reset(); 831 REGEX_ASSERT(m1->matches(5, status) == FALSE); 832 REGEX_ASSERT(m1->matches(4, status) == TRUE); 833 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 834 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 835 836 // Match() at end of string should fail, but should not 837 // be an error. 838 status = U_ZERO_ERROR; 839 len = m1->input().length(); 840 REGEX_ASSERT(m1->matches(len, status) == FALSE); 841 REGEX_CHECK_STATUS; 842 843 // Match beyond end of string should fail with an error. 844 status = U_ZERO_ERROR; 845 REGEX_ASSERT(m1->matches(len+1, status) == FALSE); 846 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 847 848 // Successful match at end of string. 849 { 850 status = U_ZERO_ERROR; 851 RegexMatcher m("A?", 0, status); // will match zero length string. 852 REGEX_CHECK_STATUS; 853 m.reset(inStr1); 854 len = inStr1.length(); 855 REGEX_ASSERT(m.matches(len, status) == TRUE); 856 REGEX_CHECK_STATUS; 857 m.reset(empty); 858 REGEX_ASSERT(m.matches(0, status) == TRUE); 859 REGEX_CHECK_STATUS; 860 } 861 862 863 // 864 // lookingAt(pos, status) 865 // 866 status = U_ZERO_ERROR; 867 m1->reset(instr2); // "not abc" 868 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 869 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 870 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 871 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 872 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 873 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 874 status = U_ZERO_ERROR; 875 len = m1->input().length(); 876 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE); 877 REGEX_CHECK_STATUS; 878 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE); 879 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 880 881 delete m1; 882 delete pat2; 883 } 884 885 886 // 887 // Capture Group. 888 // RegexMatcher::start(); 889 // RegexMatcher::end(); 890 // RegexMatcher::groupCount(); 891 // 892 { 893 int32_t flags=0; 894 UParseError pe; 895 UErrorCode status=U_ZERO_ERROR; 896 897 UnicodeString re("01(23(45)67)(.*)"); 898 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 899 REGEX_CHECK_STATUS; 900 UnicodeString data = "0123456789"; 901 902 RegexMatcher *matcher = pat->matcher(data, status); 903 REGEX_CHECK_STATUS; 904 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 905 static const int32_t matchStarts[] = {0, 2, 4, 8}; 906 static const int32_t matchEnds[] = {10, 8, 6, 10}; 907 int32_t i; 908 for (i=0; i<4; i++) { 909 int32_t actualStart = matcher->start(i, status); 910 REGEX_CHECK_STATUS; 911 if (actualStart != matchStarts[i]) { 912 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n", 913 __LINE__, i, matchStarts[i], actualStart); 914 } 915 int32_t actualEnd = matcher->end(i, status); 916 REGEX_CHECK_STATUS; 917 if (actualEnd != matchEnds[i]) { 918 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n", 919 __LINE__, i, matchEnds[i], actualEnd); 920 } 921 } 922 923 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 924 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 925 926 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 927 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 928 matcher->reset(); 929 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 930 931 matcher->lookingAt(status); 932 REGEX_ASSERT(matcher->group(status) == "0123456789"); 933 REGEX_ASSERT(matcher->group(0, status) == "0123456789"); 934 REGEX_ASSERT(matcher->group(1, status) == "234567" ); 935 REGEX_ASSERT(matcher->group(2, status) == "45" ); 936 REGEX_ASSERT(matcher->group(3, status) == "89" ); 937 REGEX_CHECK_STATUS; 938 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 939 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 940 matcher->reset(); 941 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 942 943 delete matcher; 944 delete pat; 945 946 } 947 948 // 949 // find 950 // 951 { 952 int32_t flags=0; 953 UParseError pe; 954 UErrorCode status=U_ZERO_ERROR; 955 956 UnicodeString re("abc"); 957 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 958 REGEX_CHECK_STATUS; 959 UnicodeString data = ".abc..abc...abc.."; 960 // 012345678901234567 961 962 RegexMatcher *matcher = pat->matcher(data, status); 963 REGEX_CHECK_STATUS; 964 REGEX_ASSERT(matcher->find()); 965 REGEX_ASSERT(matcher->start(status) == 1); 966 REGEX_ASSERT(matcher->find()); 967 REGEX_ASSERT(matcher->start(status) == 6); 968 REGEX_ASSERT(matcher->find()); 969 REGEX_ASSERT(matcher->start(status) == 12); 970 REGEX_ASSERT(matcher->find() == FALSE); 971 REGEX_ASSERT(matcher->find() == FALSE); 972 973 matcher->reset(); 974 REGEX_ASSERT(matcher->find()); 975 REGEX_ASSERT(matcher->start(status) == 1); 976 977 REGEX_ASSERT(matcher->find(0, status)); 978 REGEX_ASSERT(matcher->start(status) == 1); 979 REGEX_ASSERT(matcher->find(1, status)); 980 REGEX_ASSERT(matcher->start(status) == 1); 981 REGEX_ASSERT(matcher->find(2, status)); 982 REGEX_ASSERT(matcher->start(status) == 6); 983 REGEX_ASSERT(matcher->find(12, status)); 984 REGEX_ASSERT(matcher->start(status) == 12); 985 REGEX_ASSERT(matcher->find(13, status) == FALSE); 986 REGEX_ASSERT(matcher->find(16, status) == FALSE); 987 REGEX_ASSERT(matcher->find(17, status) == FALSE); 988 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 989 990 status = U_ZERO_ERROR; 991 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 992 status = U_ZERO_ERROR; 993 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 994 995 REGEX_ASSERT(matcher->groupCount() == 0); 996 997 delete matcher; 998 delete pat; 999 } 1000 1001 1002 // 1003 // find, with \G in pattern (true if at the end of a previous match). 1004 // 1005 { 1006 int32_t flags=0; 1007 UParseError pe; 1008 UErrorCode status=U_ZERO_ERROR; 1009 1010 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV); 1011 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 1012 REGEX_CHECK_STATUS; 1013 UnicodeString data = ".abcabc.abc.."; 1014 // 012345678901234567 1015 1016 RegexMatcher *matcher = pat->matcher(data, status); 1017 REGEX_CHECK_STATUS; 1018 REGEX_ASSERT(matcher->find()); 1019 REGEX_ASSERT(matcher->start(status) == 0); 1020 REGEX_ASSERT(matcher->start(1, status) == -1); 1021 REGEX_ASSERT(matcher->start(2, status) == 1); 1022 1023 REGEX_ASSERT(matcher->find()); 1024 REGEX_ASSERT(matcher->start(status) == 4); 1025 REGEX_ASSERT(matcher->start(1, status) == 4); 1026 REGEX_ASSERT(matcher->start(2, status) == -1); 1027 REGEX_CHECK_STATUS; 1028 1029 delete matcher; 1030 delete pat; 1031 } 1032 1033 // 1034 // find with zero length matches, match position should bump ahead 1035 // to prevent loops. 1036 // 1037 { 1038 int32_t i; 1039 UErrorCode status=U_ZERO_ERROR; 1040 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 1041 // using an always-true look-ahead. 1042 REGEX_CHECK_STATUS; 1043 UnicodeString s(" "); 1044 m.reset(s); 1045 for (i=0; ; i++) { 1046 if (m.find() == FALSE) { 1047 break; 1048 } 1049 REGEX_ASSERT(m.start(status) == i); 1050 REGEX_ASSERT(m.end(status) == i); 1051 } 1052 REGEX_ASSERT(i==5); 1053 1054 // Check that the bump goes over surrogate pairs OK 1055 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004"); 1056 s = s.unescape(); 1057 m.reset(s); 1058 for (i=0; ; i+=2) { 1059 if (m.find() == FALSE) { 1060 break; 1061 } 1062 REGEX_ASSERT(m.start(status) == i); 1063 REGEX_ASSERT(m.end(status) == i); 1064 } 1065 REGEX_ASSERT(i==10); 1066 } 1067 { 1068 // find() loop breaking test. 1069 // with pattern of /.?/, should see a series of one char matches, then a single 1070 // match of zero length at the end of the input string. 1071 int32_t i; 1072 UErrorCode status=U_ZERO_ERROR; 1073 RegexMatcher m(".?", 0, status); 1074 REGEX_CHECK_STATUS; 1075 UnicodeString s(" "); 1076 m.reset(s); 1077 for (i=0; ; i++) { 1078 if (m.find() == FALSE) { 1079 break; 1080 } 1081 REGEX_ASSERT(m.start(status) == i); 1082 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 1083 } 1084 REGEX_ASSERT(i==5); 1085 } 1086 1087 1088 // 1089 // Matchers with no input string behave as if they had an empty input string. 1090 // 1091 1092 { 1093 UErrorCode status = U_ZERO_ERROR; 1094 RegexMatcher m(".?", 0, status); 1095 REGEX_CHECK_STATUS; 1096 REGEX_ASSERT(m.find()); 1097 REGEX_ASSERT(m.start(status) == 0); 1098 REGEX_ASSERT(m.input() == ""); 1099 } 1100 { 1101 UErrorCode status = U_ZERO_ERROR; 1102 RegexPattern *p = RegexPattern::compile(".", 0, status); 1103 RegexMatcher *m = p->matcher(status); 1104 REGEX_CHECK_STATUS; 1105 1106 REGEX_ASSERT(m->find() == FALSE); 1107 REGEX_ASSERT(m->input() == ""); 1108 delete m; 1109 delete p; 1110 } 1111 1112 // 1113 // Regions 1114 // 1115 { 1116 UErrorCode status = U_ZERO_ERROR; 1117 UnicodeString testString("This is test data"); 1118 RegexMatcher m(".*", testString, 0, status); 1119 REGEX_CHECK_STATUS; 1120 REGEX_ASSERT(m.regionStart() == 0); 1121 REGEX_ASSERT(m.regionEnd() == testString.length()); 1122 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1123 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1124 1125 m.region(2,4, status); 1126 REGEX_CHECK_STATUS; 1127 REGEX_ASSERT(m.matches(status)); 1128 REGEX_ASSERT(m.start(status)==2); 1129 REGEX_ASSERT(m.end(status)==4); 1130 REGEX_CHECK_STATUS; 1131 1132 m.reset(); 1133 REGEX_ASSERT(m.regionStart() == 0); 1134 REGEX_ASSERT(m.regionEnd() == testString.length()); 1135 1136 UnicodeString shorterString("short"); 1137 m.reset(shorterString); 1138 REGEX_ASSERT(m.regionStart() == 0); 1139 REGEX_ASSERT(m.regionEnd() == shorterString.length()); 1140 1141 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1142 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 1143 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1144 REGEX_ASSERT(&m == &m.reset()); 1145 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1146 1147 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 1148 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1149 REGEX_ASSERT(&m == &m.reset()); 1150 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1151 1152 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1153 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 1154 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1155 REGEX_ASSERT(&m == &m.reset()); 1156 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1157 1158 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 1159 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1160 REGEX_ASSERT(&m == &m.reset()); 1161 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1162 1163 } 1164 1165 // 1166 // hitEnd() and requireEnd() 1167 // 1168 { 1169 UErrorCode status = U_ZERO_ERROR; 1170 UnicodeString testString("aabb"); 1171 RegexMatcher m1(".*", testString, 0, status); 1172 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 1173 REGEX_ASSERT(m1.hitEnd() == TRUE); 1174 REGEX_ASSERT(m1.requireEnd() == FALSE); 1175 REGEX_CHECK_STATUS; 1176 1177 status = U_ZERO_ERROR; 1178 RegexMatcher m2("a*", testString, 0, status); 1179 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 1180 REGEX_ASSERT(m2.hitEnd() == FALSE); 1181 REGEX_ASSERT(m2.requireEnd() == FALSE); 1182 REGEX_CHECK_STATUS; 1183 1184 status = U_ZERO_ERROR; 1185 RegexMatcher m3(".*$", testString, 0, status); 1186 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 1187 REGEX_ASSERT(m3.hitEnd() == TRUE); 1188 REGEX_ASSERT(m3.requireEnd() == TRUE); 1189 REGEX_CHECK_STATUS; 1190 } 1191 1192 1193 // 1194 // Compilation error on reset with UChar * 1195 // These were a hazard that people were stumbling over with runtime errors. 1196 // Changed them to compiler errors by adding private methods that more closely 1197 // matched the incorrect use of the functions. 1198 // 1199#if 0 1200 { 1201 UErrorCode status = U_ZERO_ERROR; 1202 UChar ucharString[20]; 1203 RegexMatcher m(".", 0, status); 1204 m.reset(ucharString); // should not compile. 1205 1206 RegexPattern *p = RegexPattern::compile(".", 0, status); 1207 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile. 1208 1209 RegexMatcher m3(".", ucharString, 0, status); // Should not compile 1210 } 1211#endif 1212 1213 // 1214 // Time Outs. 1215 // Note: These tests will need to be changed when the regexp engine is 1216 // able to detect and cut short the exponential time behavior on 1217 // this type of match. 1218 // 1219 { 1220 UErrorCode status = U_ZERO_ERROR; 1221 // Enough 'a's in the string to cause the match to time out. 1222 // (Each on additonal 'a' doubles the time) 1223 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa"); 1224 RegexMatcher matcher("(a+)+b", testString, 0, status); 1225 REGEX_CHECK_STATUS; 1226 REGEX_ASSERT(matcher.getTimeLimit() == 0); 1227 matcher.setTimeLimit(100, status); 1228 REGEX_ASSERT(matcher.getTimeLimit() == 100); 1229 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1230 REGEX_ASSERT(status == U_REGEX_TIME_OUT); 1231 } 1232 { 1233 UErrorCode status = U_ZERO_ERROR; 1234 // Few enough 'a's to slip in under the time limit. 1235 UnicodeString testString("aaaaaaaaaaaaaaaaaa"); 1236 RegexMatcher matcher("(a+)+b", testString, 0, status); 1237 REGEX_CHECK_STATUS; 1238 matcher.setTimeLimit(100, status); 1239 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1240 REGEX_CHECK_STATUS; 1241 } 1242 1243 // 1244 // Stack Limits 1245 // 1246 { 1247 UErrorCode status = U_ZERO_ERROR; 1248 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A' 1249 1250 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations 1251 // of the '+', and makes the stack frames larger. 1252 RegexMatcher matcher("(A)+A$", testString, 0, status); 1253 1254 // With the default stack, this match should fail to run 1255 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1256 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1257 1258 // With unlimited stack, it should run 1259 status = U_ZERO_ERROR; 1260 matcher.setStackLimit(0, status); 1261 REGEX_CHECK_STATUS; 1262 REGEX_ASSERT(matcher.lookingAt(status) == TRUE); 1263 REGEX_CHECK_STATUS; 1264 REGEX_ASSERT(matcher.getStackLimit() == 0); 1265 1266 // With a limited stack, it the match should fail 1267 status = U_ZERO_ERROR; 1268 matcher.setStackLimit(10000, status); 1269 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1270 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1271 REGEX_ASSERT(matcher.getStackLimit() == 10000); 1272 } 1273 1274 // A pattern that doesn't save state should work with 1275 // a minimal sized stack 1276 { 1277 UErrorCode status = U_ZERO_ERROR; 1278 UnicodeString testString = "abc"; 1279 RegexMatcher matcher("abc", testString, 0, status); 1280 REGEX_CHECK_STATUS; 1281 matcher.setStackLimit(30, status); 1282 REGEX_CHECK_STATUS; 1283 REGEX_ASSERT(matcher.matches(status) == TRUE); 1284 REGEX_CHECK_STATUS; 1285 REGEX_ASSERT(matcher.getStackLimit() == 30); 1286 1287 // Negative stack sizes should fail 1288 status = U_ZERO_ERROR; 1289 matcher.setStackLimit(1000, status); 1290 REGEX_CHECK_STATUS; 1291 matcher.setStackLimit(-1, status); 1292 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 1293 REGEX_ASSERT(matcher.getStackLimit() == 1000); 1294 } 1295 1296 1297} 1298 1299 1300 1301 1302 1303 1304//--------------------------------------------------------------------------- 1305// 1306// API_Replace API test for class RegexMatcher, testing the 1307// Replace family of functions. 1308// 1309//--------------------------------------------------------------------------- 1310void RegexTest::API_Replace() { 1311 // 1312 // Replace 1313 // 1314 int32_t flags=0; 1315 UParseError pe; 1316 UErrorCode status=U_ZERO_ERROR; 1317 1318 UnicodeString re("abc"); 1319 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 1320 REGEX_CHECK_STATUS; 1321 UnicodeString data = ".abc..abc...abc.."; 1322 // 012345678901234567 1323 RegexMatcher *matcher = pat->matcher(data, status); 1324 1325 // 1326 // Plain vanilla matches. 1327 // 1328 UnicodeString dest; 1329 dest = matcher->replaceFirst("yz", status); 1330 REGEX_CHECK_STATUS; 1331 REGEX_ASSERT(dest == ".yz..abc...abc.."); 1332 1333 dest = matcher->replaceAll("yz", status); 1334 REGEX_CHECK_STATUS; 1335 REGEX_ASSERT(dest == ".yz..yz...yz.."); 1336 1337 // 1338 // Plain vanilla non-matches. 1339 // 1340 UnicodeString d2 = ".abx..abx...abx.."; 1341 matcher->reset(d2); 1342 dest = matcher->replaceFirst("yz", status); 1343 REGEX_CHECK_STATUS; 1344 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1345 1346 dest = matcher->replaceAll("yz", status); 1347 REGEX_CHECK_STATUS; 1348 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1349 1350 // 1351 // Empty source string 1352 // 1353 UnicodeString d3 = ""; 1354 matcher->reset(d3); 1355 dest = matcher->replaceFirst("yz", status); 1356 REGEX_CHECK_STATUS; 1357 REGEX_ASSERT(dest == ""); 1358 1359 dest = matcher->replaceAll("yz", status); 1360 REGEX_CHECK_STATUS; 1361 REGEX_ASSERT(dest == ""); 1362 1363 // 1364 // Empty substitution string 1365 // 1366 matcher->reset(data); // ".abc..abc...abc.." 1367 dest = matcher->replaceFirst("", status); 1368 REGEX_CHECK_STATUS; 1369 REGEX_ASSERT(dest == "...abc...abc.."); 1370 1371 dest = matcher->replaceAll("", status); 1372 REGEX_CHECK_STATUS; 1373 REGEX_ASSERT(dest == "........"); 1374 1375 // 1376 // match whole string 1377 // 1378 UnicodeString d4 = "abc"; 1379 matcher->reset(d4); 1380 dest = matcher->replaceFirst("xyz", status); 1381 REGEX_CHECK_STATUS; 1382 REGEX_ASSERT(dest == "xyz"); 1383 1384 dest = matcher->replaceAll("xyz", status); 1385 REGEX_CHECK_STATUS; 1386 REGEX_ASSERT(dest == "xyz"); 1387 1388 // 1389 // Capture Group, simple case 1390 // 1391 UnicodeString re2("a(..)"); 1392 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status); 1393 REGEX_CHECK_STATUS; 1394 UnicodeString d5 = "abcdefg"; 1395 RegexMatcher *matcher2 = pat2->matcher(d5, status); 1396 REGEX_CHECK_STATUS; 1397 dest = matcher2->replaceFirst("$1$1", status); 1398 REGEX_CHECK_STATUS; 1399 REGEX_ASSERT(dest == "bcbcdefg"); 1400 1401 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status); 1402 REGEX_CHECK_STATUS; 1403 REGEX_ASSERT(dest == "The value of $1 is bc.defg"); 1404 1405 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); 1406 REGEX_CHECK_STATUS; 1407 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg"); 1408 1409 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF."); 1410 replacement = replacement.unescape(); 1411 dest = matcher2->replaceFirst(replacement, status); 1412 REGEX_CHECK_STATUS; 1413 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg"); 1414 1415 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR); 1416 1417 1418 // 1419 // Replacement String with \u hex escapes 1420 // 1421 { 1422 UnicodeString src = "abc 1 abc 2 abc 3"; 1423 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--"); 1424 matcher->reset(src); 1425 UnicodeString result = matcher->replaceAll(substitute, status); 1426 REGEX_CHECK_STATUS; 1427 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3"); 1428 } 1429 { 1430 UnicodeString src = "abc !"; 1431 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--"); 1432 matcher->reset(src); 1433 UnicodeString result = matcher->replaceAll(substitute, status); 1434 REGEX_CHECK_STATUS; 1435 UnicodeString expected = UnicodeString("--"); 1436 expected.append((UChar32)0x10000); 1437 expected.append("-- !"); 1438 REGEX_ASSERT(result == expected); 1439 } 1440 // TODO: need more through testing of capture substitutions. 1441 1442 // Bug 4057 1443 // 1444 { 1445 status = U_ZERO_ERROR; 1446 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin"; 1447 RegexMatcher m("ss(.*?)ee", 0, status); 1448 REGEX_CHECK_STATUS; 1449 UnicodeString result; 1450 1451 // Multiple finds do NOT bump up the previous appendReplacement postion. 1452 m.reset(s); 1453 m.find(); 1454 m.find(); 1455 m.appendReplacement(result, "ooh", status); 1456 REGEX_CHECK_STATUS; 1457 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1458 1459 // After a reset into the interior of a string, appendReplacemnt still starts at beginning. 1460 status = U_ZERO_ERROR; 1461 result.truncate(0); 1462 m.reset(10, status); 1463 m.find(); 1464 m.find(); 1465 m.appendReplacement(result, "ooh", status); 1466 REGEX_CHECK_STATUS; 1467 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1468 1469 // find() at interior of string, appendReplacemnt still starts at beginning. 1470 status = U_ZERO_ERROR; 1471 result.truncate(0); 1472 m.reset(); 1473 m.find(10, status); 1474 m.find(); 1475 m.appendReplacement(result, "ooh", status); 1476 REGEX_CHECK_STATUS; 1477 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1478 1479 m.appendTail(result); 1480 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin"); 1481 1482 } 1483 1484 delete matcher2; 1485 delete pat2; 1486 delete matcher; 1487 delete pat; 1488} 1489 1490 1491//--------------------------------------------------------------------------- 1492// 1493// API_Pattern Test that the API for class RegexPattern is 1494// present and nominally working. 1495// 1496//--------------------------------------------------------------------------- 1497void RegexTest::API_Pattern() { 1498 RegexPattern pata; // Test default constructor to not crash. 1499 RegexPattern patb; 1500 1501 REGEX_ASSERT(pata == patb); 1502 REGEX_ASSERT(pata == pata); 1503 1504 UnicodeString re1("abc[a-l][m-z]"); 1505 UnicodeString re2("def"); 1506 UErrorCode status = U_ZERO_ERROR; 1507 UParseError pe; 1508 1509 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status); 1510 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status); 1511 REGEX_CHECK_STATUS; 1512 REGEX_ASSERT(*pat1 == *pat1); 1513 REGEX_ASSERT(*pat1 != pata); 1514 1515 // Assign 1516 patb = *pat1; 1517 REGEX_ASSERT(patb == *pat1); 1518 1519 // Copy Construct 1520 RegexPattern patc(*pat1); 1521 REGEX_ASSERT(patc == *pat1); 1522 REGEX_ASSERT(patb == patc); 1523 REGEX_ASSERT(pat1 != pat2); 1524 patb = *pat2; 1525 REGEX_ASSERT(patb != patc); 1526 REGEX_ASSERT(patb == *pat2); 1527 1528 // Compile with no flags. 1529 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status); 1530 REGEX_ASSERT(*pat1a == *pat1); 1531 1532 REGEX_ASSERT(pat1a->flags() == 0); 1533 1534 // Compile with different flags should be not equal 1535 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status); 1536 REGEX_CHECK_STATUS; 1537 1538 REGEX_ASSERT(*pat1b != *pat1a); 1539 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 1540 REGEX_ASSERT(pat1a->flags() == 0); 1541 delete pat1b; 1542 1543 // clone 1544 RegexPattern *pat1c = pat1->clone(); 1545 REGEX_ASSERT(*pat1c == *pat1); 1546 REGEX_ASSERT(*pat1c != *pat2); 1547 1548 delete pat1c; 1549 delete pat1a; 1550 delete pat1; 1551 delete pat2; 1552 1553 1554 // 1555 // Verify that a matcher created from a cloned pattern works. 1556 // (Jitterbug 3423) 1557 // 1558 { 1559 UErrorCode status = U_ZERO_ERROR; 1560 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status); 1561 RegexPattern *pClone = pSource->clone(); 1562 delete pSource; 1563 RegexMatcher *mFromClone = pClone->matcher(status); 1564 REGEX_CHECK_STATUS; 1565 UnicodeString s = "Hello World"; 1566 mFromClone->reset(s); 1567 REGEX_ASSERT(mFromClone->find() == TRUE); 1568 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 1569 REGEX_ASSERT(mFromClone->find() == TRUE); 1570 REGEX_ASSERT(mFromClone->group(status) == "World"); 1571 REGEX_ASSERT(mFromClone->find() == FALSE); 1572 delete mFromClone; 1573 delete pClone; 1574 } 1575 1576 // 1577 // matches convenience API 1578 // 1579 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE); 1580 REGEX_CHECK_STATUS; 1581 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 1582 REGEX_CHECK_STATUS; 1583 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 1584 REGEX_CHECK_STATUS; 1585 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 1586 REGEX_CHECK_STATUS; 1587 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 1588 REGEX_CHECK_STATUS; 1589 status = U_INDEX_OUTOFBOUNDS_ERROR; 1590 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 1591 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1592 1593 1594 // 1595 // Split() 1596 // 1597 status = U_ZERO_ERROR; 1598 pat1 = RegexPattern::compile(" +", pe, status); 1599 REGEX_CHECK_STATUS; 1600 UnicodeString fields[10]; 1601 1602 int32_t n; 1603 n = pat1->split("Now is the time", fields, 10, status); 1604 REGEX_CHECK_STATUS; 1605 REGEX_ASSERT(n==4); 1606 REGEX_ASSERT(fields[0]=="Now"); 1607 REGEX_ASSERT(fields[1]=="is"); 1608 REGEX_ASSERT(fields[2]=="the"); 1609 REGEX_ASSERT(fields[3]=="time"); 1610 REGEX_ASSERT(fields[4]==""); 1611 1612 n = pat1->split("Now is the time", fields, 2, status); 1613 REGEX_CHECK_STATUS; 1614 REGEX_ASSERT(n==2); 1615 REGEX_ASSERT(fields[0]=="Now"); 1616 REGEX_ASSERT(fields[1]=="is the time"); 1617 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 1618 1619 fields[1] = "*"; 1620 status = U_ZERO_ERROR; 1621 n = pat1->split("Now is the time", fields, 1, status); 1622 REGEX_CHECK_STATUS; 1623 REGEX_ASSERT(n==1); 1624 REGEX_ASSERT(fields[0]=="Now is the time"); 1625 REGEX_ASSERT(fields[1]=="*"); 1626 status = U_ZERO_ERROR; 1627 1628 n = pat1->split(" Now is the time ", fields, 10, status); 1629 REGEX_CHECK_STATUS; 1630 REGEX_ASSERT(n==6); 1631 REGEX_ASSERT(fields[0]==""); 1632 REGEX_ASSERT(fields[1]=="Now"); 1633 REGEX_ASSERT(fields[2]=="is"); 1634 REGEX_ASSERT(fields[3]=="the"); 1635 REGEX_ASSERT(fields[4]=="time"); 1636 REGEX_ASSERT(fields[5]==""); 1637 1638 n = pat1->split(" ", fields, 10, status); 1639 REGEX_CHECK_STATUS; 1640 REGEX_ASSERT(n==2); 1641 REGEX_ASSERT(fields[0]==""); 1642 REGEX_ASSERT(fields[1]==""); 1643 1644 fields[0] = "foo"; 1645 n = pat1->split("", fields, 10, status); 1646 REGEX_CHECK_STATUS; 1647 REGEX_ASSERT(n==0); 1648 REGEX_ASSERT(fields[0]=="foo"); 1649 1650 delete pat1; 1651 1652 // split, with a pattern with (capture) 1653 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status); 1654 REGEX_CHECK_STATUS; 1655 1656 status = U_ZERO_ERROR; 1657 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 1658 REGEX_CHECK_STATUS; 1659 REGEX_ASSERT(n==7); 1660 REGEX_ASSERT(fields[0]==""); 1661 REGEX_ASSERT(fields[1]=="a"); 1662 REGEX_ASSERT(fields[2]=="Now is "); 1663 REGEX_ASSERT(fields[3]=="b"); 1664 REGEX_ASSERT(fields[4]=="the time"); 1665 REGEX_ASSERT(fields[5]=="c"); 1666 REGEX_ASSERT(fields[6]==""); 1667 REGEX_ASSERT(status==U_ZERO_ERROR); 1668 1669 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 1670 REGEX_CHECK_STATUS; 1671 REGEX_ASSERT(n==7); 1672 REGEX_ASSERT(fields[0]==" "); 1673 REGEX_ASSERT(fields[1]=="a"); 1674 REGEX_ASSERT(fields[2]=="Now is "); 1675 REGEX_ASSERT(fields[3]=="b"); 1676 REGEX_ASSERT(fields[4]=="the time"); 1677 REGEX_ASSERT(fields[5]=="c"); 1678 REGEX_ASSERT(fields[6]==""); 1679 1680 status = U_ZERO_ERROR; 1681 fields[6] = "foo"; 1682 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status); 1683 REGEX_CHECK_STATUS; 1684 REGEX_ASSERT(n==6); 1685 REGEX_ASSERT(fields[0]==" "); 1686 REGEX_ASSERT(fields[1]=="a"); 1687 REGEX_ASSERT(fields[2]=="Now is "); 1688 REGEX_ASSERT(fields[3]=="b"); 1689 REGEX_ASSERT(fields[4]=="the time"); 1690 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter. 1691 REGEX_ASSERT(fields[6]=="foo"); 1692 1693 status = U_ZERO_ERROR; 1694 fields[5] = "foo"; 1695 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 1696 REGEX_CHECK_STATUS; 1697 REGEX_ASSERT(n==5); 1698 REGEX_ASSERT(fields[0]==" "); 1699 REGEX_ASSERT(fields[1]=="a"); 1700 REGEX_ASSERT(fields[2]=="Now is "); 1701 REGEX_ASSERT(fields[3]=="b"); 1702 REGEX_ASSERT(fields[4]=="the time<c>"); 1703 REGEX_ASSERT(fields[5]=="foo"); 1704 1705 status = U_ZERO_ERROR; 1706 fields[5] = "foo"; 1707 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 1708 REGEX_CHECK_STATUS; 1709 REGEX_ASSERT(n==5); 1710 REGEX_ASSERT(fields[0]==" "); 1711 REGEX_ASSERT(fields[1]=="a"); 1712 REGEX_ASSERT(fields[2]=="Now is "); 1713 REGEX_ASSERT(fields[3]=="b"); 1714 REGEX_ASSERT(fields[4]=="the time"); 1715 REGEX_ASSERT(fields[5]=="foo"); 1716 1717 status = U_ZERO_ERROR; 1718 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 1719 REGEX_CHECK_STATUS; 1720 REGEX_ASSERT(n==4); 1721 REGEX_ASSERT(fields[0]==" "); 1722 REGEX_ASSERT(fields[1]=="a"); 1723 REGEX_ASSERT(fields[2]=="Now is "); 1724 REGEX_ASSERT(fields[3]=="the time<c>"); 1725 status = U_ZERO_ERROR; 1726 delete pat1; 1727 1728 pat1 = RegexPattern::compile("([-,])", pe, status); 1729 REGEX_CHECK_STATUS; 1730 n = pat1->split("1-10,20", fields, 10, status); 1731 REGEX_CHECK_STATUS; 1732 REGEX_ASSERT(n==5); 1733 REGEX_ASSERT(fields[0]=="1"); 1734 REGEX_ASSERT(fields[1]=="-"); 1735 REGEX_ASSERT(fields[2]=="10"); 1736 REGEX_ASSERT(fields[3]==","); 1737 REGEX_ASSERT(fields[4]=="20"); 1738 delete pat1; 1739 1740 // Test split of string with empty trailing fields 1741 pat1 = RegexPattern::compile(",", pe, status); 1742 REGEX_CHECK_STATUS; 1743 n = pat1->split("a,b,c,", fields, 10, status); 1744 REGEX_CHECK_STATUS; 1745 REGEX_ASSERT(n==4); 1746 REGEX_ASSERT(fields[0]=="a"); 1747 REGEX_ASSERT(fields[1]=="b"); 1748 REGEX_ASSERT(fields[2]=="c"); 1749 REGEX_ASSERT(fields[3]==""); 1750 1751 n = pat1->split("a,,,", fields, 10, status); 1752 REGEX_CHECK_STATUS; 1753 REGEX_ASSERT(n==4); 1754 REGEX_ASSERT(fields[0]=="a"); 1755 REGEX_ASSERT(fields[1]==""); 1756 REGEX_ASSERT(fields[2]==""); 1757 REGEX_ASSERT(fields[3]==""); 1758 delete pat1; 1759 1760 // Split Separator with zero length match. 1761 pat1 = RegexPattern::compile(":?", pe, status); 1762 REGEX_CHECK_STATUS; 1763 n = pat1->split("abc", fields, 10, status); 1764 REGEX_CHECK_STATUS; 1765 REGEX_ASSERT(n==5); 1766 REGEX_ASSERT(fields[0]==""); 1767 REGEX_ASSERT(fields[1]=="a"); 1768 REGEX_ASSERT(fields[2]=="b"); 1769 REGEX_ASSERT(fields[3]=="c"); 1770 REGEX_ASSERT(fields[4]==""); 1771 1772 delete pat1; 1773 1774 // 1775 // RegexPattern::pattern() 1776 // 1777 pat1 = new RegexPattern(); 1778 REGEX_ASSERT(pat1->pattern() == ""); 1779 delete pat1; 1780 1781 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1782 REGEX_CHECK_STATUS; 1783 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*"); 1784 delete pat1; 1785 1786 1787 // 1788 // classID functions 1789 // 1790 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1791 REGEX_CHECK_STATUS; 1792 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID()); 1793 REGEX_ASSERT(pat1->getDynamicClassID() != NULL); 1794 UnicodeString Hello("Hello, world."); 1795 RegexMatcher *m = pat1->matcher(Hello, status); 1796 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID()); 1797 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID()); 1798 REGEX_ASSERT(m->getDynamicClassID() != NULL); 1799 delete m; 1800 delete pat1; 1801 1802} 1803 1804//--------------------------------------------------------------------------- 1805// 1806// API_Match_UTF8 Test that the alternate engine for class RegexMatcher 1807// is present and working, but excluding functions 1808// implementing replace operations. 1809// 1810//--------------------------------------------------------------------------- 1811void RegexTest::API_Match_UTF8() { 1812 UParseError pe; 1813 UErrorCode status=U_ZERO_ERROR; 1814 int32_t flags = 0; 1815 1816 // 1817 // Debug - slide failing test cases early 1818 // 1819#if 0 1820 { 1821 } 1822 return; 1823#endif 1824 1825 // 1826 // Simple pattern compilation 1827 // 1828 { 1829 UText re = UTEXT_INITIALIZER; 1830 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 1831 REGEX_VERBOSE_TEXT(&re); 1832 RegexPattern *pat2; 1833 pat2 = RegexPattern::compile(&re, flags, pe, status); 1834 REGEX_CHECK_STATUS; 1835 1836 UText input1 = UTEXT_INITIALIZER; 1837 UText input2 = UTEXT_INITIALIZER; 1838 UText empty = UTEXT_INITIALIZER; 1839 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status); 1840 REGEX_VERBOSE_TEXT(&input1); 1841 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status); 1842 REGEX_VERBOSE_TEXT(&input2); 1843 utext_openUChars(&empty, NULL, 0, &status); 1844 1845 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */ 1846 int32_t input2Len = strlen("not abc"); 1847 1848 1849 // 1850 // Matcher creation and reset. 1851 // 1852 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1); 1853 REGEX_CHECK_STATUS; 1854 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1855 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */ 1856 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1857 m1->reset(&input2); 1858 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1859 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */ 1860 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText()); 1861 m1->reset(&input1); 1862 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1863 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1864 m1->reset(&empty); 1865 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1866 REGEX_ASSERT(utext_nativeLength(&empty) == 0); 1867 1868 // 1869 // reset(pos, status) 1870 // 1871 m1->reset(&input1); 1872 m1->reset(4, status); 1873 REGEX_CHECK_STATUS; 1874 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1875 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1876 1877 m1->reset(-1, status); 1878 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1879 status = U_ZERO_ERROR; 1880 1881 m1->reset(0, status); 1882 REGEX_CHECK_STATUS; 1883 status = U_ZERO_ERROR; 1884 1885 m1->reset(input1Len-1, status); 1886 REGEX_CHECK_STATUS; 1887 status = U_ZERO_ERROR; 1888 1889 m1->reset(input1Len, status); 1890 REGEX_CHECK_STATUS; 1891 status = U_ZERO_ERROR; 1892 1893 m1->reset(input1Len+1, status); 1894 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1895 status = U_ZERO_ERROR; 1896 1897 // 1898 // match(pos, status) 1899 // 1900 m1->reset(&input2); 1901 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1902 m1->reset(); 1903 REGEX_ASSERT(m1->matches(3, status) == FALSE); 1904 m1->reset(); 1905 REGEX_ASSERT(m1->matches(5, status) == FALSE); 1906 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1907 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 1908 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1909 1910 // Match() at end of string should fail, but should not 1911 // be an error. 1912 status = U_ZERO_ERROR; 1913 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE); 1914 REGEX_CHECK_STATUS; 1915 1916 // Match beyond end of string should fail with an error. 1917 status = U_ZERO_ERROR; 1918 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE); 1919 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1920 1921 // Successful match at end of string. 1922 { 1923 status = U_ZERO_ERROR; 1924 RegexMatcher m("A?", 0, status); // will match zero length string. 1925 REGEX_CHECK_STATUS; 1926 m.reset(&input1); 1927 REGEX_ASSERT(m.matches(input1Len, status) == TRUE); 1928 REGEX_CHECK_STATUS; 1929 m.reset(&empty); 1930 REGEX_ASSERT(m.matches(0, status) == TRUE); 1931 REGEX_CHECK_STATUS; 1932 } 1933 1934 1935 // 1936 // lookingAt(pos, status) 1937 // 1938 status = U_ZERO_ERROR; 1939 m1->reset(&input2); // "not abc" 1940 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1941 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 1942 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 1943 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1944 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 1945 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1946 status = U_ZERO_ERROR; 1947 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE); 1948 REGEX_CHECK_STATUS; 1949 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE); 1950 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1951 1952 delete m1; 1953 delete pat2; 1954 1955 utext_close(&re); 1956 utext_close(&input1); 1957 utext_close(&input2); 1958 utext_close(&empty); 1959 } 1960 1961 1962 // 1963 // Capture Group. 1964 // RegexMatcher::start(); 1965 // RegexMatcher::end(); 1966 // RegexMatcher::groupCount(); 1967 // 1968 { 1969 int32_t flags=0; 1970 UParseError pe; 1971 UErrorCode status=U_ZERO_ERROR; 1972 UText re=UTEXT_INITIALIZER; 1973 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */ 1974 utext_openUTF8(&re, str_01234567_pat, -1, &status); 1975 1976 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 1977 REGEX_CHECK_STATUS; 1978 1979 UText input = UTEXT_INITIALIZER; 1980 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 1981 utext_openUTF8(&input, str_0123456789, -1, &status); 1982 1983 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 1984 REGEX_CHECK_STATUS; 1985 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 1986 static const int32_t matchStarts[] = {0, 2, 4, 8}; 1987 static const int32_t matchEnds[] = {10, 8, 6, 10}; 1988 int32_t i; 1989 for (i=0; i<4; i++) { 1990 int32_t actualStart = matcher->start(i, status); 1991 REGEX_CHECK_STATUS; 1992 if (actualStart != matchStarts[i]) { 1993 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n", 1994 __FILE__, __LINE__, i, matchStarts[i], actualStart); 1995 } 1996 int32_t actualEnd = matcher->end(i, status); 1997 REGEX_CHECK_STATUS; 1998 if (actualEnd != matchEnds[i]) { 1999 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n", 2000 __FILE__, __LINE__, i, matchEnds[i], actualEnd); 2001 } 2002 } 2003 2004 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 2005 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 2006 2007 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2008 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 2009 matcher->reset(); 2010 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 2011 2012 matcher->lookingAt(status); 2013 2014 UnicodeString dest; 2015 UText destText = UTEXT_INITIALIZER; 2016 utext_openUnicodeString(&destText, &dest, &status); 2017 UText *result; 2018 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 2019 // Test shallow-clone API 2020 int64_t group_len; 2021 result = matcher->group((UText *)NULL, group_len, status); 2022 REGEX_CHECK_STATUS; 2023 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2024 utext_close(result); 2025 result = matcher->group(0, &destText, group_len, status); 2026 REGEX_CHECK_STATUS; 2027 REGEX_ASSERT(result == &destText); 2028 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2029 // destText is now immutable, reopen it 2030 utext_close(&destText); 2031 utext_openUnicodeString(&destText, &dest, &status); 2032 2033 result = matcher->group(0, NULL, status); 2034 REGEX_CHECK_STATUS; 2035 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2036 utext_close(result); 2037 result = matcher->group(0, &destText, status); 2038 REGEX_CHECK_STATUS; 2039 REGEX_ASSERT(result == &destText); 2040 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2041 2042 result = matcher->group(1, NULL, status); 2043 REGEX_CHECK_STATUS; 2044 const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */ 2045 REGEX_ASSERT_UTEXT_UTF8(str_234567, result); 2046 utext_close(result); 2047 result = matcher->group(1, &destText, status); 2048 REGEX_CHECK_STATUS; 2049 REGEX_ASSERT(result == &destText); 2050 REGEX_ASSERT_UTEXT_UTF8(str_234567, result); 2051 2052 result = matcher->group(2, NULL, status); 2053 REGEX_CHECK_STATUS; 2054 const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */ 2055 REGEX_ASSERT_UTEXT_UTF8(str_45, result); 2056 utext_close(result); 2057 result = matcher->group(2, &destText, status); 2058 REGEX_CHECK_STATUS; 2059 REGEX_ASSERT(result == &destText); 2060 REGEX_ASSERT_UTEXT_UTF8(str_45, result); 2061 2062 result = matcher->group(3, NULL, status); 2063 REGEX_CHECK_STATUS; 2064 const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */ 2065 REGEX_ASSERT_UTEXT_UTF8(str_89, result); 2066 utext_close(result); 2067 result = matcher->group(3, &destText, status); 2068 REGEX_CHECK_STATUS; 2069 REGEX_ASSERT(result == &destText); 2070 REGEX_ASSERT_UTEXT_UTF8(str_89, result); 2071 2072 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2073 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 2074 matcher->reset(); 2075 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 2076 2077 delete matcher; 2078 delete pat; 2079 2080 utext_close(&destText); 2081 utext_close(&input); 2082 utext_close(&re); 2083 } 2084 2085 // 2086 // find 2087 // 2088 { 2089 int32_t flags=0; 2090 UParseError pe; 2091 UErrorCode status=U_ZERO_ERROR; 2092 UText re=UTEXT_INITIALIZER; 2093 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2094 utext_openUTF8(&re, str_abc, -1, &status); 2095 2096 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2097 REGEX_CHECK_STATUS; 2098 UText input = UTEXT_INITIALIZER; 2099 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 2100 utext_openUTF8(&input, str_abcabcabc, -1, &status); 2101 // 012345678901234567 2102 2103 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2104 REGEX_CHECK_STATUS; 2105 REGEX_ASSERT(matcher->find()); 2106 REGEX_ASSERT(matcher->start(status) == 1); 2107 REGEX_ASSERT(matcher->find()); 2108 REGEX_ASSERT(matcher->start(status) == 6); 2109 REGEX_ASSERT(matcher->find()); 2110 REGEX_ASSERT(matcher->start(status) == 12); 2111 REGEX_ASSERT(matcher->find() == FALSE); 2112 REGEX_ASSERT(matcher->find() == FALSE); 2113 2114 matcher->reset(); 2115 REGEX_ASSERT(matcher->find()); 2116 REGEX_ASSERT(matcher->start(status) == 1); 2117 2118 REGEX_ASSERT(matcher->find(0, status)); 2119 REGEX_ASSERT(matcher->start(status) == 1); 2120 REGEX_ASSERT(matcher->find(1, status)); 2121 REGEX_ASSERT(matcher->start(status) == 1); 2122 REGEX_ASSERT(matcher->find(2, status)); 2123 REGEX_ASSERT(matcher->start(status) == 6); 2124 REGEX_ASSERT(matcher->find(12, status)); 2125 REGEX_ASSERT(matcher->start(status) == 12); 2126 REGEX_ASSERT(matcher->find(13, status) == FALSE); 2127 REGEX_ASSERT(matcher->find(16, status) == FALSE); 2128 REGEX_ASSERT(matcher->find(17, status) == FALSE); 2129 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 2130 2131 status = U_ZERO_ERROR; 2132 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2133 status = U_ZERO_ERROR; 2134 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 2135 2136 REGEX_ASSERT(matcher->groupCount() == 0); 2137 2138 delete matcher; 2139 delete pat; 2140 2141 utext_close(&input); 2142 utext_close(&re); 2143 } 2144 2145 2146 // 2147 // find, with \G in pattern (true if at the end of a previous match). 2148 // 2149 { 2150 int32_t flags=0; 2151 UParseError pe; 2152 UErrorCode status=U_ZERO_ERROR; 2153 UText re=UTEXT_INITIALIZER; 2154 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */ 2155 utext_openUTF8(&re, str_Gabcabc, -1, &status); 2156 2157 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2158 2159 REGEX_CHECK_STATUS; 2160 UText input = UTEXT_INITIALIZER; 2161 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */ 2162 utext_openUTF8(&input, str_abcabcabc, -1, &status); 2163 // 012345678901234567 2164 2165 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2166 REGEX_CHECK_STATUS; 2167 REGEX_ASSERT(matcher->find()); 2168 REGEX_ASSERT(matcher->start(status) == 0); 2169 REGEX_ASSERT(matcher->start(1, status) == -1); 2170 REGEX_ASSERT(matcher->start(2, status) == 1); 2171 2172 REGEX_ASSERT(matcher->find()); 2173 REGEX_ASSERT(matcher->start(status) == 4); 2174 REGEX_ASSERT(matcher->start(1, status) == 4); 2175 REGEX_ASSERT(matcher->start(2, status) == -1); 2176 REGEX_CHECK_STATUS; 2177 2178 delete matcher; 2179 delete pat; 2180 2181 utext_close(&input); 2182 utext_close(&re); 2183 } 2184 2185 // 2186 // find with zero length matches, match position should bump ahead 2187 // to prevent loops. 2188 // 2189 { 2190 int32_t i; 2191 UErrorCode status=U_ZERO_ERROR; 2192 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 2193 // using an always-true look-ahead. 2194 REGEX_CHECK_STATUS; 2195 UText s = UTEXT_INITIALIZER; 2196 utext_openUTF8(&s, " ", -1, &status); 2197 m.reset(&s); 2198 for (i=0; ; i++) { 2199 if (m.find() == FALSE) { 2200 break; 2201 } 2202 REGEX_ASSERT(m.start(status) == i); 2203 REGEX_ASSERT(m.end(status) == i); 2204 } 2205 REGEX_ASSERT(i==5); 2206 2207 // Check that the bump goes over characters outside the BMP OK 2208 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8 2209 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00}; 2210 utext_openUTF8(&s, (char *)aboveBMP, -1, &status); 2211 m.reset(&s); 2212 for (i=0; ; i+=4) { 2213 if (m.find() == FALSE) { 2214 break; 2215 } 2216 REGEX_ASSERT(m.start(status) == i); 2217 REGEX_ASSERT(m.end(status) == i); 2218 } 2219 REGEX_ASSERT(i==20); 2220 2221 utext_close(&s); 2222 } 2223 { 2224 // find() loop breaking test. 2225 // with pattern of /.?/, should see a series of one char matches, then a single 2226 // match of zero length at the end of the input string. 2227 int32_t i; 2228 UErrorCode status=U_ZERO_ERROR; 2229 RegexMatcher m(".?", 0, status); 2230 REGEX_CHECK_STATUS; 2231 UText s = UTEXT_INITIALIZER; 2232 utext_openUTF8(&s, " ", -1, &status); 2233 m.reset(&s); 2234 for (i=0; ; i++) { 2235 if (m.find() == FALSE) { 2236 break; 2237 } 2238 REGEX_ASSERT(m.start(status) == i); 2239 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 2240 } 2241 REGEX_ASSERT(i==5); 2242 2243 utext_close(&s); 2244 } 2245 2246 2247 // 2248 // Matchers with no input string behave as if they had an empty input string. 2249 // 2250 2251 { 2252 UErrorCode status = U_ZERO_ERROR; 2253 RegexMatcher m(".?", 0, status); 2254 REGEX_CHECK_STATUS; 2255 REGEX_ASSERT(m.find()); 2256 REGEX_ASSERT(m.start(status) == 0); 2257 REGEX_ASSERT(m.input() == ""); 2258 } 2259 { 2260 UErrorCode status = U_ZERO_ERROR; 2261 RegexPattern *p = RegexPattern::compile(".", 0, status); 2262 RegexMatcher *m = p->matcher(status); 2263 REGEX_CHECK_STATUS; 2264 2265 REGEX_ASSERT(m->find() == FALSE); 2266 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0); 2267 delete m; 2268 delete p; 2269 } 2270 2271 // 2272 // Regions 2273 // 2274 { 2275 UErrorCode status = U_ZERO_ERROR; 2276 UText testPattern = UTEXT_INITIALIZER; 2277 UText testText = UTEXT_INITIALIZER; 2278 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status); 2279 REGEX_VERBOSE_TEXT(&testPattern); 2280 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status); 2281 REGEX_VERBOSE_TEXT(&testText); 2282 2283 RegexMatcher m(&testPattern, &testText, 0, status); 2284 REGEX_CHECK_STATUS; 2285 REGEX_ASSERT(m.regionStart() == 0); 2286 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2287 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2288 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2289 2290 m.region(2,4, status); 2291 REGEX_CHECK_STATUS; 2292 REGEX_ASSERT(m.matches(status)); 2293 REGEX_ASSERT(m.start(status)==2); 2294 REGEX_ASSERT(m.end(status)==4); 2295 REGEX_CHECK_STATUS; 2296 2297 m.reset(); 2298 REGEX_ASSERT(m.regionStart() == 0); 2299 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2300 2301 regextst_openUTF8FromInvariant(&testText, "short", -1, &status); 2302 REGEX_VERBOSE_TEXT(&testText); 2303 m.reset(&testText); 2304 REGEX_ASSERT(m.regionStart() == 0); 2305 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short")); 2306 2307 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2308 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 2309 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2310 REGEX_ASSERT(&m == &m.reset()); 2311 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2312 2313 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 2314 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2315 REGEX_ASSERT(&m == &m.reset()); 2316 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2317 2318 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2319 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 2320 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2321 REGEX_ASSERT(&m == &m.reset()); 2322 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2323 2324 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 2325 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2326 REGEX_ASSERT(&m == &m.reset()); 2327 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2328 2329 utext_close(&testText); 2330 utext_close(&testPattern); 2331 } 2332 2333 // 2334 // hitEnd() and requireEnd() 2335 // 2336 { 2337 UErrorCode status = U_ZERO_ERROR; 2338 UText testPattern = UTEXT_INITIALIZER; 2339 UText testText = UTEXT_INITIALIZER; 2340 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2341 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */ 2342 utext_openUTF8(&testPattern, str_, -1, &status); 2343 utext_openUTF8(&testText, str_aabb, -1, &status); 2344 2345 RegexMatcher m1(&testPattern, &testText, 0, status); 2346 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 2347 REGEX_ASSERT(m1.hitEnd() == TRUE); 2348 REGEX_ASSERT(m1.requireEnd() == FALSE); 2349 REGEX_CHECK_STATUS; 2350 2351 status = U_ZERO_ERROR; 2352 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */ 2353 utext_openUTF8(&testPattern, str_a, -1, &status); 2354 RegexMatcher m2(&testPattern, &testText, 0, status); 2355 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 2356 REGEX_ASSERT(m2.hitEnd() == FALSE); 2357 REGEX_ASSERT(m2.requireEnd() == FALSE); 2358 REGEX_CHECK_STATUS; 2359 2360 status = U_ZERO_ERROR; 2361 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */ 2362 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status); 2363 RegexMatcher m3(&testPattern, &testText, 0, status); 2364 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 2365 REGEX_ASSERT(m3.hitEnd() == TRUE); 2366 REGEX_ASSERT(m3.requireEnd() == TRUE); 2367 REGEX_CHECK_STATUS; 2368 2369 utext_close(&testText); 2370 utext_close(&testPattern); 2371 } 2372} 2373 2374 2375//--------------------------------------------------------------------------- 2376// 2377// API_Replace_UTF8 API test for class RegexMatcher, testing the 2378// Replace family of functions. 2379// 2380//--------------------------------------------------------------------------- 2381void RegexTest::API_Replace_UTF8() { 2382 // 2383 // Replace 2384 // 2385 int32_t flags=0; 2386 UParseError pe; 2387 UErrorCode status=U_ZERO_ERROR; 2388 2389 UText re=UTEXT_INITIALIZER; 2390 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 2391 REGEX_VERBOSE_TEXT(&re); 2392 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2393 REGEX_CHECK_STATUS; 2394 2395 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 2396 // 012345678901234567 2397 UText dataText = UTEXT_INITIALIZER; 2398 utext_openUTF8(&dataText, data, -1, &status); 2399 REGEX_CHECK_STATUS; 2400 REGEX_VERBOSE_TEXT(&dataText); 2401 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText); 2402 2403 // 2404 // Plain vanilla matches. 2405 // 2406 UnicodeString dest; 2407 UText destText = UTEXT_INITIALIZER; 2408 utext_openUnicodeString(&destText, &dest, &status); 2409 UText *result; 2410 2411 UText replText = UTEXT_INITIALIZER; 2412 2413 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */ 2414 utext_openUTF8(&replText, str_yz, -1, &status); 2415 REGEX_VERBOSE_TEXT(&replText); 2416 result = matcher->replaceFirst(&replText, NULL, status); 2417 REGEX_CHECK_STATUS; 2418 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */ 2419 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2420 utext_close(result); 2421 result = matcher->replaceFirst(&replText, &destText, status); 2422 REGEX_CHECK_STATUS; 2423 REGEX_ASSERT(result == &destText); 2424 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2425 2426 result = matcher->replaceAll(&replText, NULL, status); 2427 REGEX_CHECK_STATUS; 2428 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */ 2429 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2430 utext_close(result); 2431 2432 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2433 result = matcher->replaceAll(&replText, &destText, status); 2434 REGEX_CHECK_STATUS; 2435 REGEX_ASSERT(result == &destText); 2436 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2437 2438 // 2439 // Plain vanilla non-matches. 2440 // 2441 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */ 2442 utext_openUTF8(&dataText, str_abxabxabx, -1, &status); 2443 matcher->reset(&dataText); 2444 2445 result = matcher->replaceFirst(&replText, NULL, status); 2446 REGEX_CHECK_STATUS; 2447 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2448 utext_close(result); 2449 result = matcher->replaceFirst(&replText, &destText, status); 2450 REGEX_CHECK_STATUS; 2451 REGEX_ASSERT(result == &destText); 2452 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2453 2454 result = matcher->replaceAll(&replText, NULL, status); 2455 REGEX_CHECK_STATUS; 2456 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2457 utext_close(result); 2458 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2459 result = matcher->replaceAll(&replText, &destText, status); 2460 REGEX_CHECK_STATUS; 2461 REGEX_ASSERT(result == &destText); 2462 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2463 2464 // 2465 // Empty source string 2466 // 2467 utext_openUTF8(&dataText, NULL, 0, &status); 2468 matcher->reset(&dataText); 2469 2470 result = matcher->replaceFirst(&replText, NULL, status); 2471 REGEX_CHECK_STATUS; 2472 REGEX_ASSERT_UTEXT_UTF8("", result); 2473 utext_close(result); 2474 result = matcher->replaceFirst(&replText, &destText, status); 2475 REGEX_CHECK_STATUS; 2476 REGEX_ASSERT(result == &destText); 2477 REGEX_ASSERT_UTEXT_UTF8("", result); 2478 2479 result = matcher->replaceAll(&replText, NULL, status); 2480 REGEX_CHECK_STATUS; 2481 REGEX_ASSERT_UTEXT_UTF8("", result); 2482 utext_close(result); 2483 result = matcher->replaceAll(&replText, &destText, status); 2484 REGEX_CHECK_STATUS; 2485 REGEX_ASSERT(result == &destText); 2486 REGEX_ASSERT_UTEXT_UTF8("", result); 2487 2488 // 2489 // Empty substitution string 2490 // 2491 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.." 2492 matcher->reset(&dataText); 2493 2494 utext_openUTF8(&replText, NULL, 0, &status); 2495 result = matcher->replaceFirst(&replText, NULL, status); 2496 REGEX_CHECK_STATUS; 2497 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */ 2498 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2499 utext_close(result); 2500 result = matcher->replaceFirst(&replText, &destText, status); 2501 REGEX_CHECK_STATUS; 2502 REGEX_ASSERT(result == &destText); 2503 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2504 2505 result = matcher->replaceAll(&replText, NULL, status); 2506 REGEX_CHECK_STATUS; 2507 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */ 2508 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2509 utext_close(result); 2510 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2511 result = matcher->replaceAll(&replText, &destText, status); 2512 REGEX_CHECK_STATUS; 2513 REGEX_ASSERT(result == &destText); 2514 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2515 2516 // 2517 // match whole string 2518 // 2519 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2520 utext_openUTF8(&dataText, str_abc, -1, &status); 2521 matcher->reset(&dataText); 2522 2523 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */ 2524 utext_openUTF8(&replText, str_xyz, -1, &status); 2525 result = matcher->replaceFirst(&replText, NULL, status); 2526 REGEX_CHECK_STATUS; 2527 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2528 utext_close(result); 2529 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2530 result = matcher->replaceFirst(&replText, &destText, status); 2531 REGEX_CHECK_STATUS; 2532 REGEX_ASSERT(result == &destText); 2533 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2534 2535 result = matcher->replaceAll(&replText, NULL, status); 2536 REGEX_CHECK_STATUS; 2537 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2538 utext_close(result); 2539 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2540 result = matcher->replaceAll(&replText, &destText, status); 2541 REGEX_CHECK_STATUS; 2542 REGEX_ASSERT(result == &destText); 2543 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2544 2545 // 2546 // Capture Group, simple case 2547 // 2548 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */ 2549 utext_openUTF8(&re, str_add, -1, &status); 2550 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status); 2551 REGEX_CHECK_STATUS; 2552 2553 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */ 2554 utext_openUTF8(&dataText, str_abcdefg, -1, &status); 2555 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText); 2556 REGEX_CHECK_STATUS; 2557 2558 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */ 2559 utext_openUTF8(&replText, str_11, -1, &status); 2560 result = matcher2->replaceFirst(&replText, NULL, status); 2561 REGEX_CHECK_STATUS; 2562 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */ 2563 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2564 utext_close(result); 2565 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2566 result = matcher2->replaceFirst(&replText, &destText, status); 2567 REGEX_CHECK_STATUS; 2568 REGEX_ASSERT(result == &destText); 2569 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2570 2571 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */ 2572 utext_openUTF8(&replText, str_v, -1, &status); 2573 REGEX_VERBOSE_TEXT(&replText); 2574 result = matcher2->replaceFirst(&replText, NULL, status); 2575 REGEX_CHECK_STATUS; 2576 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */ 2577 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2578 utext_close(result); 2579 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2580 result = matcher2->replaceFirst(&replText, &destText, status); 2581 REGEX_CHECK_STATUS; 2582 REGEX_ASSERT(result == &destText); 2583 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2584 2585 const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */ 2586 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status); 2587 result = matcher2->replaceFirst(&replText, NULL, status); 2588 REGEX_CHECK_STATUS; 2589 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */ 2590 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2591 utext_close(result); 2592 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2593 result = matcher2->replaceFirst(&replText, &destText, status); 2594 REGEX_CHECK_STATUS; 2595 REGEX_ASSERT(result == &destText); 2596 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2597 2598 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */ 2599 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE 2600 // 012345678901234567890123456 2601 supplDigitChars[22] = 0xF0; 2602 supplDigitChars[23] = 0x9D; 2603 supplDigitChars[24] = 0x9F; 2604 supplDigitChars[25] = 0x8F; 2605 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status); 2606 2607 result = matcher2->replaceFirst(&replText, NULL, status); 2608 REGEX_CHECK_STATUS; 2609 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */ 2610 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2611 utext_close(result); 2612 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2613 result = matcher2->replaceFirst(&replText, &destText, status); 2614 REGEX_CHECK_STATUS; 2615 REGEX_ASSERT(result == &destText); 2616 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2617 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */ 2618 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status); 2619 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2620// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2621 utext_close(result); 2622 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2623 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2624 REGEX_ASSERT(result == &destText); 2625// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2626 2627 // 2628 // Replacement String with \u hex escapes 2629 // 2630 { 2631 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */ 2632 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */ 2633 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status); 2634 utext_openUTF8(&replText, str_u0043, -1, &status); 2635 matcher->reset(&dataText); 2636 2637 result = matcher->replaceAll(&replText, NULL, status); 2638 REGEX_CHECK_STATUS; 2639 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */ 2640 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2641 utext_close(result); 2642 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2643 result = matcher->replaceAll(&replText, &destText, status); 2644 REGEX_CHECK_STATUS; 2645 REGEX_ASSERT(result == &destText); 2646 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2647 } 2648 { 2649 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */ 2650 utext_openUTF8(&dataText, str_abc, -1, &status); 2651 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */ 2652 utext_openUTF8(&replText, str_U00010000, -1, &status); 2653 matcher->reset(&dataText); 2654 2655 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A" 2656 // 0123456789 2657 expected[2] = 0xF0; 2658 expected[3] = 0x90; 2659 expected[4] = 0x80; 2660 expected[5] = 0x80; 2661 2662 result = matcher->replaceAll(&replText, NULL, status); 2663 REGEX_CHECK_STATUS; 2664 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2665 utext_close(result); 2666 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2667 result = matcher->replaceAll(&replText, &destText, status); 2668 REGEX_CHECK_STATUS; 2669 REGEX_ASSERT(result == &destText); 2670 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2671 } 2672 // TODO: need more through testing of capture substitutions. 2673 2674 // Bug 4057 2675 // 2676 { 2677 status = U_ZERO_ERROR; 2678const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */ 2679const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */ 2680const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */ 2681 utext_openUTF8(&re, str_ssee, -1, &status); 2682 utext_openUTF8(&dataText, str_blah, -1, &status); 2683 utext_openUTF8(&replText, str_ooh, -1, &status); 2684 2685 RegexMatcher m(&re, 0, status); 2686 REGEX_CHECK_STATUS; 2687 2688 UnicodeString result; 2689 UText resultText = UTEXT_INITIALIZER; 2690 utext_openUnicodeString(&resultText, &result, &status); 2691 2692 // Multiple finds do NOT bump up the previous appendReplacement postion. 2693 m.reset(&dataText); 2694 m.find(); 2695 m.find(); 2696 m.appendReplacement(&resultText, &replText, status); 2697 REGEX_CHECK_STATUS; 2698 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2699 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText); 2700 2701 // After a reset into the interior of a string, appendReplacement still starts at beginning. 2702 status = U_ZERO_ERROR; 2703 result.truncate(0); 2704 utext_openUnicodeString(&resultText, &result, &status); 2705 m.reset(10, status); 2706 m.find(); 2707 m.find(); 2708 m.appendReplacement(&resultText, &replText, status); 2709 REGEX_CHECK_STATUS; 2710 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2711 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText); 2712 2713 // find() at interior of string, appendReplacement still starts at beginning. 2714 status = U_ZERO_ERROR; 2715 result.truncate(0); 2716 utext_openUnicodeString(&resultText, &result, &status); 2717 m.reset(); 2718 m.find(10, status); 2719 m.find(); 2720 m.appendReplacement(&resultText, &replText, status); 2721 REGEX_CHECK_STATUS; 2722 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2723 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText); 2724 2725 m.appendTail(&resultText, status); 2726 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */ 2727 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText); 2728 2729 utext_close(&resultText); 2730 } 2731 2732 delete matcher2; 2733 delete pat2; 2734 delete matcher; 2735 delete pat; 2736 2737 utext_close(&dataText); 2738 utext_close(&replText); 2739 utext_close(&destText); 2740 utext_close(&re); 2741} 2742 2743 2744//--------------------------------------------------------------------------- 2745// 2746// API_Pattern_UTF8 Test that the API for class RegexPattern is 2747// present and nominally working. 2748// 2749//--------------------------------------------------------------------------- 2750void RegexTest::API_Pattern_UTF8() { 2751 RegexPattern pata; // Test default constructor to not crash. 2752 RegexPattern patb; 2753 2754 REGEX_ASSERT(pata == patb); 2755 REGEX_ASSERT(pata == pata); 2756 2757 UText re1 = UTEXT_INITIALIZER; 2758 UText re2 = UTEXT_INITIALIZER; 2759 UErrorCode status = U_ZERO_ERROR; 2760 UParseError pe; 2761 2762 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */ 2763 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */ 2764 utext_openUTF8(&re1, str_abcalmz, -1, &status); 2765 utext_openUTF8(&re2, str_def, -1, &status); 2766 2767 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status); 2768 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status); 2769 REGEX_CHECK_STATUS; 2770 REGEX_ASSERT(*pat1 == *pat1); 2771 REGEX_ASSERT(*pat1 != pata); 2772 2773 // Assign 2774 patb = *pat1; 2775 REGEX_ASSERT(patb == *pat1); 2776 2777 // Copy Construct 2778 RegexPattern patc(*pat1); 2779 REGEX_ASSERT(patc == *pat1); 2780 REGEX_ASSERT(patb == patc); 2781 REGEX_ASSERT(pat1 != pat2); 2782 patb = *pat2; 2783 REGEX_ASSERT(patb != patc); 2784 REGEX_ASSERT(patb == *pat2); 2785 2786 // Compile with no flags. 2787 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status); 2788 REGEX_ASSERT(*pat1a == *pat1); 2789 2790 REGEX_ASSERT(pat1a->flags() == 0); 2791 2792 // Compile with different flags should be not equal 2793 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status); 2794 REGEX_CHECK_STATUS; 2795 2796 REGEX_ASSERT(*pat1b != *pat1a); 2797 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 2798 REGEX_ASSERT(pat1a->flags() == 0); 2799 delete pat1b; 2800 2801 // clone 2802 RegexPattern *pat1c = pat1->clone(); 2803 REGEX_ASSERT(*pat1c == *pat1); 2804 REGEX_ASSERT(*pat1c != *pat2); 2805 2806 delete pat1c; 2807 delete pat1a; 2808 delete pat1; 2809 delete pat2; 2810 2811 utext_close(&re1); 2812 utext_close(&re2); 2813 2814 2815 // 2816 // Verify that a matcher created from a cloned pattern works. 2817 // (Jitterbug 3423) 2818 // 2819 { 2820 UErrorCode status = U_ZERO_ERROR; 2821 UText pattern = UTEXT_INITIALIZER; 2822 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */ 2823 utext_openUTF8(&pattern, str_pL, -1, &status); 2824 2825 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status); 2826 RegexPattern *pClone = pSource->clone(); 2827 delete pSource; 2828 RegexMatcher *mFromClone = pClone->matcher(status); 2829 REGEX_CHECK_STATUS; 2830 2831 UText input = UTEXT_INITIALIZER; 2832 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */ 2833 utext_openUTF8(&input, str_HelloWorld, -1, &status); 2834 mFromClone->reset(&input); 2835 REGEX_ASSERT(mFromClone->find() == TRUE); 2836 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 2837 REGEX_ASSERT(mFromClone->find() == TRUE); 2838 REGEX_ASSERT(mFromClone->group(status) == "World"); 2839 REGEX_ASSERT(mFromClone->find() == FALSE); 2840 delete mFromClone; 2841 delete pClone; 2842 2843 utext_close(&input); 2844 utext_close(&pattern); 2845 } 2846 2847 // 2848 // matches convenience API 2849 // 2850 { 2851 UErrorCode status = U_ZERO_ERROR; 2852 UText pattern = UTEXT_INITIALIZER; 2853 UText input = UTEXT_INITIALIZER; 2854 2855 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */ 2856 utext_openUTF8(&input, str_randominput, -1, &status); 2857 2858 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2859 utext_openUTF8(&pattern, str_dotstar, -1, &status); 2860 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE); 2861 REGEX_CHECK_STATUS; 2862 2863 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2864 utext_openUTF8(&pattern, str_abc, -1, &status); 2865 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 2866 REGEX_CHECK_STATUS; 2867 2868 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */ 2869 utext_openUTF8(&pattern, str_nput, -1, &status); 2870 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 2871 REGEX_CHECK_STATUS; 2872 2873 utext_openUTF8(&pattern, str_randominput, -1, &status); 2874 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 2875 REGEX_CHECK_STATUS; 2876 2877 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */ 2878 utext_openUTF8(&pattern, str_u, -1, &status); 2879 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 2880 REGEX_CHECK_STATUS; 2881 2882 utext_openUTF8(&input, str_abc, -1, &status); 2883 utext_openUTF8(&pattern, str_abc, -1, &status); 2884 status = U_INDEX_OUTOFBOUNDS_ERROR; 2885 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 2886 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 2887 2888 utext_close(&input); 2889 utext_close(&pattern); 2890 } 2891 2892 2893 // 2894 // Split() 2895 // 2896 status = U_ZERO_ERROR; 2897 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */ 2898 utext_openUTF8(&re1, str_spaceplus, -1, &status); 2899 pat1 = RegexPattern::compile(&re1, pe, status); 2900 REGEX_CHECK_STATUS; 2901 UnicodeString fields[10]; 2902 2903 int32_t n; 2904 n = pat1->split("Now is the time", fields, 10, status); 2905 REGEX_CHECK_STATUS; 2906 REGEX_ASSERT(n==4); 2907 REGEX_ASSERT(fields[0]=="Now"); 2908 REGEX_ASSERT(fields[1]=="is"); 2909 REGEX_ASSERT(fields[2]=="the"); 2910 REGEX_ASSERT(fields[3]=="time"); 2911 REGEX_ASSERT(fields[4]==""); 2912 2913 n = pat1->split("Now is the time", fields, 2, status); 2914 REGEX_CHECK_STATUS; 2915 REGEX_ASSERT(n==2); 2916 REGEX_ASSERT(fields[0]=="Now"); 2917 REGEX_ASSERT(fields[1]=="is the time"); 2918 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 2919 2920 fields[1] = "*"; 2921 status = U_ZERO_ERROR; 2922 n = pat1->split("Now is the time", fields, 1, status); 2923 REGEX_CHECK_STATUS; 2924 REGEX_ASSERT(n==1); 2925 REGEX_ASSERT(fields[0]=="Now is the time"); 2926 REGEX_ASSERT(fields[1]=="*"); 2927 status = U_ZERO_ERROR; 2928 2929 n = pat1->split(" Now is the time ", fields, 10, status); 2930 REGEX_CHECK_STATUS; 2931 REGEX_ASSERT(n==6); 2932 REGEX_ASSERT(fields[0]==""); 2933 REGEX_ASSERT(fields[1]=="Now"); 2934 REGEX_ASSERT(fields[2]=="is"); 2935 REGEX_ASSERT(fields[3]=="the"); 2936 REGEX_ASSERT(fields[4]=="time"); 2937 REGEX_ASSERT(fields[5]==""); 2938 REGEX_ASSERT(fields[6]==""); 2939 2940 fields[2] = "*"; 2941 n = pat1->split(" ", fields, 10, status); 2942 REGEX_CHECK_STATUS; 2943 REGEX_ASSERT(n==2); 2944 REGEX_ASSERT(fields[0]==""); 2945 REGEX_ASSERT(fields[1]==""); 2946 REGEX_ASSERT(fields[2]=="*"); 2947 2948 fields[0] = "foo"; 2949 n = pat1->split("", fields, 10, status); 2950 REGEX_CHECK_STATUS; 2951 REGEX_ASSERT(n==0); 2952 REGEX_ASSERT(fields[0]=="foo"); 2953 2954 delete pat1; 2955 2956 // split, with a pattern with (capture) 2957 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status); 2958 pat1 = RegexPattern::compile(&re1, pe, status); 2959 REGEX_CHECK_STATUS; 2960 2961 status = U_ZERO_ERROR; 2962 fields[6] = fields[7] = "*"; 2963 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 2964 REGEX_CHECK_STATUS; 2965 REGEX_ASSERT(n==7); 2966 REGEX_ASSERT(fields[0]==""); 2967 REGEX_ASSERT(fields[1]=="a"); 2968 REGEX_ASSERT(fields[2]=="Now is "); 2969 REGEX_ASSERT(fields[3]=="b"); 2970 REGEX_ASSERT(fields[4]=="the time"); 2971 REGEX_ASSERT(fields[5]=="c"); 2972 REGEX_ASSERT(fields[6]==""); 2973 REGEX_ASSERT(fields[7]=="*"); 2974 REGEX_ASSERT(status==U_ZERO_ERROR); 2975 2976 fields[6] = fields[7] = "*"; 2977 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 2978 REGEX_CHECK_STATUS; 2979 REGEX_ASSERT(n==7); 2980 REGEX_ASSERT(fields[0]==" "); 2981 REGEX_ASSERT(fields[1]=="a"); 2982 REGEX_ASSERT(fields[2]=="Now is "); 2983 REGEX_ASSERT(fields[3]=="b"); 2984 REGEX_ASSERT(fields[4]=="the time"); 2985 REGEX_ASSERT(fields[5]=="c"); 2986 REGEX_ASSERT(fields[6]==""); 2987 REGEX_ASSERT(fields[7]=="*"); 2988 2989 status = U_ZERO_ERROR; 2990 fields[6] = "foo"; 2991 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status); 2992 REGEX_CHECK_STATUS; 2993 REGEX_ASSERT(n==6); 2994 REGEX_ASSERT(fields[0]==" "); 2995 REGEX_ASSERT(fields[1]=="a"); 2996 REGEX_ASSERT(fields[2]=="Now is "); 2997 REGEX_ASSERT(fields[3]=="b"); 2998 REGEX_ASSERT(fields[4]=="the time"); 2999 REGEX_ASSERT(fields[5]==" "); 3000 REGEX_ASSERT(fields[6]=="foo"); 3001 3002 status = U_ZERO_ERROR; 3003 fields[5] = "foo"; 3004 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 3005 REGEX_CHECK_STATUS; 3006 REGEX_ASSERT(n==5); 3007 REGEX_ASSERT(fields[0]==" "); 3008 REGEX_ASSERT(fields[1]=="a"); 3009 REGEX_ASSERT(fields[2]=="Now is "); 3010 REGEX_ASSERT(fields[3]=="b"); 3011 REGEX_ASSERT(fields[4]=="the time<c>"); 3012 REGEX_ASSERT(fields[5]=="foo"); 3013 3014 status = U_ZERO_ERROR; 3015 fields[5] = "foo"; 3016 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 3017 REGEX_CHECK_STATUS; 3018 REGEX_ASSERT(n==5); 3019 REGEX_ASSERT(fields[0]==" "); 3020 REGEX_ASSERT(fields[1]=="a"); 3021 REGEX_ASSERT(fields[2]=="Now is "); 3022 REGEX_ASSERT(fields[3]=="b"); 3023 REGEX_ASSERT(fields[4]=="the time"); 3024 REGEX_ASSERT(fields[5]=="foo"); 3025 3026 status = U_ZERO_ERROR; 3027 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 3028 REGEX_CHECK_STATUS; 3029 REGEX_ASSERT(n==4); 3030 REGEX_ASSERT(fields[0]==" "); 3031 REGEX_ASSERT(fields[1]=="a"); 3032 REGEX_ASSERT(fields[2]=="Now is "); 3033 REGEX_ASSERT(fields[3]=="the time<c>"); 3034 status = U_ZERO_ERROR; 3035 delete pat1; 3036 3037 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status); 3038 pat1 = RegexPattern::compile(&re1, pe, status); 3039 REGEX_CHECK_STATUS; 3040 n = pat1->split("1-10,20", fields, 10, status); 3041 REGEX_CHECK_STATUS; 3042 REGEX_ASSERT(n==5); 3043 REGEX_ASSERT(fields[0]=="1"); 3044 REGEX_ASSERT(fields[1]=="-"); 3045 REGEX_ASSERT(fields[2]=="10"); 3046 REGEX_ASSERT(fields[3]==","); 3047 REGEX_ASSERT(fields[4]=="20"); 3048 delete pat1; 3049 3050 3051 // 3052 // RegexPattern::pattern() and patternText() 3053 // 3054 pat1 = new RegexPattern(); 3055 REGEX_ASSERT(pat1->pattern() == ""); 3056 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status)); 3057 delete pat1; 3058 const char *helloWorldInvariant = "(Hello, world)*"; 3059 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status); 3060 pat1 = RegexPattern::compile(&re1, pe, status); 3061 REGEX_CHECK_STATUS; 3062 REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*"); 3063 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status)); 3064 delete pat1; 3065 3066 utext_close(&re1); 3067} 3068 3069 3070//--------------------------------------------------------------------------- 3071// 3072// Extended A more thorough check for features of regex patterns 3073// The test cases are in a separate data file, 3074// source/tests/testdata/regextst.txt 3075// A description of the test data format is included in that file. 3076// 3077//--------------------------------------------------------------------------- 3078 3079const char * 3080RegexTest::getPath(char buffer[2048], const char *filename) { 3081 UErrorCode status=U_ZERO_ERROR; 3082 const char *testDataDirectory = IntlTest::getSourceTestData(status); 3083 if (U_FAILURE(status)) { 3084 errln("ERROR: loadTestData() failed - %s", u_errorName(status)); 3085 return NULL; 3086 } 3087 3088 strcpy(buffer, testDataDirectory); 3089 strcat(buffer, filename); 3090 return buffer; 3091} 3092 3093void RegexTest::Extended() { 3094 char tdd[2048]; 3095 const char *srcPath; 3096 UErrorCode status = U_ZERO_ERROR; 3097 int32_t lineNum = 0; 3098 3099 // 3100 // Open and read the test data file. 3101 // 3102 srcPath=getPath(tdd, "regextst.txt"); 3103 if(srcPath==NULL) { 3104 return; /* something went wrong, error already output */ 3105 } 3106 3107 int32_t len; 3108 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status); 3109 if (U_FAILURE(status)) { 3110 return; /* something went wrong, error already output */ 3111 } 3112 3113 // 3114 // Put the test data into a UnicodeString 3115 // 3116 UnicodeString testString(FALSE, testData, len); 3117 3118 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status); 3119 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status); 3120 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status); 3121 3122 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status); 3123 UnicodeString testPattern; // The pattern for test from the test file. 3124 UnicodeString testFlags; // the flags for a test. 3125 UnicodeString matchString; // The marked up string to be used as input 3126 3127 if (U_FAILURE(status)){ 3128 dataerrln("Construct RegexMatcher() error."); 3129 delete [] testData; 3130 return; 3131 } 3132 3133 // 3134 // Loop over the test data file, once per line. 3135 // 3136 while (lineMat.find()) { 3137 lineNum++; 3138 if (U_FAILURE(status)) { 3139 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status)); 3140 } 3141 3142 status = U_ZERO_ERROR; 3143 UnicodeString testLine = lineMat.group(1, status); 3144 if (testLine.length() == 0) { 3145 continue; 3146 } 3147 3148 // 3149 // Parse the test line. Skip blank and comment only lines. 3150 // Separate out the three main fields - pattern, flags, target. 3151 // 3152 3153 commentMat.reset(testLine); 3154 if (commentMat.lookingAt(status)) { 3155 // This line is a comment, or blank. 3156 continue; 3157 } 3158 3159 // 3160 // Pull out the pattern field, remove it from the test file line. 3161 // 3162 quotedStuffMat.reset(testLine); 3163 if (quotedStuffMat.lookingAt(status)) { 3164 testPattern = quotedStuffMat.group(2, status); 3165 testLine.remove(0, quotedStuffMat.end(0, status)); 3166 } else { 3167 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum); 3168 continue; 3169 } 3170 3171 3172 // 3173 // Pull out the flags from the test file line. 3174 // 3175 flagsMat.reset(testLine); 3176 flagsMat.lookingAt(status); // Will always match, possibly an empty string. 3177 testFlags = flagsMat.group(1, status); 3178 if (flagsMat.group(2, status).length() > 0) { 3179 errln("Bad Match flag at line %d. Scanning %c\n", 3180 lineNum, flagsMat.group(2, status).charAt(0)); 3181 continue; 3182 } 3183 testLine.remove(0, flagsMat.end(0, status)); 3184 3185 // 3186 // Pull out the match string, as a whole. 3187 // We'll process the <tags> later. 3188 // 3189 quotedStuffMat.reset(testLine); 3190 if (quotedStuffMat.lookingAt(status)) { 3191 matchString = quotedStuffMat.group(2, status); 3192 testLine.remove(0, quotedStuffMat.end(0, status)); 3193 } else { 3194 errln("Bad match string at test file line %d", lineNum); 3195 continue; 3196 } 3197 3198 // 3199 // The only thing left from the input line should be an optional trailing comment. 3200 // 3201 commentMat.reset(testLine); 3202 if (commentMat.lookingAt(status) == FALSE) { 3203 errln("Line %d: unexpected characters at end of test line.", lineNum); 3204 continue; 3205 } 3206 3207 // 3208 // Run the test 3209 // 3210 regex_find(testPattern, testFlags, matchString, srcPath, lineNum); 3211 } 3212 3213 delete [] testData; 3214 3215} 3216 3217 3218 3219//--------------------------------------------------------------------------- 3220// 3221// regex_find(pattern, flags, inputString, lineNumber) 3222// 3223// Function to run a single test from the Extended (data driven) tests. 3224// See file test/testdata/regextst.txt for a description of the 3225// pattern and inputString fields, and the allowed flags. 3226// lineNumber is the source line in regextst.txt of the test. 3227// 3228//--------------------------------------------------------------------------- 3229 3230 3231// Set a value into a UVector at position specified by a decimal number in 3232// a UnicodeString. This is a utility function needed by the actual test function, 3233// which follows. 3234static void set(UVector &vec, int32_t val, UnicodeString index) { 3235 UErrorCode status=U_ZERO_ERROR; 3236 int32_t idx = 0; 3237 for (int32_t i=0; i<index.length(); i++) { 3238 int32_t d=u_charDigitValue(index.charAt(i)); 3239 if (d<0) {return;} 3240 idx = idx*10 + d; 3241 } 3242 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3243 vec.setElementAt(val, idx); 3244} 3245 3246static void setInt(UVector &vec, int32_t val, int32_t idx) { 3247 UErrorCode status=U_ZERO_ERROR; 3248 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3249 vec.setElementAt(val, idx); 3250} 3251 3252static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex) 3253{ 3254 UBool couldFind = TRUE; 3255 UTEXT_SETNATIVEINDEX(utext, 0); 3256 int32_t i = 0; 3257 while (i < unistrOffset) { 3258 UChar32 c = UTEXT_NEXT32(utext); 3259 if (c != U_SENTINEL) { 3260 i += U16_LENGTH(c); 3261 } else { 3262 couldFind = FALSE; 3263 break; 3264 } 3265 } 3266 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext); 3267 return couldFind; 3268} 3269 3270 3271void RegexTest::regex_find(const UnicodeString &pattern, 3272 const UnicodeString &flags, 3273 const UnicodeString &inputString, 3274 const char *srcPath, 3275 int32_t line) { 3276 UnicodeString unEscapedInput; 3277 UnicodeString deTaggedInput; 3278 3279 int32_t patternUTF8Length, inputUTF8Length; 3280 char *patternChars = NULL, *inputChars = NULL; 3281 UText patternText = UTEXT_INITIALIZER; 3282 UText inputText = UTEXT_INITIALIZER; 3283 UConverter *UTF8Converter = NULL; 3284 3285 UErrorCode status = U_ZERO_ERROR; 3286 UParseError pe; 3287 RegexPattern *parsePat = NULL; 3288 RegexMatcher *parseMatcher = NULL; 3289 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL; 3290 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL; 3291 UVector groupStarts(status); 3292 UVector groupEnds(status); 3293 UVector groupStartsUTF8(status); 3294 UVector groupEndsUTF8(status); 3295 UBool isMatch = FALSE, isUTF8Match = FALSE; 3296 UBool failed = FALSE; 3297 int32_t numFinds; 3298 int32_t i; 3299 UBool useMatchesFunc = FALSE; 3300 UBool useLookingAtFunc = FALSE; 3301 int32_t regionStart = -1; 3302 int32_t regionEnd = -1; 3303 int32_t regionStartUTF8 = -1; 3304 int32_t regionEndUTF8 = -1; 3305 3306 3307 // 3308 // Compile the caller's pattern 3309 // 3310 uint32_t bflags = 0; 3311 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag 3312 bflags |= UREGEX_CASE_INSENSITIVE; 3313 } 3314 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag 3315 bflags |= UREGEX_COMMENTS; 3316 } 3317 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag 3318 bflags |= UREGEX_DOTALL; 3319 } 3320 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag 3321 bflags |= UREGEX_MULTILINE; 3322 } 3323 3324 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag 3325 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES; 3326 } 3327 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag 3328 bflags |= UREGEX_UNIX_LINES; 3329 } 3330 3331 3332 callerPattern = RegexPattern::compile(pattern, bflags, pe, status); 3333 if (status != U_ZERO_ERROR) { 3334 #if UCONFIG_NO_BREAK_ITERATION==1 3335 // 'v' test flag means that the test pattern should not compile if ICU was configured 3336 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3337 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3338 goto cleanupAndReturn; 3339 } 3340 #endif 3341 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3342 // Expected pattern compilation error. 3343 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3344 logln("Pattern Compile returns \"%s\"", u_errorName(status)); 3345 } 3346 goto cleanupAndReturn; 3347 } else { 3348 // Unexpected pattern compilation error. 3349 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status)); 3350 goto cleanupAndReturn; 3351 } 3352 } 3353 3354 UTF8Converter = ucnv_open("UTF8", &status); 3355 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 3356 3357 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status); 3358 status = U_ZERO_ERROR; // buffer overflow 3359 patternChars = new char[patternUTF8Length+1]; 3360 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status); 3361 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status); 3362 3363 if (status == U_ZERO_ERROR) { 3364 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status); 3365 3366 if (status != U_ZERO_ERROR) { 3367#if UCONFIG_NO_BREAK_ITERATION==1 3368 // 'v' test flag means that the test pattern should not compile if ICU was configured 3369 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3370 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3371 goto cleanupAndReturn; 3372 } 3373#endif 3374 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3375 // Expected pattern compilation error. 3376 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3377 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status)); 3378 } 3379 goto cleanupAndReturn; 3380 } else { 3381 // Unexpected pattern compilation error. 3382 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status)); 3383 goto cleanupAndReturn; 3384 } 3385 } 3386 } 3387 3388 if (UTF8Pattern == NULL) { 3389 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3390 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line); 3391 status = U_ZERO_ERROR; 3392 } 3393 3394 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag 3395 RegexPatternDump(callerPattern); 3396 } 3397 3398 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag 3399 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line); 3400 goto cleanupAndReturn; 3401 } 3402 3403 3404 // 3405 // Number of times find() should be called on the test string, default to 1 3406 // 3407 numFinds = 1; 3408 for (i=2; i<=9; i++) { 3409 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag 3410 if (numFinds != 1) { 3411 errln("Line %d: more than one digit flag. Scanning %d.", line, i); 3412 goto cleanupAndReturn; 3413 } 3414 numFinds = i; 3415 } 3416 } 3417 3418 // 'M' flag. Use matches() instead of find() 3419 if (flags.indexOf((UChar)0x4d) >= 0) { 3420 useMatchesFunc = TRUE; 3421 } 3422 if (flags.indexOf((UChar)0x4c) >= 0) { 3423 useLookingAtFunc = TRUE; 3424 } 3425 3426 // 3427 // Find the tags in the input data, remove them, and record the group boundary 3428 // positions. 3429 // 3430 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status); 3431 REGEX_CHECK_STATUS_L(line); 3432 3433 unEscapedInput = inputString.unescape(); 3434 parseMatcher = parsePat->matcher(unEscapedInput, status); 3435 REGEX_CHECK_STATUS_L(line); 3436 while(parseMatcher->find()) { 3437 parseMatcher->appendReplacement(deTaggedInput, "", status); 3438 REGEX_CHECK_STATUS; 3439 UnicodeString groupNum = parseMatcher->group(2, status); 3440 if (groupNum == "r") { 3441 // <r> or </r>, a region specification within the string 3442 if (parseMatcher->group(1, status) == "/") { 3443 regionEnd = deTaggedInput.length(); 3444 } else { 3445 regionStart = deTaggedInput.length(); 3446 } 3447 } else { 3448 // <digits> or </digits>, a group match boundary tag. 3449 if (parseMatcher->group(1, status) == "/") { 3450 set(groupEnds, deTaggedInput.length(), groupNum); 3451 } else { 3452 set(groupStarts, deTaggedInput.length(), groupNum); 3453 } 3454 } 3455 } 3456 parseMatcher->appendTail(deTaggedInput); 3457 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line); 3458 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) { 3459 errln("mismatched <r> tags"); 3460 failed = TRUE; 3461 goto cleanupAndReturn; 3462 } 3463 3464 // 3465 // Configure the matcher according to the flags specified with this test. 3466 // 3467 matcher = callerPattern->matcher(deTaggedInput, status); 3468 REGEX_CHECK_STATUS_L(line); 3469 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag 3470 matcher->setTrace(TRUE); 3471 } 3472 3473 if (UTF8Pattern != NULL) { 3474 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status); 3475 status = U_ZERO_ERROR; // buffer overflow 3476 inputChars = new char[inputUTF8Length+1]; 3477 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status); 3478 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status); 3479 3480 if (status == U_ZERO_ERROR) { 3481 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText); 3482 REGEX_CHECK_STATUS_L(line); 3483 } 3484 3485 if (UTF8Matcher == NULL) { 3486 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3487 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line); 3488 status = U_ZERO_ERROR; 3489 } 3490 } 3491 3492 // 3493 // Generate native indices for UTF8 versions of region and capture group info 3494 // 3495 if (UTF8Matcher != NULL) { 3496 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8); 3497 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8); 3498 3499 // Fill out the native index UVector info. 3500 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size() 3501 for (i=0; i<groupStarts.size(); i++) { 3502 int32_t start = groupStarts.elementAti(i); 3503 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3504 if (start >= 0) { 3505 int32_t startUTF8; 3506 if (!utextOffsetToNative(&inputText, start, startUTF8)) { 3507 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start); 3508 failed = TRUE; 3509 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3510 } 3511 setInt(groupStartsUTF8, startUTF8, i); 3512 } 3513 3514 int32_t end = groupEnds.elementAti(i); 3515 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3516 if (end >= 0) { 3517 int32_t endUTF8; 3518 if (!utextOffsetToNative(&inputText, end, endUTF8)) { 3519 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end); 3520 failed = TRUE; 3521 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3522 } 3523 setInt(groupEndsUTF8, endUTF8, i); 3524 } 3525 } 3526 } 3527 3528 if (regionStart>=0) { 3529 matcher->region(regionStart, regionEnd, status); 3530 REGEX_CHECK_STATUS_L(line); 3531 if (UTF8Matcher != NULL) { 3532 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status); 3533 REGEX_CHECK_STATUS_L(line); 3534 } 3535 } 3536 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag 3537 matcher->useAnchoringBounds(FALSE); 3538 if (UTF8Matcher != NULL) { 3539 UTF8Matcher->useAnchoringBounds(FALSE); 3540 } 3541 } 3542 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag 3543 matcher->useTransparentBounds(TRUE); 3544 if (UTF8Matcher != NULL) { 3545 UTF8Matcher->useTransparentBounds(TRUE); 3546 } 3547 } 3548 3549 3550 3551 // 3552 // Do a find on the de-tagged input using the caller's pattern 3553 // TODO: error on count>1 and not find(). 3554 // error on both matches() and lookingAt(). 3555 // 3556 for (i=0; i<numFinds; i++) { 3557 if (useMatchesFunc) { 3558 isMatch = matcher->matches(status); 3559 if (UTF8Matcher != NULL) { 3560 isUTF8Match = UTF8Matcher->matches(status); 3561 } 3562 } else if (useLookingAtFunc) { 3563 isMatch = matcher->lookingAt(status); 3564 if (UTF8Matcher != NULL) { 3565 isUTF8Match = UTF8Matcher->lookingAt(status); 3566 } 3567 } else { 3568 isMatch = matcher->find(); 3569 if (UTF8Matcher != NULL) { 3570 isUTF8Match = UTF8Matcher->find(); 3571 } 3572 } 3573 } 3574 matcher->setTrace(FALSE); 3575 3576 // 3577 // Match up the groups from the find() with the groups from the tags 3578 // 3579 3580 // number of tags should match number of groups from find operation. 3581 // matcher->groupCount does not include group 0, the entire match, hence the +1. 3582 // G option in test means that capture group data is not available in the 3583 // expected results, so the check needs to be suppressed. 3584 if (isMatch == FALSE && groupStarts.size() != 0) { 3585 dataerrln("Error at line %d: Match expected, but none found.", line); 3586 failed = TRUE; 3587 goto cleanupAndReturn; 3588 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) { 3589 errln("Error at line %d: Match expected, but none found. (UTF8)", line); 3590 failed = TRUE; 3591 goto cleanupAndReturn; 3592 } 3593 3594 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) { 3595 // Only check for match / no match. Don't check capture groups. 3596 if (isMatch && groupStarts.size() == 0) { 3597 errln("Error at line %d: No match expected, but one found.", line); 3598 failed = TRUE; 3599 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) { 3600 errln("Error at line %d: No match expected, but one found. (UTF8)", line); 3601 failed = TRUE; 3602 } 3603 goto cleanupAndReturn; 3604 } 3605 3606 REGEX_CHECK_STATUS_L(line); 3607 for (i=0; i<=matcher->groupCount(); i++) { 3608 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i)); 3609 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i)); 3610 if (matcher->start(i, status) != expectedStart) { 3611 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d", 3612 line, i, expectedStart, matcher->start(i, status)); 3613 failed = TRUE; 3614 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3615 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) { 3616 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)", 3617 line, i, expectedStartUTF8, UTF8Matcher->start(i, status)); 3618 failed = TRUE; 3619 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3620 } 3621 3622 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i)); 3623 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i)); 3624 if (matcher->end(i, status) != expectedEnd) { 3625 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d", 3626 line, i, expectedEnd, matcher->end(i, status)); 3627 failed = TRUE; 3628 // Error on end position; keep going; real error is probably yet to come as group 3629 // end positions work from end of the input data towards the front. 3630 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) { 3631 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)", 3632 line, i, expectedEndUTF8, UTF8Matcher->end(i, status)); 3633 failed = TRUE; 3634 // Error on end position; keep going; real error is probably yet to come as group 3635 // end positions work from end of the input data towards the front. 3636 } 3637 } 3638 if ( matcher->groupCount()+1 < groupStarts.size()) { 3639 errln("Error at line %d: Expected %d capture groups, found %d.", 3640 line, groupStarts.size()-1, matcher->groupCount()); 3641 failed = TRUE; 3642 } 3643 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) { 3644 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)", 3645 line, groupStarts.size()-1, UTF8Matcher->groupCount()); 3646 failed = TRUE; 3647 } 3648 3649 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3650 matcher->requireEnd() == TRUE) { 3651 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line); 3652 failed = TRUE; 3653 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3654 UTF8Matcher->requireEnd() == TRUE) { 3655 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line); 3656 failed = TRUE; 3657 } 3658 3659 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true 3660 matcher->requireEnd() == FALSE) { 3661 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line); 3662 failed = TRUE; 3663 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false 3664 UTF8Matcher->requireEnd() == FALSE) { 3665 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line); 3666 failed = TRUE; 3667 } 3668 3669 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3670 matcher->hitEnd() == TRUE) { 3671 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line); 3672 failed = TRUE; 3673 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3674 UTF8Matcher->hitEnd() == TRUE) { 3675 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line); 3676 failed = TRUE; 3677 } 3678 3679 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3680 matcher->hitEnd() == FALSE) { 3681 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line); 3682 failed = TRUE; 3683 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3684 UTF8Matcher->hitEnd() == FALSE) { 3685 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line); 3686 failed = TRUE; 3687 } 3688 3689 3690cleanupAndReturn: 3691 if (failed) { 3692 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" " 3693 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\""); 3694 // callerPattern->dump(); 3695 } 3696 delete parseMatcher; 3697 delete parsePat; 3698 delete UTF8Matcher; 3699 delete UTF8Pattern; 3700 delete matcher; 3701 delete callerPattern; 3702 3703 utext_close(&inputText); 3704 delete[] inputChars; 3705 utext_close(&patternText); 3706 delete[] patternChars; 3707 ucnv_close(UTF8Converter); 3708} 3709 3710 3711 3712 3713//--------------------------------------------------------------------------- 3714// 3715// Errors Check for error handling in patterns. 3716// 3717//--------------------------------------------------------------------------- 3718void RegexTest::Errors() { 3719 // \escape sequences that aren't implemented yet. 3720 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED); 3721 3722 // Missing close parentheses 3723 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN); 3724 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN); 3725 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN); 3726 3727 // Extra close paren 3728 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN); 3729 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN); 3730 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN); 3731 3732 // Look-ahead, Look-behind 3733 // TODO: add tests for unbounded length look-behinds. 3734 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct 3735 3736 // Attempt to use non-default flags 3737 { 3738 UParseError pe; 3739 UErrorCode status = U_ZERO_ERROR; 3740 int32_t flags = UREGEX_CANON_EQ | 3741 UREGEX_COMMENTS | UREGEX_DOTALL | 3742 UREGEX_MULTILINE; 3743 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status); 3744 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED); 3745 delete pat1; 3746 } 3747 3748 3749 // Quantifiers are allowed only after something that can be quantified. 3750 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX); 3751 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); 3752 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); 3753 3754 // Mal-formed {min,max} quantifiers 3755 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL); 3756 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN); 3757 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL); 3758 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); 3759 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); 3760 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); 3761 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan 3762 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format 3763 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); 3764 3765 // Ticket 5389 3766 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX); 3767 3768 // Invalid Back Reference \0 3769 // For ICU 3.8 and earlier 3770 // For ICU versions newer than 3.8, \0 introduces an octal escape. 3771 // 3772 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE); 3773 3774} 3775 3776 3777//------------------------------------------------------------------------------- 3778// 3779// Read a text data file, convert it to UChars, and return the data 3780// in one big UChar * buffer, which the caller must delete. 3781// 3782//-------------------------------------------------------------------------------- 3783UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, 3784 const char *defEncoding, UErrorCode &status) { 3785 UChar *retPtr = NULL; 3786 char *fileBuf = NULL; 3787 UConverter* conv = NULL; 3788 FILE *f = NULL; 3789 3790 ulen = 0; 3791 if (U_FAILURE(status)) { 3792 return retPtr; 3793 } 3794 3795 // 3796 // Open the file. 3797 // 3798 f = fopen(fileName, "rb"); 3799 if (f == 0) { 3800 dataerrln("Error opening test data file %s\n", fileName); 3801 status = U_FILE_ACCESS_ERROR; 3802 return NULL; 3803 } 3804 // 3805 // Read it in 3806 // 3807 int32_t fileSize; 3808 int32_t amt_read; 3809 3810 fseek( f, 0, SEEK_END); 3811 fileSize = ftell(f); 3812 fileBuf = new char[fileSize]; 3813 fseek(f, 0, SEEK_SET); 3814 amt_read = fread(fileBuf, 1, fileSize, f); 3815 if (amt_read != fileSize || fileSize <= 0) { 3816 errln("Error reading test data file."); 3817 goto cleanUpAndReturn; 3818 } 3819 3820 // 3821 // Look for a Unicode Signature (BOM) on the data just read 3822 // 3823 int32_t signatureLength; 3824 const char * fileBufC; 3825 const char* encoding; 3826 3827 fileBufC = fileBuf; 3828 encoding = ucnv_detectUnicodeSignature( 3829 fileBuf, fileSize, &signatureLength, &status); 3830 if(encoding!=NULL ){ 3831 fileBufC += signatureLength; 3832 fileSize -= signatureLength; 3833 } else { 3834 encoding = defEncoding; 3835 if (strcmp(encoding, "utf-8") == 0) { 3836 errln("file %s is missing its BOM", fileName); 3837 } 3838 } 3839 3840 // 3841 // Open a converter to take the rule file to UTF-16 3842 // 3843 conv = ucnv_open(encoding, &status); 3844 if (U_FAILURE(status)) { 3845 goto cleanUpAndReturn; 3846 } 3847 3848 // 3849 // Convert the rules to UChar. 3850 // Preflight first to determine required buffer size. 3851 // 3852 ulen = ucnv_toUChars(conv, 3853 NULL, // dest, 3854 0, // destCapacity, 3855 fileBufC, 3856 fileSize, 3857 &status); 3858 if (status == U_BUFFER_OVERFLOW_ERROR) { 3859 // Buffer Overflow is expected from the preflight operation. 3860 status = U_ZERO_ERROR; 3861 3862 retPtr = new UChar[ulen+1]; 3863 ucnv_toUChars(conv, 3864 retPtr, // dest, 3865 ulen+1, 3866 fileBufC, 3867 fileSize, 3868 &status); 3869 } 3870 3871cleanUpAndReturn: 3872 fclose(f); 3873 delete[] fileBuf; 3874 ucnv_close(conv); 3875 if (U_FAILURE(status)) { 3876 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3877 delete []retPtr; 3878 retPtr = 0; 3879 ulen = 0; 3880 }; 3881 return retPtr; 3882} 3883 3884 3885//------------------------------------------------------------------------------- 3886// 3887// PerlTests - Run Perl's regular expression tests 3888// The input file for this test is re_tests, the standard regular 3889// expression test data distributed with the Perl source code. 3890// 3891// Here is Perl's description of the test data file: 3892// 3893// # The tests are in a separate file 't/op/re_tests'. 3894// # Each line in that file is a separate test. 3895// # There are five columns, separated by tabs. 3896// # 3897// # Column 1 contains the pattern, optionally enclosed in C<''>. 3898// # Modifiers can be put after the closing C<'>. 3899// # 3900// # Column 2 contains the string to be matched. 3901// # 3902// # Column 3 contains the expected result: 3903// # y expect a match 3904// # n expect no match 3905// # c expect an error 3906// # B test exposes a known bug in Perl, should be skipped 3907// # b test exposes a known bug in Perl, should be skipped if noamp 3908// # 3909// # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 3910// # 3911// # Column 4 contains a string, usually C<$&>. 3912// # 3913// # Column 5 contains the expected result of double-quote 3914// # interpolating that string after the match, or start of error message. 3915// # 3916// # Column 6, if present, contains a reason why the test is skipped. 3917// # This is printed with "skipped", for harness to pick up. 3918// # 3919// # \n in the tests are interpolated, as are variables of the form ${\w+}. 3920// # 3921// # If you want to add a regular expression test that can't be expressed 3922// # in this format, don't add it here: put it in op/pat.t instead. 3923// 3924// For ICU, if field 3 contains an 'i', the test will be skipped. 3925// The test exposes is some known incompatibility between ICU and Perl regexps. 3926// (The i is in addition to whatever was there before.) 3927// 3928//------------------------------------------------------------------------------- 3929void RegexTest::PerlTests() { 3930 char tdd[2048]; 3931 const char *srcPath; 3932 UErrorCode status = U_ZERO_ERROR; 3933 UParseError pe; 3934 3935 // 3936 // Open and read the test data file. 3937 // 3938 srcPath=getPath(tdd, "re_tests.txt"); 3939 if(srcPath==NULL) { 3940 return; /* something went wrong, error already output */ 3941 } 3942 3943 int32_t len; 3944 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 3945 if (U_FAILURE(status)) { 3946 return; /* something went wrong, error already output */ 3947 } 3948 3949 // 3950 // Put the test data into a UnicodeString 3951 // 3952 UnicodeString testDataString(FALSE, testData, len); 3953 3954 // 3955 // Regex to break the input file into lines, and strip the new lines. 3956 // One line per match, capture group one is the desired data. 3957 // 3958 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 3959 if (U_FAILURE(status)) { 3960 dataerrln("RegexPattern::compile() error"); 3961 return; 3962 } 3963 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 3964 3965 // 3966 // Regex to split a test file line into fields. 3967 // There are six fields, separated by tabs. 3968 // 3969 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 3970 3971 // 3972 // Regex to identify test patterns with flag settings, and to separate them. 3973 // Test patterns with flags look like 'pattern'i 3974 // Test patterns without flags are not quoted: pattern 3975 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 3976 // 3977 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 3978 RegexMatcher* flagMat = flagPat->matcher(status); 3979 3980 // 3981 // The Perl tests reference several perl-isms, which are evaluated/substituted 3982 // in the test data. Not being perl, this must be done explicitly. Here 3983 // are string constants and REs for these constructs. 3984 // 3985 UnicodeString nulnulSrc("${nulnul}"); 3986 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 3987 nulnul = nulnul.unescape(); 3988 3989 UnicodeString ffffSrc("${ffff}"); 3990 UnicodeString ffff("\\uffff", -1, US_INV); 3991 ffff = ffff.unescape(); 3992 3993 // regexp for $-[0], $+[2], etc. 3994 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 3995 RegexMatcher *groupsMat = groupsPat->matcher(status); 3996 3997 // regexp for $0, $1, $2, etc. 3998 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 3999 RegexMatcher *cgMat = cgPat->matcher(status); 4000 4001 4002 // 4003 // Main Loop for the Perl Tests, runs once per line from the 4004 // test data file. 4005 // 4006 int32_t lineNum = 0; 4007 int32_t skippedUnimplementedCount = 0; 4008 while (lineMat->find()) { 4009 lineNum++; 4010 4011 // 4012 // Get a line, break it into its fields, do the Perl 4013 // variable substitutions. 4014 // 4015 UnicodeString line = lineMat->group(1, status); 4016 UnicodeString fields[7]; 4017 fieldPat->split(line, fields, 7, status); 4018 4019 flagMat->reset(fields[0]); 4020 flagMat->matches(status); 4021 UnicodeString pattern = flagMat->group(2, status); 4022 pattern.findAndReplace("${bang}", "!"); 4023 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4024 pattern.findAndReplace(ffffSrc, ffff); 4025 4026 // 4027 // Identify patterns that include match flag settings, 4028 // split off the flags, remove the extra quotes. 4029 // 4030 UnicodeString flagStr = flagMat->group(3, status); 4031 if (U_FAILURE(status)) { 4032 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4033 return; 4034 } 4035 int32_t flags = 0; 4036 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4037 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4038 const UChar UChar_m = 0x6d; 4039 const UChar UChar_x = 0x78; 4040 const UChar UChar_y = 0x79; 4041 if (flagStr.indexOf(UChar_i) != -1) { 4042 flags |= UREGEX_CASE_INSENSITIVE; 4043 } 4044 if (flagStr.indexOf(UChar_m) != -1) { 4045 flags |= UREGEX_MULTILINE; 4046 } 4047 if (flagStr.indexOf(UChar_x) != -1) { 4048 flags |= UREGEX_COMMENTS; 4049 } 4050 4051 // 4052 // Compile the test pattern. 4053 // 4054 status = U_ZERO_ERROR; 4055 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status); 4056 if (status == U_REGEX_UNIMPLEMENTED) { 4057 // 4058 // Test of a feature that is planned for ICU, but not yet implemented. 4059 // skip the test. 4060 skippedUnimplementedCount++; 4061 delete testPat; 4062 status = U_ZERO_ERROR; 4063 continue; 4064 } 4065 4066 if (U_FAILURE(status)) { 4067 // Some tests are supposed to generate errors. 4068 // Only report an error for tests that are supposed to succeed. 4069 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4070 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4071 { 4072 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4073 } 4074 status = U_ZERO_ERROR; 4075 delete testPat; 4076 continue; 4077 } 4078 4079 if (fields[2].indexOf(UChar_i) >= 0) { 4080 // ICU should skip this test. 4081 delete testPat; 4082 continue; 4083 } 4084 4085 if (fields[2].indexOf(UChar_c) >= 0) { 4086 // This pattern should have caused a compilation error, but didn't/ 4087 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4088 delete testPat; 4089 continue; 4090 } 4091 4092 // 4093 // replace the Perl variables that appear in some of the 4094 // match data strings. 4095 // 4096 UnicodeString matchString = fields[1]; 4097 matchString.findAndReplace(nulnulSrc, nulnul); 4098 matchString.findAndReplace(ffffSrc, ffff); 4099 4100 // Replace any \n in the match string with an actual new-line char. 4101 // Don't do full unescape, as this unescapes more than Perl does, which 4102 // causes other spurious failures in the tests. 4103 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4104 4105 4106 4107 // 4108 // Run the test, check for expected match/don't match result. 4109 // 4110 RegexMatcher *testMat = testPat->matcher(matchString, status); 4111 UBool found = testMat->find(); 4112 UBool expected = FALSE; 4113 if (fields[2].indexOf(UChar_y) >=0) { 4114 expected = TRUE; 4115 } 4116 if (expected != found) { 4117 errln("line %d: Expected %smatch, got %smatch", 4118 lineNum, expected?"":"no ", found?"":"no " ); 4119 continue; 4120 } 4121 4122 // Don't try to check expected results if there is no match. 4123 // (Some have stuff in the expected fields) 4124 if (!found) { 4125 delete testMat; 4126 delete testPat; 4127 continue; 4128 } 4129 4130 // 4131 // Interpret the Perl expression from the fourth field of the data file, 4132 // building up an ICU string from the results of the ICU match. 4133 // The Perl expression will contain references to the results of 4134 // a regex match, including the matched string, capture group strings, 4135 // group starting and ending indicies, etc. 4136 // 4137 UnicodeString resultString; 4138 UnicodeString perlExpr = fields[3]; 4139#if SUPPORT_MUTATING_INPUT_STRING 4140 groupsMat->reset(perlExpr); 4141 cgMat->reset(perlExpr); 4142#endif 4143 4144 while (perlExpr.length() > 0) { 4145#if !SUPPORT_MUTATING_INPUT_STRING 4146 // Perferred usage. Reset after any modification to input string. 4147 groupsMat->reset(perlExpr); 4148 cgMat->reset(perlExpr); 4149#endif 4150 4151 if (perlExpr.startsWith("$&")) { 4152 resultString.append(testMat->group(status)); 4153 perlExpr.remove(0, 2); 4154 } 4155 4156 else if (groupsMat->lookingAt(status)) { 4157 // $-[0] $+[2] etc. 4158 UnicodeString digitString = groupsMat->group(2, status); 4159 int32_t t = 0; 4160 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4161 UnicodeString plusOrMinus = groupsMat->group(1, status); 4162 int32_t matchPosition; 4163 if (plusOrMinus.compare("+") == 0) { 4164 matchPosition = testMat->end(groupNum, status); 4165 } else { 4166 matchPosition = testMat->start(groupNum, status); 4167 } 4168 if (matchPosition != -1) { 4169 ICU_Utility::appendNumber(resultString, matchPosition); 4170 } 4171 perlExpr.remove(0, groupsMat->end(status)); 4172 } 4173 4174 else if (cgMat->lookingAt(status)) { 4175 // $1, $2, $3, etc. 4176 UnicodeString digitString = cgMat->group(1, status); 4177 int32_t t = 0; 4178 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4179 if (U_SUCCESS(status)) { 4180 resultString.append(testMat->group(groupNum, status)); 4181 status = U_ZERO_ERROR; 4182 } 4183 perlExpr.remove(0, cgMat->end(status)); 4184 } 4185 4186 else if (perlExpr.startsWith("@-")) { 4187 int32_t i; 4188 for (i=0; i<=testMat->groupCount(); i++) { 4189 if (i>0) { 4190 resultString.append(" "); 4191 } 4192 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4193 } 4194 perlExpr.remove(0, 2); 4195 } 4196 4197 else if (perlExpr.startsWith("@+")) { 4198 int32_t i; 4199 for (i=0; i<=testMat->groupCount(); i++) { 4200 if (i>0) { 4201 resultString.append(" "); 4202 } 4203 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4204 } 4205 perlExpr.remove(0, 2); 4206 } 4207 4208 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4209 // or as an escaped sequence (e.g. \n) 4210 if (perlExpr.length() > 1) { 4211 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4212 } 4213 UChar c = perlExpr.charAt(0); 4214 switch (c) { 4215 case 'n': c = '\n'; break; 4216 // add any other escape sequences that show up in the test expected results. 4217 } 4218 resultString.append(c); 4219 perlExpr.remove(0, 1); 4220 } 4221 4222 else { 4223 // Any characters from the perl expression that we don't explicitly 4224 // recognize before here are assumed to be literals and copied 4225 // as-is to the expected results. 4226 resultString.append(perlExpr.charAt(0)); 4227 perlExpr.remove(0, 1); 4228 } 4229 4230 if (U_FAILURE(status)) { 4231 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4232 break; 4233 } 4234 } 4235 4236 // 4237 // Expected Results Compare 4238 // 4239 UnicodeString expectedS(fields[4]); 4240 expectedS.findAndReplace(nulnulSrc, nulnul); 4241 expectedS.findAndReplace(ffffSrc, ffff); 4242 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4243 4244 4245 if (expectedS.compare(resultString) != 0) { 4246 err("Line %d: Incorrect perl expression results.", lineNum); 4247 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4248 } 4249 4250 delete testMat; 4251 delete testPat; 4252 } 4253 4254 // 4255 // All done. Clean up allocated stuff. 4256 // 4257 delete cgMat; 4258 delete cgPat; 4259 4260 delete groupsMat; 4261 delete groupsPat; 4262 4263 delete flagMat; 4264 delete flagPat; 4265 4266 delete lineMat; 4267 delete linePat; 4268 4269 delete fieldPat; 4270 delete [] testData; 4271 4272 4273 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4274 4275} 4276 4277 4278//------------------------------------------------------------------------------- 4279// 4280// PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts 4281// (instead of using UnicodeStrings) to test the alternate engine. 4282// The input file for this test is re_tests, the standard regular 4283// expression test data distributed with the Perl source code. 4284// See PerlTests() for more information. 4285// 4286//------------------------------------------------------------------------------- 4287void RegexTest::PerlTestsUTF8() { 4288 char tdd[2048]; 4289 const char *srcPath; 4290 UErrorCode status = U_ZERO_ERROR; 4291 UParseError pe; 4292 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status)); 4293 UText patternText = UTEXT_INITIALIZER; 4294 char *patternChars = NULL; 4295 int32_t patternLength; 4296 int32_t patternCapacity = 0; 4297 UText inputText = UTEXT_INITIALIZER; 4298 char *inputChars = NULL; 4299 int32_t inputLength; 4300 int32_t inputCapacity = 0; 4301 4302 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 4303 4304 // 4305 // Open and read the test data file. 4306 // 4307 srcPath=getPath(tdd, "re_tests.txt"); 4308 if(srcPath==NULL) { 4309 return; /* something went wrong, error already output */ 4310 } 4311 4312 int32_t len; 4313 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 4314 if (U_FAILURE(status)) { 4315 return; /* something went wrong, error already output */ 4316 } 4317 4318 // 4319 // Put the test data into a UnicodeString 4320 // 4321 UnicodeString testDataString(FALSE, testData, len); 4322 4323 // 4324 // Regex to break the input file into lines, and strip the new lines. 4325 // One line per match, capture group one is the desired data. 4326 // 4327 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 4328 if (U_FAILURE(status)) { 4329 dataerrln("RegexPattern::compile() error"); 4330 return; 4331 } 4332 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 4333 4334 // 4335 // Regex to split a test file line into fields. 4336 // There are six fields, separated by tabs. 4337 // 4338 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 4339 4340 // 4341 // Regex to identify test patterns with flag settings, and to separate them. 4342 // Test patterns with flags look like 'pattern'i 4343 // Test patterns without flags are not quoted: pattern 4344 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 4345 // 4346 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 4347 RegexMatcher* flagMat = flagPat->matcher(status); 4348 4349 // 4350 // The Perl tests reference several perl-isms, which are evaluated/substituted 4351 // in the test data. Not being perl, this must be done explicitly. Here 4352 // are string constants and REs for these constructs. 4353 // 4354 UnicodeString nulnulSrc("${nulnul}"); 4355 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 4356 nulnul = nulnul.unescape(); 4357 4358 UnicodeString ffffSrc("${ffff}"); 4359 UnicodeString ffff("\\uffff", -1, US_INV); 4360 ffff = ffff.unescape(); 4361 4362 // regexp for $-[0], $+[2], etc. 4363 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 4364 RegexMatcher *groupsMat = groupsPat->matcher(status); 4365 4366 // regexp for $0, $1, $2, etc. 4367 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 4368 RegexMatcher *cgMat = cgPat->matcher(status); 4369 4370 4371 // 4372 // Main Loop for the Perl Tests, runs once per line from the 4373 // test data file. 4374 // 4375 int32_t lineNum = 0; 4376 int32_t skippedUnimplementedCount = 0; 4377 while (lineMat->find()) { 4378 lineNum++; 4379 4380 // 4381 // Get a line, break it into its fields, do the Perl 4382 // variable substitutions. 4383 // 4384 UnicodeString line = lineMat->group(1, status); 4385 UnicodeString fields[7]; 4386 fieldPat->split(line, fields, 7, status); 4387 4388 flagMat->reset(fields[0]); 4389 flagMat->matches(status); 4390 UnicodeString pattern = flagMat->group(2, status); 4391 pattern.findAndReplace("${bang}", "!"); 4392 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4393 pattern.findAndReplace(ffffSrc, ffff); 4394 4395 // 4396 // Identify patterns that include match flag settings, 4397 // split off the flags, remove the extra quotes. 4398 // 4399 UnicodeString flagStr = flagMat->group(3, status); 4400 if (U_FAILURE(status)) { 4401 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4402 return; 4403 } 4404 int32_t flags = 0; 4405 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4406 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4407 const UChar UChar_m = 0x6d; 4408 const UChar UChar_x = 0x78; 4409 const UChar UChar_y = 0x79; 4410 if (flagStr.indexOf(UChar_i) != -1) { 4411 flags |= UREGEX_CASE_INSENSITIVE; 4412 } 4413 if (flagStr.indexOf(UChar_m) != -1) { 4414 flags |= UREGEX_MULTILINE; 4415 } 4416 if (flagStr.indexOf(UChar_x) != -1) { 4417 flags |= UREGEX_COMMENTS; 4418 } 4419 4420 // 4421 // Put the pattern in a UTF-8 UText 4422 // 4423 status = U_ZERO_ERROR; 4424 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4425 if (status == U_BUFFER_OVERFLOW_ERROR) { 4426 status = U_ZERO_ERROR; 4427 delete[] patternChars; 4428 patternCapacity = patternLength + 1; 4429 patternChars = new char[patternCapacity]; 4430 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4431 } 4432 utext_openUTF8(&patternText, patternChars, patternLength, &status); 4433 4434 // 4435 // Compile the test pattern. 4436 // 4437 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status); 4438 if (status == U_REGEX_UNIMPLEMENTED) { 4439 // 4440 // Test of a feature that is planned for ICU, but not yet implemented. 4441 // skip the test. 4442 skippedUnimplementedCount++; 4443 delete testPat; 4444 status = U_ZERO_ERROR; 4445 continue; 4446 } 4447 4448 if (U_FAILURE(status)) { 4449 // Some tests are supposed to generate errors. 4450 // Only report an error for tests that are supposed to succeed. 4451 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4452 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4453 { 4454 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4455 } 4456 status = U_ZERO_ERROR; 4457 delete testPat; 4458 continue; 4459 } 4460 4461 if (fields[2].indexOf(UChar_i) >= 0) { 4462 // ICU should skip this test. 4463 delete testPat; 4464 continue; 4465 } 4466 4467 if (fields[2].indexOf(UChar_c) >= 0) { 4468 // This pattern should have caused a compilation error, but didn't/ 4469 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4470 delete testPat; 4471 continue; 4472 } 4473 4474 4475 // 4476 // replace the Perl variables that appear in some of the 4477 // match data strings. 4478 // 4479 UnicodeString matchString = fields[1]; 4480 matchString.findAndReplace(nulnulSrc, nulnul); 4481 matchString.findAndReplace(ffffSrc, ffff); 4482 4483 // Replace any \n in the match string with an actual new-line char. 4484 // Don't do full unescape, as this unescapes more than Perl does, which 4485 // causes other spurious failures in the tests. 4486 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4487 4488 // 4489 // Put the input in a UTF-8 UText 4490 // 4491 status = U_ZERO_ERROR; 4492 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4493 if (status == U_BUFFER_OVERFLOW_ERROR) { 4494 status = U_ZERO_ERROR; 4495 delete[] inputChars; 4496 inputCapacity = inputLength + 1; 4497 inputChars = new char[inputCapacity]; 4498 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4499 } 4500 utext_openUTF8(&inputText, inputChars, inputLength, &status); 4501 4502 // 4503 // Run the test, check for expected match/don't match result. 4504 // 4505 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText); 4506 UBool found = testMat->find(); 4507 UBool expected = FALSE; 4508 if (fields[2].indexOf(UChar_y) >=0) { 4509 expected = TRUE; 4510 } 4511 if (expected != found) { 4512 errln("line %d: Expected %smatch, got %smatch", 4513 lineNum, expected?"":"no ", found?"":"no " ); 4514 continue; 4515 } 4516 4517 // Don't try to check expected results if there is no match. 4518 // (Some have stuff in the expected fields) 4519 if (!found) { 4520 delete testMat; 4521 delete testPat; 4522 continue; 4523 } 4524 4525 // 4526 // Interpret the Perl expression from the fourth field of the data file, 4527 // building up an ICU string from the results of the ICU match. 4528 // The Perl expression will contain references to the results of 4529 // a regex match, including the matched string, capture group strings, 4530 // group starting and ending indicies, etc. 4531 // 4532 UnicodeString resultString; 4533 UnicodeString perlExpr = fields[3]; 4534 4535 while (perlExpr.length() > 0) { 4536 groupsMat->reset(perlExpr); 4537 cgMat->reset(perlExpr); 4538 4539 if (perlExpr.startsWith("$&")) { 4540 resultString.append(testMat->group(status)); 4541 perlExpr.remove(0, 2); 4542 } 4543 4544 else if (groupsMat->lookingAt(status)) { 4545 // $-[0] $+[2] etc. 4546 UnicodeString digitString = groupsMat->group(2, status); 4547 int32_t t = 0; 4548 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4549 UnicodeString plusOrMinus = groupsMat->group(1, status); 4550 int32_t matchPosition; 4551 if (plusOrMinus.compare("+") == 0) { 4552 matchPosition = testMat->end(groupNum, status); 4553 } else { 4554 matchPosition = testMat->start(groupNum, status); 4555 } 4556 if (matchPosition != -1) { 4557 ICU_Utility::appendNumber(resultString, matchPosition); 4558 } 4559 perlExpr.remove(0, groupsMat->end(status)); 4560 } 4561 4562 else if (cgMat->lookingAt(status)) { 4563 // $1, $2, $3, etc. 4564 UnicodeString digitString = cgMat->group(1, status); 4565 int32_t t = 0; 4566 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4567 if (U_SUCCESS(status)) { 4568 resultString.append(testMat->group(groupNum, status)); 4569 status = U_ZERO_ERROR; 4570 } 4571 perlExpr.remove(0, cgMat->end(status)); 4572 } 4573 4574 else if (perlExpr.startsWith("@-")) { 4575 int32_t i; 4576 for (i=0; i<=testMat->groupCount(); i++) { 4577 if (i>0) { 4578 resultString.append(" "); 4579 } 4580 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4581 } 4582 perlExpr.remove(0, 2); 4583 } 4584 4585 else if (perlExpr.startsWith("@+")) { 4586 int32_t i; 4587 for (i=0; i<=testMat->groupCount(); i++) { 4588 if (i>0) { 4589 resultString.append(" "); 4590 } 4591 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4592 } 4593 perlExpr.remove(0, 2); 4594 } 4595 4596 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4597 // or as an escaped sequence (e.g. \n) 4598 if (perlExpr.length() > 1) { 4599 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4600 } 4601 UChar c = perlExpr.charAt(0); 4602 switch (c) { 4603 case 'n': c = '\n'; break; 4604 // add any other escape sequences that show up in the test expected results. 4605 } 4606 resultString.append(c); 4607 perlExpr.remove(0, 1); 4608 } 4609 4610 else { 4611 // Any characters from the perl expression that we don't explicitly 4612 // recognize before here are assumed to be literals and copied 4613 // as-is to the expected results. 4614 resultString.append(perlExpr.charAt(0)); 4615 perlExpr.remove(0, 1); 4616 } 4617 4618 if (U_FAILURE(status)) { 4619 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4620 break; 4621 } 4622 } 4623 4624 // 4625 // Expected Results Compare 4626 // 4627 UnicodeString expectedS(fields[4]); 4628 expectedS.findAndReplace(nulnulSrc, nulnul); 4629 expectedS.findAndReplace(ffffSrc, ffff); 4630 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4631 4632 4633 if (expectedS.compare(resultString) != 0) { 4634 err("Line %d: Incorrect perl expression results.", lineNum); 4635 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4636 } 4637 4638 delete testMat; 4639 delete testPat; 4640 } 4641 4642 // 4643 // All done. Clean up allocated stuff. 4644 // 4645 delete cgMat; 4646 delete cgPat; 4647 4648 delete groupsMat; 4649 delete groupsPat; 4650 4651 delete flagMat; 4652 delete flagPat; 4653 4654 delete lineMat; 4655 delete linePat; 4656 4657 delete fieldPat; 4658 delete [] testData; 4659 4660 utext_close(&patternText); 4661 utext_close(&inputText); 4662 4663 delete [] patternChars; 4664 delete [] inputChars; 4665 4666 4667 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4668 4669} 4670 4671 4672//-------------------------------------------------------------- 4673// 4674// Bug6149 Verify limits to heap expansion for backtrack stack. 4675// Use this pattern, 4676// "(a?){1,}" 4677// The zero-length match will repeat forever. 4678// (That this goes into a loop is another bug) 4679// 4680//--------------------------------------------------------------- 4681void RegexTest::Bug6149() { 4682 UnicodeString pattern("(a?){1,}"); 4683 UnicodeString s("xyz"); 4684 uint32_t flags = 0; 4685 UErrorCode status = U_ZERO_ERROR; 4686 4687 RegexMatcher matcher(pattern, s, flags, status); 4688 UBool result = false; 4689 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW); 4690 REGEX_ASSERT(result == FALSE); 4691 } 4692 4693 4694// 4695// Callbacks() Test the callback function. 4696// When set, callbacks occur periodically during matching operations, 4697// giving the application code the ability to abort the operation 4698// before it's normal completion. 4699// 4700 4701struct callBackContext { 4702 RegexTest *test; 4703 int32_t maxCalls; 4704 int32_t numCalls; 4705 int32_t lastSteps; 4706 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}; 4707}; 4708 4709U_CDECL_BEGIN 4710static UBool U_CALLCONV 4711testCallBackFn(const void *context, int32_t steps) { 4712 callBackContext *info = (callBackContext *)context; 4713 if (info->lastSteps+1 != steps) { 4714 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps); 4715 } 4716 info->lastSteps = steps; 4717 info->numCalls++; 4718 return (info->numCalls < info->maxCalls); 4719} 4720U_CDECL_END 4721 4722void RegexTest::Callbacks() { 4723 { 4724 // Getter returns NULLs if no callback has been set 4725 4726 // The variables that the getter will fill in. 4727 // Init to non-null values so that the action of the getter can be seen. 4728 const void *returnedContext = &returnedContext; 4729 URegexMatchCallback *returnedFn = &testCallBackFn; 4730 4731 UErrorCode status = U_ZERO_ERROR; 4732 RegexMatcher matcher("x", 0, status); 4733 REGEX_CHECK_STATUS; 4734 matcher.getMatchCallback(returnedFn, returnedContext, status); 4735 REGEX_CHECK_STATUS; 4736 REGEX_ASSERT(returnedFn == NULL); 4737 REGEX_ASSERT(returnedContext == NULL); 4738 } 4739 4740 { 4741 // Set and Get work 4742 callBackContext cbInfo = {this, 0, 0, 0}; 4743 const void *returnedContext; 4744 URegexMatchCallback *returnedFn; 4745 UErrorCode status = U_ZERO_ERROR; 4746 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. 4747 REGEX_CHECK_STATUS; 4748 matcher.setMatchCallback(testCallBackFn, &cbInfo, status); 4749 REGEX_CHECK_STATUS; 4750 matcher.getMatchCallback(returnedFn, returnedContext, status); 4751 REGEX_CHECK_STATUS; 4752 REGEX_ASSERT(returnedFn == testCallBackFn); 4753 REGEX_ASSERT(returnedContext == &cbInfo); 4754 4755 // A short-running match shouldn't invoke the callback 4756 status = U_ZERO_ERROR; 4757 cbInfo.reset(1); 4758 UnicodeString s = "xxx"; 4759 matcher.reset(s); 4760 REGEX_ASSERT(matcher.matches(status)); 4761 REGEX_CHECK_STATUS; 4762 REGEX_ASSERT(cbInfo.numCalls == 0); 4763 4764 // A medium-length match that runs long enough to invoke the 4765 // callback, but not so long that the callback aborts it. 4766 status = U_ZERO_ERROR; 4767 cbInfo.reset(4); 4768 s = "aaaaaaaaaaaaaaaaaaab"; 4769 matcher.reset(s); 4770 REGEX_ASSERT(matcher.matches(status)==FALSE); 4771 REGEX_CHECK_STATUS; 4772 REGEX_ASSERT(cbInfo.numCalls > 0); 4773 4774 // A longer running match that the callback function will abort. 4775 status = U_ZERO_ERROR; 4776 cbInfo.reset(4); 4777 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 4778 matcher.reset(s); 4779 REGEX_ASSERT(matcher.matches(status)==FALSE); 4780 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4781 REGEX_ASSERT(cbInfo.numCalls == 4); 4782 } 4783 4784 4785} 4786 4787 4788// 4789// FindProgressCallbacks() Test the find "progress" callback function. 4790// When set, the find progress callback will be invoked during a find operations 4791// after each return from a match attempt, giving the application the opportunity 4792// to terminate a long-running find operation before it's normal completion. 4793// 4794 4795struct progressCallBackContext { 4796 RegexTest *test; 4797 int64_t lastIndex; 4798 int32_t maxCalls; 4799 int32_t numCalls; 4800 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}; 4801}; 4802 4803U_CDECL_BEGIN 4804static UBool U_CALLCONV 4805testProgressCallBackFn(const void *context, int64_t matchIndex) { 4806 progressCallBackContext *info = (progressCallBackContext *)context; 4807 info->numCalls++; 4808 info->lastIndex = matchIndex; 4809// info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls); 4810 return (info->numCalls < info->maxCalls); 4811} 4812U_CDECL_END 4813 4814void RegexTest::FindProgressCallbacks() { 4815 { 4816 // Getter returns NULLs if no callback has been set 4817 4818 // The variables that the getter will fill in. 4819 // Init to non-null values so that the action of the getter can be seen. 4820 const void *returnedContext = &returnedContext; 4821 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn; 4822 4823 UErrorCode status = U_ZERO_ERROR; 4824 RegexMatcher matcher("x", 0, status); 4825 REGEX_CHECK_STATUS; 4826 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4827 REGEX_CHECK_STATUS; 4828 REGEX_ASSERT(returnedFn == NULL); 4829 REGEX_ASSERT(returnedContext == NULL); 4830 } 4831 4832 { 4833 // Set and Get work 4834 progressCallBackContext cbInfo = {this, 0, 0, 0}; 4835 const void *returnedContext; 4836 URegexFindProgressCallback *returnedFn; 4837 UErrorCode status = U_ZERO_ERROR; 4838 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. 4839 REGEX_CHECK_STATUS; 4840 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status); 4841 REGEX_CHECK_STATUS; 4842 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4843 REGEX_CHECK_STATUS; 4844 REGEX_ASSERT(returnedFn == testProgressCallBackFn); 4845 REGEX_ASSERT(returnedContext == &cbInfo); 4846 4847 // A short-running match should NOT invoke the callback. 4848 status = U_ZERO_ERROR; 4849 cbInfo.reset(100); 4850 UnicodeString s = "abxxx"; 4851 matcher.reset(s); 4852#if 0 4853 matcher.setTrace(TRUE); 4854#endif 4855 REGEX_ASSERT(matcher.find(0, status)); 4856 REGEX_CHECK_STATUS; 4857 REGEX_ASSERT(cbInfo.numCalls == 0); 4858 4859 // A medium running match that causes matcher.find() to invoke our callback for each index. 4860 status = U_ZERO_ERROR; 4861 s = "aaaaaaaaaaaaaaaaaaab"; 4862 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string 4863 matcher.reset(s); 4864 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4865 REGEX_CHECK_STATUS; 4866 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25); 4867 4868 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point. 4869 status = U_ZERO_ERROR; 4870 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab"; 4871 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string 4872 matcher.reset(s1); 4873 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4874 REGEX_CHECK_STATUS; 4875 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5); 4876 4877#if 0 4878 // Now a match that will succeed, but after an interruption 4879 status = U_ZERO_ERROR; 4880 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx"; 4881 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string 4882 matcher.reset(s2); 4883 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4884 REGEX_CHECK_STATUS; 4885 // Now retry the match from where left off 4886 cbInfo.maxCalls = 100; // No callback limit 4887 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status)); 4888 REGEX_CHECK_STATUS; 4889#endif 4890 } 4891 4892 4893} 4894 4895 4896//--------------------------------------------------------------------------- 4897// 4898// PreAllocatedUTextCAPI Check the C API with pre-allocated mutable 4899// UTexts. The pure-C implementation of UText 4900// has no mutable backing stores, but we can 4901// use UnicodeString here to test the functionality. 4902// 4903//--------------------------------------------------------------------------- 4904void RegexTest::PreAllocatedUTextCAPI () { 4905 UErrorCode status = U_ZERO_ERROR; 4906 URegularExpression *re; 4907 UText patternText = UTEXT_INITIALIZER; 4908 UnicodeString buffer; 4909 UText bufferText = UTEXT_INITIALIZER; 4910 4911 utext_openUnicodeString(&bufferText, &buffer, &status); 4912 4913 /* 4914 * getText() and getUText() 4915 */ 4916 { 4917 UText text1 = UTEXT_INITIALIZER; 4918 UText text2 = UTEXT_INITIALIZER; 4919 UChar text2Chars[20]; 4920 UText *resultText; 4921 4922 status = U_ZERO_ERROR; 4923 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status); 4924 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status); 4925 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2); 4926 utext_openUChars(&text2, text2Chars, -1, &status); 4927 4928 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status); 4929 re = uregex_openUText(&patternText, 0, NULL, &status); 4930 4931 /* First set a UText */ 4932 uregex_setUText(re, &text1, &status); 4933 resultText = uregex_getUText(re, &bufferText, &status); 4934 REGEX_CHECK_STATUS; 4935 REGEX_ASSERT(resultText == &bufferText); 4936 utext_setNativeIndex(resultText, 0); 4937 utext_setNativeIndex(&text1, 0); 4938 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0); 4939 4940 resultText = uregex_getUText(re, &bufferText, &status); 4941 REGEX_CHECK_STATUS; 4942 REGEX_ASSERT(resultText == &bufferText); 4943 utext_setNativeIndex(resultText, 0); 4944 utext_setNativeIndex(&text1, 0); 4945 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0); 4946 4947 /* Then set a UChar * */ 4948 uregex_setText(re, text2Chars, 7, &status); 4949 resultText = uregex_getUText(re, &bufferText, &status); 4950 REGEX_CHECK_STATUS; 4951 REGEX_ASSERT(resultText == &bufferText); 4952 utext_setNativeIndex(resultText, 0); 4953 utext_setNativeIndex(&text2, 0); 4954 REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0); 4955 4956 uregex_close(re); 4957 utext_close(&text1); 4958 utext_close(&text2); 4959 } 4960 4961 /* 4962 * group() 4963 */ 4964 { 4965 UChar text1[80]; 4966 UText *actual; 4967 UBool result; 4968 u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2); 4969 4970 status = U_ZERO_ERROR; 4971 re = uregex_openC("abc(.*?)def", 0, NULL, &status); 4972 REGEX_CHECK_STATUS; 4973 4974 uregex_setText(re, text1, -1, &status); 4975 result = uregex_find(re, 0, &status); 4976 REGEX_ASSERT(result==TRUE); 4977 4978 /* Capture Group 0, the full match. Should succeed. */ 4979 status = U_ZERO_ERROR; 4980 actual = uregex_groupUTextDeep(re, 0, &bufferText, &status); 4981 REGEX_CHECK_STATUS; 4982 REGEX_ASSERT(actual == &bufferText); 4983 REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual); 4984 4985 /* Capture group #1. Should succeed. */ 4986 status = U_ZERO_ERROR; 4987 actual = uregex_groupUTextDeep(re, 1, &bufferText, &status); 4988 REGEX_CHECK_STATUS; 4989 REGEX_ASSERT(actual == &bufferText); 4990 REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual); 4991 4992 /* Capture group out of range. Error. */ 4993 status = U_ZERO_ERROR; 4994 actual = uregex_groupUTextDeep(re, 2, &bufferText, &status); 4995 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 4996 REGEX_ASSERT(actual == &bufferText); 4997 4998 uregex_close(re); 4999 5000 } 5001 5002 /* 5003 * replaceFirst() 5004 */ 5005 { 5006 UChar text1[80]; 5007 UChar text2[80]; 5008 UText replText = UTEXT_INITIALIZER; 5009 UText *result; 5010 5011 status = U_ZERO_ERROR; 5012 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); 5013 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); 5014 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 5015 5016 re = uregex_openC("x(.*?)x", 0, NULL, &status); 5017 REGEX_CHECK_STATUS; 5018 5019 /* Normal case, with match */ 5020 uregex_setText(re, text1, -1, &status); 5021 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5022 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5023 REGEX_CHECK_STATUS; 5024 REGEX_ASSERT(result == &bufferText); 5025 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result); 5026 5027 /* No match. Text should copy to output with no changes. */ 5028 uregex_setText(re, text2, -1, &status); 5029 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5030 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5031 REGEX_CHECK_STATUS; 5032 REGEX_ASSERT(result == &bufferText); 5033 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 5034 5035 /* Unicode escapes */ 5036 uregex_setText(re, text1, -1, &status); 5037 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status); 5038 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5039 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5040 REGEX_CHECK_STATUS; 5041 REGEX_ASSERT(result == &bufferText); 5042 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result); 5043 5044 uregex_close(re); 5045 utext_close(&replText); 5046 } 5047 5048 5049 /* 5050 * replaceAll() 5051 */ 5052 { 5053 UChar text1[80]; 5054 UChar text2[80]; 5055 UText replText = UTEXT_INITIALIZER; 5056 UText *result; 5057 5058 status = U_ZERO_ERROR; 5059 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); 5060 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); 5061 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 5062 5063 re = uregex_openC("x(.*?)x", 0, NULL, &status); 5064 REGEX_CHECK_STATUS; 5065 5066 /* Normal case, with match */ 5067 uregex_setText(re, text1, -1, &status); 5068 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5069 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 5070 REGEX_CHECK_STATUS; 5071 REGEX_ASSERT(result == &bufferText); 5072 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result); 5073 5074 /* No match. Text should copy to output with no changes. */ 5075 uregex_setText(re, text2, -1, &status); 5076 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5077 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 5078 REGEX_CHECK_STATUS; 5079 REGEX_ASSERT(result == &bufferText); 5080 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 5081 5082 uregex_close(re); 5083 utext_close(&replText); 5084 } 5085 5086 5087 /* 5088 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts, 5089 * so we don't need to test it here. 5090 */ 5091 5092 utext_close(&bufferText); 5093 utext_close(&patternText); 5094} 5095 5096//-------------------------------------------------------------- 5097// 5098// Bug7651 Regex pattern that exceeds default operator stack depth in matcher. 5099// 5100//--------------------------------------------------------------- 5101void RegexTest::Bug7651() { 5102 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)"); 5103 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData. 5104 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation. 5105 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)"); 5106 UnicodeString s("#ff @abcd This is test"); 5107 RegexPattern *REPattern = NULL; 5108 RegexMatcher *REMatcher = NULL; 5109 UErrorCode status = U_ZERO_ERROR; 5110 UParseError pe; 5111 5112 REPattern = RegexPattern::compile(pattern1, 0, pe, status); 5113 REGEX_CHECK_STATUS; 5114 REMatcher = REPattern->matcher(s, status); 5115 REGEX_CHECK_STATUS; 5116 REGEX_ASSERT(REMatcher->find()); 5117 REGEX_ASSERT(REMatcher->start(status) == 0); 5118 delete REPattern; 5119 delete REMatcher; 5120 status = U_ZERO_ERROR; 5121 5122 REPattern = RegexPattern::compile(pattern2, 0, pe, status); 5123 REGEX_CHECK_STATUS; 5124 REMatcher = REPattern->matcher(s, status); 5125 REGEX_CHECK_STATUS; 5126 REGEX_ASSERT(REMatcher->find()); 5127 REGEX_ASSERT(REMatcher->start(status) == 0); 5128 delete REPattern; 5129 delete REMatcher; 5130 status = U_ZERO_ERROR; 5131 } 5132 5133void RegexTest::Bug7740() { 5134 UErrorCode status = U_ZERO_ERROR; 5135 UnicodeString pattern = "(a)"; 5136 UnicodeString text = "abcdef"; 5137 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status); 5138 REGEX_CHECK_STATUS; 5139 REGEX_ASSERT(m->lookingAt(status)); 5140 REGEX_CHECK_STATUS; 5141 status = U_ILLEGAL_ARGUMENT_ERROR; 5142 UnicodeString s = m->group(1, status); // Bug 7740: segfault here. 5143 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 5144 REGEX_ASSERT(s == ""); 5145 delete m; 5146} 5147 5148// Bug 8479: was crashing whith a Bogus UnicodeString as input. 5149 5150void RegexTest::Bug8479() { 5151 UErrorCode status = U_ZERO_ERROR; 5152 5153 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status); 5154 REGEX_CHECK_STATUS; 5155 if (U_SUCCESS(status)) 5156 { 5157 UnicodeString str; 5158 str.setToBogus(); 5159 pMatcher->reset(str); 5160 status = U_ZERO_ERROR; 5161 pMatcher->matches(status); 5162 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 5163 delete pMatcher; 5164 } 5165} 5166 5167 5168// Bug 7029 5169void RegexTest::Bug7029() { 5170 UErrorCode status = U_ZERO_ERROR; 5171 5172 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status); 5173 UnicodeString text = "abc.def"; 5174 UnicodeString splits[10]; 5175 REGEX_CHECK_STATUS; 5176 int32_t numFields = pMatcher->split(text, splits, 10, status); 5177 REGEX_CHECK_STATUS; 5178 REGEX_ASSERT(numFields == 8); 5179 delete pMatcher; 5180} 5181 5182void RegexTest::CheckInvBufSize() { 5183 if(inv_next>=INV_BUFSIZ) { 5184 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n", 5185 __FILE__, INV_BUFSIZ, inv_next); 5186 } else { 5187 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next); 5188 } 5189} 5190 5191#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 5192 5193