1/******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 2002-2015, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 7// 8// regextst.cpp 9// 10// ICU Regular Expressions test, part of intltest. 11// 12 13/* 14 NOTE!! 15 16 PLEASE be careful about ASCII assumptions in this test. 17 This test is one of the worst repeat offenders. 18 If you have questions, contact someone on the ICU PMC 19 who has access to an EBCDIC system. 20 21 */ 22 23#include "intltest.h" 24#if !UCONFIG_NO_REGULAR_EXPRESSIONS 25 26#include "unicode/localpointer.h" 27#include "unicode/regex.h" 28#include "unicode/uchar.h" 29#include "unicode/ucnv.h" 30#include "unicode/uniset.h" 31#include "unicode/uregex.h" 32#include "unicode/usetiter.h" 33#include "unicode/ustring.h" 34#include "regextst.h" 35#include "regexcmp.h" 36#include "uvector.h" 37#include "util.h" 38#include <stdlib.h> 39#include <string.h> 40#include <stdio.h> 41#include "cmemory.h" 42#include "cstring.h" 43#include "uinvchar.h" 44 45#define SUPPORT_MUTATING_INPUT_STRING 0 46 47//--------------------------------------------------------------------------- 48// 49// Test class boilerplate 50// 51//--------------------------------------------------------------------------- 52RegexTest::RegexTest() 53{ 54} 55 56 57RegexTest::~RegexTest() 58{ 59} 60 61 62 63void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 64{ 65 if (exec) logln("TestSuite RegexTest: "); 66 switch (index) { 67 68 case 0: name = "Basic"; 69 if (exec) Basic(); 70 break; 71 case 1: name = "API_Match"; 72 if (exec) API_Match(); 73 break; 74 case 2: name = "API_Replace"; 75 if (exec) API_Replace(); 76 break; 77 case 3: name = "API_Pattern"; 78 if (exec) API_Pattern(); 79 break; 80 case 4: 81#if !UCONFIG_NO_FILE_IO 82 name = "Extended"; 83 if (exec) Extended(); 84#else 85 name = "skip"; 86#endif 87 break; 88 case 5: name = "Errors"; 89 if (exec) Errors(); 90 break; 91 case 6: name = "PerlTests"; 92 if (exec) PerlTests(); 93 break; 94 case 7: name = "Callbacks"; 95 if (exec) Callbacks(); 96 break; 97 case 8: name = "FindProgressCallbacks"; 98 if (exec) FindProgressCallbacks(); 99 break; 100 case 9: name = "Bug 6149"; 101 if (exec) Bug6149(); 102 break; 103 case 10: name = "UTextBasic"; 104 if (exec) UTextBasic(); 105 break; 106 case 11: name = "API_Match_UTF8"; 107 if (exec) API_Match_UTF8(); 108 break; 109 case 12: name = "API_Replace_UTF8"; 110 if (exec) API_Replace_UTF8(); 111 break; 112 case 13: name = "API_Pattern_UTF8"; 113 if (exec) API_Pattern_UTF8(); 114 break; 115 case 14: name = "PerlTestsUTF8"; 116 if (exec) PerlTestsUTF8(); 117 break; 118 case 15: name = "PreAllocatedUTextCAPI"; 119 if (exec) PreAllocatedUTextCAPI(); 120 break; 121 case 16: name = "Bug 7651"; 122 if (exec) Bug7651(); 123 break; 124 case 17: name = "Bug 7740"; 125 if (exec) Bug7740(); 126 break; 127 case 18: name = "Bug 8479"; 128 if (exec) Bug8479(); 129 break; 130 case 19: name = "Bug 7029"; 131 if (exec) Bug7029(); 132 break; 133 case 20: name = "CheckInvBufSize"; 134 if (exec) CheckInvBufSize(); 135 break; 136 case 21: name = "Bug 9283"; 137 if (exec) Bug9283(); 138 break; 139 case 22: name = "Bug10459"; 140 if (exec) Bug10459(); 141 break; 142 case 23: name = "TestCaseInsensitiveStarters"; 143 if (exec) TestCaseInsensitiveStarters(); 144 break; 145 case 24: name = "TestBug11049"; 146 if (exec) TestBug11049(); 147 break; 148 case 25: name = "TestBug11371"; 149 if (exec) TestBug11371(); 150 break; 151 case 26: name = "TestBug11480"; 152 if (exec) TestBug11480(); 153 break; 154 case 27: name = "NamedCapture"; 155 if (exec) NamedCapture(); 156 break; 157 case 28: name = "NamedCaptureLimits"; 158 if (exec) NamedCaptureLimits(); 159 break; 160 default: name = ""; 161 break; //needed to end loop 162 } 163} 164 165 166 167/** 168 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage 169 * into ASCII. 170 * @see utext_openUTF8 171 */ 172static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status); 173 174//--------------------------------------------------------------------------- 175// 176// Error Checking / Reporting macros used in all of the tests. 177// 178//--------------------------------------------------------------------------- 179 180static void utextToPrintable(char *buf, int32_t bufLen, UText *text) { 181 int64_t oldIndex = utext_getNativeIndex(text); 182 utext_setNativeIndex(text, 0); 183 char *bufPtr = buf; 184 UChar32 c = utext_next32From(text, 0); 185 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) { 186 if (0x000020<=c && c<0x00007e) { 187 *bufPtr = c; 188 } else { 189#if 0 190 sprintf(bufPtr,"U+%04X", c); 191 bufPtr+= strlen(bufPtr)-1; 192#else 193 *bufPtr = '%'; 194#endif 195 } 196 bufPtr++; 197 c = UTEXT_NEXT32(text); 198 } 199 *bufPtr = 0; 200#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY) 201 char *ebuf = (char*)malloc(bufLen); 202 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen); 203 uprv_strncpy(buf, ebuf, bufLen); 204 free((void*)ebuf); 205#endif 206 utext_setNativeIndex(text, oldIndex); 207} 208 209 210static char ASSERT_BUF[1024]; 211 212const char* RegexTest::extractToAssertBuf(const UnicodeString& message) { 213 if(message.length()==0) { 214 strcpy(ASSERT_BUF, "[[empty UnicodeString]]"); 215 } else { 216 UnicodeString buf; 217 IntlTest::prettify(message,buf); 218 if(buf.length()==0) { 219 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]"); 220 } else { 221 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1); 222 if(ASSERT_BUF[0]==0) { 223 ASSERT_BUF[0]=0; 224 for(int32_t i=0;i<buf.length();i++) { 225 UChar ch = buf[i]; 226 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch); 227 } 228 } 229 } 230 } 231 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0; 232 return ASSERT_BUF; 233} 234 235#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);} 236 237#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \ 238 __FILE__, __LINE__, u_errorName(status)); return;}} 239 240#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};} 241 242#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\ 243if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \ 244 __LINE__, u_errorName(errcode), u_errorName(status));};} 245 246#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \ 247 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }} 248 249#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \ 250 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}} 251 252// expected: const char * , restricted to invariant characters. 253// actual: const UnicodeString & 254#define REGEX_ASSERT_UNISTR(expected, actual) { \ 255 if (UnicodeString(expected, -1, US_INV) != (actual)) { \ 256 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \ 257 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};} 258 259 260static UBool testUTextEqual(UText *uta, UText *utb) { 261 UChar32 ca = 0; 262 UChar32 cb = 0; 263 utext_setNativeIndex(uta, 0); 264 utext_setNativeIndex(utb, 0); 265 do { 266 ca = utext_next32(uta); 267 cb = utext_next32(utb); 268 if (ca != cb) { 269 break; 270 } 271 } while (ca != U_SENTINEL); 272 return ca == cb; 273} 274 275 276/** 277 * @param expected expected text in UTF-8 (not platform) codepage 278 */ 279void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) { 280 UErrorCode status = U_ZERO_ERROR; 281 UText expectedText = UTEXT_INITIALIZER; 282 utext_openUTF8(&expectedText, expected, -1, &status); 283 if(U_FAILURE(status)) { 284 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 285 return; 286 } 287 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) { 288 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected)); 289 return; 290 } 291 utext_setNativeIndex(actual, 0); 292 if (!testUTextEqual(&expectedText, actual)) { 293 char buf[201 /*21*/]; 294 char expectedBuf[201]; 295 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); 296 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); 297 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 298 } 299 utext_close(&expectedText); 300} 301/** 302 * @param expected invariant (platform local text) input 303 */ 304 305void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) { 306 UErrorCode status = U_ZERO_ERROR; 307 UText expectedText = UTEXT_INITIALIZER; 308 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status); 309 if(U_FAILURE(status)) { 310 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected)); 311 return; 312 } 313 utext_setNativeIndex(actual, 0); 314 if (!testUTextEqual(&expectedText, actual)) { 315 char buf[201 /*21*/]; 316 char expectedBuf[201]; 317 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual); 318 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText); 319 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual)); 320 } 321 utext_close(&expectedText); 322} 323 324/** 325 * Assumes utf-8 input 326 */ 327#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__) 328/** 329 * Assumes Invariant input 330 */ 331#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__) 332 333/** 334 * This buffer ( inv_buf ) is used to hold the UTF-8 strings 335 * passed into utext_openUTF8. An error will be given if 336 * INV_BUFSIZ is too small. It's only used on EBCDIC systems. 337 */ 338 339#define INV_BUFSIZ 2048 /* increase this if too small */ 340 341static int64_t inv_next=0; 342 343#if U_CHARSET_FAMILY!=U_ASCII_FAMILY 344static char inv_buf[INV_BUFSIZ]; 345#endif 346 347static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) { 348 if(length==-1) length=strlen(inv); 349#if U_CHARSET_FAMILY==U_ASCII_FAMILY 350 inv_next+=length; 351 return utext_openUTF8(ut, inv, length, status); 352#else 353 if(inv_next+length+1>INV_BUFSIZ) { 354 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n", 355 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1)); 356 *status = U_MEMORY_ALLOCATION_ERROR; 357 return NULL; 358 } 359 360 unsigned char *buf = (unsigned char*)inv_buf+inv_next; 361 uprv_aestrncpy(buf, (const uint8_t*)inv, length); 362 inv_next+=length; 363 364#if 0 365 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next); 366#endif 367 368 return utext_openUTF8(ut, (const char*)buf, length, status); 369#endif 370} 371 372 373//--------------------------------------------------------------------------- 374// 375// REGEX_TESTLM Macro + invocation function to simplify writing quick tests 376// for the LookingAt() and Match() functions. 377// 378// usage: 379// REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected); 380// 381// The expected results are UBool - TRUE or FALSE. 382// The input text is unescaped. The pattern is not. 383// 384// 385//--------------------------------------------------------------------------- 386 387#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);} 388 389UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 390 const UnicodeString pattern(pat, -1, US_INV); 391 const UnicodeString inputText(text, -1, US_INV); 392 UErrorCode status = U_ZERO_ERROR; 393 UParseError pe; 394 RegexPattern *REPattern = NULL; 395 RegexMatcher *REMatcher = NULL; 396 UBool retVal = TRUE; 397 398 UnicodeString patString(pat, -1, US_INV); 399 REPattern = RegexPattern::compile(patString, 0, pe, status); 400 if (U_FAILURE(status)) { 401 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s", 402 line, u_errorName(status)); 403 return FALSE; 404 } 405 if (line==376) { REPattern->dumpPattern();} 406 407 UnicodeString inputString(inputText); 408 UnicodeString unEscapedInput = inputString.unescape(); 409 REMatcher = REPattern->matcher(unEscapedInput, status); 410 if (U_FAILURE(status)) { 411 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n", 412 line, u_errorName(status)); 413 return FALSE; 414 } 415 416 UBool actualmatch; 417 actualmatch = REMatcher->lookingAt(status); 418 if (U_FAILURE(status)) { 419 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n", 420 line, u_errorName(status)); 421 retVal = FALSE; 422 } 423 if (actualmatch != looking) { 424 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line); 425 retVal = FALSE; 426 } 427 428 status = U_ZERO_ERROR; 429 actualmatch = REMatcher->matches(status); 430 if (U_FAILURE(status)) { 431 errln("RegexTest failure in matches() at line %d. Status = %s\n", 432 line, u_errorName(status)); 433 retVal = FALSE; 434 } 435 if (actualmatch != match) { 436 errln("RegexTest: wrong return from matches() at line %d.\n", line); 437 retVal = FALSE; 438 } 439 440 if (retVal == FALSE) { 441 REPattern->dumpPattern(); 442 } 443 444 delete REPattern; 445 delete REMatcher; 446 return retVal; 447} 448 449 450UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) { 451 UText pattern = UTEXT_INITIALIZER; 452 int32_t inputUTF8Length; 453 char *textChars = NULL; 454 UText inputText = UTEXT_INITIALIZER; 455 UErrorCode status = U_ZERO_ERROR; 456 UParseError pe; 457 RegexPattern *REPattern = NULL; 458 RegexMatcher *REMatcher = NULL; 459 UBool retVal = TRUE; 460 461 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status); 462 REPattern = RegexPattern::compile(&pattern, 0, pe, status); 463 if (U_FAILURE(status)) { 464 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n", 465 line, u_errorName(status)); 466 return FALSE; 467 } 468 469 UnicodeString inputString(text, -1, US_INV); 470 UnicodeString unEscapedInput = inputString.unescape(); 471 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status)); 472 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 473 474 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status); 475 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) { 476 // UTF-8 does not allow unpaired surrogates, so this could actually happen 477 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status)); 478 return TRUE; // not a failure of the Regex engine 479 } 480 status = U_ZERO_ERROR; // buffer overflow 481 textChars = new char[inputUTF8Length+1]; 482 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status); 483 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status); 484 485 REMatcher = &REPattern->matcher(status)->reset(&inputText); 486 if (U_FAILURE(status)) { 487 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n", 488 line, u_errorName(status)); 489 return FALSE; 490 } 491 492 UBool actualmatch; 493 actualmatch = REMatcher->lookingAt(status); 494 if (U_FAILURE(status)) { 495 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n", 496 line, u_errorName(status)); 497 retVal = FALSE; 498 } 499 if (actualmatch != looking) { 500 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line); 501 retVal = FALSE; 502 } 503 504 status = U_ZERO_ERROR; 505 actualmatch = REMatcher->matches(status); 506 if (U_FAILURE(status)) { 507 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n", 508 line, u_errorName(status)); 509 retVal = FALSE; 510 } 511 if (actualmatch != match) { 512 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line); 513 retVal = FALSE; 514 } 515 516 if (retVal == FALSE) { 517 REPattern->dumpPattern(); 518 } 519 520 delete REPattern; 521 delete REMatcher; 522 utext_close(&inputText); 523 utext_close(&pattern); 524 delete[] textChars; 525 return retVal; 526} 527 528 529 530//--------------------------------------------------------------------------- 531// 532// REGEX_ERR Macro + invocation function to simplify writing tests 533// regex tests for incorrect patterns 534// 535// usage: 536// REGEX_ERR("pattern", expected error line, column, expected status); 537// 538//--------------------------------------------------------------------------- 539#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__); 540 541void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol, 542 UErrorCode expectedStatus, int32_t line) { 543 UnicodeString pattern(pat); 544 545 UErrorCode status = U_ZERO_ERROR; 546 UParseError pe; 547 RegexPattern *callerPattern = NULL; 548 549 // 550 // Compile the caller's pattern 551 // 552 UnicodeString patString(pat); 553 callerPattern = RegexPattern::compile(patString, 0, pe, status); 554 if (status != expectedStatus) { 555 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 556 } else { 557 if (status != U_ZERO_ERROR) { 558 if (pe.line != errLine || pe.offset != errCol) { 559 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 560 line, errLine, errCol, pe.line, pe.offset); 561 } 562 } 563 } 564 565 delete callerPattern; 566 567 // 568 // Compile again, using a UTF-8-based UText 569 // 570 UText patternText = UTEXT_INITIALIZER; 571 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status); 572 callerPattern = RegexPattern::compile(&patternText, 0, pe, status); 573 if (status != expectedStatus) { 574 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status)); 575 } else { 576 if (status != U_ZERO_ERROR) { 577 if (pe.line != errLine || pe.offset != errCol) { 578 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n", 579 line, errLine, errCol, pe.line, pe.offset); 580 } 581 } 582 } 583 584 delete callerPattern; 585 utext_close(&patternText); 586} 587 588 589 590//--------------------------------------------------------------------------- 591// 592// Basic Check for basic functionality of regex pattern matching. 593// Avoid the use of REGEX_FIND test macro, which has 594// substantial dependencies on basic Regex functionality. 595// 596//--------------------------------------------------------------------------- 597void RegexTest::Basic() { 598 599 600// 601// Debug - slide failing test cases early 602// 603#if 0 604 { 605 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE); 606 UParseError pe; 607 UErrorCode status = U_ZERO_ERROR; 608 RegexPattern *pattern; 609 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status); 610 pattern->dumpPattern(); 611 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status); 612 UBool result = m->find(); 613 printf("result = %d\n", result); 614 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd"); 615 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX===================="); 616 } 617 exit(1); 618#endif 619 620 621 // 622 // Pattern with parentheses 623 // 624 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE); 625 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE); 626 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE); 627 628 // 629 // Patterns with * 630 // 631 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE); 632 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE); 633 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE); 634 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE); 635 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE); 636 637 REGEX_TESTLM("a*", "", TRUE, TRUE); 638 REGEX_TESTLM("a*", "b", TRUE, FALSE); 639 640 641 // 642 // Patterns with "." 643 // 644 REGEX_TESTLM(".", "abc", TRUE, FALSE); 645 REGEX_TESTLM("...", "abc", TRUE, TRUE); 646 REGEX_TESTLM("....", "abc", FALSE, FALSE); 647 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE); 648 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE); 649 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE); 650 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE); 651 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE); 652 653 // 654 // Patterns with * applied to chars at end of literal string 655 // 656 REGEX_TESTLM("abc*", "ab", TRUE, TRUE); 657 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE); 658 659 // 660 // Supplemental chars match as single chars, not a pair of surrogates. 661 // 662 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE); 663 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE); 664 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE); 665 666 667 // 668 // UnicodeSets in the pattern 669 // 670 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE); 671 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE); 672 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE); 673 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 674 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE); 675 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE); 676 677 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE); 678 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE); 679 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE); 680 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences. 681 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE); 682 683 // 684 // OR operator in patterns 685 // 686 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE); 687 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE); 688 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE); 689 REGEX_TESTLM("a|b", "b", TRUE, TRUE); 690 691 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE); 692 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE); 693 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE); 694 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE); 695 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE); 696 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE); 697 698 // 699 // + 700 // 701 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE); 702 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE); 703 REGEX_TESTLM("b+", "", FALSE, FALSE); 704 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE); 705 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE); 706 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE); 707 708 // 709 // ? 710 // 711 REGEX_TESTLM("ab?", "ab", TRUE, TRUE); 712 REGEX_TESTLM("ab?", "a", TRUE, TRUE); 713 REGEX_TESTLM("ab?", "ac", TRUE, FALSE); 714 REGEX_TESTLM("ab?", "abb", TRUE, FALSE); 715 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE); 716 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE); 717 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE); 718 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE); 719 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE); 720 721 // 722 // Escape sequences that become single literal chars, handled internally 723 // by ICU's Unescape. 724 // 725 726 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet. 727 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL 728 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L 729 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape 730 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed 731 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line 732 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR 733 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab 734 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE); 735 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE); 736 737 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input 738 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input 739 740 // Escape of special chars in patterns 741 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE); 742} 743 744 745//--------------------------------------------------------------------------- 746// 747// UTextBasic Check for quirks that are specific to the UText 748// implementation. 749// 750//--------------------------------------------------------------------------- 751void RegexTest::UTextBasic() { 752 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 753 UErrorCode status = U_ZERO_ERROR; 754 UText pattern = UTEXT_INITIALIZER; 755 utext_openUTF8(&pattern, str_abc, -1, &status); 756 RegexMatcher matcher(&pattern, 0, status); 757 REGEX_CHECK_STATUS; 758 759 UText input = UTEXT_INITIALIZER; 760 utext_openUTF8(&input, str_abc, -1, &status); 761 REGEX_CHECK_STATUS; 762 matcher.reset(&input); 763 REGEX_CHECK_STATUS; 764 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 765 766 matcher.reset(matcher.inputText()); 767 REGEX_CHECK_STATUS; 768 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText()); 769 770 utext_close(&pattern); 771 utext_close(&input); 772} 773 774 775//--------------------------------------------------------------------------- 776// 777// API_Match Test that the API for class RegexMatcher 778// is present and nominally working, but excluding functions 779// implementing replace operations. 780// 781//--------------------------------------------------------------------------- 782void RegexTest::API_Match() { 783 UParseError pe; 784 UErrorCode status=U_ZERO_ERROR; 785 int32_t flags = 0; 786 787 // 788 // Debug - slide failing test cases early 789 // 790#if 0 791 { 792 } 793 return; 794#endif 795 796 // 797 // Simple pattern compilation 798 // 799 { 800 UnicodeString re("abc"); 801 RegexPattern *pat2; 802 pat2 = RegexPattern::compile(re, flags, pe, status); 803 REGEX_CHECK_STATUS; 804 805 UnicodeString inStr1 = "abcdef this is a test"; 806 UnicodeString instr2 = "not abc"; 807 UnicodeString empty = ""; 808 809 810 // 811 // Matcher creation and reset. 812 // 813 RegexMatcher *m1 = pat2->matcher(inStr1, status); 814 REGEX_CHECK_STATUS; 815 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 816 REGEX_ASSERT(m1->input() == inStr1); 817 m1->reset(instr2); 818 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 819 REGEX_ASSERT(m1->input() == instr2); 820 m1->reset(inStr1); 821 REGEX_ASSERT(m1->input() == inStr1); 822 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 823 m1->reset(empty); 824 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 825 REGEX_ASSERT(m1->input() == empty); 826 REGEX_ASSERT(&m1->pattern() == pat2); 827 828 // 829 // reset(pos, status) 830 // 831 m1->reset(inStr1); 832 m1->reset(4, status); 833 REGEX_CHECK_STATUS; 834 REGEX_ASSERT(m1->input() == inStr1); 835 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 836 837 m1->reset(-1, status); 838 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 839 status = U_ZERO_ERROR; 840 841 m1->reset(0, status); 842 REGEX_CHECK_STATUS; 843 status = U_ZERO_ERROR; 844 845 int32_t len = m1->input().length(); 846 m1->reset(len-1, status); 847 REGEX_CHECK_STATUS; 848 status = U_ZERO_ERROR; 849 850 m1->reset(len, status); 851 REGEX_CHECK_STATUS; 852 status = U_ZERO_ERROR; 853 854 m1->reset(len+1, status); 855 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 856 status = U_ZERO_ERROR; 857 858 // 859 // match(pos, status) 860 // 861 m1->reset(instr2); 862 REGEX_ASSERT(m1->matches(4, status) == TRUE); 863 m1->reset(); 864 REGEX_ASSERT(m1->matches(3, status) == FALSE); 865 m1->reset(); 866 REGEX_ASSERT(m1->matches(5, status) == FALSE); 867 REGEX_ASSERT(m1->matches(4, status) == TRUE); 868 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 869 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 870 871 // Match() at end of string should fail, but should not 872 // be an error. 873 status = U_ZERO_ERROR; 874 len = m1->input().length(); 875 REGEX_ASSERT(m1->matches(len, status) == FALSE); 876 REGEX_CHECK_STATUS; 877 878 // Match beyond end of string should fail with an error. 879 status = U_ZERO_ERROR; 880 REGEX_ASSERT(m1->matches(len+1, status) == FALSE); 881 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 882 883 // Successful match at end of string. 884 { 885 status = U_ZERO_ERROR; 886 RegexMatcher m("A?", 0, status); // will match zero length string. 887 REGEX_CHECK_STATUS; 888 m.reset(inStr1); 889 len = inStr1.length(); 890 REGEX_ASSERT(m.matches(len, status) == TRUE); 891 REGEX_CHECK_STATUS; 892 m.reset(empty); 893 REGEX_ASSERT(m.matches(0, status) == TRUE); 894 REGEX_CHECK_STATUS; 895 } 896 897 898 // 899 // lookingAt(pos, status) 900 // 901 status = U_ZERO_ERROR; 902 m1->reset(instr2); // "not abc" 903 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 904 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 905 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 906 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 907 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 908 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 909 status = U_ZERO_ERROR; 910 len = m1->input().length(); 911 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE); 912 REGEX_CHECK_STATUS; 913 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE); 914 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 915 916 delete m1; 917 delete pat2; 918 } 919 920 921 // 922 // Capture Group. 923 // RegexMatcher::start(); 924 // RegexMatcher::end(); 925 // RegexMatcher::groupCount(); 926 // 927 { 928 int32_t flags=0; 929 UParseError pe; 930 UErrorCode status=U_ZERO_ERROR; 931 932 UnicodeString re("01(23(45)67)(.*)"); 933 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 934 REGEX_CHECK_STATUS; 935 UnicodeString data = "0123456789"; 936 937 RegexMatcher *matcher = pat->matcher(data, status); 938 REGEX_CHECK_STATUS; 939 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 940 static const int32_t matchStarts[] = {0, 2, 4, 8}; 941 static const int32_t matchEnds[] = {10, 8, 6, 10}; 942 int32_t i; 943 for (i=0; i<4; i++) { 944 int32_t actualStart = matcher->start(i, status); 945 REGEX_CHECK_STATUS; 946 if (actualStart != matchStarts[i]) { 947 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n", 948 __LINE__, i, matchStarts[i], actualStart); 949 } 950 int32_t actualEnd = matcher->end(i, status); 951 REGEX_CHECK_STATUS; 952 if (actualEnd != matchEnds[i]) { 953 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n", 954 __LINE__, i, matchEnds[i], actualEnd); 955 } 956 } 957 958 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 959 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 960 961 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 962 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 963 matcher->reset(); 964 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 965 966 matcher->lookingAt(status); 967 REGEX_ASSERT(matcher->group(status) == "0123456789"); 968 REGEX_ASSERT(matcher->group(0, status) == "0123456789"); 969 REGEX_ASSERT(matcher->group(1, status) == "234567" ); 970 REGEX_ASSERT(matcher->group(2, status) == "45" ); 971 REGEX_ASSERT(matcher->group(3, status) == "89" ); 972 REGEX_CHECK_STATUS; 973 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 974 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 975 matcher->reset(); 976 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 977 978 delete matcher; 979 delete pat; 980 981 } 982 983 // 984 // find 985 // 986 { 987 int32_t flags=0; 988 UParseError pe; 989 UErrorCode status=U_ZERO_ERROR; 990 991 UnicodeString re("abc"); 992 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 993 REGEX_CHECK_STATUS; 994 UnicodeString data = ".abc..abc...abc.."; 995 // 012345678901234567 996 997 RegexMatcher *matcher = pat->matcher(data, status); 998 REGEX_CHECK_STATUS; 999 REGEX_ASSERT(matcher->find()); 1000 REGEX_ASSERT(matcher->start(status) == 1); 1001 REGEX_ASSERT(matcher->find()); 1002 REGEX_ASSERT(matcher->start(status) == 6); 1003 REGEX_ASSERT(matcher->find()); 1004 REGEX_ASSERT(matcher->start(status) == 12); 1005 REGEX_ASSERT(matcher->find() == FALSE); 1006 REGEX_ASSERT(matcher->find() == FALSE); 1007 1008 matcher->reset(); 1009 REGEX_ASSERT(matcher->find()); 1010 REGEX_ASSERT(matcher->start(status) == 1); 1011 1012 REGEX_ASSERT(matcher->find(0, status)); 1013 REGEX_ASSERT(matcher->start(status) == 1); 1014 REGEX_ASSERT(matcher->find(1, status)); 1015 REGEX_ASSERT(matcher->start(status) == 1); 1016 REGEX_ASSERT(matcher->find(2, status)); 1017 REGEX_ASSERT(matcher->start(status) == 6); 1018 REGEX_ASSERT(matcher->find(12, status)); 1019 REGEX_ASSERT(matcher->start(status) == 12); 1020 REGEX_ASSERT(matcher->find(13, status) == FALSE); 1021 REGEX_ASSERT(matcher->find(16, status) == FALSE); 1022 REGEX_ASSERT(matcher->find(17, status) == FALSE); 1023 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 1024 1025 status = U_ZERO_ERROR; 1026 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 1027 status = U_ZERO_ERROR; 1028 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 1029 1030 REGEX_ASSERT(matcher->groupCount() == 0); 1031 1032 delete matcher; 1033 delete pat; 1034 } 1035 1036 1037 // 1038 // find, with \G in pattern (true if at the end of a previous match). 1039 // 1040 { 1041 int32_t flags=0; 1042 UParseError pe; 1043 UErrorCode status=U_ZERO_ERROR; 1044 1045 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV); 1046 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 1047 REGEX_CHECK_STATUS; 1048 UnicodeString data = ".abcabc.abc.."; 1049 // 012345678901234567 1050 1051 RegexMatcher *matcher = pat->matcher(data, status); 1052 REGEX_CHECK_STATUS; 1053 REGEX_ASSERT(matcher->find()); 1054 REGEX_ASSERT(matcher->start(status) == 0); 1055 REGEX_ASSERT(matcher->start(1, status) == -1); 1056 REGEX_ASSERT(matcher->start(2, status) == 1); 1057 1058 REGEX_ASSERT(matcher->find()); 1059 REGEX_ASSERT(matcher->start(status) == 4); 1060 REGEX_ASSERT(matcher->start(1, status) == 4); 1061 REGEX_ASSERT(matcher->start(2, status) == -1); 1062 REGEX_CHECK_STATUS; 1063 1064 delete matcher; 1065 delete pat; 1066 } 1067 1068 // 1069 // find with zero length matches, match position should bump ahead 1070 // to prevent loops. 1071 // 1072 { 1073 int32_t i; 1074 UErrorCode status=U_ZERO_ERROR; 1075 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 1076 // using an always-true look-ahead. 1077 REGEX_CHECK_STATUS; 1078 UnicodeString s(" "); 1079 m.reset(s); 1080 for (i=0; ; i++) { 1081 if (m.find() == FALSE) { 1082 break; 1083 } 1084 REGEX_ASSERT(m.start(status) == i); 1085 REGEX_ASSERT(m.end(status) == i); 1086 } 1087 REGEX_ASSERT(i==5); 1088 1089 // Check that the bump goes over surrogate pairs OK 1090 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004"); 1091 s = s.unescape(); 1092 m.reset(s); 1093 for (i=0; ; i+=2) { 1094 if (m.find() == FALSE) { 1095 break; 1096 } 1097 REGEX_ASSERT(m.start(status) == i); 1098 REGEX_ASSERT(m.end(status) == i); 1099 } 1100 REGEX_ASSERT(i==10); 1101 } 1102 { 1103 // find() loop breaking test. 1104 // with pattern of /.?/, should see a series of one char matches, then a single 1105 // match of zero length at the end of the input string. 1106 int32_t i; 1107 UErrorCode status=U_ZERO_ERROR; 1108 RegexMatcher m(".?", 0, status); 1109 REGEX_CHECK_STATUS; 1110 UnicodeString s(" "); 1111 m.reset(s); 1112 for (i=0; ; i++) { 1113 if (m.find() == FALSE) { 1114 break; 1115 } 1116 REGEX_ASSERT(m.start(status) == i); 1117 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 1118 } 1119 REGEX_ASSERT(i==5); 1120 } 1121 1122 1123 // 1124 // Matchers with no input string behave as if they had an empty input string. 1125 // 1126 1127 { 1128 UErrorCode status = U_ZERO_ERROR; 1129 RegexMatcher m(".?", 0, status); 1130 REGEX_CHECK_STATUS; 1131 REGEX_ASSERT(m.find()); 1132 REGEX_ASSERT(m.start(status) == 0); 1133 REGEX_ASSERT(m.input() == ""); 1134 } 1135 { 1136 UErrorCode status = U_ZERO_ERROR; 1137 RegexPattern *p = RegexPattern::compile(".", 0, status); 1138 RegexMatcher *m = p->matcher(status); 1139 REGEX_CHECK_STATUS; 1140 1141 REGEX_ASSERT(m->find() == FALSE); 1142 REGEX_ASSERT(m->input() == ""); 1143 delete m; 1144 delete p; 1145 } 1146 1147 // 1148 // Regions 1149 // 1150 { 1151 UErrorCode status = U_ZERO_ERROR; 1152 UnicodeString testString("This is test data"); 1153 RegexMatcher m(".*", testString, 0, status); 1154 REGEX_CHECK_STATUS; 1155 REGEX_ASSERT(m.regionStart() == 0); 1156 REGEX_ASSERT(m.regionEnd() == testString.length()); 1157 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1158 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1159 1160 m.region(2,4, status); 1161 REGEX_CHECK_STATUS; 1162 REGEX_ASSERT(m.matches(status)); 1163 REGEX_ASSERT(m.start(status)==2); 1164 REGEX_ASSERT(m.end(status)==4); 1165 REGEX_CHECK_STATUS; 1166 1167 m.reset(); 1168 REGEX_ASSERT(m.regionStart() == 0); 1169 REGEX_ASSERT(m.regionEnd() == testString.length()); 1170 1171 UnicodeString shorterString("short"); 1172 m.reset(shorterString); 1173 REGEX_ASSERT(m.regionStart() == 0); 1174 REGEX_ASSERT(m.regionEnd() == shorterString.length()); 1175 1176 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1177 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 1178 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1179 REGEX_ASSERT(&m == &m.reset()); 1180 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 1181 1182 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 1183 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1184 REGEX_ASSERT(&m == &m.reset()); 1185 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 1186 1187 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1188 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 1189 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1190 REGEX_ASSERT(&m == &m.reset()); 1191 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 1192 1193 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 1194 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1195 REGEX_ASSERT(&m == &m.reset()); 1196 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 1197 1198 } 1199 1200 // 1201 // hitEnd() and requireEnd() 1202 // 1203 { 1204 UErrorCode status = U_ZERO_ERROR; 1205 UnicodeString testString("aabb"); 1206 RegexMatcher m1(".*", testString, 0, status); 1207 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 1208 REGEX_ASSERT(m1.hitEnd() == TRUE); 1209 REGEX_ASSERT(m1.requireEnd() == FALSE); 1210 REGEX_CHECK_STATUS; 1211 1212 status = U_ZERO_ERROR; 1213 RegexMatcher m2("a*", testString, 0, status); 1214 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 1215 REGEX_ASSERT(m2.hitEnd() == FALSE); 1216 REGEX_ASSERT(m2.requireEnd() == FALSE); 1217 REGEX_CHECK_STATUS; 1218 1219 status = U_ZERO_ERROR; 1220 RegexMatcher m3(".*$", testString, 0, status); 1221 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 1222 REGEX_ASSERT(m3.hitEnd() == TRUE); 1223 REGEX_ASSERT(m3.requireEnd() == TRUE); 1224 REGEX_CHECK_STATUS; 1225 } 1226 1227 1228 // 1229 // Compilation error on reset with UChar * 1230 // These were a hazard that people were stumbling over with runtime errors. 1231 // Changed them to compiler errors by adding private methods that more closely 1232 // matched the incorrect use of the functions. 1233 // 1234#if 0 1235 { 1236 UErrorCode status = U_ZERO_ERROR; 1237 UChar ucharString[20]; 1238 RegexMatcher m(".", 0, status); 1239 m.reset(ucharString); // should not compile. 1240 1241 RegexPattern *p = RegexPattern::compile(".", 0, status); 1242 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile. 1243 1244 RegexMatcher m3(".", ucharString, 0, status); // Should not compile 1245 } 1246#endif 1247 1248 // 1249 // Time Outs. 1250 // Note: These tests will need to be changed when the regexp engine is 1251 // able to detect and cut short the exponential time behavior on 1252 // this type of match. 1253 // 1254 { 1255 UErrorCode status = U_ZERO_ERROR; 1256 // Enough 'a's in the string to cause the match to time out. 1257 // (Each on additonal 'a' doubles the time) 1258 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa"); 1259 RegexMatcher matcher("(a+)+b", testString, 0, status); 1260 REGEX_CHECK_STATUS; 1261 REGEX_ASSERT(matcher.getTimeLimit() == 0); 1262 matcher.setTimeLimit(100, status); 1263 REGEX_ASSERT(matcher.getTimeLimit() == 100); 1264 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1265 REGEX_ASSERT(status == U_REGEX_TIME_OUT); 1266 } 1267 { 1268 UErrorCode status = U_ZERO_ERROR; 1269 // Few enough 'a's to slip in under the time limit. 1270 UnicodeString testString("aaaaaaaaaaaaaaaaaa"); 1271 RegexMatcher matcher("(a+)+b", testString, 0, status); 1272 REGEX_CHECK_STATUS; 1273 matcher.setTimeLimit(100, status); 1274 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1275 REGEX_CHECK_STATUS; 1276 } 1277 1278 // 1279 // Stack Limits 1280 // 1281 { 1282 UErrorCode status = U_ZERO_ERROR; 1283 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A' 1284 1285 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations 1286 // of the '+', and makes the stack frames larger. 1287 RegexMatcher matcher("(A)+A$", testString, 0, status); 1288 1289 // With the default stack, this match should fail to run 1290 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1291 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1292 1293 // With unlimited stack, it should run 1294 status = U_ZERO_ERROR; 1295 matcher.setStackLimit(0, status); 1296 REGEX_CHECK_STATUS; 1297 REGEX_ASSERT(matcher.lookingAt(status) == TRUE); 1298 REGEX_CHECK_STATUS; 1299 REGEX_ASSERT(matcher.getStackLimit() == 0); 1300 1301 // With a limited stack, it the match should fail 1302 status = U_ZERO_ERROR; 1303 matcher.setStackLimit(10000, status); 1304 REGEX_ASSERT(matcher.lookingAt(status) == FALSE); 1305 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW); 1306 REGEX_ASSERT(matcher.getStackLimit() == 10000); 1307 } 1308 1309 // A pattern that doesn't save state should work with 1310 // a minimal sized stack 1311 { 1312 UErrorCode status = U_ZERO_ERROR; 1313 UnicodeString testString = "abc"; 1314 RegexMatcher matcher("abc", testString, 0, status); 1315 REGEX_CHECK_STATUS; 1316 matcher.setStackLimit(30, status); 1317 REGEX_CHECK_STATUS; 1318 REGEX_ASSERT(matcher.matches(status) == TRUE); 1319 REGEX_CHECK_STATUS; 1320 REGEX_ASSERT(matcher.getStackLimit() == 30); 1321 1322 // Negative stack sizes should fail 1323 status = U_ZERO_ERROR; 1324 matcher.setStackLimit(1000, status); 1325 REGEX_CHECK_STATUS; 1326 matcher.setStackLimit(-1, status); 1327 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 1328 REGEX_ASSERT(matcher.getStackLimit() == 1000); 1329 } 1330 1331 1332} 1333 1334 1335 1336 1337 1338 1339//--------------------------------------------------------------------------- 1340// 1341// API_Replace API test for class RegexMatcher, testing the 1342// Replace family of functions. 1343// 1344//--------------------------------------------------------------------------- 1345void RegexTest::API_Replace() { 1346 // 1347 // Replace 1348 // 1349 int32_t flags=0; 1350 UParseError pe; 1351 UErrorCode status=U_ZERO_ERROR; 1352 1353 UnicodeString re("abc"); 1354 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status); 1355 REGEX_CHECK_STATUS; 1356 UnicodeString data = ".abc..abc...abc.."; 1357 // 012345678901234567 1358 RegexMatcher *matcher = pat->matcher(data, status); 1359 1360 // 1361 // Plain vanilla matches. 1362 // 1363 UnicodeString dest; 1364 dest = matcher->replaceFirst("yz", status); 1365 REGEX_CHECK_STATUS; 1366 REGEX_ASSERT(dest == ".yz..abc...abc.."); 1367 1368 dest = matcher->replaceAll("yz", status); 1369 REGEX_CHECK_STATUS; 1370 REGEX_ASSERT(dest == ".yz..yz...yz.."); 1371 1372 // 1373 // Plain vanilla non-matches. 1374 // 1375 UnicodeString d2 = ".abx..abx...abx.."; 1376 matcher->reset(d2); 1377 dest = matcher->replaceFirst("yz", status); 1378 REGEX_CHECK_STATUS; 1379 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1380 1381 dest = matcher->replaceAll("yz", status); 1382 REGEX_CHECK_STATUS; 1383 REGEX_ASSERT(dest == ".abx..abx...abx.."); 1384 1385 // 1386 // Empty source string 1387 // 1388 UnicodeString d3 = ""; 1389 matcher->reset(d3); 1390 dest = matcher->replaceFirst("yz", status); 1391 REGEX_CHECK_STATUS; 1392 REGEX_ASSERT(dest == ""); 1393 1394 dest = matcher->replaceAll("yz", status); 1395 REGEX_CHECK_STATUS; 1396 REGEX_ASSERT(dest == ""); 1397 1398 // 1399 // Empty substitution string 1400 // 1401 matcher->reset(data); // ".abc..abc...abc.." 1402 dest = matcher->replaceFirst("", status); 1403 REGEX_CHECK_STATUS; 1404 REGEX_ASSERT(dest == "...abc...abc.."); 1405 1406 dest = matcher->replaceAll("", status); 1407 REGEX_CHECK_STATUS; 1408 REGEX_ASSERT(dest == "........"); 1409 1410 // 1411 // match whole string 1412 // 1413 UnicodeString d4 = "abc"; 1414 matcher->reset(d4); 1415 dest = matcher->replaceFirst("xyz", status); 1416 REGEX_CHECK_STATUS; 1417 REGEX_ASSERT(dest == "xyz"); 1418 1419 dest = matcher->replaceAll("xyz", status); 1420 REGEX_CHECK_STATUS; 1421 REGEX_ASSERT(dest == "xyz"); 1422 1423 // 1424 // Capture Group, simple case 1425 // 1426 UnicodeString re2("a(..)"); 1427 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status); 1428 REGEX_CHECK_STATUS; 1429 UnicodeString d5 = "abcdefg"; 1430 RegexMatcher *matcher2 = pat2->matcher(d5, status); 1431 REGEX_CHECK_STATUS; 1432 dest = matcher2->replaceFirst("$1$1", status); 1433 REGEX_CHECK_STATUS; 1434 REGEX_ASSERT(dest == "bcbcdefg"); 1435 1436 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status); 1437 REGEX_CHECK_STATUS; 1438 REGEX_ASSERT(dest == "The value of $1 is bc.defg"); 1439 1440 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status); 1441 REGEX_ASSERT(U_FAILURE(status)); 1442 status = U_ZERO_ERROR; 1443 1444 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF."); 1445 replacement = replacement.unescape(); 1446 dest = matcher2->replaceFirst(replacement, status); 1447 REGEX_CHECK_STATUS; 1448 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg"); 1449 1450 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR); 1451 1452 1453 // 1454 // Replacement String with \u hex escapes 1455 // 1456 { 1457 UnicodeString src = "abc 1 abc 2 abc 3"; 1458 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--"); 1459 matcher->reset(src); 1460 UnicodeString result = matcher->replaceAll(substitute, status); 1461 REGEX_CHECK_STATUS; 1462 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3"); 1463 } 1464 { 1465 UnicodeString src = "abc !"; 1466 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--"); 1467 matcher->reset(src); 1468 UnicodeString result = matcher->replaceAll(substitute, status); 1469 REGEX_CHECK_STATUS; 1470 UnicodeString expected = UnicodeString("--"); 1471 expected.append((UChar32)0x10000); 1472 expected.append("-- !"); 1473 REGEX_ASSERT(result == expected); 1474 } 1475 // TODO: need more through testing of capture substitutions. 1476 1477 // Bug 4057 1478 // 1479 { 1480 status = U_ZERO_ERROR; 1481 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin"; 1482 RegexMatcher m("ss(.*?)ee", 0, status); 1483 REGEX_CHECK_STATUS; 1484 UnicodeString result; 1485 1486 // Multiple finds do NOT bump up the previous appendReplacement postion. 1487 m.reset(s); 1488 m.find(); 1489 m.find(); 1490 m.appendReplacement(result, "ooh", status); 1491 REGEX_CHECK_STATUS; 1492 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1493 1494 // After a reset into the interior of a string, appendReplacemnt still starts at beginning. 1495 status = U_ZERO_ERROR; 1496 result.truncate(0); 1497 m.reset(10, status); 1498 m.find(); 1499 m.find(); 1500 m.appendReplacement(result, "ooh", status); 1501 REGEX_CHECK_STATUS; 1502 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1503 1504 // find() at interior of string, appendReplacemnt still starts at beginning. 1505 status = U_ZERO_ERROR; 1506 result.truncate(0); 1507 m.reset(); 1508 m.find(10, status); 1509 m.find(); 1510 m.appendReplacement(result, "ooh", status); 1511 REGEX_CHECK_STATUS; 1512 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh"); 1513 1514 m.appendTail(result); 1515 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin"); 1516 1517 } 1518 1519 delete matcher2; 1520 delete pat2; 1521 delete matcher; 1522 delete pat; 1523} 1524 1525 1526//--------------------------------------------------------------------------- 1527// 1528// API_Pattern Test that the API for class RegexPattern is 1529// present and nominally working. 1530// 1531//--------------------------------------------------------------------------- 1532void RegexTest::API_Pattern() { 1533 RegexPattern pata; // Test default constructor to not crash. 1534 RegexPattern patb; 1535 1536 REGEX_ASSERT(pata == patb); 1537 REGEX_ASSERT(pata == pata); 1538 1539 UnicodeString re1("abc[a-l][m-z]"); 1540 UnicodeString re2("def"); 1541 UErrorCode status = U_ZERO_ERROR; 1542 UParseError pe; 1543 1544 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status); 1545 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status); 1546 REGEX_CHECK_STATUS; 1547 REGEX_ASSERT(*pat1 == *pat1); 1548 REGEX_ASSERT(*pat1 != pata); 1549 1550 // Assign 1551 patb = *pat1; 1552 REGEX_ASSERT(patb == *pat1); 1553 1554 // Copy Construct 1555 RegexPattern patc(*pat1); 1556 REGEX_ASSERT(patc == *pat1); 1557 REGEX_ASSERT(patb == patc); 1558 REGEX_ASSERT(pat1 != pat2); 1559 patb = *pat2; 1560 REGEX_ASSERT(patb != patc); 1561 REGEX_ASSERT(patb == *pat2); 1562 1563 // Compile with no flags. 1564 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status); 1565 REGEX_ASSERT(*pat1a == *pat1); 1566 1567 REGEX_ASSERT(pat1a->flags() == 0); 1568 1569 // Compile with different flags should be not equal 1570 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status); 1571 REGEX_CHECK_STATUS; 1572 1573 REGEX_ASSERT(*pat1b != *pat1a); 1574 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 1575 REGEX_ASSERT(pat1a->flags() == 0); 1576 delete pat1b; 1577 1578 // clone 1579 RegexPattern *pat1c = pat1->clone(); 1580 REGEX_ASSERT(*pat1c == *pat1); 1581 REGEX_ASSERT(*pat1c != *pat2); 1582 1583 delete pat1c; 1584 delete pat1a; 1585 delete pat1; 1586 delete pat2; 1587 1588 1589 // 1590 // Verify that a matcher created from a cloned pattern works. 1591 // (Jitterbug 3423) 1592 // 1593 { 1594 UErrorCode status = U_ZERO_ERROR; 1595 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status); 1596 RegexPattern *pClone = pSource->clone(); 1597 delete pSource; 1598 RegexMatcher *mFromClone = pClone->matcher(status); 1599 REGEX_CHECK_STATUS; 1600 UnicodeString s = "Hello World"; 1601 mFromClone->reset(s); 1602 REGEX_ASSERT(mFromClone->find() == TRUE); 1603 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 1604 REGEX_ASSERT(mFromClone->find() == TRUE); 1605 REGEX_ASSERT(mFromClone->group(status) == "World"); 1606 REGEX_ASSERT(mFromClone->find() == FALSE); 1607 delete mFromClone; 1608 delete pClone; 1609 } 1610 1611 // 1612 // matches convenience API 1613 // 1614 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE); 1615 REGEX_CHECK_STATUS; 1616 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 1617 REGEX_CHECK_STATUS; 1618 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 1619 REGEX_CHECK_STATUS; 1620 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 1621 REGEX_CHECK_STATUS; 1622 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 1623 REGEX_CHECK_STATUS; 1624 status = U_INDEX_OUTOFBOUNDS_ERROR; 1625 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 1626 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1627 1628 1629 // 1630 // Split() 1631 // 1632 status = U_ZERO_ERROR; 1633 pat1 = RegexPattern::compile(" +", pe, status); 1634 REGEX_CHECK_STATUS; 1635 UnicodeString fields[10]; 1636 1637 int32_t n; 1638 n = pat1->split("Now is the time", fields, 10, status); 1639 REGEX_CHECK_STATUS; 1640 REGEX_ASSERT(n==4); 1641 REGEX_ASSERT(fields[0]=="Now"); 1642 REGEX_ASSERT(fields[1]=="is"); 1643 REGEX_ASSERT(fields[2]=="the"); 1644 REGEX_ASSERT(fields[3]=="time"); 1645 REGEX_ASSERT(fields[4]==""); 1646 1647 n = pat1->split("Now is the time", fields, 2, status); 1648 REGEX_CHECK_STATUS; 1649 REGEX_ASSERT(n==2); 1650 REGEX_ASSERT(fields[0]=="Now"); 1651 REGEX_ASSERT(fields[1]=="is the time"); 1652 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 1653 1654 fields[1] = "*"; 1655 status = U_ZERO_ERROR; 1656 n = pat1->split("Now is the time", fields, 1, status); 1657 REGEX_CHECK_STATUS; 1658 REGEX_ASSERT(n==1); 1659 REGEX_ASSERT(fields[0]=="Now is the time"); 1660 REGEX_ASSERT(fields[1]=="*"); 1661 status = U_ZERO_ERROR; 1662 1663 n = pat1->split(" Now is the time ", fields, 10, status); 1664 REGEX_CHECK_STATUS; 1665 REGEX_ASSERT(n==6); 1666 REGEX_ASSERT(fields[0]==""); 1667 REGEX_ASSERT(fields[1]=="Now"); 1668 REGEX_ASSERT(fields[2]=="is"); 1669 REGEX_ASSERT(fields[3]=="the"); 1670 REGEX_ASSERT(fields[4]=="time"); 1671 REGEX_ASSERT(fields[5]==""); 1672 1673 n = pat1->split(" ", fields, 10, status); 1674 REGEX_CHECK_STATUS; 1675 REGEX_ASSERT(n==2); 1676 REGEX_ASSERT(fields[0]==""); 1677 REGEX_ASSERT(fields[1]==""); 1678 1679 fields[0] = "foo"; 1680 n = pat1->split("", fields, 10, status); 1681 REGEX_CHECK_STATUS; 1682 REGEX_ASSERT(n==0); 1683 REGEX_ASSERT(fields[0]=="foo"); 1684 1685 delete pat1; 1686 1687 // split, with a pattern with (capture) 1688 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status); 1689 REGEX_CHECK_STATUS; 1690 1691 status = U_ZERO_ERROR; 1692 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 1693 REGEX_CHECK_STATUS; 1694 REGEX_ASSERT(n==7); 1695 REGEX_ASSERT(fields[0]==""); 1696 REGEX_ASSERT(fields[1]=="a"); 1697 REGEX_ASSERT(fields[2]=="Now is "); 1698 REGEX_ASSERT(fields[3]=="b"); 1699 REGEX_ASSERT(fields[4]=="the time"); 1700 REGEX_ASSERT(fields[5]=="c"); 1701 REGEX_ASSERT(fields[6]==""); 1702 REGEX_ASSERT(status==U_ZERO_ERROR); 1703 1704 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 1705 REGEX_CHECK_STATUS; 1706 REGEX_ASSERT(n==7); 1707 REGEX_ASSERT(fields[0]==" "); 1708 REGEX_ASSERT(fields[1]=="a"); 1709 REGEX_ASSERT(fields[2]=="Now is "); 1710 REGEX_ASSERT(fields[3]=="b"); 1711 REGEX_ASSERT(fields[4]=="the time"); 1712 REGEX_ASSERT(fields[5]=="c"); 1713 REGEX_ASSERT(fields[6]==""); 1714 1715 status = U_ZERO_ERROR; 1716 fields[6] = "foo"; 1717 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status); 1718 REGEX_CHECK_STATUS; 1719 REGEX_ASSERT(n==6); 1720 REGEX_ASSERT(fields[0]==" "); 1721 REGEX_ASSERT(fields[1]=="a"); 1722 REGEX_ASSERT(fields[2]=="Now is "); 1723 REGEX_ASSERT(fields[3]=="b"); 1724 REGEX_ASSERT(fields[4]=="the time"); 1725 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter. 1726 REGEX_ASSERT(fields[6]=="foo"); 1727 1728 status = U_ZERO_ERROR; 1729 fields[5] = "foo"; 1730 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 1731 REGEX_CHECK_STATUS; 1732 REGEX_ASSERT(n==5); 1733 REGEX_ASSERT(fields[0]==" "); 1734 REGEX_ASSERT(fields[1]=="a"); 1735 REGEX_ASSERT(fields[2]=="Now is "); 1736 REGEX_ASSERT(fields[3]=="b"); 1737 REGEX_ASSERT(fields[4]=="the time<c>"); 1738 REGEX_ASSERT(fields[5]=="foo"); 1739 1740 status = U_ZERO_ERROR; 1741 fields[5] = "foo"; 1742 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 1743 REGEX_CHECK_STATUS; 1744 REGEX_ASSERT(n==5); 1745 REGEX_ASSERT(fields[0]==" "); 1746 REGEX_ASSERT(fields[1]=="a"); 1747 REGEX_ASSERT(fields[2]=="Now is "); 1748 REGEX_ASSERT(fields[3]=="b"); 1749 REGEX_ASSERT(fields[4]=="the time"); 1750 REGEX_ASSERT(fields[5]=="foo"); 1751 1752 status = U_ZERO_ERROR; 1753 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 1754 REGEX_CHECK_STATUS; 1755 REGEX_ASSERT(n==4); 1756 REGEX_ASSERT(fields[0]==" "); 1757 REGEX_ASSERT(fields[1]=="a"); 1758 REGEX_ASSERT(fields[2]=="Now is "); 1759 REGEX_ASSERT(fields[3]=="the time<c>"); 1760 status = U_ZERO_ERROR; 1761 delete pat1; 1762 1763 pat1 = RegexPattern::compile("([-,])", pe, status); 1764 REGEX_CHECK_STATUS; 1765 n = pat1->split("1-10,20", fields, 10, status); 1766 REGEX_CHECK_STATUS; 1767 REGEX_ASSERT(n==5); 1768 REGEX_ASSERT(fields[0]=="1"); 1769 REGEX_ASSERT(fields[1]=="-"); 1770 REGEX_ASSERT(fields[2]=="10"); 1771 REGEX_ASSERT(fields[3]==","); 1772 REGEX_ASSERT(fields[4]=="20"); 1773 delete pat1; 1774 1775 // Test split of string with empty trailing fields 1776 pat1 = RegexPattern::compile(",", pe, status); 1777 REGEX_CHECK_STATUS; 1778 n = pat1->split("a,b,c,", fields, 10, status); 1779 REGEX_CHECK_STATUS; 1780 REGEX_ASSERT(n==4); 1781 REGEX_ASSERT(fields[0]=="a"); 1782 REGEX_ASSERT(fields[1]=="b"); 1783 REGEX_ASSERT(fields[2]=="c"); 1784 REGEX_ASSERT(fields[3]==""); 1785 1786 n = pat1->split("a,,,", fields, 10, status); 1787 REGEX_CHECK_STATUS; 1788 REGEX_ASSERT(n==4); 1789 REGEX_ASSERT(fields[0]=="a"); 1790 REGEX_ASSERT(fields[1]==""); 1791 REGEX_ASSERT(fields[2]==""); 1792 REGEX_ASSERT(fields[3]==""); 1793 delete pat1; 1794 1795 // Split Separator with zero length match. 1796 pat1 = RegexPattern::compile(":?", pe, status); 1797 REGEX_CHECK_STATUS; 1798 n = pat1->split("abc", fields, 10, status); 1799 REGEX_CHECK_STATUS; 1800 REGEX_ASSERT(n==5); 1801 REGEX_ASSERT(fields[0]==""); 1802 REGEX_ASSERT(fields[1]=="a"); 1803 REGEX_ASSERT(fields[2]=="b"); 1804 REGEX_ASSERT(fields[3]=="c"); 1805 REGEX_ASSERT(fields[4]==""); 1806 1807 delete pat1; 1808 1809 // 1810 // RegexPattern::pattern() 1811 // 1812 pat1 = new RegexPattern(); 1813 REGEX_ASSERT(pat1->pattern() == ""); 1814 delete pat1; 1815 1816 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1817 REGEX_CHECK_STATUS; 1818 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*"); 1819 delete pat1; 1820 1821 1822 // 1823 // classID functions 1824 // 1825 pat1 = RegexPattern::compile("(Hello, world)*", pe, status); 1826 REGEX_CHECK_STATUS; 1827 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID()); 1828 REGEX_ASSERT(pat1->getDynamicClassID() != NULL); 1829 UnicodeString Hello("Hello, world."); 1830 RegexMatcher *m = pat1->matcher(Hello, status); 1831 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID()); 1832 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID()); 1833 REGEX_ASSERT(m->getDynamicClassID() != NULL); 1834 delete m; 1835 delete pat1; 1836 1837} 1838 1839//--------------------------------------------------------------------------- 1840// 1841// API_Match_UTF8 Test that the alternate engine for class RegexMatcher 1842// is present and working, but excluding functions 1843// implementing replace operations. 1844// 1845//--------------------------------------------------------------------------- 1846void RegexTest::API_Match_UTF8() { 1847 UParseError pe; 1848 UErrorCode status=U_ZERO_ERROR; 1849 int32_t flags = 0; 1850 1851 // 1852 // Debug - slide failing test cases early 1853 // 1854#if 0 1855 { 1856 } 1857 return; 1858#endif 1859 1860 // 1861 // Simple pattern compilation 1862 // 1863 { 1864 UText re = UTEXT_INITIALIZER; 1865 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 1866 REGEX_VERBOSE_TEXT(&re); 1867 RegexPattern *pat2; 1868 pat2 = RegexPattern::compile(&re, flags, pe, status); 1869 REGEX_CHECK_STATUS; 1870 1871 UText input1 = UTEXT_INITIALIZER; 1872 UText input2 = UTEXT_INITIALIZER; 1873 UText empty = UTEXT_INITIALIZER; 1874 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status); 1875 REGEX_VERBOSE_TEXT(&input1); 1876 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status); 1877 REGEX_VERBOSE_TEXT(&input2); 1878 utext_openUChars(&empty, NULL, 0, &status); 1879 1880 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */ 1881 int32_t input2Len = strlen("not abc"); 1882 1883 1884 // 1885 // Matcher creation and reset. 1886 // 1887 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1); 1888 REGEX_CHECK_STATUS; 1889 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1890 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */ 1891 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1892 m1->reset(&input2); 1893 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1894 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */ 1895 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText()); 1896 m1->reset(&input1); 1897 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1898 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1899 m1->reset(&empty); 1900 REGEX_ASSERT(m1->lookingAt(status) == FALSE); 1901 REGEX_ASSERT(utext_nativeLength(&empty) == 0); 1902 1903 // 1904 // reset(pos, status) 1905 // 1906 m1->reset(&input1); 1907 m1->reset(4, status); 1908 REGEX_CHECK_STATUS; 1909 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText()); 1910 REGEX_ASSERT(m1->lookingAt(status) == TRUE); 1911 1912 m1->reset(-1, status); 1913 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1914 status = U_ZERO_ERROR; 1915 1916 m1->reset(0, status); 1917 REGEX_CHECK_STATUS; 1918 status = U_ZERO_ERROR; 1919 1920 m1->reset(input1Len-1, status); 1921 REGEX_CHECK_STATUS; 1922 status = U_ZERO_ERROR; 1923 1924 m1->reset(input1Len, status); 1925 REGEX_CHECK_STATUS; 1926 status = U_ZERO_ERROR; 1927 1928 m1->reset(input1Len+1, status); 1929 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1930 status = U_ZERO_ERROR; 1931 1932 // 1933 // match(pos, status) 1934 // 1935 m1->reset(&input2); 1936 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1937 m1->reset(); 1938 REGEX_ASSERT(m1->matches(3, status) == FALSE); 1939 m1->reset(); 1940 REGEX_ASSERT(m1->matches(5, status) == FALSE); 1941 REGEX_ASSERT(m1->matches(4, status) == TRUE); 1942 REGEX_ASSERT(m1->matches(-1, status) == FALSE); 1943 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1944 1945 // Match() at end of string should fail, but should not 1946 // be an error. 1947 status = U_ZERO_ERROR; 1948 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE); 1949 REGEX_CHECK_STATUS; 1950 1951 // Match beyond end of string should fail with an error. 1952 status = U_ZERO_ERROR; 1953 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE); 1954 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1955 1956 // Successful match at end of string. 1957 { 1958 status = U_ZERO_ERROR; 1959 RegexMatcher m("A?", 0, status); // will match zero length string. 1960 REGEX_CHECK_STATUS; 1961 m.reset(&input1); 1962 REGEX_ASSERT(m.matches(input1Len, status) == TRUE); 1963 REGEX_CHECK_STATUS; 1964 m.reset(&empty); 1965 REGEX_ASSERT(m.matches(0, status) == TRUE); 1966 REGEX_CHECK_STATUS; 1967 } 1968 1969 1970 // 1971 // lookingAt(pos, status) 1972 // 1973 status = U_ZERO_ERROR; 1974 m1->reset(&input2); // "not abc" 1975 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1976 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE); 1977 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE); 1978 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE); 1979 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE); 1980 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1981 status = U_ZERO_ERROR; 1982 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE); 1983 REGEX_CHECK_STATUS; 1984 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE); 1985 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 1986 1987 delete m1; 1988 delete pat2; 1989 1990 utext_close(&re); 1991 utext_close(&input1); 1992 utext_close(&input2); 1993 utext_close(&empty); 1994 } 1995 1996 1997 // 1998 // Capture Group. 1999 // RegexMatcher::start(); 2000 // RegexMatcher::end(); 2001 // RegexMatcher::groupCount(); 2002 // 2003 { 2004 int32_t flags=0; 2005 UParseError pe; 2006 UErrorCode status=U_ZERO_ERROR; 2007 UText re=UTEXT_INITIALIZER; 2008 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */ 2009 utext_openUTF8(&re, str_01234567_pat, -1, &status); 2010 2011 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2012 REGEX_CHECK_STATUS; 2013 2014 UText input = UTEXT_INITIALIZER; 2015 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 2016 utext_openUTF8(&input, str_0123456789, -1, &status); 2017 2018 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2019 REGEX_CHECK_STATUS; 2020 REGEX_ASSERT(matcher->lookingAt(status) == TRUE); 2021 static const int32_t matchStarts[] = {0, 2, 4, 8}; 2022 static const int32_t matchEnds[] = {10, 8, 6, 10}; 2023 int32_t i; 2024 for (i=0; i<4; i++) { 2025 int32_t actualStart = matcher->start(i, status); 2026 REGEX_CHECK_STATUS; 2027 if (actualStart != matchStarts[i]) { 2028 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n", 2029 __FILE__, __LINE__, i, matchStarts[i], actualStart); 2030 } 2031 int32_t actualEnd = matcher->end(i, status); 2032 REGEX_CHECK_STATUS; 2033 if (actualEnd != matchEnds[i]) { 2034 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n", 2035 __FILE__, __LINE__, i, matchEnds[i], actualEnd); 2036 } 2037 } 2038 2039 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status)); 2040 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status)); 2041 2042 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2043 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 2044 matcher->reset(); 2045 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE); 2046 2047 matcher->lookingAt(status); 2048 2049 UnicodeString dest; 2050 UText destText = UTEXT_INITIALIZER; 2051 utext_openUnicodeString(&destText, &dest, &status); 2052 UText *result; 2053 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */ 2054 // Test shallow-clone API 2055 int64_t group_len; 2056 result = matcher->group((UText *)NULL, group_len, status); 2057 REGEX_CHECK_STATUS; 2058 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2059 utext_close(result); 2060 result = matcher->group(0, &destText, group_len, status); 2061 REGEX_CHECK_STATUS; 2062 REGEX_ASSERT(result == &destText); 2063 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2064 // destText is now immutable, reopen it 2065 utext_close(&destText); 2066 utext_openUnicodeString(&destText, &dest, &status); 2067 2068 int64_t length; 2069 result = matcher->group(0, NULL, length, status); 2070 REGEX_CHECK_STATUS; 2071 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result); 2072 utext_close(result); 2073 result = matcher->group(0, &destText, length, status); 2074 REGEX_CHECK_STATUS; 2075 REGEX_ASSERT(result == &destText); 2076 REGEX_ASSERT(utext_getNativeIndex(result) == 0); 2077 REGEX_ASSERT(length == 10); 2078 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2079 2080 // Capture Group 1 == "234567" 2081 result = matcher->group(1, NULL, length, status); 2082 REGEX_CHECK_STATUS; 2083 REGEX_ASSERT(utext_getNativeIndex(result) == 2); 2084 REGEX_ASSERT(length == 6); 2085 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2086 utext_close(result); 2087 2088 result = matcher->group(1, &destText, length, status); 2089 REGEX_CHECK_STATUS; 2090 REGEX_ASSERT(result == &destText); 2091 REGEX_ASSERT(utext_getNativeIndex(result) == 2); 2092 REGEX_ASSERT(length == 6); 2093 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2094 utext_close(result); 2095 2096 // Capture Group 2 == "45" 2097 result = matcher->group(2, NULL, length, status); 2098 REGEX_CHECK_STATUS; 2099 REGEX_ASSERT(utext_getNativeIndex(result) == 4); 2100 REGEX_ASSERT(length == 2); 2101 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2102 utext_close(result); 2103 2104 result = matcher->group(2, &destText, length, status); 2105 REGEX_CHECK_STATUS; 2106 REGEX_ASSERT(result == &destText); 2107 REGEX_ASSERT(utext_getNativeIndex(result) == 4); 2108 REGEX_ASSERT(length == 2); 2109 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2110 utext_close(result); 2111 2112 // Capture Group 3 == "89" 2113 result = matcher->group(3, NULL, length, status); 2114 REGEX_CHECK_STATUS; 2115 REGEX_ASSERT(utext_getNativeIndex(result) == 8); 2116 REGEX_ASSERT(length == 2); 2117 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2118 utext_close(result); 2119 2120 result = matcher->group(3, &destText, length, status); 2121 REGEX_CHECK_STATUS; 2122 REGEX_ASSERT(result == &destText); 2123 REGEX_ASSERT(utext_getNativeIndex(result) == 8); 2124 REGEX_ASSERT(length == 2); 2125 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result); 2126 utext_close(result); 2127 2128 // Capture Group number out of range. 2129 status = U_ZERO_ERROR; 2130 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2131 status = U_ZERO_ERROR; 2132 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR); 2133 status = U_ZERO_ERROR; 2134 matcher->reset(); 2135 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE); 2136 2137 delete matcher; 2138 delete pat; 2139 2140 utext_close(&destText); 2141 utext_close(&input); 2142 utext_close(&re); 2143 } 2144 2145 // 2146 // find 2147 // 2148 { 2149 int32_t flags=0; 2150 UParseError pe; 2151 UErrorCode status=U_ZERO_ERROR; 2152 UText re=UTEXT_INITIALIZER; 2153 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2154 utext_openUTF8(&re, str_abc, -1, &status); 2155 2156 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2157 REGEX_CHECK_STATUS; 2158 UText input = UTEXT_INITIALIZER; 2159 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 2160 utext_openUTF8(&input, str_abcabcabc, -1, &status); 2161 // 012345678901234567 2162 2163 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2164 REGEX_CHECK_STATUS; 2165 REGEX_ASSERT(matcher->find()); 2166 REGEX_ASSERT(matcher->start(status) == 1); 2167 REGEX_ASSERT(matcher->find()); 2168 REGEX_ASSERT(matcher->start(status) == 6); 2169 REGEX_ASSERT(matcher->find()); 2170 REGEX_ASSERT(matcher->start(status) == 12); 2171 REGEX_ASSERT(matcher->find() == FALSE); 2172 REGEX_ASSERT(matcher->find() == FALSE); 2173 2174 matcher->reset(); 2175 REGEX_ASSERT(matcher->find()); 2176 REGEX_ASSERT(matcher->start(status) == 1); 2177 2178 REGEX_ASSERT(matcher->find(0, status)); 2179 REGEX_ASSERT(matcher->start(status) == 1); 2180 REGEX_ASSERT(matcher->find(1, status)); 2181 REGEX_ASSERT(matcher->start(status) == 1); 2182 REGEX_ASSERT(matcher->find(2, status)); 2183 REGEX_ASSERT(matcher->start(status) == 6); 2184 REGEX_ASSERT(matcher->find(12, status)); 2185 REGEX_ASSERT(matcher->start(status) == 12); 2186 REGEX_ASSERT(matcher->find(13, status) == FALSE); 2187 REGEX_ASSERT(matcher->find(16, status) == FALSE); 2188 REGEX_ASSERT(matcher->find(17, status) == FALSE); 2189 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE); 2190 2191 status = U_ZERO_ERROR; 2192 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR); 2193 status = U_ZERO_ERROR; 2194 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR); 2195 2196 REGEX_ASSERT(matcher->groupCount() == 0); 2197 2198 delete matcher; 2199 delete pat; 2200 2201 utext_close(&input); 2202 utext_close(&re); 2203 } 2204 2205 2206 // 2207 // find, with \G in pattern (true if at the end of a previous match). 2208 // 2209 { 2210 int32_t flags=0; 2211 UParseError pe; 2212 UErrorCode status=U_ZERO_ERROR; 2213 UText re=UTEXT_INITIALIZER; 2214 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */ 2215 utext_openUTF8(&re, str_Gabcabc, -1, &status); 2216 2217 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2218 2219 REGEX_CHECK_STATUS; 2220 UText input = UTEXT_INITIALIZER; 2221 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */ 2222 utext_openUTF8(&input, str_abcabcabc, -1, &status); 2223 // 012345678901234567 2224 2225 RegexMatcher *matcher = &pat->matcher(status)->reset(&input); 2226 REGEX_CHECK_STATUS; 2227 REGEX_ASSERT(matcher->find()); 2228 REGEX_ASSERT(matcher->start(status) == 0); 2229 REGEX_ASSERT(matcher->start(1, status) == -1); 2230 REGEX_ASSERT(matcher->start(2, status) == 1); 2231 2232 REGEX_ASSERT(matcher->find()); 2233 REGEX_ASSERT(matcher->start(status) == 4); 2234 REGEX_ASSERT(matcher->start(1, status) == 4); 2235 REGEX_ASSERT(matcher->start(2, status) == -1); 2236 REGEX_CHECK_STATUS; 2237 2238 delete matcher; 2239 delete pat; 2240 2241 utext_close(&input); 2242 utext_close(&re); 2243 } 2244 2245 // 2246 // find with zero length matches, match position should bump ahead 2247 // to prevent loops. 2248 // 2249 { 2250 int32_t i; 2251 UErrorCode status=U_ZERO_ERROR; 2252 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere, 2253 // using an always-true look-ahead. 2254 REGEX_CHECK_STATUS; 2255 UText s = UTEXT_INITIALIZER; 2256 utext_openUTF8(&s, " ", -1, &status); 2257 m.reset(&s); 2258 for (i=0; ; i++) { 2259 if (m.find() == FALSE) { 2260 break; 2261 } 2262 REGEX_ASSERT(m.start(status) == i); 2263 REGEX_ASSERT(m.end(status) == i); 2264 } 2265 REGEX_ASSERT(i==5); 2266 2267 // Check that the bump goes over characters outside the BMP OK 2268 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8 2269 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00}; 2270 utext_openUTF8(&s, (char *)aboveBMP, -1, &status); 2271 m.reset(&s); 2272 for (i=0; ; i+=4) { 2273 if (m.find() == FALSE) { 2274 break; 2275 } 2276 REGEX_ASSERT(m.start(status) == i); 2277 REGEX_ASSERT(m.end(status) == i); 2278 } 2279 REGEX_ASSERT(i==20); 2280 2281 utext_close(&s); 2282 } 2283 { 2284 // find() loop breaking test. 2285 // with pattern of /.?/, should see a series of one char matches, then a single 2286 // match of zero length at the end of the input string. 2287 int32_t i; 2288 UErrorCode status=U_ZERO_ERROR; 2289 RegexMatcher m(".?", 0, status); 2290 REGEX_CHECK_STATUS; 2291 UText s = UTEXT_INITIALIZER; 2292 utext_openUTF8(&s, " ", -1, &status); 2293 m.reset(&s); 2294 for (i=0; ; i++) { 2295 if (m.find() == FALSE) { 2296 break; 2297 } 2298 REGEX_ASSERT(m.start(status) == i); 2299 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i)); 2300 } 2301 REGEX_ASSERT(i==5); 2302 2303 utext_close(&s); 2304 } 2305 2306 2307 // 2308 // Matchers with no input string behave as if they had an empty input string. 2309 // 2310 2311 { 2312 UErrorCode status = U_ZERO_ERROR; 2313 RegexMatcher m(".?", 0, status); 2314 REGEX_CHECK_STATUS; 2315 REGEX_ASSERT(m.find()); 2316 REGEX_ASSERT(m.start(status) == 0); 2317 REGEX_ASSERT(m.input() == ""); 2318 } 2319 { 2320 UErrorCode status = U_ZERO_ERROR; 2321 RegexPattern *p = RegexPattern::compile(".", 0, status); 2322 RegexMatcher *m = p->matcher(status); 2323 REGEX_CHECK_STATUS; 2324 2325 REGEX_ASSERT(m->find() == FALSE); 2326 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0); 2327 delete m; 2328 delete p; 2329 } 2330 2331 // 2332 // Regions 2333 // 2334 { 2335 UErrorCode status = U_ZERO_ERROR; 2336 UText testPattern = UTEXT_INITIALIZER; 2337 UText testText = UTEXT_INITIALIZER; 2338 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status); 2339 REGEX_VERBOSE_TEXT(&testPattern); 2340 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status); 2341 REGEX_VERBOSE_TEXT(&testText); 2342 2343 RegexMatcher m(&testPattern, &testText, 0, status); 2344 REGEX_CHECK_STATUS; 2345 REGEX_ASSERT(m.regionStart() == 0); 2346 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2347 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2348 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2349 2350 m.region(2,4, status); 2351 REGEX_CHECK_STATUS; 2352 REGEX_ASSERT(m.matches(status)); 2353 REGEX_ASSERT(m.start(status)==2); 2354 REGEX_ASSERT(m.end(status)==4); 2355 REGEX_CHECK_STATUS; 2356 2357 m.reset(); 2358 REGEX_ASSERT(m.regionStart() == 0); 2359 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data")); 2360 2361 regextst_openUTF8FromInvariant(&testText, "short", -1, &status); 2362 REGEX_VERBOSE_TEXT(&testText); 2363 m.reset(&testText); 2364 REGEX_ASSERT(m.regionStart() == 0); 2365 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short")); 2366 2367 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2368 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE)); 2369 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2370 REGEX_ASSERT(&m == &m.reset()); 2371 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE); 2372 2373 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE)); 2374 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2375 REGEX_ASSERT(&m == &m.reset()); 2376 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE); 2377 2378 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2379 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE)); 2380 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2381 REGEX_ASSERT(&m == &m.reset()); 2382 REGEX_ASSERT(m.hasTransparentBounds() == TRUE); 2383 2384 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE)); 2385 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2386 REGEX_ASSERT(&m == &m.reset()); 2387 REGEX_ASSERT(m.hasTransparentBounds() == FALSE); 2388 2389 utext_close(&testText); 2390 utext_close(&testPattern); 2391 } 2392 2393 // 2394 // hitEnd() and requireEnd() 2395 // 2396 { 2397 UErrorCode status = U_ZERO_ERROR; 2398 UText testPattern = UTEXT_INITIALIZER; 2399 UText testText = UTEXT_INITIALIZER; 2400 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2401 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */ 2402 utext_openUTF8(&testPattern, str_, -1, &status); 2403 utext_openUTF8(&testText, str_aabb, -1, &status); 2404 2405 RegexMatcher m1(&testPattern, &testText, 0, status); 2406 REGEX_ASSERT(m1.lookingAt(status) == TRUE); 2407 REGEX_ASSERT(m1.hitEnd() == TRUE); 2408 REGEX_ASSERT(m1.requireEnd() == FALSE); 2409 REGEX_CHECK_STATUS; 2410 2411 status = U_ZERO_ERROR; 2412 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */ 2413 utext_openUTF8(&testPattern, str_a, -1, &status); 2414 RegexMatcher m2(&testPattern, &testText, 0, status); 2415 REGEX_ASSERT(m2.lookingAt(status) == TRUE); 2416 REGEX_ASSERT(m2.hitEnd() == FALSE); 2417 REGEX_ASSERT(m2.requireEnd() == FALSE); 2418 REGEX_CHECK_STATUS; 2419 2420 status = U_ZERO_ERROR; 2421 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */ 2422 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status); 2423 RegexMatcher m3(&testPattern, &testText, 0, status); 2424 REGEX_ASSERT(m3.lookingAt(status) == TRUE); 2425 REGEX_ASSERT(m3.hitEnd() == TRUE); 2426 REGEX_ASSERT(m3.requireEnd() == TRUE); 2427 REGEX_CHECK_STATUS; 2428 2429 utext_close(&testText); 2430 utext_close(&testPattern); 2431 } 2432} 2433 2434 2435//--------------------------------------------------------------------------- 2436// 2437// API_Replace_UTF8 API test for class RegexMatcher, testing the 2438// Replace family of functions. 2439// 2440//--------------------------------------------------------------------------- 2441void RegexTest::API_Replace_UTF8() { 2442 // 2443 // Replace 2444 // 2445 int32_t flags=0; 2446 UParseError pe; 2447 UErrorCode status=U_ZERO_ERROR; 2448 2449 UText re=UTEXT_INITIALIZER; 2450 regextst_openUTF8FromInvariant(&re, "abc", -1, &status); 2451 REGEX_VERBOSE_TEXT(&re); 2452 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status); 2453 REGEX_CHECK_STATUS; 2454 2455 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */ 2456 // 012345678901234567 2457 UText dataText = UTEXT_INITIALIZER; 2458 utext_openUTF8(&dataText, data, -1, &status); 2459 REGEX_CHECK_STATUS; 2460 REGEX_VERBOSE_TEXT(&dataText); 2461 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText); 2462 2463 // 2464 // Plain vanilla matches. 2465 // 2466 UnicodeString dest; 2467 UText destText = UTEXT_INITIALIZER; 2468 utext_openUnicodeString(&destText, &dest, &status); 2469 UText *result; 2470 2471 UText replText = UTEXT_INITIALIZER; 2472 2473 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */ 2474 utext_openUTF8(&replText, str_yz, -1, &status); 2475 REGEX_VERBOSE_TEXT(&replText); 2476 result = matcher->replaceFirst(&replText, NULL, status); 2477 REGEX_CHECK_STATUS; 2478 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */ 2479 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2480 utext_close(result); 2481 result = matcher->replaceFirst(&replText, &destText, status); 2482 REGEX_CHECK_STATUS; 2483 REGEX_ASSERT(result == &destText); 2484 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result); 2485 2486 result = matcher->replaceAll(&replText, NULL, status); 2487 REGEX_CHECK_STATUS; 2488 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */ 2489 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2490 utext_close(result); 2491 2492 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2493 result = matcher->replaceAll(&replText, &destText, status); 2494 REGEX_CHECK_STATUS; 2495 REGEX_ASSERT(result == &destText); 2496 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result); 2497 2498 // 2499 // Plain vanilla non-matches. 2500 // 2501 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */ 2502 utext_openUTF8(&dataText, str_abxabxabx, -1, &status); 2503 matcher->reset(&dataText); 2504 2505 result = matcher->replaceFirst(&replText, NULL, status); 2506 REGEX_CHECK_STATUS; 2507 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2508 utext_close(result); 2509 result = matcher->replaceFirst(&replText, &destText, status); 2510 REGEX_CHECK_STATUS; 2511 REGEX_ASSERT(result == &destText); 2512 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2513 2514 result = matcher->replaceAll(&replText, NULL, status); 2515 REGEX_CHECK_STATUS; 2516 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2517 utext_close(result); 2518 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2519 result = matcher->replaceAll(&replText, &destText, status); 2520 REGEX_CHECK_STATUS; 2521 REGEX_ASSERT(result == &destText); 2522 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result); 2523 2524 // 2525 // Empty source string 2526 // 2527 utext_openUTF8(&dataText, NULL, 0, &status); 2528 matcher->reset(&dataText); 2529 2530 result = matcher->replaceFirst(&replText, NULL, status); 2531 REGEX_CHECK_STATUS; 2532 REGEX_ASSERT_UTEXT_UTF8("", result); 2533 utext_close(result); 2534 result = matcher->replaceFirst(&replText, &destText, status); 2535 REGEX_CHECK_STATUS; 2536 REGEX_ASSERT(result == &destText); 2537 REGEX_ASSERT_UTEXT_UTF8("", result); 2538 2539 result = matcher->replaceAll(&replText, NULL, status); 2540 REGEX_CHECK_STATUS; 2541 REGEX_ASSERT_UTEXT_UTF8("", result); 2542 utext_close(result); 2543 result = matcher->replaceAll(&replText, &destText, status); 2544 REGEX_CHECK_STATUS; 2545 REGEX_ASSERT(result == &destText); 2546 REGEX_ASSERT_UTEXT_UTF8("", result); 2547 2548 // 2549 // Empty substitution string 2550 // 2551 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.." 2552 matcher->reset(&dataText); 2553 2554 utext_openUTF8(&replText, NULL, 0, &status); 2555 result = matcher->replaceFirst(&replText, NULL, status); 2556 REGEX_CHECK_STATUS; 2557 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */ 2558 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2559 utext_close(result); 2560 result = matcher->replaceFirst(&replText, &destText, status); 2561 REGEX_CHECK_STATUS; 2562 REGEX_ASSERT(result == &destText); 2563 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result); 2564 2565 result = matcher->replaceAll(&replText, NULL, status); 2566 REGEX_CHECK_STATUS; 2567 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */ 2568 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2569 utext_close(result); 2570 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2571 result = matcher->replaceAll(&replText, &destText, status); 2572 REGEX_CHECK_STATUS; 2573 REGEX_ASSERT(result == &destText); 2574 REGEX_ASSERT_UTEXT_UTF8(str_dots, result); 2575 2576 // 2577 // match whole string 2578 // 2579 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2580 utext_openUTF8(&dataText, str_abc, -1, &status); 2581 matcher->reset(&dataText); 2582 2583 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */ 2584 utext_openUTF8(&replText, str_xyz, -1, &status); 2585 result = matcher->replaceFirst(&replText, NULL, status); 2586 REGEX_CHECK_STATUS; 2587 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2588 utext_close(result); 2589 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2590 result = matcher->replaceFirst(&replText, &destText, status); 2591 REGEX_CHECK_STATUS; 2592 REGEX_ASSERT(result == &destText); 2593 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2594 2595 result = matcher->replaceAll(&replText, NULL, status); 2596 REGEX_CHECK_STATUS; 2597 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2598 utext_close(result); 2599 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2600 result = matcher->replaceAll(&replText, &destText, status); 2601 REGEX_CHECK_STATUS; 2602 REGEX_ASSERT(result == &destText); 2603 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result); 2604 2605 // 2606 // Capture Group, simple case 2607 // 2608 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */ 2609 utext_openUTF8(&re, str_add, -1, &status); 2610 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status); 2611 REGEX_CHECK_STATUS; 2612 2613 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */ 2614 utext_openUTF8(&dataText, str_abcdefg, -1, &status); 2615 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText); 2616 REGEX_CHECK_STATUS; 2617 2618 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */ 2619 utext_openUTF8(&replText, str_11, -1, &status); 2620 result = matcher2->replaceFirst(&replText, NULL, status); 2621 REGEX_CHECK_STATUS; 2622 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */ 2623 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2624 utext_close(result); 2625 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2626 result = matcher2->replaceFirst(&replText, &destText, status); 2627 REGEX_CHECK_STATUS; 2628 REGEX_ASSERT(result == &destText); 2629 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result); 2630 2631 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */ 2632 utext_openUTF8(&replText, str_v, -1, &status); 2633 REGEX_VERBOSE_TEXT(&replText); 2634 result = matcher2->replaceFirst(&replText, NULL, status); 2635 REGEX_CHECK_STATUS; 2636 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */ 2637 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2638 utext_close(result); 2639 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2640 result = matcher2->replaceFirst(&replText, &destText, status); 2641 REGEX_CHECK_STATUS; 2642 REGEX_ASSERT(result == &destText); 2643 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result); 2644 2645 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 2646 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 2647 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */ 2648 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status); 2649 result = matcher2->replaceFirst(&replText, NULL, status); 2650 REGEX_CHECK_STATUS; 2651 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */ 2652 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2653 utext_close(result); 2654 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2655 result = matcher2->replaceFirst(&replText, &destText, status); 2656 REGEX_CHECK_STATUS; 2657 REGEX_ASSERT(result == &destText); 2658 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result); 2659 2660 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */ 2661 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE 2662 // 012345678901234567890123456 2663 supplDigitChars[22] = 0xF0; 2664 supplDigitChars[23] = 0x9D; 2665 supplDigitChars[24] = 0x9F; 2666 supplDigitChars[25] = 0x8F; 2667 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status); 2668 2669 result = matcher2->replaceFirst(&replText, NULL, status); 2670 REGEX_CHECK_STATUS; 2671 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */ 2672 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2673 utext_close(result); 2674 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2675 result = matcher2->replaceFirst(&replText, &destText, status); 2676 REGEX_CHECK_STATUS; 2677 REGEX_ASSERT(result == &destText); 2678 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result); 2679 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */ 2680 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status); 2681 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2682// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2683 utext_close(result); 2684 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2685 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR); 2686 REGEX_ASSERT(result == &destText); 2687// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result); 2688 2689 // 2690 // Replacement String with \u hex escapes 2691 // 2692 { 2693 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */ 2694 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */ 2695 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status); 2696 utext_openUTF8(&replText, str_u0043, -1, &status); 2697 matcher->reset(&dataText); 2698 2699 result = matcher->replaceAll(&replText, NULL, status); 2700 REGEX_CHECK_STATUS; 2701 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */ 2702 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2703 utext_close(result); 2704 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2705 result = matcher->replaceAll(&replText, &destText, status); 2706 REGEX_CHECK_STATUS; 2707 REGEX_ASSERT(result == &destText); 2708 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result); 2709 } 2710 { 2711 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */ 2712 utext_openUTF8(&dataText, str_abc, -1, &status); 2713 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */ 2714 utext_openUTF8(&replText, str_U00010000, -1, &status); 2715 matcher->reset(&dataText); 2716 2717 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A" 2718 // 0123456789 2719 expected[2] = 0xF0; 2720 expected[3] = 0x90; 2721 expected[4] = 0x80; 2722 expected[5] = 0x80; 2723 2724 result = matcher->replaceAll(&replText, NULL, status); 2725 REGEX_CHECK_STATUS; 2726 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2727 utext_close(result); 2728 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status); 2729 result = matcher->replaceAll(&replText, &destText, status); 2730 REGEX_CHECK_STATUS; 2731 REGEX_ASSERT(result == &destText); 2732 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result); 2733 } 2734 // TODO: need more through testing of capture substitutions. 2735 2736 // Bug 4057 2737 // 2738 { 2739 status = U_ZERO_ERROR; 2740const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */ 2741const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */ 2742const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */ 2743 utext_openUTF8(&re, str_ssee, -1, &status); 2744 utext_openUTF8(&dataText, str_blah, -1, &status); 2745 utext_openUTF8(&replText, str_ooh, -1, &status); 2746 2747 RegexMatcher m(&re, 0, status); 2748 REGEX_CHECK_STATUS; 2749 2750 UnicodeString result; 2751 UText resultText = UTEXT_INITIALIZER; 2752 utext_openUnicodeString(&resultText, &result, &status); 2753 2754 // Multiple finds do NOT bump up the previous appendReplacement postion. 2755 m.reset(&dataText); 2756 m.find(); 2757 m.find(); 2758 m.appendReplacement(&resultText, &replText, status); 2759 REGEX_CHECK_STATUS; 2760 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2761 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText); 2762 2763 // After a reset into the interior of a string, appendReplacement still starts at beginning. 2764 status = U_ZERO_ERROR; 2765 result.truncate(0); 2766 utext_openUnicodeString(&resultText, &result, &status); 2767 m.reset(10, status); 2768 m.find(); 2769 m.find(); 2770 m.appendReplacement(&resultText, &replText, status); 2771 REGEX_CHECK_STATUS; 2772 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2773 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText); 2774 2775 // find() at interior of string, appendReplacement still starts at beginning. 2776 status = U_ZERO_ERROR; 2777 result.truncate(0); 2778 utext_openUnicodeString(&resultText, &result, &status); 2779 m.reset(); 2780 m.find(10, status); 2781 m.find(); 2782 m.appendReplacement(&resultText, &replText, status); 2783 REGEX_CHECK_STATUS; 2784 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */ 2785 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText); 2786 2787 m.appendTail(&resultText, status); 2788 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */ 2789 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText); 2790 2791 utext_close(&resultText); 2792 } 2793 2794 delete matcher2; 2795 delete pat2; 2796 delete matcher; 2797 delete pat; 2798 2799 utext_close(&dataText); 2800 utext_close(&replText); 2801 utext_close(&destText); 2802 utext_close(&re); 2803} 2804 2805 2806//--------------------------------------------------------------------------- 2807// 2808// API_Pattern_UTF8 Test that the API for class RegexPattern is 2809// present and nominally working. 2810// 2811//--------------------------------------------------------------------------- 2812void RegexTest::API_Pattern_UTF8() { 2813 RegexPattern pata; // Test default constructor to not crash. 2814 RegexPattern patb; 2815 2816 REGEX_ASSERT(pata == patb); 2817 REGEX_ASSERT(pata == pata); 2818 2819 UText re1 = UTEXT_INITIALIZER; 2820 UText re2 = UTEXT_INITIALIZER; 2821 UErrorCode status = U_ZERO_ERROR; 2822 UParseError pe; 2823 2824 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */ 2825 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */ 2826 utext_openUTF8(&re1, str_abcalmz, -1, &status); 2827 utext_openUTF8(&re2, str_def, -1, &status); 2828 2829 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status); 2830 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status); 2831 REGEX_CHECK_STATUS; 2832 REGEX_ASSERT(*pat1 == *pat1); 2833 REGEX_ASSERT(*pat1 != pata); 2834 2835 // Assign 2836 patb = *pat1; 2837 REGEX_ASSERT(patb == *pat1); 2838 2839 // Copy Construct 2840 RegexPattern patc(*pat1); 2841 REGEX_ASSERT(patc == *pat1); 2842 REGEX_ASSERT(patb == patc); 2843 REGEX_ASSERT(pat1 != pat2); 2844 patb = *pat2; 2845 REGEX_ASSERT(patb != patc); 2846 REGEX_ASSERT(patb == *pat2); 2847 2848 // Compile with no flags. 2849 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status); 2850 REGEX_ASSERT(*pat1a == *pat1); 2851 2852 REGEX_ASSERT(pat1a->flags() == 0); 2853 2854 // Compile with different flags should be not equal 2855 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status); 2856 REGEX_CHECK_STATUS; 2857 2858 REGEX_ASSERT(*pat1b != *pat1a); 2859 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE); 2860 REGEX_ASSERT(pat1a->flags() == 0); 2861 delete pat1b; 2862 2863 // clone 2864 RegexPattern *pat1c = pat1->clone(); 2865 REGEX_ASSERT(*pat1c == *pat1); 2866 REGEX_ASSERT(*pat1c != *pat2); 2867 2868 delete pat1c; 2869 delete pat1a; 2870 delete pat1; 2871 delete pat2; 2872 2873 utext_close(&re1); 2874 utext_close(&re2); 2875 2876 2877 // 2878 // Verify that a matcher created from a cloned pattern works. 2879 // (Jitterbug 3423) 2880 // 2881 { 2882 UErrorCode status = U_ZERO_ERROR; 2883 UText pattern = UTEXT_INITIALIZER; 2884 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */ 2885 utext_openUTF8(&pattern, str_pL, -1, &status); 2886 2887 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status); 2888 RegexPattern *pClone = pSource->clone(); 2889 delete pSource; 2890 RegexMatcher *mFromClone = pClone->matcher(status); 2891 REGEX_CHECK_STATUS; 2892 2893 UText input = UTEXT_INITIALIZER; 2894 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */ 2895 utext_openUTF8(&input, str_HelloWorld, -1, &status); 2896 mFromClone->reset(&input); 2897 REGEX_ASSERT(mFromClone->find() == TRUE); 2898 REGEX_ASSERT(mFromClone->group(status) == "Hello"); 2899 REGEX_ASSERT(mFromClone->find() == TRUE); 2900 REGEX_ASSERT(mFromClone->group(status) == "World"); 2901 REGEX_ASSERT(mFromClone->find() == FALSE); 2902 delete mFromClone; 2903 delete pClone; 2904 2905 utext_close(&input); 2906 utext_close(&pattern); 2907 } 2908 2909 // 2910 // matches convenience API 2911 // 2912 { 2913 UErrorCode status = U_ZERO_ERROR; 2914 UText pattern = UTEXT_INITIALIZER; 2915 UText input = UTEXT_INITIALIZER; 2916 2917 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */ 2918 utext_openUTF8(&input, str_randominput, -1, &status); 2919 2920 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */ 2921 utext_openUTF8(&pattern, str_dotstar, -1, &status); 2922 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE); 2923 REGEX_CHECK_STATUS; 2924 2925 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */ 2926 utext_openUTF8(&pattern, str_abc, -1, &status); 2927 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE); 2928 REGEX_CHECK_STATUS; 2929 2930 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */ 2931 utext_openUTF8(&pattern, str_nput, -1, &status); 2932 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE); 2933 REGEX_CHECK_STATUS; 2934 2935 utext_openUTF8(&pattern, str_randominput, -1, &status); 2936 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE); 2937 REGEX_CHECK_STATUS; 2938 2939 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */ 2940 utext_openUTF8(&pattern, str_u, -1, &status); 2941 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE); 2942 REGEX_CHECK_STATUS; 2943 2944 utext_openUTF8(&input, str_abc, -1, &status); 2945 utext_openUTF8(&pattern, str_abc, -1, &status); 2946 status = U_INDEX_OUTOFBOUNDS_ERROR; 2947 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE); 2948 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 2949 2950 utext_close(&input); 2951 utext_close(&pattern); 2952 } 2953 2954 2955 // 2956 // Split() 2957 // 2958 status = U_ZERO_ERROR; 2959 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */ 2960 utext_openUTF8(&re1, str_spaceplus, -1, &status); 2961 pat1 = RegexPattern::compile(&re1, pe, status); 2962 REGEX_CHECK_STATUS; 2963 UnicodeString fields[10]; 2964 2965 int32_t n; 2966 n = pat1->split("Now is the time", fields, 10, status); 2967 REGEX_CHECK_STATUS; 2968 REGEX_ASSERT(n==4); 2969 REGEX_ASSERT(fields[0]=="Now"); 2970 REGEX_ASSERT(fields[1]=="is"); 2971 REGEX_ASSERT(fields[2]=="the"); 2972 REGEX_ASSERT(fields[3]=="time"); 2973 REGEX_ASSERT(fields[4]==""); 2974 2975 n = pat1->split("Now is the time", fields, 2, status); 2976 REGEX_CHECK_STATUS; 2977 REGEX_ASSERT(n==2); 2978 REGEX_ASSERT(fields[0]=="Now"); 2979 REGEX_ASSERT(fields[1]=="is the time"); 2980 REGEX_ASSERT(fields[2]=="the"); // left over from previous test 2981 2982 fields[1] = "*"; 2983 status = U_ZERO_ERROR; 2984 n = pat1->split("Now is the time", fields, 1, status); 2985 REGEX_CHECK_STATUS; 2986 REGEX_ASSERT(n==1); 2987 REGEX_ASSERT(fields[0]=="Now is the time"); 2988 REGEX_ASSERT(fields[1]=="*"); 2989 status = U_ZERO_ERROR; 2990 2991 n = pat1->split(" Now is the time ", fields, 10, status); 2992 REGEX_CHECK_STATUS; 2993 REGEX_ASSERT(n==6); 2994 REGEX_ASSERT(fields[0]==""); 2995 REGEX_ASSERT(fields[1]=="Now"); 2996 REGEX_ASSERT(fields[2]=="is"); 2997 REGEX_ASSERT(fields[3]=="the"); 2998 REGEX_ASSERT(fields[4]=="time"); 2999 REGEX_ASSERT(fields[5]==""); 3000 REGEX_ASSERT(fields[6]==""); 3001 3002 fields[2] = "*"; 3003 n = pat1->split(" ", fields, 10, status); 3004 REGEX_CHECK_STATUS; 3005 REGEX_ASSERT(n==2); 3006 REGEX_ASSERT(fields[0]==""); 3007 REGEX_ASSERT(fields[1]==""); 3008 REGEX_ASSERT(fields[2]=="*"); 3009 3010 fields[0] = "foo"; 3011 n = pat1->split("", fields, 10, status); 3012 REGEX_CHECK_STATUS; 3013 REGEX_ASSERT(n==0); 3014 REGEX_ASSERT(fields[0]=="foo"); 3015 3016 delete pat1; 3017 3018 // split, with a pattern with (capture) 3019 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status); 3020 pat1 = RegexPattern::compile(&re1, pe, status); 3021 REGEX_CHECK_STATUS; 3022 3023 status = U_ZERO_ERROR; 3024 fields[6] = fields[7] = "*"; 3025 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status); 3026 REGEX_CHECK_STATUS; 3027 REGEX_ASSERT(n==7); 3028 REGEX_ASSERT(fields[0]==""); 3029 REGEX_ASSERT(fields[1]=="a"); 3030 REGEX_ASSERT(fields[2]=="Now is "); 3031 REGEX_ASSERT(fields[3]=="b"); 3032 REGEX_ASSERT(fields[4]=="the time"); 3033 REGEX_ASSERT(fields[5]=="c"); 3034 REGEX_ASSERT(fields[6]==""); 3035 REGEX_ASSERT(fields[7]=="*"); 3036 REGEX_ASSERT(status==U_ZERO_ERROR); 3037 3038 fields[6] = fields[7] = "*"; 3039 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status); 3040 REGEX_CHECK_STATUS; 3041 REGEX_ASSERT(n==7); 3042 REGEX_ASSERT(fields[0]==" "); 3043 REGEX_ASSERT(fields[1]=="a"); 3044 REGEX_ASSERT(fields[2]=="Now is "); 3045 REGEX_ASSERT(fields[3]=="b"); 3046 REGEX_ASSERT(fields[4]=="the time"); 3047 REGEX_ASSERT(fields[5]=="c"); 3048 REGEX_ASSERT(fields[6]==""); 3049 REGEX_ASSERT(fields[7]=="*"); 3050 3051 status = U_ZERO_ERROR; 3052 fields[6] = "foo"; 3053 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status); 3054 REGEX_CHECK_STATUS; 3055 REGEX_ASSERT(n==6); 3056 REGEX_ASSERT(fields[0]==" "); 3057 REGEX_ASSERT(fields[1]=="a"); 3058 REGEX_ASSERT(fields[2]=="Now is "); 3059 REGEX_ASSERT(fields[3]=="b"); 3060 REGEX_ASSERT(fields[4]=="the time"); 3061 REGEX_ASSERT(fields[5]==" "); 3062 REGEX_ASSERT(fields[6]=="foo"); 3063 3064 status = U_ZERO_ERROR; 3065 fields[5] = "foo"; 3066 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status); 3067 REGEX_CHECK_STATUS; 3068 REGEX_ASSERT(n==5); 3069 REGEX_ASSERT(fields[0]==" "); 3070 REGEX_ASSERT(fields[1]=="a"); 3071 REGEX_ASSERT(fields[2]=="Now is "); 3072 REGEX_ASSERT(fields[3]=="b"); 3073 REGEX_ASSERT(fields[4]=="the time<c>"); 3074 REGEX_ASSERT(fields[5]=="foo"); 3075 3076 status = U_ZERO_ERROR; 3077 fields[5] = "foo"; 3078 n = pat1->split(" <a>Now is <b>the time", fields, 5, status); 3079 REGEX_CHECK_STATUS; 3080 REGEX_ASSERT(n==5); 3081 REGEX_ASSERT(fields[0]==" "); 3082 REGEX_ASSERT(fields[1]=="a"); 3083 REGEX_ASSERT(fields[2]=="Now is "); 3084 REGEX_ASSERT(fields[3]=="b"); 3085 REGEX_ASSERT(fields[4]=="the time"); 3086 REGEX_ASSERT(fields[5]=="foo"); 3087 3088 status = U_ZERO_ERROR; 3089 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status); 3090 REGEX_CHECK_STATUS; 3091 REGEX_ASSERT(n==4); 3092 REGEX_ASSERT(fields[0]==" "); 3093 REGEX_ASSERT(fields[1]=="a"); 3094 REGEX_ASSERT(fields[2]=="Now is "); 3095 REGEX_ASSERT(fields[3]=="the time<c>"); 3096 status = U_ZERO_ERROR; 3097 delete pat1; 3098 3099 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status); 3100 pat1 = RegexPattern::compile(&re1, pe, status); 3101 REGEX_CHECK_STATUS; 3102 n = pat1->split("1-10,20", fields, 10, status); 3103 REGEX_CHECK_STATUS; 3104 REGEX_ASSERT(n==5); 3105 REGEX_ASSERT(fields[0]=="1"); 3106 REGEX_ASSERT(fields[1]=="-"); 3107 REGEX_ASSERT(fields[2]=="10"); 3108 REGEX_ASSERT(fields[3]==","); 3109 REGEX_ASSERT(fields[4]=="20"); 3110 delete pat1; 3111 3112 3113 // 3114 // split of a UText based string, with library allocating output UTexts. 3115 // 3116 { 3117 status = U_ZERO_ERROR; 3118 RegexMatcher matcher(UnicodeString("(:)"), 0, status); 3119 UnicodeString stringToSplit("first:second:third"); 3120 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status); 3121 REGEX_CHECK_STATUS; 3122 3123 UText *splits[10] = {NULL}; 3124 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status); 3125 REGEX_CHECK_STATUS; 3126 REGEX_ASSERT(numFields == 5); 3127 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]); 3128 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]); 3129 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]); 3130 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]); 3131 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]); 3132 REGEX_ASSERT(splits[5] == NULL); 3133 3134 for (int i=0; i<UPRV_LENGTHOF(splits); i++) { 3135 if (splits[i]) { 3136 utext_close(splits[i]); 3137 splits[i] = NULL; 3138 } 3139 } 3140 utext_close(textToSplit); 3141 } 3142 3143 3144 // 3145 // RegexPattern::pattern() and patternText() 3146 // 3147 pat1 = new RegexPattern(); 3148 REGEX_ASSERT(pat1->pattern() == ""); 3149 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status)); 3150 delete pat1; 3151 const char *helloWorldInvariant = "(Hello, world)*"; 3152 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status); 3153 pat1 = RegexPattern::compile(&re1, pe, status); 3154 REGEX_CHECK_STATUS; 3155 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern()); 3156 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status)); 3157 delete pat1; 3158 3159 utext_close(&re1); 3160} 3161 3162 3163//--------------------------------------------------------------------------- 3164// 3165// Extended A more thorough check for features of regex patterns 3166// The test cases are in a separate data file, 3167// source/tests/testdata/regextst.txt 3168// A description of the test data format is included in that file. 3169// 3170//--------------------------------------------------------------------------- 3171 3172const char * 3173RegexTest::getPath(char buffer[2048], const char *filename) { 3174 UErrorCode status=U_ZERO_ERROR; 3175 const char *testDataDirectory = IntlTest::getSourceTestData(status); 3176 if (U_FAILURE(status)) { 3177 errln("ERROR: loadTestData() failed - %s", u_errorName(status)); 3178 return NULL; 3179 } 3180 3181 strcpy(buffer, testDataDirectory); 3182 strcat(buffer, filename); 3183 return buffer; 3184} 3185 3186void RegexTest::Extended() { 3187 char tdd[2048]; 3188 const char *srcPath; 3189 UErrorCode status = U_ZERO_ERROR; 3190 int32_t lineNum = 0; 3191 3192 // 3193 // Open and read the test data file. 3194 // 3195 srcPath=getPath(tdd, "regextst.txt"); 3196 if(srcPath==NULL) { 3197 return; /* something went wrong, error already output */ 3198 } 3199 3200 int32_t len; 3201 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status); 3202 if (U_FAILURE(status)) { 3203 return; /* something went wrong, error already output */ 3204 } 3205 3206 // 3207 // Put the test data into a UnicodeString 3208 // 3209 UnicodeString testString(FALSE, testData, len); 3210 3211 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status); 3212 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status); 3213 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status); 3214 3215 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status); 3216 UnicodeString testPattern; // The pattern for test from the test file. 3217 UnicodeString testFlags; // the flags for a test. 3218 UnicodeString matchString; // The marked up string to be used as input 3219 3220 if (U_FAILURE(status)){ 3221 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status)); 3222 delete [] testData; 3223 return; 3224 } 3225 3226 // 3227 // Loop over the test data file, once per line. 3228 // 3229 while (lineMat.find()) { 3230 lineNum++; 3231 if (U_FAILURE(status)) { 3232 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status)); 3233 } 3234 3235 status = U_ZERO_ERROR; 3236 UnicodeString testLine = lineMat.group(1, status); 3237 if (testLine.length() == 0) { 3238 continue; 3239 } 3240 3241 // 3242 // Parse the test line. Skip blank and comment only lines. 3243 // Separate out the three main fields - pattern, flags, target. 3244 // 3245 3246 commentMat.reset(testLine); 3247 if (commentMat.lookingAt(status)) { 3248 // This line is a comment, or blank. 3249 continue; 3250 } 3251 3252 // 3253 // Pull out the pattern field, remove it from the test file line. 3254 // 3255 quotedStuffMat.reset(testLine); 3256 if (quotedStuffMat.lookingAt(status)) { 3257 testPattern = quotedStuffMat.group(2, status); 3258 testLine.remove(0, quotedStuffMat.end(0, status)); 3259 } else { 3260 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum); 3261 continue; 3262 } 3263 3264 3265 // 3266 // Pull out the flags from the test file line. 3267 // 3268 flagsMat.reset(testLine); 3269 flagsMat.lookingAt(status); // Will always match, possibly an empty string. 3270 testFlags = flagsMat.group(1, status); 3271 if (flagsMat.group(2, status).length() > 0) { 3272 errln("Bad Match flag at line %d. Scanning %c\n", 3273 lineNum, flagsMat.group(2, status).charAt(0)); 3274 continue; 3275 } 3276 testLine.remove(0, flagsMat.end(0, status)); 3277 3278 // 3279 // Pull out the match string, as a whole. 3280 // We'll process the <tags> later. 3281 // 3282 quotedStuffMat.reset(testLine); 3283 if (quotedStuffMat.lookingAt(status)) { 3284 matchString = quotedStuffMat.group(2, status); 3285 testLine.remove(0, quotedStuffMat.end(0, status)); 3286 } else { 3287 errln("Bad match string at test file line %d", lineNum); 3288 continue; 3289 } 3290 3291 // 3292 // The only thing left from the input line should be an optional trailing comment. 3293 // 3294 commentMat.reset(testLine); 3295 if (commentMat.lookingAt(status) == FALSE) { 3296 errln("Line %d: unexpected characters at end of test line.", lineNum); 3297 continue; 3298 } 3299 3300 // 3301 // Run the test 3302 // 3303 regex_find(testPattern, testFlags, matchString, srcPath, lineNum); 3304 } 3305 3306 delete [] testData; 3307 3308} 3309 3310 3311 3312//--------------------------------------------------------------------------- 3313// 3314// regex_find(pattern, flags, inputString, lineNumber) 3315// 3316// Function to run a single test from the Extended (data driven) tests. 3317// See file test/testdata/regextst.txt for a description of the 3318// pattern and inputString fields, and the allowed flags. 3319// lineNumber is the source line in regextst.txt of the test. 3320// 3321//--------------------------------------------------------------------------- 3322 3323 3324// Set a value into a UVector at position specified by a decimal number in 3325// a UnicodeString. This is a utility function needed by the actual test function, 3326// which follows. 3327static void set(UVector &vec, int32_t val, UnicodeString index) { 3328 UErrorCode status=U_ZERO_ERROR; 3329 int32_t idx = 0; 3330 for (int32_t i=0; i<index.length(); i++) { 3331 int32_t d=u_charDigitValue(index.charAt(i)); 3332 if (d<0) {return;} 3333 idx = idx*10 + d; 3334 } 3335 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3336 vec.setElementAt(val, idx); 3337} 3338 3339static void setInt(UVector &vec, int32_t val, int32_t idx) { 3340 UErrorCode status=U_ZERO_ERROR; 3341 while (vec.size()<idx+1) {vec.addElement(-1, status);} 3342 vec.setElementAt(val, idx); 3343} 3344 3345static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex) 3346{ 3347 UBool couldFind = TRUE; 3348 UTEXT_SETNATIVEINDEX(utext, 0); 3349 int32_t i = 0; 3350 while (i < unistrOffset) { 3351 UChar32 c = UTEXT_NEXT32(utext); 3352 if (c != U_SENTINEL) { 3353 i += U16_LENGTH(c); 3354 } else { 3355 couldFind = FALSE; 3356 break; 3357 } 3358 } 3359 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext); 3360 return couldFind; 3361} 3362 3363 3364void RegexTest::regex_find(const UnicodeString &pattern, 3365 const UnicodeString &flags, 3366 const UnicodeString &inputString, 3367 const char *srcPath, 3368 int32_t line) { 3369 UnicodeString unEscapedInput; 3370 UnicodeString deTaggedInput; 3371 3372 int32_t patternUTF8Length, inputUTF8Length; 3373 char *patternChars = NULL, *inputChars = NULL; 3374 UText patternText = UTEXT_INITIALIZER; 3375 UText inputText = UTEXT_INITIALIZER; 3376 UConverter *UTF8Converter = NULL; 3377 3378 UErrorCode status = U_ZERO_ERROR; 3379 UParseError pe; 3380 RegexPattern *parsePat = NULL; 3381 RegexMatcher *parseMatcher = NULL; 3382 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL; 3383 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL; 3384 UVector groupStarts(status); 3385 UVector groupEnds(status); 3386 UVector groupStartsUTF8(status); 3387 UVector groupEndsUTF8(status); 3388 UBool isMatch = FALSE, isUTF8Match = FALSE; 3389 UBool failed = FALSE; 3390 int32_t numFinds; 3391 int32_t i; 3392 UBool useMatchesFunc = FALSE; 3393 UBool useLookingAtFunc = FALSE; 3394 int32_t regionStart = -1; 3395 int32_t regionEnd = -1; 3396 int32_t regionStartUTF8 = -1; 3397 int32_t regionEndUTF8 = -1; 3398 3399 3400 // 3401 // Compile the caller's pattern 3402 // 3403 uint32_t bflags = 0; 3404 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag 3405 bflags |= UREGEX_CASE_INSENSITIVE; 3406 } 3407 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag 3408 bflags |= UREGEX_COMMENTS; 3409 } 3410 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag 3411 bflags |= UREGEX_DOTALL; 3412 } 3413 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag 3414 bflags |= UREGEX_MULTILINE; 3415 } 3416 3417 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag 3418 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES; 3419 } 3420 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag 3421 bflags |= UREGEX_UNIX_LINES; 3422 } 3423 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag 3424 bflags |= UREGEX_LITERAL; 3425 } 3426 3427 3428 callerPattern = RegexPattern::compile(pattern, bflags, pe, status); 3429 if (status != U_ZERO_ERROR) { 3430 #if UCONFIG_NO_BREAK_ITERATION==1 3431 // 'v' test flag means that the test pattern should not compile if ICU was configured 3432 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3433 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3434 goto cleanupAndReturn; 3435 } 3436 #endif 3437 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3438 // Expected pattern compilation error. 3439 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3440 logln("Pattern Compile returns \"%s\"", u_errorName(status)); 3441 } 3442 goto cleanupAndReturn; 3443 } else { 3444 // Unexpected pattern compilation error. 3445 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status)); 3446 goto cleanupAndReturn; 3447 } 3448 } 3449 3450 UTF8Converter = ucnv_open("UTF8", &status); 3451 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 3452 3453 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status); 3454 status = U_ZERO_ERROR; // buffer overflow 3455 patternChars = new char[patternUTF8Length+1]; 3456 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status); 3457 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status); 3458 3459 if (status == U_ZERO_ERROR) { 3460 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status); 3461 3462 if (status != U_ZERO_ERROR) { 3463#if UCONFIG_NO_BREAK_ITERATION==1 3464 // 'v' test flag means that the test pattern should not compile if ICU was configured 3465 // to not include break iteration. RBBI is needed for Unicode word boundaries. 3466 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) { 3467 goto cleanupAndReturn; 3468 } 3469#endif 3470 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E' 3471 // Expected pattern compilation error. 3472 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd' 3473 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status)); 3474 } 3475 goto cleanupAndReturn; 3476 } else { 3477 // Unexpected pattern compilation error. 3478 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status)); 3479 goto cleanupAndReturn; 3480 } 3481 } 3482 } 3483 3484 if (UTF8Pattern == NULL) { 3485 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3486 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line); 3487 status = U_ZERO_ERROR; 3488 } 3489 3490 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag 3491 callerPattern->dumpPattern(); 3492 } 3493 3494 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag 3495 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line); 3496 goto cleanupAndReturn; 3497 } 3498 3499 3500 // 3501 // Number of times find() should be called on the test string, default to 1 3502 // 3503 numFinds = 1; 3504 for (i=2; i<=9; i++) { 3505 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag 3506 if (numFinds != 1) { 3507 errln("Line %d: more than one digit flag. Scanning %d.", line, i); 3508 goto cleanupAndReturn; 3509 } 3510 numFinds = i; 3511 } 3512 } 3513 3514 // 'M' flag. Use matches() instead of find() 3515 if (flags.indexOf((UChar)0x4d) >= 0) { 3516 useMatchesFunc = TRUE; 3517 } 3518 if (flags.indexOf((UChar)0x4c) >= 0) { 3519 useLookingAtFunc = TRUE; 3520 } 3521 3522 // 3523 // Find the tags in the input data, remove them, and record the group boundary 3524 // positions. 3525 // 3526 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status); 3527 REGEX_CHECK_STATUS_L(line); 3528 3529 unEscapedInput = inputString.unescape(); 3530 parseMatcher = parsePat->matcher(unEscapedInput, status); 3531 REGEX_CHECK_STATUS_L(line); 3532 while(parseMatcher->find()) { 3533 parseMatcher->appendReplacement(deTaggedInput, "", status); 3534 REGEX_CHECK_STATUS; 3535 UnicodeString groupNum = parseMatcher->group(2, status); 3536 if (groupNum == "r") { 3537 // <r> or </r>, a region specification within the string 3538 if (parseMatcher->group(1, status) == "/") { 3539 regionEnd = deTaggedInput.length(); 3540 } else { 3541 regionStart = deTaggedInput.length(); 3542 } 3543 } else { 3544 // <digits> or </digits>, a group match boundary tag. 3545 if (parseMatcher->group(1, status) == "/") { 3546 set(groupEnds, deTaggedInput.length(), groupNum); 3547 } else { 3548 set(groupStarts, deTaggedInput.length(), groupNum); 3549 } 3550 } 3551 } 3552 parseMatcher->appendTail(deTaggedInput); 3553 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line); 3554 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) { 3555 errln("mismatched <r> tags"); 3556 failed = TRUE; 3557 goto cleanupAndReturn; 3558 } 3559 3560 // 3561 // Configure the matcher according to the flags specified with this test. 3562 // 3563 matcher = callerPattern->matcher(deTaggedInput, status); 3564 REGEX_CHECK_STATUS_L(line); 3565 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag 3566 matcher->setTrace(TRUE); 3567 } 3568 3569 if (UTF8Pattern != NULL) { 3570 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status); 3571 status = U_ZERO_ERROR; // buffer overflow 3572 inputChars = new char[inputUTF8Length+1]; 3573 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status); 3574 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status); 3575 3576 if (status == U_ZERO_ERROR) { 3577 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText); 3578 REGEX_CHECK_STATUS_L(line); 3579 } 3580 3581 if (UTF8Matcher == NULL) { 3582 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine 3583 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line); 3584 status = U_ZERO_ERROR; 3585 } 3586 } 3587 3588 // 3589 // Generate native indices for UTF8 versions of region and capture group info 3590 // 3591 if (UTF8Matcher != NULL) { 3592 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8); 3593 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8); 3594 3595 // Fill out the native index UVector info. 3596 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size() 3597 for (i=0; i<groupStarts.size(); i++) { 3598 int32_t start = groupStarts.elementAti(i); 3599 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3600 if (start >= 0) { 3601 int32_t startUTF8; 3602 if (!utextOffsetToNative(&inputText, start, startUTF8)) { 3603 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start); 3604 failed = TRUE; 3605 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3606 } 3607 setInt(groupStartsUTF8, startUTF8, i); 3608 } 3609 3610 int32_t end = groupEnds.elementAti(i); 3611 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting 3612 if (end >= 0) { 3613 int32_t endUTF8; 3614 if (!utextOffsetToNative(&inputText, end, endUTF8)) { 3615 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end); 3616 failed = TRUE; 3617 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3618 } 3619 setInt(groupEndsUTF8, endUTF8, i); 3620 } 3621 } 3622 } 3623 3624 if (regionStart>=0) { 3625 matcher->region(regionStart, regionEnd, status); 3626 REGEX_CHECK_STATUS_L(line); 3627 if (UTF8Matcher != NULL) { 3628 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status); 3629 REGEX_CHECK_STATUS_L(line); 3630 } 3631 } 3632 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag 3633 matcher->useAnchoringBounds(FALSE); 3634 if (UTF8Matcher != NULL) { 3635 UTF8Matcher->useAnchoringBounds(FALSE); 3636 } 3637 } 3638 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag 3639 matcher->useTransparentBounds(TRUE); 3640 if (UTF8Matcher != NULL) { 3641 UTF8Matcher->useTransparentBounds(TRUE); 3642 } 3643 } 3644 3645 3646 3647 // 3648 // Do a find on the de-tagged input using the caller's pattern 3649 // TODO: error on count>1 and not find(). 3650 // error on both matches() and lookingAt(). 3651 // 3652 for (i=0; i<numFinds; i++) { 3653 if (useMatchesFunc) { 3654 isMatch = matcher->matches(status); 3655 if (UTF8Matcher != NULL) { 3656 isUTF8Match = UTF8Matcher->matches(status); 3657 } 3658 } else if (useLookingAtFunc) { 3659 isMatch = matcher->lookingAt(status); 3660 if (UTF8Matcher != NULL) { 3661 isUTF8Match = UTF8Matcher->lookingAt(status); 3662 } 3663 } else { 3664 isMatch = matcher->find(); 3665 if (UTF8Matcher != NULL) { 3666 isUTF8Match = UTF8Matcher->find(); 3667 } 3668 } 3669 } 3670 matcher->setTrace(FALSE); 3671 if (U_FAILURE(status)) { 3672 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status)); 3673 } 3674 3675 // 3676 // Match up the groups from the find() with the groups from the tags 3677 // 3678 3679 // number of tags should match number of groups from find operation. 3680 // matcher->groupCount does not include group 0, the entire match, hence the +1. 3681 // G option in test means that capture group data is not available in the 3682 // expected results, so the check needs to be suppressed. 3683 if (isMatch == FALSE && groupStarts.size() != 0) { 3684 dataerrln("Error at line %d: Match expected, but none found.", line); 3685 failed = TRUE; 3686 goto cleanupAndReturn; 3687 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) { 3688 errln("Error at line %d: Match expected, but none found. (UTF8)", line); 3689 failed = TRUE; 3690 goto cleanupAndReturn; 3691 } 3692 3693 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) { 3694 // Only check for match / no match. Don't check capture groups. 3695 if (isMatch && groupStarts.size() == 0) { 3696 errln("Error at line %d: No match expected, but one found.", line); 3697 failed = TRUE; 3698 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) { 3699 errln("Error at line %d: No match expected, but one found. (UTF8)", line); 3700 failed = TRUE; 3701 } 3702 goto cleanupAndReturn; 3703 } 3704 3705 REGEX_CHECK_STATUS_L(line); 3706 for (i=0; i<=matcher->groupCount(); i++) { 3707 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i)); 3708 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i)); 3709 if (matcher->start(i, status) != expectedStart) { 3710 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d", 3711 line, i, expectedStart, matcher->start(i, status)); 3712 failed = TRUE; 3713 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3714 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) { 3715 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)", 3716 line, i, expectedStartUTF8, UTF8Matcher->start(i, status)); 3717 failed = TRUE; 3718 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now. 3719 } 3720 3721 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i)); 3722 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i)); 3723 if (matcher->end(i, status) != expectedEnd) { 3724 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d", 3725 line, i, expectedEnd, matcher->end(i, status)); 3726 failed = TRUE; 3727 // Error on end position; keep going; real error is probably yet to come as group 3728 // end positions work from end of the input data towards the front. 3729 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) { 3730 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)", 3731 line, i, expectedEndUTF8, UTF8Matcher->end(i, status)); 3732 failed = TRUE; 3733 // Error on end position; keep going; real error is probably yet to come as group 3734 // end positions work from end of the input data towards the front. 3735 } 3736 } 3737 if ( matcher->groupCount()+1 < groupStarts.size()) { 3738 errln("Error at line %d: Expected %d capture groups, found %d.", 3739 line, groupStarts.size()-1, matcher->groupCount()); 3740 failed = TRUE; 3741 } 3742 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) { 3743 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)", 3744 line, groupStarts.size()-1, UTF8Matcher->groupCount()); 3745 failed = TRUE; 3746 } 3747 3748 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3749 matcher->requireEnd() == TRUE) { 3750 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line); 3751 failed = TRUE; 3752 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false 3753 UTF8Matcher->requireEnd() == TRUE) { 3754 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line); 3755 failed = TRUE; 3756 } 3757 3758 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true 3759 matcher->requireEnd() == FALSE) { 3760 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line); 3761 failed = TRUE; 3762 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false 3763 UTF8Matcher->requireEnd() == FALSE) { 3764 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line); 3765 failed = TRUE; 3766 } 3767 3768 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3769 matcher->hitEnd() == TRUE) { 3770 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line); 3771 failed = TRUE; 3772 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false 3773 UTF8Matcher->hitEnd() == TRUE) { 3774 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line); 3775 failed = TRUE; 3776 } 3777 3778 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3779 matcher->hitEnd() == FALSE) { 3780 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line); 3781 failed = TRUE; 3782 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true 3783 UTF8Matcher->hitEnd() == FALSE) { 3784 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line); 3785 failed = TRUE; 3786 } 3787 3788 3789cleanupAndReturn: 3790 if (failed) { 3791 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" " 3792 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\""); 3793 // callerPattern->dump(); 3794 } 3795 delete parseMatcher; 3796 delete parsePat; 3797 delete UTF8Matcher; 3798 delete UTF8Pattern; 3799 delete matcher; 3800 delete callerPattern; 3801 3802 utext_close(&inputText); 3803 delete[] inputChars; 3804 utext_close(&patternText); 3805 delete[] patternChars; 3806 ucnv_close(UTF8Converter); 3807} 3808 3809 3810 3811 3812//--------------------------------------------------------------------------- 3813// 3814// Errors Check for error handling in patterns. 3815// 3816//--------------------------------------------------------------------------- 3817void RegexTest::Errors() { 3818 // \escape sequences that aren't implemented yet. 3819 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED); 3820 3821 // Missing close parentheses 3822 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN); 3823 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN); 3824 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN); 3825 3826 // Extra close paren 3827 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN); 3828 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN); 3829 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN); 3830 3831 // Look-ahead, Look-behind 3832 // TODO: add tests for unbounded length look-behinds. 3833 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct 3834 3835 // Attempt to use non-default flags 3836 { 3837 UParseError pe; 3838 UErrorCode status = U_ZERO_ERROR; 3839 int32_t flags = UREGEX_CANON_EQ | 3840 UREGEX_COMMENTS | UREGEX_DOTALL | 3841 UREGEX_MULTILINE; 3842 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status); 3843 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED); 3844 delete pat1; 3845 } 3846 3847 3848 // Quantifiers are allowed only after something that can be quantified. 3849 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX); 3850 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX); 3851 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX); 3852 3853 // Mal-formed {min,max} quantifiers 3854 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL); 3855 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN); 3856 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL); 3857 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL); 3858 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL); 3859 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG); 3860 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan 3861 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format 3862 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG); 3863 3864 // Ticket 5389 3865 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX); 3866 3867 // Invalid Back Reference \0 3868 // For ICU 3.8 and earlier 3869 // For ICU versions newer than 3.8, \0 introduces an octal escape. 3870 // 3871 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE); 3872 3873} 3874 3875 3876//------------------------------------------------------------------------------- 3877// 3878// Read a text data file, convert it to UChars, and return the data 3879// in one big UChar * buffer, which the caller must delete. 3880// 3881//-------------------------------------------------------------------------------- 3882UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen, 3883 const char *defEncoding, UErrorCode &status) { 3884 UChar *retPtr = NULL; 3885 char *fileBuf = NULL; 3886 UConverter* conv = NULL; 3887 FILE *f = NULL; 3888 3889 ulen = 0; 3890 if (U_FAILURE(status)) { 3891 return retPtr; 3892 } 3893 3894 // 3895 // Open the file. 3896 // 3897 f = fopen(fileName, "rb"); 3898 if (f == 0) { 3899 dataerrln("Error opening test data file %s\n", fileName); 3900 status = U_FILE_ACCESS_ERROR; 3901 return NULL; 3902 } 3903 // 3904 // Read it in 3905 // 3906 int32_t fileSize; 3907 int32_t amt_read; 3908 3909 fseek( f, 0, SEEK_END); 3910 fileSize = ftell(f); 3911 fileBuf = new char[fileSize]; 3912 fseek(f, 0, SEEK_SET); 3913 amt_read = fread(fileBuf, 1, fileSize, f); 3914 if (amt_read != fileSize || fileSize <= 0) { 3915 errln("Error reading test data file."); 3916 goto cleanUpAndReturn; 3917 } 3918 3919 // 3920 // Look for a Unicode Signature (BOM) on the data just read 3921 // 3922 int32_t signatureLength; 3923 const char * fileBufC; 3924 const char* encoding; 3925 3926 fileBufC = fileBuf; 3927 encoding = ucnv_detectUnicodeSignature( 3928 fileBuf, fileSize, &signatureLength, &status); 3929 if(encoding!=NULL ){ 3930 fileBufC += signatureLength; 3931 fileSize -= signatureLength; 3932 } else { 3933 encoding = defEncoding; 3934 if (strcmp(encoding, "utf-8") == 0) { 3935 errln("file %s is missing its BOM", fileName); 3936 } 3937 } 3938 3939 // 3940 // Open a converter to take the rule file to UTF-16 3941 // 3942 conv = ucnv_open(encoding, &status); 3943 if (U_FAILURE(status)) { 3944 goto cleanUpAndReturn; 3945 } 3946 3947 // 3948 // Convert the rules to UChar. 3949 // Preflight first to determine required buffer size. 3950 // 3951 ulen = ucnv_toUChars(conv, 3952 NULL, // dest, 3953 0, // destCapacity, 3954 fileBufC, 3955 fileSize, 3956 &status); 3957 if (status == U_BUFFER_OVERFLOW_ERROR) { 3958 // Buffer Overflow is expected from the preflight operation. 3959 status = U_ZERO_ERROR; 3960 3961 retPtr = new UChar[ulen+1]; 3962 ucnv_toUChars(conv, 3963 retPtr, // dest, 3964 ulen+1, 3965 fileBufC, 3966 fileSize, 3967 &status); 3968 } 3969 3970cleanUpAndReturn: 3971 fclose(f); 3972 delete[] fileBuf; 3973 ucnv_close(conv); 3974 if (U_FAILURE(status)) { 3975 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 3976 delete []retPtr; 3977 retPtr = 0; 3978 ulen = 0; 3979 }; 3980 return retPtr; 3981} 3982 3983 3984//------------------------------------------------------------------------------- 3985// 3986// PerlTests - Run Perl's regular expression tests 3987// The input file for this test is re_tests, the standard regular 3988// expression test data distributed with the Perl source code. 3989// 3990// Here is Perl's description of the test data file: 3991// 3992// # The tests are in a separate file 't/op/re_tests'. 3993// # Each line in that file is a separate test. 3994// # There are five columns, separated by tabs. 3995// # 3996// # Column 1 contains the pattern, optionally enclosed in C<''>. 3997// # Modifiers can be put after the closing C<'>. 3998// # 3999// # Column 2 contains the string to be matched. 4000// # 4001// # Column 3 contains the expected result: 4002// # y expect a match 4003// # n expect no match 4004// # c expect an error 4005// # B test exposes a known bug in Perl, should be skipped 4006// # b test exposes a known bug in Perl, should be skipped if noamp 4007// # 4008// # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>. 4009// # 4010// # Column 4 contains a string, usually C<$&>. 4011// # 4012// # Column 5 contains the expected result of double-quote 4013// # interpolating that string after the match, or start of error message. 4014// # 4015// # Column 6, if present, contains a reason why the test is skipped. 4016// # This is printed with "skipped", for harness to pick up. 4017// # 4018// # \n in the tests are interpolated, as are variables of the form ${\w+}. 4019// # 4020// # If you want to add a regular expression test that can't be expressed 4021// # in this format, don't add it here: put it in op/pat.t instead. 4022// 4023// For ICU, if field 3 contains an 'i', the test will be skipped. 4024// The test exposes is some known incompatibility between ICU and Perl regexps. 4025// (The i is in addition to whatever was there before.) 4026// 4027//------------------------------------------------------------------------------- 4028void RegexTest::PerlTests() { 4029 char tdd[2048]; 4030 const char *srcPath; 4031 UErrorCode status = U_ZERO_ERROR; 4032 UParseError pe; 4033 4034 // 4035 // Open and read the test data file. 4036 // 4037 srcPath=getPath(tdd, "re_tests.txt"); 4038 if(srcPath==NULL) { 4039 return; /* something went wrong, error already output */ 4040 } 4041 4042 int32_t len; 4043 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 4044 if (U_FAILURE(status)) { 4045 return; /* something went wrong, error already output */ 4046 } 4047 4048 // 4049 // Put the test data into a UnicodeString 4050 // 4051 UnicodeString testDataString(FALSE, testData, len); 4052 4053 // 4054 // Regex to break the input file into lines, and strip the new lines. 4055 // One line per match, capture group one is the desired data. 4056 // 4057 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 4058 if (U_FAILURE(status)) { 4059 dataerrln("RegexPattern::compile() error"); 4060 return; 4061 } 4062 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 4063 4064 // 4065 // Regex to split a test file line into fields. 4066 // There are six fields, separated by tabs. 4067 // 4068 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 4069 4070 // 4071 // Regex to identify test patterns with flag settings, and to separate them. 4072 // Test patterns with flags look like 'pattern'i 4073 // Test patterns without flags are not quoted: pattern 4074 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 4075 // 4076 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 4077 RegexMatcher* flagMat = flagPat->matcher(status); 4078 4079 // 4080 // The Perl tests reference several perl-isms, which are evaluated/substituted 4081 // in the test data. Not being perl, this must be done explicitly. Here 4082 // are string constants and REs for these constructs. 4083 // 4084 UnicodeString nulnulSrc("${nulnul}"); 4085 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 4086 nulnul = nulnul.unescape(); 4087 4088 UnicodeString ffffSrc("${ffff}"); 4089 UnicodeString ffff("\\uffff", -1, US_INV); 4090 ffff = ffff.unescape(); 4091 4092 // regexp for $-[0], $+[2], etc. 4093 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 4094 RegexMatcher *groupsMat = groupsPat->matcher(status); 4095 4096 // regexp for $0, $1, $2, etc. 4097 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 4098 RegexMatcher *cgMat = cgPat->matcher(status); 4099 4100 4101 // 4102 // Main Loop for the Perl Tests, runs once per line from the 4103 // test data file. 4104 // 4105 int32_t lineNum = 0; 4106 int32_t skippedUnimplementedCount = 0; 4107 while (lineMat->find()) { 4108 lineNum++; 4109 4110 // 4111 // Get a line, break it into its fields, do the Perl 4112 // variable substitutions. 4113 // 4114 UnicodeString line = lineMat->group(1, status); 4115 UnicodeString fields[7]; 4116 fieldPat->split(line, fields, 7, status); 4117 4118 flagMat->reset(fields[0]); 4119 flagMat->matches(status); 4120 UnicodeString pattern = flagMat->group(2, status); 4121 pattern.findAndReplace("${bang}", "!"); 4122 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4123 pattern.findAndReplace(ffffSrc, ffff); 4124 4125 // 4126 // Identify patterns that include match flag settings, 4127 // split off the flags, remove the extra quotes. 4128 // 4129 UnicodeString flagStr = flagMat->group(3, status); 4130 if (U_FAILURE(status)) { 4131 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4132 return; 4133 } 4134 int32_t flags = 0; 4135 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4136 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4137 const UChar UChar_m = 0x6d; 4138 const UChar UChar_x = 0x78; 4139 const UChar UChar_y = 0x79; 4140 if (flagStr.indexOf(UChar_i) != -1) { 4141 flags |= UREGEX_CASE_INSENSITIVE; 4142 } 4143 if (flagStr.indexOf(UChar_m) != -1) { 4144 flags |= UREGEX_MULTILINE; 4145 } 4146 if (flagStr.indexOf(UChar_x) != -1) { 4147 flags |= UREGEX_COMMENTS; 4148 } 4149 4150 // 4151 // Compile the test pattern. 4152 // 4153 status = U_ZERO_ERROR; 4154 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status); 4155 if (status == U_REGEX_UNIMPLEMENTED) { 4156 // 4157 // Test of a feature that is planned for ICU, but not yet implemented. 4158 // skip the test. 4159 skippedUnimplementedCount++; 4160 delete testPat; 4161 status = U_ZERO_ERROR; 4162 continue; 4163 } 4164 4165 if (U_FAILURE(status)) { 4166 // Some tests are supposed to generate errors. 4167 // Only report an error for tests that are supposed to succeed. 4168 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4169 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4170 { 4171 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4172 } 4173 status = U_ZERO_ERROR; 4174 delete testPat; 4175 continue; 4176 } 4177 4178 if (fields[2].indexOf(UChar_i) >= 0) { 4179 // ICU should skip this test. 4180 delete testPat; 4181 continue; 4182 } 4183 4184 if (fields[2].indexOf(UChar_c) >= 0) { 4185 // This pattern should have caused a compilation error, but didn't/ 4186 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4187 delete testPat; 4188 continue; 4189 } 4190 4191 // 4192 // replace the Perl variables that appear in some of the 4193 // match data strings. 4194 // 4195 UnicodeString matchString = fields[1]; 4196 matchString.findAndReplace(nulnulSrc, nulnul); 4197 matchString.findAndReplace(ffffSrc, ffff); 4198 4199 // Replace any \n in the match string with an actual new-line char. 4200 // Don't do full unescape, as this unescapes more than Perl does, which 4201 // causes other spurious failures in the tests. 4202 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4203 4204 4205 4206 // 4207 // Run the test, check for expected match/don't match result. 4208 // 4209 RegexMatcher *testMat = testPat->matcher(matchString, status); 4210 UBool found = testMat->find(); 4211 UBool expected = FALSE; 4212 if (fields[2].indexOf(UChar_y) >=0) { 4213 expected = TRUE; 4214 } 4215 if (expected != found) { 4216 errln("line %d: Expected %smatch, got %smatch", 4217 lineNum, expected?"":"no ", found?"":"no " ); 4218 continue; 4219 } 4220 4221 // Don't try to check expected results if there is no match. 4222 // (Some have stuff in the expected fields) 4223 if (!found) { 4224 delete testMat; 4225 delete testPat; 4226 continue; 4227 } 4228 4229 // 4230 // Interpret the Perl expression from the fourth field of the data file, 4231 // building up an ICU string from the results of the ICU match. 4232 // The Perl expression will contain references to the results of 4233 // a regex match, including the matched string, capture group strings, 4234 // group starting and ending indicies, etc. 4235 // 4236 UnicodeString resultString; 4237 UnicodeString perlExpr = fields[3]; 4238#if SUPPORT_MUTATING_INPUT_STRING 4239 groupsMat->reset(perlExpr); 4240 cgMat->reset(perlExpr); 4241#endif 4242 4243 while (perlExpr.length() > 0) { 4244#if !SUPPORT_MUTATING_INPUT_STRING 4245 // Perferred usage. Reset after any modification to input string. 4246 groupsMat->reset(perlExpr); 4247 cgMat->reset(perlExpr); 4248#endif 4249 4250 if (perlExpr.startsWith("$&")) { 4251 resultString.append(testMat->group(status)); 4252 perlExpr.remove(0, 2); 4253 } 4254 4255 else if (groupsMat->lookingAt(status)) { 4256 // $-[0] $+[2] etc. 4257 UnicodeString digitString = groupsMat->group(2, status); 4258 int32_t t = 0; 4259 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4260 UnicodeString plusOrMinus = groupsMat->group(1, status); 4261 int32_t matchPosition; 4262 if (plusOrMinus.compare("+") == 0) { 4263 matchPosition = testMat->end(groupNum, status); 4264 } else { 4265 matchPosition = testMat->start(groupNum, status); 4266 } 4267 if (matchPosition != -1) { 4268 ICU_Utility::appendNumber(resultString, matchPosition); 4269 } 4270 perlExpr.remove(0, groupsMat->end(status)); 4271 } 4272 4273 else if (cgMat->lookingAt(status)) { 4274 // $1, $2, $3, etc. 4275 UnicodeString digitString = cgMat->group(1, status); 4276 int32_t t = 0; 4277 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4278 if (U_SUCCESS(status)) { 4279 resultString.append(testMat->group(groupNum, status)); 4280 status = U_ZERO_ERROR; 4281 } 4282 perlExpr.remove(0, cgMat->end(status)); 4283 } 4284 4285 else if (perlExpr.startsWith("@-")) { 4286 int32_t i; 4287 for (i=0; i<=testMat->groupCount(); i++) { 4288 if (i>0) { 4289 resultString.append(" "); 4290 } 4291 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4292 } 4293 perlExpr.remove(0, 2); 4294 } 4295 4296 else if (perlExpr.startsWith("@+")) { 4297 int32_t i; 4298 for (i=0; i<=testMat->groupCount(); i++) { 4299 if (i>0) { 4300 resultString.append(" "); 4301 } 4302 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4303 } 4304 perlExpr.remove(0, 2); 4305 } 4306 4307 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4308 // or as an escaped sequence (e.g. \n) 4309 if (perlExpr.length() > 1) { 4310 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4311 } 4312 UChar c = perlExpr.charAt(0); 4313 switch (c) { 4314 case 'n': c = '\n'; break; 4315 // add any other escape sequences that show up in the test expected results. 4316 } 4317 resultString.append(c); 4318 perlExpr.remove(0, 1); 4319 } 4320 4321 else { 4322 // Any characters from the perl expression that we don't explicitly 4323 // recognize before here are assumed to be literals and copied 4324 // as-is to the expected results. 4325 resultString.append(perlExpr.charAt(0)); 4326 perlExpr.remove(0, 1); 4327 } 4328 4329 if (U_FAILURE(status)) { 4330 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4331 break; 4332 } 4333 } 4334 4335 // 4336 // Expected Results Compare 4337 // 4338 UnicodeString expectedS(fields[4]); 4339 expectedS.findAndReplace(nulnulSrc, nulnul); 4340 expectedS.findAndReplace(ffffSrc, ffff); 4341 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4342 4343 4344 if (expectedS.compare(resultString) != 0) { 4345 err("Line %d: Incorrect perl expression results.", lineNum); 4346 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4347 } 4348 4349 delete testMat; 4350 delete testPat; 4351 } 4352 4353 // 4354 // All done. Clean up allocated stuff. 4355 // 4356 delete cgMat; 4357 delete cgPat; 4358 4359 delete groupsMat; 4360 delete groupsPat; 4361 4362 delete flagMat; 4363 delete flagPat; 4364 4365 delete lineMat; 4366 delete linePat; 4367 4368 delete fieldPat; 4369 delete [] testData; 4370 4371 4372 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4373 4374} 4375 4376 4377//------------------------------------------------------------------------------- 4378// 4379// PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts 4380// (instead of using UnicodeStrings) to test the alternate engine. 4381// The input file for this test is re_tests, the standard regular 4382// expression test data distributed with the Perl source code. 4383// See PerlTests() for more information. 4384// 4385//------------------------------------------------------------------------------- 4386void RegexTest::PerlTestsUTF8() { 4387 char tdd[2048]; 4388 const char *srcPath; 4389 UErrorCode status = U_ZERO_ERROR; 4390 UParseError pe; 4391 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status)); 4392 UText patternText = UTEXT_INITIALIZER; 4393 char *patternChars = NULL; 4394 int32_t patternLength; 4395 int32_t patternCapacity = 0; 4396 UText inputText = UTEXT_INITIALIZER; 4397 char *inputChars = NULL; 4398 int32_t inputLength; 4399 int32_t inputCapacity = 0; 4400 4401 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); 4402 4403 // 4404 // Open and read the test data file. 4405 // 4406 srcPath=getPath(tdd, "re_tests.txt"); 4407 if(srcPath==NULL) { 4408 return; /* something went wrong, error already output */ 4409 } 4410 4411 int32_t len; 4412 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status); 4413 if (U_FAILURE(status)) { 4414 return; /* something went wrong, error already output */ 4415 } 4416 4417 // 4418 // Put the test data into a UnicodeString 4419 // 4420 UnicodeString testDataString(FALSE, testData, len); 4421 4422 // 4423 // Regex to break the input file into lines, and strip the new lines. 4424 // One line per match, capture group one is the desired data. 4425 // 4426 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status); 4427 if (U_FAILURE(status)) { 4428 dataerrln("RegexPattern::compile() error"); 4429 return; 4430 } 4431 RegexMatcher* lineMat = linePat->matcher(testDataString, status); 4432 4433 // 4434 // Regex to split a test file line into fields. 4435 // There are six fields, separated by tabs. 4436 // 4437 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status); 4438 4439 // 4440 // Regex to identify test patterns with flag settings, and to separate them. 4441 // Test patterns with flags look like 'pattern'i 4442 // Test patterns without flags are not quoted: pattern 4443 // Coming out, capture group 2 is the pattern, capture group 3 is the flags. 4444 // 4445 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status); 4446 RegexMatcher* flagMat = flagPat->matcher(status); 4447 4448 // 4449 // The Perl tests reference several perl-isms, which are evaluated/substituted 4450 // in the test data. Not being perl, this must be done explicitly. Here 4451 // are string constants and REs for these constructs. 4452 // 4453 UnicodeString nulnulSrc("${nulnul}"); 4454 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV); 4455 nulnul = nulnul.unescape(); 4456 4457 UnicodeString ffffSrc("${ffff}"); 4458 UnicodeString ffff("\\uffff", -1, US_INV); 4459 ffff = ffff.unescape(); 4460 4461 // regexp for $-[0], $+[2], etc. 4462 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status); 4463 RegexMatcher *groupsMat = groupsPat->matcher(status); 4464 4465 // regexp for $0, $1, $2, etc. 4466 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status); 4467 RegexMatcher *cgMat = cgPat->matcher(status); 4468 4469 4470 // 4471 // Main Loop for the Perl Tests, runs once per line from the 4472 // test data file. 4473 // 4474 int32_t lineNum = 0; 4475 int32_t skippedUnimplementedCount = 0; 4476 while (lineMat->find()) { 4477 lineNum++; 4478 4479 // 4480 // Get a line, break it into its fields, do the Perl 4481 // variable substitutions. 4482 // 4483 UnicodeString line = lineMat->group(1, status); 4484 UnicodeString fields[7]; 4485 fieldPat->split(line, fields, 7, status); 4486 4487 flagMat->reset(fields[0]); 4488 flagMat->matches(status); 4489 UnicodeString pattern = flagMat->group(2, status); 4490 pattern.findAndReplace("${bang}", "!"); 4491 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000")); 4492 pattern.findAndReplace(ffffSrc, ffff); 4493 4494 // 4495 // Identify patterns that include match flag settings, 4496 // split off the flags, remove the extra quotes. 4497 // 4498 UnicodeString flagStr = flagMat->group(3, status); 4499 if (U_FAILURE(status)) { 4500 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 4501 return; 4502 } 4503 int32_t flags = 0; 4504 const UChar UChar_c = 0x63; // Char constants for the flag letters. 4505 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C) 4506 const UChar UChar_m = 0x6d; 4507 const UChar UChar_x = 0x78; 4508 const UChar UChar_y = 0x79; 4509 if (flagStr.indexOf(UChar_i) != -1) { 4510 flags |= UREGEX_CASE_INSENSITIVE; 4511 } 4512 if (flagStr.indexOf(UChar_m) != -1) { 4513 flags |= UREGEX_MULTILINE; 4514 } 4515 if (flagStr.indexOf(UChar_x) != -1) { 4516 flags |= UREGEX_COMMENTS; 4517 } 4518 4519 // 4520 // Put the pattern in a UTF-8 UText 4521 // 4522 status = U_ZERO_ERROR; 4523 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4524 if (status == U_BUFFER_OVERFLOW_ERROR) { 4525 status = U_ZERO_ERROR; 4526 delete[] patternChars; 4527 patternCapacity = patternLength + 1; 4528 patternChars = new char[patternCapacity]; 4529 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status); 4530 } 4531 utext_openUTF8(&patternText, patternChars, patternLength, &status); 4532 4533 // 4534 // Compile the test pattern. 4535 // 4536 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status); 4537 if (status == U_REGEX_UNIMPLEMENTED) { 4538 // 4539 // Test of a feature that is planned for ICU, but not yet implemented. 4540 // skip the test. 4541 skippedUnimplementedCount++; 4542 delete testPat; 4543 status = U_ZERO_ERROR; 4544 continue; 4545 } 4546 4547 if (U_FAILURE(status)) { 4548 // Some tests are supposed to generate errors. 4549 // Only report an error for tests that are supposed to succeed. 4550 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND 4551 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility 4552 { 4553 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status)); 4554 } 4555 status = U_ZERO_ERROR; 4556 delete testPat; 4557 continue; 4558 } 4559 4560 if (fields[2].indexOf(UChar_i) >= 0) { 4561 // ICU should skip this test. 4562 delete testPat; 4563 continue; 4564 } 4565 4566 if (fields[2].indexOf(UChar_c) >= 0) { 4567 // This pattern should have caused a compilation error, but didn't/ 4568 errln("line %d: Expected a pattern compile error, got success.", lineNum); 4569 delete testPat; 4570 continue; 4571 } 4572 4573 4574 // 4575 // replace the Perl variables that appear in some of the 4576 // match data strings. 4577 // 4578 UnicodeString matchString = fields[1]; 4579 matchString.findAndReplace(nulnulSrc, nulnul); 4580 matchString.findAndReplace(ffffSrc, ffff); 4581 4582 // Replace any \n in the match string with an actual new-line char. 4583 // Don't do full unescape, as this unescapes more than Perl does, which 4584 // causes other spurious failures in the tests. 4585 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4586 4587 // 4588 // Put the input in a UTF-8 UText 4589 // 4590 status = U_ZERO_ERROR; 4591 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4592 if (status == U_BUFFER_OVERFLOW_ERROR) { 4593 status = U_ZERO_ERROR; 4594 delete[] inputChars; 4595 inputCapacity = inputLength + 1; 4596 inputChars = new char[inputCapacity]; 4597 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status); 4598 } 4599 utext_openUTF8(&inputText, inputChars, inputLength, &status); 4600 4601 // 4602 // Run the test, check for expected match/don't match result. 4603 // 4604 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText); 4605 UBool found = testMat->find(); 4606 UBool expected = FALSE; 4607 if (fields[2].indexOf(UChar_y) >=0) { 4608 expected = TRUE; 4609 } 4610 if (expected != found) { 4611 errln("line %d: Expected %smatch, got %smatch", 4612 lineNum, expected?"":"no ", found?"":"no " ); 4613 continue; 4614 } 4615 4616 // Don't try to check expected results if there is no match. 4617 // (Some have stuff in the expected fields) 4618 if (!found) { 4619 delete testMat; 4620 delete testPat; 4621 continue; 4622 } 4623 4624 // 4625 // Interpret the Perl expression from the fourth field of the data file, 4626 // building up an ICU string from the results of the ICU match. 4627 // The Perl expression will contain references to the results of 4628 // a regex match, including the matched string, capture group strings, 4629 // group starting and ending indicies, etc. 4630 // 4631 UnicodeString resultString; 4632 UnicodeString perlExpr = fields[3]; 4633 4634 while (perlExpr.length() > 0) { 4635 groupsMat->reset(perlExpr); 4636 cgMat->reset(perlExpr); 4637 4638 if (perlExpr.startsWith("$&")) { 4639 resultString.append(testMat->group(status)); 4640 perlExpr.remove(0, 2); 4641 } 4642 4643 else if (groupsMat->lookingAt(status)) { 4644 // $-[0] $+[2] etc. 4645 UnicodeString digitString = groupsMat->group(2, status); 4646 int32_t t = 0; 4647 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4648 UnicodeString plusOrMinus = groupsMat->group(1, status); 4649 int32_t matchPosition; 4650 if (plusOrMinus.compare("+") == 0) { 4651 matchPosition = testMat->end(groupNum, status); 4652 } else { 4653 matchPosition = testMat->start(groupNum, status); 4654 } 4655 if (matchPosition != -1) { 4656 ICU_Utility::appendNumber(resultString, matchPosition); 4657 } 4658 perlExpr.remove(0, groupsMat->end(status)); 4659 } 4660 4661 else if (cgMat->lookingAt(status)) { 4662 // $1, $2, $3, etc. 4663 UnicodeString digitString = cgMat->group(1, status); 4664 int32_t t = 0; 4665 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10); 4666 if (U_SUCCESS(status)) { 4667 resultString.append(testMat->group(groupNum, status)); 4668 status = U_ZERO_ERROR; 4669 } 4670 perlExpr.remove(0, cgMat->end(status)); 4671 } 4672 4673 else if (perlExpr.startsWith("@-")) { 4674 int32_t i; 4675 for (i=0; i<=testMat->groupCount(); i++) { 4676 if (i>0) { 4677 resultString.append(" "); 4678 } 4679 ICU_Utility::appendNumber(resultString, testMat->start(i, status)); 4680 } 4681 perlExpr.remove(0, 2); 4682 } 4683 4684 else if (perlExpr.startsWith("@+")) { 4685 int32_t i; 4686 for (i=0; i<=testMat->groupCount(); i++) { 4687 if (i>0) { 4688 resultString.append(" "); 4689 } 4690 ICU_Utility::appendNumber(resultString, testMat->end(i, status)); 4691 } 4692 perlExpr.remove(0, 2); 4693 } 4694 4695 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal. 4696 // or as an escaped sequence (e.g. \n) 4697 if (perlExpr.length() > 1) { 4698 perlExpr.remove(0, 1); // Remove the '\', but only if not last char. 4699 } 4700 UChar c = perlExpr.charAt(0); 4701 switch (c) { 4702 case 'n': c = '\n'; break; 4703 // add any other escape sequences that show up in the test expected results. 4704 } 4705 resultString.append(c); 4706 perlExpr.remove(0, 1); 4707 } 4708 4709 else { 4710 // Any characters from the perl expression that we don't explicitly 4711 // recognize before here are assumed to be literals and copied 4712 // as-is to the expected results. 4713 resultString.append(perlExpr.charAt(0)); 4714 perlExpr.remove(0, 1); 4715 } 4716 4717 if (U_FAILURE(status)) { 4718 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status)); 4719 break; 4720 } 4721 } 4722 4723 // 4724 // Expected Results Compare 4725 // 4726 UnicodeString expectedS(fields[4]); 4727 expectedS.findAndReplace(nulnulSrc, nulnul); 4728 expectedS.findAndReplace(ffffSrc, ffff); 4729 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n"); 4730 4731 4732 if (expectedS.compare(resultString) != 0) { 4733 err("Line %d: Incorrect perl expression results.", lineNum); 4734 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\""); 4735 } 4736 4737 delete testMat; 4738 delete testPat; 4739 } 4740 4741 // 4742 // All done. Clean up allocated stuff. 4743 // 4744 delete cgMat; 4745 delete cgPat; 4746 4747 delete groupsMat; 4748 delete groupsPat; 4749 4750 delete flagMat; 4751 delete flagPat; 4752 4753 delete lineMat; 4754 delete linePat; 4755 4756 delete fieldPat; 4757 delete [] testData; 4758 4759 utext_close(&patternText); 4760 utext_close(&inputText); 4761 4762 delete [] patternChars; 4763 delete [] inputChars; 4764 4765 4766 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount); 4767 4768} 4769 4770 4771//-------------------------------------------------------------- 4772// 4773// Bug6149 Verify limits to heap expansion for backtrack stack. 4774// Use this pattern, 4775// "(a?){1,8000000}" 4776// Note: was an unbounded upperbounds, but that now has loop-breaking enabled. 4777// This test is likely to be fragile, as further optimizations stop 4778// more cases of pointless looping in the match engine. 4779// 4780//--------------------------------------------------------------- 4781void RegexTest::Bug6149() { 4782 UnicodeString pattern("(a?){1,8000000}"); 4783 UnicodeString s("xyz"); 4784 uint32_t flags = 0; 4785 UErrorCode status = U_ZERO_ERROR; 4786 4787 RegexMatcher matcher(pattern, s, flags, status); 4788 UBool result = false; 4789 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW); 4790 REGEX_ASSERT(result == FALSE); 4791 } 4792 4793 4794// 4795// Callbacks() Test the callback function. 4796// When set, callbacks occur periodically during matching operations, 4797// giving the application code the ability to abort the operation 4798// before it's normal completion. 4799// 4800 4801struct callBackContext { 4802 RegexTest *test; 4803 int32_t maxCalls; 4804 int32_t numCalls; 4805 int32_t lastSteps; 4806 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}; 4807}; 4808 4809U_CDECL_BEGIN 4810static UBool U_CALLCONV 4811testCallBackFn(const void *context, int32_t steps) { 4812 callBackContext *info = (callBackContext *)context; 4813 if (info->lastSteps+1 != steps) { 4814 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps); 4815 } 4816 info->lastSteps = steps; 4817 info->numCalls++; 4818 return (info->numCalls < info->maxCalls); 4819} 4820U_CDECL_END 4821 4822void RegexTest::Callbacks() { 4823 { 4824 // Getter returns NULLs if no callback has been set 4825 4826 // The variables that the getter will fill in. 4827 // Init to non-null values so that the action of the getter can be seen. 4828 const void *returnedContext = &returnedContext; 4829 URegexMatchCallback *returnedFn = &testCallBackFn; 4830 4831 UErrorCode status = U_ZERO_ERROR; 4832 RegexMatcher matcher("x", 0, status); 4833 REGEX_CHECK_STATUS; 4834 matcher.getMatchCallback(returnedFn, returnedContext, status); 4835 REGEX_CHECK_STATUS; 4836 REGEX_ASSERT(returnedFn == NULL); 4837 REGEX_ASSERT(returnedContext == NULL); 4838 } 4839 4840 { 4841 // Set and Get work 4842 callBackContext cbInfo = {this, 0, 0, 0}; 4843 const void *returnedContext; 4844 URegexMatchCallback *returnedFn; 4845 UErrorCode status = U_ZERO_ERROR; 4846 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long. 4847 REGEX_CHECK_STATUS; 4848 matcher.setMatchCallback(testCallBackFn, &cbInfo, status); 4849 REGEX_CHECK_STATUS; 4850 matcher.getMatchCallback(returnedFn, returnedContext, status); 4851 REGEX_CHECK_STATUS; 4852 REGEX_ASSERT(returnedFn == testCallBackFn); 4853 REGEX_ASSERT(returnedContext == &cbInfo); 4854 4855 // A short-running match shouldn't invoke the callback 4856 status = U_ZERO_ERROR; 4857 cbInfo.reset(1); 4858 UnicodeString s = "xxx"; 4859 matcher.reset(s); 4860 REGEX_ASSERT(matcher.matches(status)); 4861 REGEX_CHECK_STATUS; 4862 REGEX_ASSERT(cbInfo.numCalls == 0); 4863 4864 // A medium-length match that runs long enough to invoke the 4865 // callback, but not so long that the callback aborts it. 4866 status = U_ZERO_ERROR; 4867 cbInfo.reset(4); 4868 s = "aaaaaaaaaaaaaaaaaaab"; 4869 matcher.reset(s); 4870 REGEX_ASSERT(matcher.matches(status)==FALSE); 4871 REGEX_CHECK_STATUS; 4872 REGEX_ASSERT(cbInfo.numCalls > 0); 4873 4874 // A longer running match that the callback function will abort. 4875 status = U_ZERO_ERROR; 4876 cbInfo.reset(4); 4877 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 4878 matcher.reset(s); 4879 REGEX_ASSERT(matcher.matches(status)==FALSE); 4880 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4881 REGEX_ASSERT(cbInfo.numCalls == 4); 4882 4883 // A longer running find that the callback function will abort. 4884 status = U_ZERO_ERROR; 4885 cbInfo.reset(4); 4886 s = "aaaaaaaaaaaaaaaaaaaaaaab"; 4887 matcher.reset(s); 4888 REGEX_ASSERT(matcher.find(status)==FALSE); 4889 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4890 REGEX_ASSERT(cbInfo.numCalls == 4); 4891 } 4892 4893 4894} 4895 4896 4897// 4898// FindProgressCallbacks() Test the find "progress" callback function. 4899// When set, the find progress callback will be invoked during a find operations 4900// after each return from a match attempt, giving the application the opportunity 4901// to terminate a long-running find operation before it's normal completion. 4902// 4903 4904struct progressCallBackContext { 4905 RegexTest *test; 4906 int64_t lastIndex; 4907 int32_t maxCalls; 4908 int32_t numCalls; 4909 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}; 4910}; 4911 4912// call-back function for find(). 4913// Return TRUE to continue the find(). 4914// Return FALSE to stop the find(). 4915U_CDECL_BEGIN 4916static UBool U_CALLCONV 4917testProgressCallBackFn(const void *context, int64_t matchIndex) { 4918 progressCallBackContext *info = (progressCallBackContext *)context; 4919 info->numCalls++; 4920 info->lastIndex = matchIndex; 4921// info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls); 4922 return (info->numCalls < info->maxCalls); 4923} 4924U_CDECL_END 4925 4926void RegexTest::FindProgressCallbacks() { 4927 { 4928 // Getter returns NULLs if no callback has been set 4929 4930 // The variables that the getter will fill in. 4931 // Init to non-null values so that the action of the getter can be seen. 4932 const void *returnedContext = &returnedContext; 4933 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn; 4934 4935 UErrorCode status = U_ZERO_ERROR; 4936 RegexMatcher matcher("x", 0, status); 4937 REGEX_CHECK_STATUS; 4938 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4939 REGEX_CHECK_STATUS; 4940 REGEX_ASSERT(returnedFn == NULL); 4941 REGEX_ASSERT(returnedContext == NULL); 4942 } 4943 4944 { 4945 // Set and Get work 4946 progressCallBackContext cbInfo = {this, 0, 0, 0}; 4947 const void *returnedContext; 4948 URegexFindProgressCallback *returnedFn; 4949 UErrorCode status = U_ZERO_ERROR; 4950 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status); 4951 REGEX_CHECK_STATUS; 4952 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status); 4953 REGEX_CHECK_STATUS; 4954 matcher.getFindProgressCallback(returnedFn, returnedContext, status); 4955 REGEX_CHECK_STATUS; 4956 REGEX_ASSERT(returnedFn == testProgressCallBackFn); 4957 REGEX_ASSERT(returnedContext == &cbInfo); 4958 4959 // A find that matches on the initial position does NOT invoke the callback. 4960 status = U_ZERO_ERROR; 4961 cbInfo.reset(100); 4962 UnicodeString s = "aaxxx"; 4963 matcher.reset(s); 4964#if 0 4965 matcher.setTrace(TRUE); 4966#endif 4967 REGEX_ASSERT(matcher.find(0, status)); 4968 REGEX_CHECK_STATUS; 4969 REGEX_ASSERT(cbInfo.numCalls == 0); 4970 4971 // A medium running find() that causes matcher.find() to invoke our callback for each index, 4972 // but not so many times that we interrupt the operation. 4973 status = U_ZERO_ERROR; 4974 s = "aaaaaaaaaaaaaaaaaaab"; 4975 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string 4976 matcher.reset(s); 4977 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4978 REGEX_CHECK_STATUS; 4979 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25); 4980 4981 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point. 4982 status = U_ZERO_ERROR; 4983 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab"; 4984 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string 4985 matcher.reset(s1); 4986 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4987 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4988 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5); 4989 4990 // Now a match that will succeed, but after an interruption 4991 status = U_ZERO_ERROR; 4992 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx"; 4993 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string 4994 matcher.reset(s2); 4995 REGEX_ASSERT(matcher.find(0, status)==FALSE); 4996 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER); 4997 // Now retry the match from where left off 4998 cbInfo.maxCalls = 100; // No callback limit 4999 status = U_ZERO_ERROR; 5000 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status)); 5001 REGEX_CHECK_STATUS; 5002 } 5003 5004 5005} 5006 5007 5008//--------------------------------------------------------------------------- 5009// 5010// PreAllocatedUTextCAPI Check the C API with pre-allocated mutable 5011// UTexts. The pure-C implementation of UText 5012// has no mutable backing stores, but we can 5013// use UnicodeString here to test the functionality. 5014// 5015//--------------------------------------------------------------------------- 5016void RegexTest::PreAllocatedUTextCAPI () { 5017 UErrorCode status = U_ZERO_ERROR; 5018 URegularExpression *re; 5019 UText patternText = UTEXT_INITIALIZER; 5020 UnicodeString buffer; 5021 UText bufferText = UTEXT_INITIALIZER; 5022 5023 utext_openUnicodeString(&bufferText, &buffer, &status); 5024 5025 /* 5026 * getText() and getUText() 5027 */ 5028 { 5029 UText text1 = UTEXT_INITIALIZER; 5030 UText text2 = UTEXT_INITIALIZER; 5031 UChar text2Chars[20]; 5032 UText *resultText; 5033 5034 status = U_ZERO_ERROR; 5035 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status); 5036 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status); 5037 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2); 5038 utext_openUChars(&text2, text2Chars, -1, &status); 5039 5040 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status); 5041 re = uregex_openUText(&patternText, 0, NULL, &status); 5042 5043 /* First set a UText */ 5044 uregex_setUText(re, &text1, &status); 5045 resultText = uregex_getUText(re, &bufferText, &status); 5046 REGEX_CHECK_STATUS; 5047 REGEX_ASSERT(resultText == &bufferText); 5048 utext_setNativeIndex(resultText, 0); 5049 utext_setNativeIndex(&text1, 0); 5050 REGEX_ASSERT(testUTextEqual(resultText, &text1)); 5051 5052 resultText = uregex_getUText(re, &bufferText, &status); 5053 REGEX_CHECK_STATUS; 5054 REGEX_ASSERT(resultText == &bufferText); 5055 utext_setNativeIndex(resultText, 0); 5056 utext_setNativeIndex(&text1, 0); 5057 REGEX_ASSERT(testUTextEqual(resultText, &text1)); 5058 5059 /* Then set a UChar * */ 5060 uregex_setText(re, text2Chars, 7, &status); 5061 resultText = uregex_getUText(re, &bufferText, &status); 5062 REGEX_CHECK_STATUS; 5063 REGEX_ASSERT(resultText == &bufferText); 5064 utext_setNativeIndex(resultText, 0); 5065 utext_setNativeIndex(&text2, 0); 5066 REGEX_ASSERT(testUTextEqual(resultText, &text2)); 5067 5068 uregex_close(re); 5069 utext_close(&text1); 5070 utext_close(&text2); 5071 } 5072 5073 /* 5074 * group() 5075 */ 5076 { 5077 UChar text1[80]; 5078 UText *actual; 5079 UBool result; 5080 int64_t length = 0; 5081 5082 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1)); 5083 // 012345678901234567890123456789012345678901234567 5084 // 0 1 2 3 4 5085 5086 status = U_ZERO_ERROR; 5087 re = uregex_openC("abc(.*?)def", 0, NULL, &status); 5088 REGEX_CHECK_STATUS; 5089 5090 uregex_setText(re, text1, -1, &status); 5091 result = uregex_find(re, 0, &status); 5092 REGEX_ASSERT(result==TRUE); 5093 5094 /* Capture Group 0, the full match. Should succeed. "abc interior def" */ 5095 status = U_ZERO_ERROR; 5096 actual = uregex_groupUText(re, 0, &bufferText, &length, &status); 5097 REGEX_CHECK_STATUS; 5098 REGEX_ASSERT(actual == &bufferText); 5099 REGEX_ASSERT(utext_getNativeIndex(actual) == 6); 5100 REGEX_ASSERT(length == 16); 5101 REGEX_ASSERT(utext_nativeLength(actual) == 47); 5102 5103 /* Capture group #1. Should succeed, matching " interior ". */ 5104 status = U_ZERO_ERROR; 5105 actual = uregex_groupUText(re, 1, &bufferText, &length, &status); 5106 REGEX_CHECK_STATUS; 5107 REGEX_ASSERT(actual == &bufferText); 5108 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior " 5109 REGEX_ASSERT(length == 10); 5110 REGEX_ASSERT(utext_nativeLength(actual) == 47); 5111 5112 /* Capture group out of range. Error. */ 5113 status = U_ZERO_ERROR; 5114 actual = uregex_groupUText(re, 2, &bufferText, &length, &status); 5115 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5116 REGEX_ASSERT(actual == &bufferText); 5117 uregex_close(re); 5118 5119 } 5120 5121 /* 5122 * replaceFirst() 5123 */ 5124 { 5125 UChar text1[80]; 5126 UChar text2[80]; 5127 UText replText = UTEXT_INITIALIZER; 5128 UText *result; 5129 status = U_ZERO_ERROR; 5130 utext_openUnicodeString(&bufferText, &buffer, &status); 5131 5132 status = U_ZERO_ERROR; 5133 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1)); 5134 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2); 5135 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 5136 5137 re = uregex_openC("x(.*?)x", 0, NULL, &status); 5138 REGEX_CHECK_STATUS; 5139 5140 /* Normal case, with match */ 5141 uregex_setText(re, text1, -1, &status); 5142 REGEX_CHECK_STATUS; 5143 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5144 REGEX_CHECK_STATUS; 5145 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5146 REGEX_CHECK_STATUS; 5147 REGEX_ASSERT(result == &bufferText); 5148 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result); 5149 5150 /* No match. Text should copy to output with no changes. */ 5151 uregex_setText(re, text2, -1, &status); 5152 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5153 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5154 REGEX_CHECK_STATUS; 5155 REGEX_ASSERT(result == &bufferText); 5156 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 5157 5158 /* Unicode escapes */ 5159 uregex_setText(re, text1, -1, &status); 5160 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status); 5161 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5162 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status); 5163 REGEX_CHECK_STATUS; 5164 REGEX_ASSERT(result == &bufferText); 5165 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result); 5166 5167 uregex_close(re); 5168 utext_close(&replText); 5169 } 5170 5171 5172 /* 5173 * replaceAll() 5174 */ 5175 { 5176 UChar text1[80]; 5177 UChar text2[80]; 5178 UText replText = UTEXT_INITIALIZER; 5179 UText *result; 5180 5181 status = U_ZERO_ERROR; 5182 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2); 5183 u_uastrncpy(text2, "No match here.", sizeof(text2)/2); 5184 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status); 5185 5186 re = uregex_openC("x(.*?)x", 0, NULL, &status); 5187 REGEX_CHECK_STATUS; 5188 5189 /* Normal case, with match */ 5190 uregex_setText(re, text1, -1, &status); 5191 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5192 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 5193 REGEX_CHECK_STATUS; 5194 REGEX_ASSERT(result == &bufferText); 5195 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result); 5196 5197 /* No match. Text should copy to output with no changes. */ 5198 uregex_setText(re, text2, -1, &status); 5199 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status); 5200 result = uregex_replaceAllUText(re, &replText, &bufferText, &status); 5201 REGEX_CHECK_STATUS; 5202 REGEX_ASSERT(result == &bufferText); 5203 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result); 5204 5205 uregex_close(re); 5206 utext_close(&replText); 5207 } 5208 5209 5210 /* 5211 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts, 5212 * so we don't need to test it here. 5213 */ 5214 5215 utext_close(&bufferText); 5216 utext_close(&patternText); 5217} 5218 5219 5220//-------------------------------------------------------------- 5221// 5222// NamedCapture Check basic named capture group functionality 5223// 5224//-------------------------------------------------------------- 5225void RegexTest::NamedCapture() { 5226 UErrorCode status = U_ZERO_ERROR; 5227 RegexPattern *pat = RegexPattern::compile(UnicodeString( 5228 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status); 5229 REGEX_CHECK_STATUS; 5230 int32_t group = pat->groupNumberFromName("five", -1, status); 5231 REGEX_CHECK_STATUS; 5232 REGEX_ASSERT(5 == group); 5233 group = pat->groupNumberFromName("three", -1, status); 5234 REGEX_CHECK_STATUS; 5235 REGEX_ASSERT(3 == group); 5236 5237 status = U_ZERO_ERROR; 5238 group = pat->groupNumberFromName(UnicodeString("six"), status); 5239 REGEX_CHECK_STATUS; 5240 REGEX_ASSERT(6 == group); 5241 5242 status = U_ZERO_ERROR; 5243 group = pat->groupNumberFromName(UnicodeString("nosuch"), status); 5244 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5245 5246 status = U_ZERO_ERROR; 5247 5248 // After copying a pattern, named capture should still work in the copy. 5249 RegexPattern *copiedPat = new RegexPattern(*pat); 5250 REGEX_ASSERT(*copiedPat == *pat); 5251 delete pat; pat = NULL; // Delete original, copy should have no references back to it. 5252 5253 group = copiedPat->groupNumberFromName("five", -1, status); 5254 REGEX_CHECK_STATUS; 5255 REGEX_ASSERT(5 == group); 5256 group = copiedPat->groupNumberFromName("three", -1, status); 5257 REGEX_CHECK_STATUS; 5258 REGEX_ASSERT(3 == group); 5259 delete copiedPat; 5260 5261 // ReplaceAll with named capture group. 5262 status = U_ZERO_ERROR; 5263 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>"); 5264 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status); 5265 REGEX_CHECK_STATUS; 5266 // m.pattern().dumpPattern(); 5267 UnicodeString replacedText = m->replaceAll("'${mid}'", status); 5268 REGEX_CHECK_STATUS; 5269 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText); 5270 delete m; 5271 5272 // ReplaceAll, allowed capture group numbers. 5273 text = UnicodeString("abcmxyz"); 5274 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status); 5275 REGEX_CHECK_STATUS; 5276 5277 status = U_ZERO_ERROR; 5278 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed. 5279 REGEX_CHECK_STATUS; 5280 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText); 5281 5282 status = U_ZERO_ERROR; 5283 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number. 5284 REGEX_CHECK_STATUS; 5285 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText); 5286 5287 status = U_ZERO_ERROR; 5288 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name. 5289 REGEX_CHECK_STATUS; 5290 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText); 5291 5292 status = U_ZERO_ERROR; 5293 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2. 5294 REGEX_CHECK_STATUS; 5295 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText); 5296 5297 status = U_ZERO_ERROR; 5298 replacedText = m->replaceAll(UnicodeString("<$3>"), status); 5299 REGEX_CHECK_STATUS; 5300 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText); 5301 5302 status = U_ZERO_ERROR; 5303 replacedText = m->replaceAll(UnicodeString("<$4>"), status); 5304 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5305 5306 status = U_ZERO_ERROR; 5307 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0, 5308 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through. 5309 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText); 5310 5311 status = U_ZERO_ERROR; 5312 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits 5313 REGEX_CHECK_STATUS; // that push group num out of range. 5314 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1. 5315 5316 status = U_ZERO_ERROR; 5317 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status); 5318 REGEX_CHECK_STATUS; 5319 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText); 5320 5321 status = U_ZERO_ERROR; 5322 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status); 5323 REGEX_CHECK_STATUS; 5324 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText); 5325 5326 status = U_ZERO_ERROR; 5327 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status); 5328 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5329 5330 status = U_ZERO_ERROR; 5331 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status); 5332 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5333 5334 status = U_ZERO_ERROR; 5335 replacedText = m->replaceAll(UnicodeString("<${one"), status); 5336 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5337 5338 status = U_ZERO_ERROR; 5339 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status); 5340 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5341 5342 delete m; 5343 5344 // Repeat the above replaceAll() tests using the plain C API, which 5345 // has a separate implementation internally. 5346 // TODO: factor out the test data. 5347 5348 status = U_ZERO_ERROR; 5349 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status); 5350 REGEX_CHECK_STATUS; 5351 text = UnicodeString("abcmxyz"); 5352 uregex_setText(re, text.getBuffer(), text.length(), &status); 5353 REGEX_CHECK_STATUS; 5354 5355 UChar resultBuf[100]; 5356 int32_t resultLength; 5357 UnicodeString repl; 5358 5359 status = U_ZERO_ERROR; 5360 repl = UnicodeString("<$0>"); 5361 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5362 REGEX_CHECK_STATUS; 5363 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength)); 5364 5365 status = U_ZERO_ERROR; 5366 repl = UnicodeString("<$1>"); 5367 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5368 REGEX_CHECK_STATUS; 5369 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength)); 5370 5371 status = U_ZERO_ERROR; 5372 repl = UnicodeString("<${one}>"); 5373 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5374 REGEX_CHECK_STATUS; 5375 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength)); 5376 5377 status = U_ZERO_ERROR; 5378 repl = UnicodeString("<$2>"); 5379 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5380 REGEX_CHECK_STATUS; 5381 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength)); 5382 5383 status = U_ZERO_ERROR; 5384 repl = UnicodeString("<$3>"); 5385 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5386 REGEX_CHECK_STATUS; 5387 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength)); 5388 5389 status = U_ZERO_ERROR; 5390 repl = UnicodeString("<$4>"); 5391 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5392 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR); 5393 5394 status = U_ZERO_ERROR; 5395 repl = UnicodeString("<$04>"); 5396 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5397 REGEX_CHECK_STATUS; 5398 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength)); 5399 5400 status = U_ZERO_ERROR; 5401 repl = UnicodeString("<$000016>"); 5402 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5403 REGEX_CHECK_STATUS; 5404 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength)); 5405 5406 status = U_ZERO_ERROR; 5407 repl = UnicodeString("<$3$2$1${one}>"); 5408 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5409 REGEX_CHECK_STATUS; 5410 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength)); 5411 5412 status = U_ZERO_ERROR; 5413 repl = UnicodeString("$3$2$1${one}"); 5414 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5415 REGEX_CHECK_STATUS; 5416 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength)); 5417 5418 status = U_ZERO_ERROR; 5419 repl = UnicodeString("<${noSuchName}>"); 5420 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5421 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5422 5423 status = U_ZERO_ERROR; 5424 repl = UnicodeString("<${invalid-name}>"); 5425 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5426 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5427 5428 status = U_ZERO_ERROR; 5429 repl = UnicodeString("<${one"); 5430 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5431 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5432 5433 status = U_ZERO_ERROR; 5434 repl = UnicodeString("$not a capture group"); 5435 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status); 5436 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME); 5437 5438 uregex_close(re); 5439} 5440 5441//-------------------------------------------------------------- 5442// 5443// NamedCaptureLimits Patterns with huge numbers of named capture groups. 5444// The point is not so much what the exact limit is, 5445// but that a largish number doesn't hit bad non-linear performance, 5446// and that exceeding the limit fails cleanly. 5447// 5448//-------------------------------------------------------------- 5449void RegexTest::NamedCaptureLimits() { 5450 if (quick) { 5451 logln("Skipping test. Runs in exhuastive mode only."); 5452 return; 5453 } 5454 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully. 5455 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile. 5456 char nnbuf[100]; 5457 UnicodeString pattern; 5458 int32_t nn; 5459 5460 for (nn=1; nn<goodLimit; nn++) { 5461 sprintf(nnbuf, "(?<nn%d>)", nn); 5462 pattern.append(UnicodeString(nnbuf, -1, US_INV)); 5463 } 5464 UErrorCode status = U_ZERO_ERROR; 5465 RegexPattern *pat = RegexPattern::compile(pattern, 0, status); 5466 REGEX_CHECK_STATUS; 5467 for (nn=1; nn<goodLimit; nn++) { 5468 sprintf(nnbuf, "nn%d", nn); 5469 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status); 5470 REGEX_ASSERT(nn == groupNum); 5471 if (nn != groupNum) { 5472 break; 5473 } 5474 } 5475 delete pat; 5476 5477 pattern.remove(); 5478 for (nn=1; nn<failLimit; nn++) { 5479 sprintf(nnbuf, "(?<nn%d>)", nn); 5480 pattern.append(UnicodeString(nnbuf, -1, US_INV)); 5481 } 5482 status = U_ZERO_ERROR; 5483 pat = RegexPattern::compile(pattern, 0, status); 5484 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG); 5485 delete pat; 5486} 5487 5488 5489//-------------------------------------------------------------- 5490// 5491// Bug7651 Regex pattern that exceeds default operator stack depth in matcher. 5492// 5493//--------------------------------------------------------------- 5494void RegexTest::Bug7651() { 5495 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)"); 5496 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData. 5497 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation. 5498 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)"); 5499 UnicodeString s("#ff @abcd This is test"); 5500 RegexPattern *REPattern = NULL; 5501 RegexMatcher *REMatcher = NULL; 5502 UErrorCode status = U_ZERO_ERROR; 5503 UParseError pe; 5504 5505 REPattern = RegexPattern::compile(pattern1, 0, pe, status); 5506 REGEX_CHECK_STATUS; 5507 REMatcher = REPattern->matcher(s, status); 5508 REGEX_CHECK_STATUS; 5509 REGEX_ASSERT(REMatcher->find()); 5510 REGEX_ASSERT(REMatcher->start(status) == 0); 5511 delete REPattern; 5512 delete REMatcher; 5513 status = U_ZERO_ERROR; 5514 5515 REPattern = RegexPattern::compile(pattern2, 0, pe, status); 5516 REGEX_CHECK_STATUS; 5517 REMatcher = REPattern->matcher(s, status); 5518 REGEX_CHECK_STATUS; 5519 REGEX_ASSERT(REMatcher->find()); 5520 REGEX_ASSERT(REMatcher->start(status) == 0); 5521 delete REPattern; 5522 delete REMatcher; 5523 status = U_ZERO_ERROR; 5524 } 5525 5526void RegexTest::Bug7740() { 5527 UErrorCode status = U_ZERO_ERROR; 5528 UnicodeString pattern = "(a)"; 5529 UnicodeString text = "abcdef"; 5530 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status); 5531 REGEX_CHECK_STATUS; 5532 REGEX_ASSERT(m->lookingAt(status)); 5533 REGEX_CHECK_STATUS; 5534 status = U_ILLEGAL_ARGUMENT_ERROR; 5535 UnicodeString s = m->group(1, status); // Bug 7740: segfault here. 5536 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 5537 REGEX_ASSERT(s == ""); 5538 delete m; 5539} 5540 5541// Bug 8479: was crashing whith a Bogus UnicodeString as input. 5542 5543void RegexTest::Bug8479() { 5544 UErrorCode status = U_ZERO_ERROR; 5545 5546 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status); 5547 REGEX_CHECK_STATUS; 5548 if (U_SUCCESS(status)) 5549 { 5550 UnicodeString str; 5551 str.setToBogus(); 5552 pMatcher->reset(str); 5553 status = U_ZERO_ERROR; 5554 pMatcher->matches(status); 5555 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR); 5556 delete pMatcher; 5557 } 5558} 5559 5560 5561// Bug 7029 5562void RegexTest::Bug7029() { 5563 UErrorCode status = U_ZERO_ERROR; 5564 5565 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status); 5566 UnicodeString text = "abc.def"; 5567 UnicodeString splits[10]; 5568 REGEX_CHECK_STATUS; 5569 int32_t numFields = pMatcher->split(text, splits, 10, status); 5570 REGEX_CHECK_STATUS; 5571 REGEX_ASSERT(numFields == 8); 5572 delete pMatcher; 5573} 5574 5575// Bug 9283 5576// This test is checking for the existance of any supplemental characters that case-fold 5577// to a bmp character. 5578// 5579// At the time of this writing there are none. If any should appear in a subsequent release 5580// of Unicode, the code in regular expressions compilation that determines the longest 5581// posssible match for a literal string will need to be enhanced. 5582// 5583// See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength() 5584// for details on what to do in case of a failure of this test. 5585// 5586void RegexTest::Bug9283() { 5587#if !UCONFIG_NO_NORMALIZATION 5588 UErrorCode status = U_ZERO_ERROR; 5589 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status); 5590 REGEX_CHECK_STATUS; 5591 int32_t index; 5592 UChar32 c; 5593 for (index=0; ; index++) { 5594 c = supplementalsWithCaseFolding.charAt(index); 5595 if (c == -1) { 5596 break; 5597 } 5598 UnicodeString cf = UnicodeString(c).foldCase(); 5599 REGEX_ASSERT(cf.length() >= 2); 5600 } 5601#endif /* #if !UCONFIG_NO_NORMALIZATION */ 5602} 5603 5604 5605void RegexTest::CheckInvBufSize() { 5606 if(inv_next>=INV_BUFSIZ) { 5607 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n", 5608 __FILE__, INV_BUFSIZ, inv_next); 5609 } else { 5610 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next); 5611 } 5612} 5613 5614 5615void RegexTest::Bug10459() { 5616 UErrorCode status = U_ZERO_ERROR; 5617 UnicodeString patternString("(txt)"); 5618 UnicodeString txtString("txt"); 5619 5620 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status); 5621 REGEX_CHECK_STATUS; 5622 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status); 5623 REGEX_CHECK_STATUS; 5624 5625 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status); 5626 REGEX_CHECK_STATUS; 5627 5628 uregex_setUText(icu_re, utext_txt, &status); 5629 REGEX_CHECK_STATUS; 5630 5631 // The bug was that calling uregex_group() before doing a matching operation 5632 // was causing a segfault. Only for Regular Expressions created from UText. 5633 // It should set an U_REGEX_INVALID_STATE. 5634 5635 UChar buf[100]; 5636 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status); 5637 REGEX_ASSERT(status == U_REGEX_INVALID_STATE); 5638 REGEX_ASSERT(len == 0); 5639 5640 uregex_close(icu_re); 5641 utext_close(utext_pat); 5642 utext_close(utext_txt); 5643} 5644 5645void RegexTest::TestCaseInsensitiveStarters() { 5646 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't 5647 // become stale because of new Unicode characters. 5648 // If it is stale, rerun the generation tool 5649 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing 5650 // and replace the embedded data in i18n/regexcmp.cpp 5651 5652 for (UChar32 cp=0; cp<=0x10ffff; cp++) { 5653 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) { 5654 continue; 5655 } 5656 UnicodeSet s(cp, cp); 5657 s.closeOver(USET_CASE_INSENSITIVE); 5658 UnicodeSetIterator setIter(s); 5659 while (setIter.next()) { 5660 if (!setIter.isString()) { 5661 continue; 5662 } 5663 const UnicodeString &str = setIter.getString(); 5664 UChar32 firstChar = str.char32At(0); 5665 UnicodeSet starters; 5666 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters); 5667 if (!starters.contains(cp)) { 5668 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar); 5669 return; 5670 } 5671 } 5672 } 5673} 5674 5675 5676void RegexTest::TestBug11049() { 5677 // Original bug report: pattern with match start consisting of one of several individual characters, 5678 // and the text being matched ending with a supplementary character. find() would read past the 5679 // end of the input text when searching for potential match starting points. 5680 5681 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will 5682 // detect the bad read. 5683 5684 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__); 5685 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__); 5686 5687 // Test again with a pattern starting with a single character, 5688 // which takes a different code path than starting with an OR expression, 5689 // but with similar logic. 5690 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__); 5691 TestCase11049("C", "string matches at end C", TRUE, __LINE__); 5692} 5693 5694// Run a single test case from TestBug11049(). Internal function. 5695void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) { 5696 UErrorCode status = U_ZERO_ERROR; 5697 UnicodeString patternString = UnicodeString(pattern).unescape(); 5698 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status)); 5699 5700 UnicodeString dataString = UnicodeString(data).unescape(); 5701 UChar *exactBuffer = new UChar[dataString.length()]; 5702 dataString.extract(exactBuffer, dataString.length(), status); 5703 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status); 5704 5705 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status)); 5706 REGEX_CHECK_STATUS; 5707 matcher->reset(ut); 5708 UBool result = matcher->find(); 5709 if (result != expectMatch) { 5710 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"", 5711 __FILE__, lineNumber, expectMatch, result, pattern, data); 5712 } 5713 5714 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see 5715 // off-by-one on find() with match at the last code point. 5716 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8 5717 // because string.unescape() will only shrink it. 5718 char * utf8Buffer = new char[uprv_strlen(data)+1]; 5719 u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status); 5720 REGEX_CHECK_STATUS; 5721 ut = utext_openUTF8(ut, utf8Buffer, -1, &status); 5722 REGEX_CHECK_STATUS; 5723 matcher->reset(ut); 5724 result = matcher->find(); 5725 if (result != expectMatch) { 5726 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"", 5727 __FILE__, lineNumber, expectMatch, result, pattern, data); 5728 } 5729 delete [] utf8Buffer; 5730 5731 utext_close(ut); 5732 delete [] exactBuffer; 5733} 5734 5735 5736void RegexTest::TestBug11371() { 5737 if (quick) { 5738 logln("Skipping test. Runs in exhuastive mode only."); 5739 return; 5740 } 5741 UErrorCode status = U_ZERO_ERROR; 5742 UnicodeString patternString; 5743 5744 for (int i=0; i<8000000; i++) { 5745 patternString.append(UnicodeString("()")); 5746 } 5747 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status)); 5748 if (status != U_REGEX_PATTERN_TOO_BIG) { 5749 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", 5750 __FILE__, __LINE__, u_errorName(status)); 5751 } 5752 5753 status = U_ZERO_ERROR; 5754 patternString = "("; 5755 for (int i=0; i<20000000; i++) { 5756 patternString.append(UnicodeString("A++")); 5757 } 5758 patternString.append(UnicodeString("){0}B++")); 5759 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status)); 5760 if (status != U_REGEX_PATTERN_TOO_BIG) { 5761 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", 5762 __FILE__, __LINE__, u_errorName(status)); 5763 } 5764 5765 // Pattern with too much string data, such that string indexes overflow operand data field size 5766 // in compiled instruction. 5767 status = U_ZERO_ERROR; 5768 patternString = ""; 5769 while (patternString.length() < 0x00ffffff) { 5770 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n")); 5771 } 5772 patternString.append(UnicodeString("X? trailing string")); 5773 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status)); 5774 if (status != U_REGEX_PATTERN_TOO_BIG) { 5775 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.", 5776 __FILE__, __LINE__, u_errorName(status)); 5777 } 5778} 5779 5780void RegexTest::TestBug11480() { 5781 // C API, get capture group of a group that does not participate in the match. 5782 // (Returns a zero length string, with nul termination, 5783 // indistinguishable from a group with a zero lenght match.) 5784 5785 UErrorCode status = U_ZERO_ERROR; 5786 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status); 5787 REGEX_CHECK_STATUS; 5788 UnicodeString text = UNICODE_STRING_SIMPLE("A"); 5789 uregex_setText(re, text.getBuffer(), text.length(), &status); 5790 REGEX_CHECK_STATUS; 5791 REGEX_ASSERT(uregex_lookingAt(re, 0, &status)); 5792 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13}; 5793 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status); 5794 REGEX_ASSERT(length == 0); 5795 REGEX_ASSERT(buf[0] == 13); 5796 REGEX_ASSERT(buf[1] == 0); 5797 REGEX_ASSERT(buf[2] == 13); 5798 uregex_close(re); 5799} 5800 5801 5802#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ 5803