1/* 2************************************************************************** 3* Copyright (C) 2002-2010 International Business Machines Corporation * 4* and others. All rights reserved. * 5************************************************************************** 6*/ 7// 8// file: rematch.cpp 9// 10// Contains the implementation of class RegexMatcher, 11// which is one of the main API classes for the ICU regular expression package. 12// 13 14#include "unicode/utypes.h" 15#if !UCONFIG_NO_REGULAR_EXPRESSIONS 16 17#include "unicode/regex.h" 18#include "unicode/uniset.h" 19#include "unicode/uchar.h" 20#include "unicode/ustring.h" 21#include "unicode/rbbi.h" 22#include "uassert.h" 23#include "cmemory.h" 24#include "uvector.h" 25#include "uvectr32.h" 26#include "uvectr64.h" 27#include "regeximp.h" 28#include "regexst.h" 29#include "regextxt.h" 30#include "ucase.h" 31 32// #include <malloc.h> // Needed for heapcheck testing 33 34 35// Find progress callback 36// ---------------------- 37// Macro to inline test & call to ReportFindProgress(). Eliminates unnecessary function call. 38// 39#define REGEXFINDPROGRESS_INTERRUPT(pos, status) \ 40 (fFindProgressCallbackFn != NULL) && (ReportFindProgress(pos, status) == FALSE) 41 42 43// Smart Backtracking 44// ------------------ 45// When a failure would go back to a LOOP_C instruction, 46// strings, characters, and setrefs scan backwards for a valid start 47// character themselves, pop the stack, and save state, emulating the 48// LOOP_C's effect but assured that the next character of input is a 49// possible matching character. 50// 51// Good idea in theory; unfortunately it only helps out a few specific 52// cases and slows the engine down a little in the rest. 53 54//#define REGEX_SMART_BACKTRACKING 1 55 56U_NAMESPACE_BEGIN 57 58// Default limit for the size of the back track stack, to avoid system 59// failures causedby heap exhaustion. Units are in 32 bit words, not bytes. 60// This value puts ICU's limits higher than most other regexp implementations, 61// which use recursion rather than the heap, and take more storage per 62// backtrack point. 63// 64static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000; 65 66// Time limit counter constant. 67// Time limits for expression evaluation are in terms of quanta of work by 68// the engine, each of which is 10,000 state saves. 69// This constant determines that state saves per tick number. 70static const int32_t TIMER_INITIAL_VALUE = 10000; 71 72//----------------------------------------------------------------------------- 73// 74// Constructor and Destructor 75// 76//----------------------------------------------------------------------------- 77RegexMatcher::RegexMatcher(const RegexPattern *pat) { 78 fDeferredStatus = U_ZERO_ERROR; 79 init(fDeferredStatus); 80 if (U_FAILURE(fDeferredStatus)) { 81 return; 82 } 83 if (pat==NULL) { 84 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR; 85 return; 86 } 87 fPattern = pat; 88 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus); 89} 90 91 92 93RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 94 uint32_t flags, UErrorCode &status) { 95 init(status); 96 if (U_FAILURE(status)) { 97 return; 98 } 99 UParseError pe; 100 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 101 fPattern = fPatternOwned; 102 103 UText inputText = UTEXT_INITIALIZER; 104 utext_openConstUnicodeString(&inputText, &input, &status); 105 init2(&inputText, status); 106 utext_close(&inputText); 107 108 fInputUniStrMaybeMutable = TRUE; 109} 110 111 112RegexMatcher::RegexMatcher(UText *regexp, UText *input, 113 uint32_t flags, UErrorCode &status) { 114 init(status); 115 if (U_FAILURE(status)) { 116 return; 117 } 118 UParseError pe; 119 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 120 if (U_FAILURE(status)) { 121 return; 122 } 123 124 fPattern = fPatternOwned; 125 init2(input, status); 126} 127 128 129RegexMatcher::RegexMatcher(const UnicodeString ®exp, 130 uint32_t flags, UErrorCode &status) { 131 init(status); 132 if (U_FAILURE(status)) { 133 return; 134 } 135 UParseError pe; 136 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 137 if (U_FAILURE(status)) { 138 return; 139 } 140 fPattern = fPatternOwned; 141 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 142} 143 144RegexMatcher::RegexMatcher(UText *regexp, 145 uint32_t flags, UErrorCode &status) { 146 init(status); 147 if (U_FAILURE(status)) { 148 return; 149 } 150 UParseError pe; 151 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status); 152 if (U_FAILURE(status)) { 153 return; 154 } 155 156 fPattern = fPatternOwned; 157 init2(RegexStaticSets::gStaticSets->fEmptyText, status); 158} 159 160 161 162 163RegexMatcher::~RegexMatcher() { 164 delete fStack; 165 if (fData != fSmallData) { 166 uprv_free(fData); 167 fData = NULL; 168 } 169 if (fPatternOwned) { 170 delete fPatternOwned; 171 fPatternOwned = NULL; 172 fPattern = NULL; 173 } 174 175 if (fInput) { 176 delete fInput; 177 } 178 if (fInputText) { 179 utext_close(fInputText); 180 } 181 if (fAltInputText) { 182 utext_close(fAltInputText); 183 } 184 185 #if UCONFIG_NO_BREAK_ITERATION==0 186 delete fWordBreakItr; 187 #endif 188} 189 190// 191// init() common initialization for use by all constructors. 192// Initialize all fields, get the object into a consistent state. 193// This must be done even when the initial status shows an error, 194// so that the object is initialized sufficiently well for the destructor 195// to run safely. 196// 197void RegexMatcher::init(UErrorCode &status) { 198 fPattern = NULL; 199 fPatternOwned = NULL; 200 fFrameSize = 0; 201 fRegionStart = 0; 202 fRegionLimit = 0; 203 fAnchorStart = 0; 204 fAnchorLimit = 0; 205 fLookStart = 0; 206 fLookLimit = 0; 207 fActiveStart = 0; 208 fActiveLimit = 0; 209 fTransparentBounds = FALSE; 210 fAnchoringBounds = TRUE; 211 fMatch = FALSE; 212 fMatchStart = 0; 213 fMatchEnd = 0; 214 fLastMatchEnd = -1; 215 fAppendPosition = 0; 216 fHitEnd = FALSE; 217 fRequireEnd = FALSE; 218 fStack = NULL; 219 fFrame = NULL; 220 fTimeLimit = 0; 221 fTime = 0; 222 fTickCounter = 0; 223 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY; 224 fCallbackFn = NULL; 225 fCallbackContext = NULL; 226 fFindProgressCallbackFn = NULL; 227 fFindProgressCallbackContext = NULL; 228 fTraceDebug = FALSE; 229 fDeferredStatus = status; 230 fData = fSmallData; 231 fWordBreakItr = NULL; 232 233 fStack = new UVector64(status); 234 fInputText = NULL; 235 fAltInputText = NULL; 236 fInput = NULL; 237 fInputLength = 0; 238 fInputUniStrMaybeMutable = FALSE; 239 240 if (U_FAILURE(status)) { 241 fDeferredStatus = status; 242 } 243} 244 245// 246// init2() Common initialization for use by RegexMatcher constructors, part 2. 247// This handles the common setup to be done after the Pattern is available. 248// 249void RegexMatcher::init2(UText *input, UErrorCode &status) { 250 if (U_FAILURE(status)) { 251 fDeferredStatus = status; 252 return; 253 } 254 255 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) { 256 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t)); 257 if (fData == NULL) { 258 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 259 return; 260 } 261 } 262 263 reset(input); 264 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status); 265 if (U_FAILURE(status)) { 266 fDeferredStatus = status; 267 return; 268 } 269} 270 271 272static const UChar BACKSLASH = 0x5c; 273static const UChar DOLLARSIGN = 0x24; 274//-------------------------------------------------------------------------------- 275// 276// appendReplacement 277// 278//-------------------------------------------------------------------------------- 279RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest, 280 const UnicodeString &replacement, 281 UErrorCode &status) { 282 UText replacementText = UTEXT_INITIALIZER; 283 284 utext_openConstUnicodeString(&replacementText, &replacement, &status); 285 if (U_SUCCESS(status)) { 286 UText resultText = UTEXT_INITIALIZER; 287 utext_openUnicodeString(&resultText, &dest, &status); 288 289 if (U_SUCCESS(status)) { 290 appendReplacement(&resultText, &replacementText, status); 291 utext_close(&resultText); 292 } 293 utext_close(&replacementText); 294 } 295 296 return *this; 297} 298 299// 300// appendReplacement, UText mode 301// 302RegexMatcher &RegexMatcher::appendReplacement(UText *dest, 303 UText *replacement, 304 UErrorCode &status) { 305 if (U_FAILURE(status)) { 306 return *this; 307 } 308 if (U_FAILURE(fDeferredStatus)) { 309 status = fDeferredStatus; 310 return *this; 311 } 312 if (fMatch == FALSE) { 313 status = U_REGEX_INVALID_STATE; 314 return *this; 315 } 316 317 // Copy input string from the end of previous match to start of current match 318 int64_t destLen = utext_nativeLength(dest); 319 if (fMatchStart > fAppendPosition) { 320 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 321 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 322 (int32_t)(fMatchStart-fAppendPosition), &status); 323 } else { 324 int32_t len16; 325 if (UTEXT_USES_U16(fInputText)) { 326 len16 = (int32_t)(fMatchStart-fAppendPosition); 327 } else { 328 UErrorCode lengthStatus = U_ZERO_ERROR; 329 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus); 330 } 331 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 332 if (inputChars == NULL) { 333 status = U_MEMORY_ALLOCATION_ERROR; 334 return *this; 335 } 336 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status); 337 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status); 338 uprv_free(inputChars); 339 } 340 } 341 fAppendPosition = fMatchEnd; 342 343 344 // scan the replacement text, looking for substitutions ($n) and \escapes. 345 // TODO: optimize this loop by efficiently scanning for '$' or '\', 346 // move entire ranges not containing substitutions. 347 UTEXT_SETNATIVEINDEX(replacement, 0); 348 UChar32 c = UTEXT_NEXT32(replacement); 349 while (c != U_SENTINEL) { 350 if (c == BACKSLASH) { 351 // Backslash Escape. Copy the following char out without further checks. 352 // Note: Surrogate pairs don't need any special handling 353 // The second half wont be a '$' or a '\', and 354 // will move to the dest normally on the next 355 // loop iteration. 356 c = UTEXT_CURRENT32(replacement); 357 if (c == U_SENTINEL) { 358 break; 359 } 360 361 if (c==0x55/*U*/ || c==0x75/*u*/) { 362 // We have a \udddd or \Udddddddd escape sequence. 363 int32_t offset = 0; 364 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement); 365 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 366 if (escapedChar != (UChar32)0xFFFFFFFF) { 367 if (U_IS_BMP(escapedChar)) { 368 UChar c16 = (UChar)escapedChar; 369 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 370 } else { 371 UChar surrogate[2]; 372 surrogate[0] = U16_LEAD(escapedChar); 373 surrogate[1] = U16_TRAIL(escapedChar); 374 if (U_SUCCESS(status)) { 375 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 376 } 377 } 378 // TODO: Report errors for mal-formed \u escapes? 379 // As this is, the original sequence is output, which may be OK. 380 if (context.lastOffset == offset) { 381 UTEXT_PREVIOUS32(replacement); 382 } else if (context.lastOffset != offset-1) { 383 utext_moveIndex32(replacement, offset - context.lastOffset - 1); 384 } 385 } 386 } else { 387 UTEXT_NEXT32(replacement); 388 // Plain backslash escape. Just put out the escaped character. 389 if (U_IS_BMP(c)) { 390 UChar c16 = (UChar)c; 391 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 392 } else { 393 UChar surrogate[2]; 394 surrogate[0] = U16_LEAD(c); 395 surrogate[1] = U16_TRAIL(c); 396 if (U_SUCCESS(status)) { 397 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 398 } 399 } 400 } 401 } else if (c != DOLLARSIGN) { 402 // Normal char, not a $. Copy it out without further checks. 403 if (U_IS_BMP(c)) { 404 UChar c16 = (UChar)c; 405 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 406 } else { 407 UChar surrogate[2]; 408 surrogate[0] = U16_LEAD(c); 409 surrogate[1] = U16_TRAIL(c); 410 if (U_SUCCESS(status)) { 411 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status); 412 } 413 } 414 } else { 415 // We've got a $. Pick up a capture group number if one follows. 416 // Consume at most the number of digits necessary for the largest capture 417 // number that is valid for this pattern. 418 419 int32_t numDigits = 0; 420 int32_t groupNum = 0; 421 UChar32 digitC; 422 for (;;) { 423 digitC = UTEXT_CURRENT32(replacement); 424 if (digitC == U_SENTINEL) { 425 break; 426 } 427 if (u_isdigit(digitC) == FALSE) { 428 break; 429 } 430 UTEXT_NEXT32(replacement); 431 groupNum=groupNum*10 + u_charDigitValue(digitC); 432 numDigits++; 433 if (numDigits >= fPattern->fMaxCaptureDigits) { 434 break; 435 } 436 } 437 438 439 if (numDigits == 0) { 440 // The $ didn't introduce a group number at all. 441 // Treat it as just part of the substitution text. 442 UChar c16 = DOLLARSIGN; 443 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status); 444 } else { 445 // Finally, append the capture group data to the destination. 446 destLen += appendGroup(groupNum, dest, status); 447 if (U_FAILURE(status)) { 448 // Can fail if group number is out of range. 449 break; 450 } 451 } 452 } 453 454 if (U_FAILURE(status)) { 455 break; 456 } else { 457 c = UTEXT_NEXT32(replacement); 458 } 459 } 460 461 return *this; 462} 463 464 465 466//-------------------------------------------------------------------------------- 467// 468// appendTail Intended to be used in conjunction with appendReplacement() 469// To the destination string, append everything following 470// the last match position from the input string. 471// 472// Note: Match ranges do not affect appendTail or appendReplacement 473// 474//-------------------------------------------------------------------------------- 475UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) { 476 UErrorCode status = U_ZERO_ERROR; 477 UText resultText = UTEXT_INITIALIZER; 478 utext_openUnicodeString(&resultText, &dest, &status); 479 480 if (U_SUCCESS(status)) { 481 appendTail(&resultText, status); 482 utext_close(&resultText); 483 } 484 485 return dest; 486} 487 488// 489// appendTail, UText mode 490// 491UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) { 492 UBool bailOut = FALSE; 493 if (U_FAILURE(status)) { 494 bailOut = TRUE; 495 } 496 if (U_FAILURE(fDeferredStatus)) { 497 status = fDeferredStatus; 498 bailOut = TRUE; 499 } 500 501 if (bailOut) { 502 // dest must not be NULL 503 if (dest) { 504 utext_replace(dest, utext_nativeLength(dest), utext_nativeLength(dest), NULL, 0, &status); 505 return dest; 506 } 507 } 508 509 if (fInputLength > fAppendPosition) { 510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 511 int64_t destLen = utext_nativeLength(dest); 512 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition, 513 (int32_t)(fInputLength-fAppendPosition), &status); 514 } else { 515 int32_t len16; 516 if (UTEXT_USES_U16(fInputText)) { 517 len16 = (int32_t)(fInputLength-fAppendPosition); 518 } else { 519 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status); 520 status = U_ZERO_ERROR; // buffer overflow 521 } 522 523 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16)); 524 if (inputChars == NULL) { 525 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; 526 } else { 527 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated 528 int64_t destLen = utext_nativeLength(dest); 529 utext_replace(dest, destLen, destLen, inputChars, len16, &status); 530 uprv_free(inputChars); 531 } 532 } 533 } 534 return dest; 535} 536 537 538 539//-------------------------------------------------------------------------------- 540// 541// end 542// 543//-------------------------------------------------------------------------------- 544int32_t RegexMatcher::end(UErrorCode &err) const { 545 return end(0, err); 546} 547 548int64_t RegexMatcher::end64(UErrorCode &err) const { 549 return end64(0, err); 550} 551 552int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const { 553 if (U_FAILURE(err)) { 554 return -1; 555 } 556 if (fMatch == FALSE) { 557 err = U_REGEX_INVALID_STATE; 558 return -1; 559 } 560 if (group < 0 || group > fPattern->fGroupMap->size()) { 561 err = U_INDEX_OUTOFBOUNDS_ERROR; 562 return -1; 563 } 564 int64_t e = -1; 565 if (group == 0) { 566 e = fMatchEnd; 567 } else { 568 // Get the position within the stack frame of the variables for 569 // this capture group. 570 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 571 U_ASSERT(groupOffset < fPattern->fFrameSize); 572 U_ASSERT(groupOffset >= 0); 573 e = fFrame->fExtra[groupOffset + 1]; 574 } 575 576 return e; 577} 578 579int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const { 580 return (int32_t)end64(group, err); 581} 582 583 584//-------------------------------------------------------------------------------- 585// 586// find() 587// 588//-------------------------------------------------------------------------------- 589UBool RegexMatcher::find() { 590 // Start at the position of the last match end. (Will be zero if the 591 // matcher has been reset.) 592 // 593 if (U_FAILURE(fDeferredStatus)) { 594 return FALSE; 595 } 596 597 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 598 return findUsingChunk(); 599 } 600 601 int64_t startPos = fMatchEnd; 602 if (startPos==0) { 603 startPos = fActiveStart; 604 } 605 606 if (fMatch) { 607 // Save the position of any previous successful match. 608 fLastMatchEnd = fMatchEnd; 609 610 if (fMatchStart == fMatchEnd) { 611 // Previous match had zero length. Move start position up one position 612 // to avoid sending find() into a loop on zero-length matches. 613 if (startPos >= fActiveLimit) { 614 fMatch = FALSE; 615 fHitEnd = TRUE; 616 return FALSE; 617 } 618 UTEXT_SETNATIVEINDEX(fInputText, startPos); 619 UTEXT_NEXT32(fInputText); 620 startPos = UTEXT_GETNATIVEINDEX(fInputText); 621 } 622 } else { 623 if (fLastMatchEnd >= 0) { 624 // A previous find() failed to match. Don't try again. 625 // (without this test, a pattern with a zero-length match 626 // could match again at the end of an input string.) 627 fHitEnd = TRUE; 628 return FALSE; 629 } 630 } 631 632 633 // Compute the position in the input string beyond which a match can not begin, because 634 // the minimum length match would extend past the end of the input. 635 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 636 // Be aware of possible overflows if making changes here. 637 int64_t testStartLimit; 638 if (UTEXT_USES_U16(fInputText)) { 639 testStartLimit = fActiveLimit - fPattern->fMinMatchLen; 640 if (startPos > testStartLimit) { 641 fMatch = FALSE; 642 fHitEnd = TRUE; 643 return FALSE; 644 } 645 } else { 646 // For now, let the matcher discover that it can't match on its own 647 // We don't know how long the match len is in native characters 648 testStartLimit = fActiveLimit; 649 } 650 651 UChar32 c; 652 U_ASSERT(startPos >= 0); 653 654 switch (fPattern->fStartType) { 655 case START_NO_INFO: 656 // No optimization was found. 657 // Try a match at each input position. 658 for (;;) { 659 MatchAt(startPos, FALSE, fDeferredStatus); 660 if (U_FAILURE(fDeferredStatus)) { 661 return FALSE; 662 } 663 if (fMatch) { 664 return TRUE; 665 } 666 if (startPos >= testStartLimit) { 667 fHitEnd = TRUE; 668 return FALSE; 669 } 670 UTEXT_SETNATIVEINDEX(fInputText, startPos); 671 UTEXT_NEXT32(fInputText); 672 startPos = UTEXT_GETNATIVEINDEX(fInputText); 673 // Note that it's perfectly OK for a pattern to have a zero-length 674 // match at the end of a string, so we must make sure that the loop 675 // runs with startPos == testStartLimit the last time through. 676 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 677 return FALSE; 678 } 679 U_ASSERT(FALSE); 680 681 case START_START: 682 // Matches are only possible at the start of the input string 683 // (pattern begins with ^ or \A) 684 if (startPos > fActiveStart) { 685 fMatch = FALSE; 686 return FALSE; 687 } 688 MatchAt(startPos, FALSE, fDeferredStatus); 689 if (U_FAILURE(fDeferredStatus)) { 690 return FALSE; 691 } 692 return fMatch; 693 694 695 case START_SET: 696 { 697 // Match may start on any char from a pre-computed set. 698 U_ASSERT(fPattern->fMinMatchLen > 0); 699 int64_t pos; 700 UTEXT_SETNATIVEINDEX(fInputText, startPos); 701 for (;;) { 702 c = UTEXT_NEXT32(fInputText); 703 pos = UTEXT_GETNATIVEINDEX(fInputText); 704 // c will be -1 (U_SENTINEL) at end of text, in which case we 705 // skip this next block (so we don't have a negative array index) 706 // and handle end of text in the following block. 707 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) || 708 (c>=256 && fPattern->fInitialChars->contains(c)))) { 709 MatchAt(startPos, FALSE, fDeferredStatus); 710 if (U_FAILURE(fDeferredStatus)) { 711 return FALSE; 712 } 713 if (fMatch) { 714 return TRUE; 715 } 716 UTEXT_SETNATIVEINDEX(fInputText, pos); 717 } 718 if (startPos >= testStartLimit) { 719 fMatch = FALSE; 720 fHitEnd = TRUE; 721 return FALSE; 722 } 723 startPos = pos; 724 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 725 return FALSE; 726 } 727 } 728 U_ASSERT(FALSE); 729 730 case START_STRING: 731 case START_CHAR: 732 { 733 // Match starts on exactly one char. 734 U_ASSERT(fPattern->fMinMatchLen > 0); 735 UChar32 theChar = fPattern->fInitialChar; 736 int64_t pos; 737 UTEXT_SETNATIVEINDEX(fInputText, startPos); 738 for (;;) { 739 c = UTEXT_NEXT32(fInputText); 740 pos = UTEXT_GETNATIVEINDEX(fInputText); 741 if (c == theChar) { 742 MatchAt(startPos, FALSE, fDeferredStatus); 743 if (U_FAILURE(fDeferredStatus)) { 744 return FALSE; 745 } 746 if (fMatch) { 747 return TRUE; 748 } 749 UTEXT_SETNATIVEINDEX(fInputText, pos); 750 } 751 if (startPos >= testStartLimit) { 752 fMatch = FALSE; 753 fHitEnd = TRUE; 754 return FALSE; 755 } 756 startPos = pos; 757 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 758 return FALSE; 759 } 760 } 761 U_ASSERT(FALSE); 762 763 case START_LINE: 764 { 765 UChar32 c; 766 if (startPos == fAnchorStart) { 767 MatchAt(startPos, FALSE, fDeferredStatus); 768 if (U_FAILURE(fDeferredStatus)) { 769 return FALSE; 770 } 771 if (fMatch) { 772 return TRUE; 773 } 774 UTEXT_SETNATIVEINDEX(fInputText, startPos); 775 c = UTEXT_NEXT32(fInputText); 776 startPos = UTEXT_GETNATIVEINDEX(fInputText); 777 } else { 778 UTEXT_SETNATIVEINDEX(fInputText, startPos); 779 c = UTEXT_PREVIOUS32(fInputText); 780 UTEXT_SETNATIVEINDEX(fInputText, startPos); 781 } 782 783 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 784 for (;;) { 785 if (c == 0x0a) { 786 MatchAt(startPos, FALSE, fDeferredStatus); 787 if (U_FAILURE(fDeferredStatus)) { 788 return FALSE; 789 } 790 if (fMatch) { 791 return TRUE; 792 } 793 UTEXT_SETNATIVEINDEX(fInputText, startPos); 794 } 795 if (startPos >= testStartLimit) { 796 fMatch = FALSE; 797 fHitEnd = TRUE; 798 return FALSE; 799 } 800 c = UTEXT_NEXT32(fInputText); 801 startPos = UTEXT_GETNATIVEINDEX(fInputText); 802 // Note that it's perfectly OK for a pattern to have a zero-length 803 // match at the end of a string, so we must make sure that the loop 804 // runs with startPos == testStartLimit the last time through. 805 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 806 return FALSE; 807 } 808 } else { 809 for (;;) { 810 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 811 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 812 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 813 UTEXT_NEXT32(fInputText); 814 startPos = UTEXT_GETNATIVEINDEX(fInputText); 815 } 816 MatchAt(startPos, FALSE, fDeferredStatus); 817 if (U_FAILURE(fDeferredStatus)) { 818 return FALSE; 819 } 820 if (fMatch) { 821 return TRUE; 822 } 823 UTEXT_SETNATIVEINDEX(fInputText, startPos); 824 } 825 if (startPos >= testStartLimit) { 826 fMatch = FALSE; 827 fHitEnd = TRUE; 828 return FALSE; 829 } 830 c = UTEXT_NEXT32(fInputText); 831 startPos = UTEXT_GETNATIVEINDEX(fInputText); 832 // Note that it's perfectly OK for a pattern to have a zero-length 833 // match at the end of a string, so we must make sure that the loop 834 // runs with startPos == testStartLimit the last time through. 835 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 836 return FALSE; 837 } 838 } 839 } 840 841 default: 842 U_ASSERT(FALSE); 843 } 844 845 U_ASSERT(FALSE); 846 return FALSE; 847} 848 849 850 851UBool RegexMatcher::find(int64_t start, UErrorCode &status) { 852 if (U_FAILURE(status)) { 853 return FALSE; 854 } 855 if (U_FAILURE(fDeferredStatus)) { 856 status = fDeferredStatus; 857 return FALSE; 858 } 859 this->reset(); // Note: Reset() is specified by Java Matcher documentation. 860 // This will reset the region to be the full input length. 861 if (start < 0) { 862 status = U_INDEX_OUTOFBOUNDS_ERROR; 863 return FALSE; 864 } 865 866 int64_t nativeStart = start; 867 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 868 status = U_INDEX_OUTOFBOUNDS_ERROR; 869 return FALSE; 870 } 871 fMatchEnd = nativeStart; 872 return find(); 873} 874 875 876//-------------------------------------------------------------------------------- 877// 878// findUsingChunk() -- like find(), but with the advance knowledge that the 879// entire string is available in the UText's chunk buffer. 880// 881//-------------------------------------------------------------------------------- 882UBool RegexMatcher::findUsingChunk() { 883 // Start at the position of the last match end. (Will be zero if the 884 // matcher has been reset. 885 // 886 887 int32_t startPos = (int32_t)fMatchEnd; 888 if (startPos==0) { 889 startPos = (int32_t)fActiveStart; 890 } 891 892 const UChar *inputBuf = fInputText->chunkContents; 893 894 if (fMatch) { 895 // Save the position of any previous successful match. 896 fLastMatchEnd = fMatchEnd; 897 898 if (fMatchStart == fMatchEnd) { 899 // Previous match had zero length. Move start position up one position 900 // to avoid sending find() into a loop on zero-length matches. 901 if (startPos >= fActiveLimit) { 902 fMatch = FALSE; 903 fHitEnd = TRUE; 904 return FALSE; 905 } 906 U16_FWD_1(inputBuf, startPos, fInputLength); 907 } 908 } else { 909 if (fLastMatchEnd >= 0) { 910 // A previous find() failed to match. Don't try again. 911 // (without this test, a pattern with a zero-length match 912 // could match again at the end of an input string.) 913 fHitEnd = TRUE; 914 return FALSE; 915 } 916 } 917 918 919 // Compute the position in the input string beyond which a match can not begin, because 920 // the minimum length match would extend past the end of the input. 921 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int. 922 // Be aware of possible overflows if making changes here. 923 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen); 924 if (startPos > testLen) { 925 fMatch = FALSE; 926 fHitEnd = TRUE; 927 return FALSE; 928 } 929 930 UChar32 c; 931 U_ASSERT(startPos >= 0); 932 933 switch (fPattern->fStartType) { 934 case START_NO_INFO: 935 // No optimization was found. 936 // Try a match at each input position. 937 for (;;) { 938 MatchChunkAt(startPos, FALSE, fDeferredStatus); 939 if (U_FAILURE(fDeferredStatus)) { 940 return FALSE; 941 } 942 if (fMatch) { 943 return TRUE; 944 } 945 if (startPos >= testLen) { 946 fHitEnd = TRUE; 947 return FALSE; 948 } 949 U16_FWD_1(inputBuf, startPos, fActiveLimit); 950 // Note that it's perfectly OK for a pattern to have a zero-length 951 // match at the end of a string, so we must make sure that the loop 952 // runs with startPos == testLen the last time through. 953 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 954 return FALSE; 955 } 956 U_ASSERT(FALSE); 957 958 case START_START: 959 // Matches are only possible at the start of the input string 960 // (pattern begins with ^ or \A) 961 if (startPos > fActiveStart) { 962 fMatch = FALSE; 963 return FALSE; 964 } 965 MatchChunkAt(startPos, FALSE, fDeferredStatus); 966 if (U_FAILURE(fDeferredStatus)) { 967 return FALSE; 968 } 969 return fMatch; 970 971 972 case START_SET: 973 { 974 // Match may start on any char from a pre-computed set. 975 U_ASSERT(fPattern->fMinMatchLen > 0); 976 for (;;) { 977 int32_t pos = startPos; 978 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 979 if ((c<256 && fPattern->fInitialChars8->contains(c)) || 980 (c>=256 && fPattern->fInitialChars->contains(c))) { 981 MatchChunkAt(pos, FALSE, fDeferredStatus); 982 if (U_FAILURE(fDeferredStatus)) { 983 return FALSE; 984 } 985 if (fMatch) { 986 return TRUE; 987 } 988 } 989 if (pos >= testLen) { 990 fMatch = FALSE; 991 fHitEnd = TRUE; 992 return FALSE; 993 } 994 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 995 return FALSE; 996 } 997 } 998 U_ASSERT(FALSE); 999 1000 case START_STRING: 1001 case START_CHAR: 1002 { 1003 // Match starts on exactly one char. 1004 U_ASSERT(fPattern->fMinMatchLen > 0); 1005 UChar32 theChar = fPattern->fInitialChar; 1006 for (;;) { 1007 int32_t pos = startPos; 1008 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++]; 1009 if (c == theChar) { 1010 MatchChunkAt(pos, FALSE, fDeferredStatus); 1011 if (U_FAILURE(fDeferredStatus)) { 1012 return FALSE; 1013 } 1014 if (fMatch) { 1015 return TRUE; 1016 } 1017 } 1018 if (pos >= testLen) { 1019 fMatch = FALSE; 1020 fHitEnd = TRUE; 1021 return FALSE; 1022 } 1023 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1024 return FALSE; 1025 } 1026 } 1027 U_ASSERT(FALSE); 1028 1029 case START_LINE: 1030 { 1031 UChar32 c; 1032 if (startPos == fAnchorStart) { 1033 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1034 if (U_FAILURE(fDeferredStatus)) { 1035 return FALSE; 1036 } 1037 if (fMatch) { 1038 return TRUE; 1039 } 1040 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1041 } 1042 1043 if (fPattern->fFlags & UREGEX_UNIX_LINES) { 1044 for (;;) { 1045 c = inputBuf[startPos-1]; 1046 if (c == 0x0a) { 1047 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1048 if (U_FAILURE(fDeferredStatus)) { 1049 return FALSE; 1050 } 1051 if (fMatch) { 1052 return TRUE; 1053 } 1054 } 1055 if (startPos >= testLen) { 1056 fMatch = FALSE; 1057 fHitEnd = TRUE; 1058 return FALSE; 1059 } 1060 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1061 // Note that it's perfectly OK for a pattern to have a zero-length 1062 // match at the end of a string, so we must make sure that the loop 1063 // runs with startPos == testLen the last time through. 1064 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1065 return FALSE; 1066 } 1067 } else { 1068 for (;;) { 1069 c = inputBuf[startPos-1]; 1070 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 1071 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029 )) { 1072 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) { 1073 startPos++; 1074 } 1075 MatchChunkAt(startPos, FALSE, fDeferredStatus); 1076 if (U_FAILURE(fDeferredStatus)) { 1077 return FALSE; 1078 } 1079 if (fMatch) { 1080 return TRUE; 1081 } 1082 } 1083 if (startPos >= testLen) { 1084 fMatch = FALSE; 1085 fHitEnd = TRUE; 1086 return FALSE; 1087 } 1088 U16_FWD_1(inputBuf, startPos, fActiveLimit); 1089 // Note that it's perfectly OK for a pattern to have a zero-length 1090 // match at the end of a string, so we must make sure that the loop 1091 // runs with startPos == testLen the last time through. 1092 if (REGEXFINDPROGRESS_INTERRUPT(startPos, fDeferredStatus)) 1093 return FALSE; 1094 } 1095 } 1096 } 1097 1098 default: 1099 U_ASSERT(FALSE); 1100 } 1101 1102 U_ASSERT(FALSE); 1103 return FALSE; 1104} 1105 1106 1107 1108//-------------------------------------------------------------------------------- 1109// 1110// group() 1111// 1112//-------------------------------------------------------------------------------- 1113UnicodeString RegexMatcher::group(UErrorCode &status) const { 1114 return group(0, status); 1115} 1116 1117// Return immutable shallow clone 1118UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const { 1119 return group(0, dest, group_len, status); 1120} 1121 1122// Return immutable shallow clone 1123UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const { 1124 group_len = 0; 1125 UBool bailOut = FALSE; 1126 if (U_FAILURE(status)) { 1127 return dest; 1128 } 1129 if (U_FAILURE(fDeferredStatus)) { 1130 status = fDeferredStatus; 1131 bailOut = TRUE; 1132 } 1133 if (fMatch == FALSE) { 1134 status = U_REGEX_INVALID_STATE; 1135 bailOut = TRUE; 1136 } 1137 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1138 status = U_INDEX_OUTOFBOUNDS_ERROR; 1139 bailOut = TRUE; 1140 } 1141 1142 if (bailOut) { 1143 return (dest) ? dest : utext_openUChars(NULL, NULL, 0, &status); 1144 } 1145 1146 int64_t s, e; 1147 if (groupNum == 0) { 1148 s = fMatchStart; 1149 e = fMatchEnd; 1150 } else { 1151 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1152 U_ASSERT(groupOffset < fPattern->fFrameSize); 1153 U_ASSERT(groupOffset >= 0); 1154 s = fFrame->fExtra[groupOffset]; 1155 e = fFrame->fExtra[groupOffset+1]; 1156 } 1157 1158 if (s < 0) { 1159 // A capture group wasn't part of the match 1160 return utext_clone(dest, fInputText, FALSE, TRUE, &status); 1161 } 1162 U_ASSERT(s <= e); 1163 group_len = e - s; 1164 1165 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status); 1166 if (dest) 1167 UTEXT_SETNATIVEINDEX(dest, s); 1168 return dest; 1169} 1170 1171UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const { 1172 UnicodeString result; 1173 if (U_FAILURE(status)) { 1174 return result; 1175 } 1176 UText resultText = UTEXT_INITIALIZER; 1177 utext_openUnicodeString(&resultText, &result, &status); 1178 group(groupNum, &resultText, status); 1179 utext_close(&resultText); 1180 return result; 1181} 1182 1183 1184// Return deep (mutable) clone 1185// Technology Preview (as an API), but note that the UnicodeString API is implemented 1186// using this function. 1187UText *RegexMatcher::group(int32_t groupNum, UText *dest, UErrorCode &status) const { 1188 UBool bailOut = FALSE; 1189 if (U_FAILURE(status)) { 1190 return dest; 1191 } 1192 if (U_FAILURE(fDeferredStatus)) { 1193 status = fDeferredStatus; 1194 bailOut = TRUE; 1195 } 1196 1197 if (fMatch == FALSE) { 1198 status = U_REGEX_INVALID_STATE; 1199 bailOut = TRUE; 1200 } 1201 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1202 status = U_INDEX_OUTOFBOUNDS_ERROR; 1203 bailOut = TRUE; 1204 } 1205 1206 if (bailOut) { 1207 if (dest) { 1208 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1209 return dest; 1210 } else { 1211 return utext_openUChars(NULL, NULL, 0, &status); 1212 } 1213 } 1214 1215 int64_t s, e; 1216 if (groupNum == 0) { 1217 s = fMatchStart; 1218 e = fMatchEnd; 1219 } else { 1220 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1221 U_ASSERT(groupOffset < fPattern->fFrameSize); 1222 U_ASSERT(groupOffset >= 0); 1223 s = fFrame->fExtra[groupOffset]; 1224 e = fFrame->fExtra[groupOffset+1]; 1225 } 1226 1227 if (s < 0) { 1228 // A capture group wasn't part of the match 1229 if (dest) { 1230 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1231 return dest; 1232 } else { 1233 return utext_openUChars(NULL, NULL, 0, &status); 1234 } 1235 } 1236 U_ASSERT(s <= e); 1237 1238 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1239 U_ASSERT(e <= fInputLength); 1240 if (dest) { 1241 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents+s, (int32_t)(e-s), &status); 1242 } else { 1243 UText groupText = UTEXT_INITIALIZER; 1244 utext_openUChars(&groupText, fInputText->chunkContents+s, e-s, &status); 1245 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1246 utext_close(&groupText); 1247 } 1248 } else { 1249 int32_t len16; 1250 if (UTEXT_USES_U16(fInputText)) { 1251 len16 = (int32_t)(e-s); 1252 } else { 1253 UErrorCode lengthStatus = U_ZERO_ERROR; 1254 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1255 } 1256 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1257 if (groupChars == NULL) { 1258 status = U_MEMORY_ALLOCATION_ERROR; 1259 return dest; 1260 } 1261 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1262 1263 if (dest) { 1264 utext_replace(dest, 0, utext_nativeLength(dest), groupChars, len16, &status); 1265 } else { 1266 UText groupText = UTEXT_INITIALIZER; 1267 utext_openUChars(&groupText, groupChars, len16, &status); 1268 dest = utext_clone(NULL, &groupText, TRUE, FALSE, &status); 1269 utext_close(&groupText); 1270 } 1271 1272 uprv_free(groupChars); 1273 } 1274 return dest; 1275} 1276 1277//-------------------------------------------------------------------------------- 1278// 1279// appendGroup() -- currently internal only, appends a group to a UText rather 1280// than replacing its contents 1281// 1282//-------------------------------------------------------------------------------- 1283 1284int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const { 1285 if (U_FAILURE(status)) { 1286 return 0; 1287 } 1288 if (U_FAILURE(fDeferredStatus)) { 1289 status = fDeferredStatus; 1290 return 0; 1291 } 1292 int64_t destLen = utext_nativeLength(dest); 1293 1294 if (fMatch == FALSE) { 1295 status = U_REGEX_INVALID_STATE; 1296 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1297 } 1298 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) { 1299 status = U_INDEX_OUTOFBOUNDS_ERROR; 1300 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1301 } 1302 1303 int64_t s, e; 1304 if (groupNum == 0) { 1305 s = fMatchStart; 1306 e = fMatchEnd; 1307 } else { 1308 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1); 1309 U_ASSERT(groupOffset < fPattern->fFrameSize); 1310 U_ASSERT(groupOffset >= 0); 1311 s = fFrame->fExtra[groupOffset]; 1312 e = fFrame->fExtra[groupOffset+1]; 1313 } 1314 1315 if (s < 0) { 1316 // A capture group wasn't part of the match 1317 return utext_replace(dest, destLen, destLen, NULL, 0, &status); 1318 } 1319 U_ASSERT(s <= e); 1320 1321 int64_t deltaLen; 1322 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1323 U_ASSERT(e <= fInputLength); 1324 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status); 1325 } else { 1326 int32_t len16; 1327 if (UTEXT_USES_U16(fInputText)) { 1328 len16 = (int32_t)(e-s); 1329 } else { 1330 UErrorCode lengthStatus = U_ZERO_ERROR; 1331 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus); 1332 } 1333 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1)); 1334 if (groupChars == NULL) { 1335 status = U_MEMORY_ALLOCATION_ERROR; 1336 return 0; 1337 } 1338 utext_extract(fInputText, s, e, groupChars, len16+1, &status); 1339 1340 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status); 1341 uprv_free(groupChars); 1342 } 1343 return deltaLen; 1344} 1345 1346 1347 1348//-------------------------------------------------------------------------------- 1349// 1350// groupCount() 1351// 1352//-------------------------------------------------------------------------------- 1353int32_t RegexMatcher::groupCount() const { 1354 return fPattern->fGroupMap->size(); 1355} 1356 1357 1358 1359//-------------------------------------------------------------------------------- 1360// 1361// hasAnchoringBounds() 1362// 1363//-------------------------------------------------------------------------------- 1364UBool RegexMatcher::hasAnchoringBounds() const { 1365 return fAnchoringBounds; 1366} 1367 1368 1369//-------------------------------------------------------------------------------- 1370// 1371// hasTransparentBounds() 1372// 1373//-------------------------------------------------------------------------------- 1374UBool RegexMatcher::hasTransparentBounds() const { 1375 return fTransparentBounds; 1376} 1377 1378 1379 1380//-------------------------------------------------------------------------------- 1381// 1382// hitEnd() 1383// 1384//-------------------------------------------------------------------------------- 1385UBool RegexMatcher::hitEnd() const { 1386 return fHitEnd; 1387} 1388 1389 1390//-------------------------------------------------------------------------------- 1391// 1392// input() 1393// 1394//-------------------------------------------------------------------------------- 1395const UnicodeString &RegexMatcher::input() const { 1396 if (!fInput) { 1397 UErrorCode status = U_ZERO_ERROR; 1398 int32_t len16; 1399 if (UTEXT_USES_U16(fInputText)) { 1400 len16 = (int32_t)fInputLength; 1401 } else { 1402 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status); 1403 status = U_ZERO_ERROR; // overflow, length status 1404 } 1405 UnicodeString *result = new UnicodeString(len16, 0, 0); 1406 1407 UChar *inputChars = result->getBuffer(len16); 1408 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning 1409 result->releaseBuffer(len16); 1410 1411 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator= 1412 } 1413 1414 return *fInput; 1415} 1416 1417//-------------------------------------------------------------------------------- 1418// 1419// inputText() 1420// 1421//-------------------------------------------------------------------------------- 1422UText *RegexMatcher::inputText() const { 1423 return fInputText; 1424} 1425 1426 1427//-------------------------------------------------------------------------------- 1428// 1429// getInput() -- like inputText(), but makes a clone or copies into another UText 1430// 1431//-------------------------------------------------------------------------------- 1432UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const { 1433 UBool bailOut = FALSE; 1434 if (U_FAILURE(status)) { 1435 return dest; 1436 } 1437 if (U_FAILURE(fDeferredStatus)) { 1438 status = fDeferredStatus; 1439 bailOut = TRUE; 1440 } 1441 1442 if (bailOut) { 1443 if (dest) { 1444 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, &status); 1445 return dest; 1446 } else { 1447 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1448 } 1449 } 1450 1451 if (dest) { 1452 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1453 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status); 1454 } else { 1455 int32_t input16Len; 1456 if (UTEXT_USES_U16(fInputText)) { 1457 input16Len = (int32_t)fInputLength; 1458 } else { 1459 UErrorCode lengthStatus = U_ZERO_ERROR; 1460 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error 1461 } 1462 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len)); 1463 if (inputChars == NULL) { 1464 return dest; 1465 } 1466 1467 status = U_ZERO_ERROR; 1468 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning 1469 status = U_ZERO_ERROR; 1470 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status); 1471 1472 uprv_free(inputChars); 1473 } 1474 return dest; 1475 } else { 1476 return utext_clone(NULL, fInputText, FALSE, TRUE, &status); 1477 } 1478} 1479 1480 1481static UBool compat_SyncMutableUTextContents(UText *ut); 1482static UBool compat_SyncMutableUTextContents(UText *ut) { 1483 UBool retVal = FALSE; 1484 1485 // In the following test, we're really only interested in whether the UText should switch 1486 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents 1487 // will still point to the correct data. 1488 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) { 1489 UnicodeString *us=(UnicodeString *)ut->context; 1490 1491 // Update to the latest length. 1492 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit). 1493 int32_t newLength = us->length(); 1494 1495 // Update the chunk description. 1496 // The buffer may have switched between stack- and heap-based. 1497 ut->chunkContents = us->getBuffer(); 1498 ut->chunkLength = newLength; 1499 ut->chunkNativeLimit = newLength; 1500 ut->nativeIndexingLimit = newLength; 1501 retVal = TRUE; 1502 } 1503 1504 return retVal; 1505} 1506 1507//-------------------------------------------------------------------------------- 1508// 1509// lookingAt() 1510// 1511//-------------------------------------------------------------------------------- 1512UBool RegexMatcher::lookingAt(UErrorCode &status) { 1513 if (U_FAILURE(status)) { 1514 return FALSE; 1515 } 1516 if (U_FAILURE(fDeferredStatus)) { 1517 status = fDeferredStatus; 1518 return FALSE; 1519 } 1520 1521 if (fInputUniStrMaybeMutable) { 1522 if (compat_SyncMutableUTextContents(fInputText)) { 1523 fInputLength = utext_nativeLength(fInputText); 1524 reset(); 1525 } 1526 } 1527 else { 1528 resetPreserveRegion(); 1529 } 1530 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1531 MatchChunkAt((int32_t)fActiveStart, FALSE, status); 1532 } else { 1533 MatchAt(fActiveStart, FALSE, status); 1534 } 1535 return fMatch; 1536} 1537 1538 1539UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) { 1540 if (U_FAILURE(status)) { 1541 return FALSE; 1542 } 1543 if (U_FAILURE(fDeferredStatus)) { 1544 status = fDeferredStatus; 1545 return FALSE; 1546 } 1547 reset(); 1548 1549 if (start < 0) { 1550 status = U_INDEX_OUTOFBOUNDS_ERROR; 1551 return FALSE; 1552 } 1553 1554 if (fInputUniStrMaybeMutable) { 1555 if (compat_SyncMutableUTextContents(fInputText)) { 1556 fInputLength = utext_nativeLength(fInputText); 1557 reset(); 1558 } 1559 } 1560 1561 int64_t nativeStart; 1562 nativeStart = start; 1563 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1564 status = U_INDEX_OUTOFBOUNDS_ERROR; 1565 return FALSE; 1566 } 1567 1568 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1569 MatchChunkAt((int32_t)nativeStart, FALSE, status); 1570 } else { 1571 MatchAt(nativeStart, FALSE, status); 1572 } 1573 return fMatch; 1574} 1575 1576 1577 1578//-------------------------------------------------------------------------------- 1579// 1580// matches() 1581// 1582//-------------------------------------------------------------------------------- 1583UBool RegexMatcher::matches(UErrorCode &status) { 1584 if (U_FAILURE(status)) { 1585 return FALSE; 1586 } 1587 if (U_FAILURE(fDeferredStatus)) { 1588 status = fDeferredStatus; 1589 return FALSE; 1590 } 1591 1592 if (fInputUniStrMaybeMutable) { 1593 if (compat_SyncMutableUTextContents(fInputText)) { 1594 fInputLength = utext_nativeLength(fInputText); 1595 reset(); 1596 } 1597 } 1598 else { 1599 resetPreserveRegion(); 1600 } 1601 1602 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1603 MatchChunkAt((int32_t)fActiveStart, TRUE, status); 1604 } else { 1605 MatchAt(fActiveStart, TRUE, status); 1606 } 1607 return fMatch; 1608} 1609 1610 1611UBool RegexMatcher::matches(int64_t start, UErrorCode &status) { 1612 if (U_FAILURE(status)) { 1613 return FALSE; 1614 } 1615 if (U_FAILURE(fDeferredStatus)) { 1616 status = fDeferredStatus; 1617 return FALSE; 1618 } 1619 reset(); 1620 1621 if (start < 0) { 1622 status = U_INDEX_OUTOFBOUNDS_ERROR; 1623 return FALSE; 1624 } 1625 1626 if (fInputUniStrMaybeMutable) { 1627 if (compat_SyncMutableUTextContents(fInputText)) { 1628 fInputLength = utext_nativeLength(fInputText); 1629 reset(); 1630 } 1631 } 1632 1633 int64_t nativeStart; 1634 nativeStart = start; 1635 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) { 1636 status = U_INDEX_OUTOFBOUNDS_ERROR; 1637 return FALSE; 1638 } 1639 1640 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) { 1641 MatchChunkAt((int32_t)nativeStart, TRUE, status); 1642 } else { 1643 MatchAt(nativeStart, TRUE, status); 1644 } 1645 return fMatch; 1646} 1647 1648 1649 1650//-------------------------------------------------------------------------------- 1651// 1652// pattern 1653// 1654//-------------------------------------------------------------------------------- 1655const RegexPattern &RegexMatcher::pattern() const { 1656 return *fPattern; 1657} 1658 1659 1660 1661//-------------------------------------------------------------------------------- 1662// 1663// region 1664// 1665//-------------------------------------------------------------------------------- 1666RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) { 1667 if (U_FAILURE(status)) { 1668 return *this; 1669 } 1670 1671 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) { 1672 status = U_ILLEGAL_ARGUMENT_ERROR; 1673 } 1674 1675 int64_t nativeStart = regionStart; 1676 int64_t nativeLimit = regionLimit; 1677 if (nativeStart > fInputLength || nativeLimit > fInputLength) { 1678 status = U_ILLEGAL_ARGUMENT_ERROR; 1679 } 1680 1681 if (startIndex == -1) 1682 this->reset(); 1683 else 1684 resetPreserveRegion(); 1685 1686 fRegionStart = nativeStart; 1687 fRegionLimit = nativeLimit; 1688 fActiveStart = nativeStart; 1689 fActiveLimit = nativeLimit; 1690 1691 if (startIndex != -1) { 1692 if (startIndex < fActiveStart || startIndex > fActiveLimit) { 1693 status = U_INDEX_OUTOFBOUNDS_ERROR; 1694 } 1695 fMatchEnd = startIndex; 1696 } 1697 1698 if (!fTransparentBounds) { 1699 fLookStart = nativeStart; 1700 fLookLimit = nativeLimit; 1701 } 1702 if (fAnchoringBounds) { 1703 fAnchorStart = nativeStart; 1704 fAnchorLimit = nativeLimit; 1705 } 1706 return *this; 1707} 1708 1709RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) { 1710 return region(start, limit, -1, status); 1711} 1712 1713//-------------------------------------------------------------------------------- 1714// 1715// regionEnd 1716// 1717//-------------------------------------------------------------------------------- 1718int32_t RegexMatcher::regionEnd() const { 1719 return (int32_t)fRegionLimit; 1720} 1721 1722int64_t RegexMatcher::regionEnd64() const { 1723 return fRegionLimit; 1724} 1725 1726//-------------------------------------------------------------------------------- 1727// 1728// regionStart 1729// 1730//-------------------------------------------------------------------------------- 1731int32_t RegexMatcher::regionStart() const { 1732 return (int32_t)fRegionStart; 1733} 1734 1735int64_t RegexMatcher::regionStart64() const { 1736 return fRegionStart; 1737} 1738 1739 1740//-------------------------------------------------------------------------------- 1741// 1742// replaceAll 1743// 1744//-------------------------------------------------------------------------------- 1745UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) { 1746 UText replacementText = UTEXT_INITIALIZER; 1747 UText resultText = UTEXT_INITIALIZER; 1748 UnicodeString resultString; 1749 if (U_FAILURE(status)) { 1750 return resultString; 1751 } 1752 1753 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1754 utext_openUnicodeString(&resultText, &resultString, &status); 1755 1756 replaceAll(&replacementText, &resultText, status); 1757 1758 utext_close(&resultText); 1759 utext_close(&replacementText); 1760 1761 return resultString; 1762} 1763 1764 1765// 1766// replaceAll, UText mode 1767// 1768UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) { 1769 if (U_FAILURE(status)) { 1770 return dest; 1771 } 1772 if (U_FAILURE(fDeferredStatus)) { 1773 status = fDeferredStatus; 1774 return dest; 1775 } 1776 1777 if (dest == NULL) { 1778 UnicodeString emptyString; 1779 UText empty = UTEXT_INITIALIZER; 1780 1781 utext_openUnicodeString(&empty, &emptyString, &status); 1782 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1783 utext_close(&empty); 1784 } 1785 1786 if (U_SUCCESS(status)) { 1787 reset(); 1788 while (find()) { 1789 appendReplacement(dest, replacement, status); 1790 if (U_FAILURE(status)) { 1791 break; 1792 } 1793 } 1794 appendTail(dest, status); 1795 } 1796 1797 return dest; 1798} 1799 1800 1801//-------------------------------------------------------------------------------- 1802// 1803// replaceFirst 1804// 1805//-------------------------------------------------------------------------------- 1806UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) { 1807 UText replacementText = UTEXT_INITIALIZER; 1808 UText resultText = UTEXT_INITIALIZER; 1809 UnicodeString resultString; 1810 1811 utext_openConstUnicodeString(&replacementText, &replacement, &status); 1812 utext_openUnicodeString(&resultText, &resultString, &status); 1813 1814 replaceFirst(&replacementText, &resultText, status); 1815 1816 utext_close(&resultText); 1817 utext_close(&replacementText); 1818 1819 return resultString; 1820} 1821 1822// 1823// replaceFirst, UText mode 1824// 1825UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) { 1826 if (U_FAILURE(status)) { 1827 return dest; 1828 } 1829 if (U_FAILURE(fDeferredStatus)) { 1830 status = fDeferredStatus; 1831 return dest; 1832 } 1833 1834 reset(); 1835 if (!find()) { 1836 return getInput(dest, status); 1837 } 1838 1839 if (dest == NULL) { 1840 UnicodeString emptyString; 1841 UText empty = UTEXT_INITIALIZER; 1842 1843 utext_openUnicodeString(&empty, &emptyString, &status); 1844 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status); 1845 utext_close(&empty); 1846 } 1847 1848 appendReplacement(dest, replacement, status); 1849 appendTail(dest, status); 1850 1851 return dest; 1852} 1853 1854 1855//-------------------------------------------------------------------------------- 1856// 1857// requireEnd 1858// 1859//-------------------------------------------------------------------------------- 1860UBool RegexMatcher::requireEnd() const { 1861 return fRequireEnd; 1862} 1863 1864 1865//-------------------------------------------------------------------------------- 1866// 1867// reset 1868// 1869//-------------------------------------------------------------------------------- 1870RegexMatcher &RegexMatcher::reset() { 1871 fRegionStart = 0; 1872 fRegionLimit = fInputLength; 1873 fActiveStart = 0; 1874 fActiveLimit = fInputLength; 1875 fAnchorStart = 0; 1876 fAnchorLimit = fInputLength; 1877 fLookStart = 0; 1878 fLookLimit = fInputLength; 1879 resetPreserveRegion(); 1880 return *this; 1881} 1882 1883 1884 1885void RegexMatcher::resetPreserveRegion() { 1886 fMatchStart = 0; 1887 fMatchEnd = 0; 1888 fLastMatchEnd = -1; 1889 fAppendPosition = 0; 1890 fMatch = FALSE; 1891 fHitEnd = FALSE; 1892 fRequireEnd = FALSE; 1893 fTime = 0; 1894 fTickCounter = TIMER_INITIAL_VALUE; 1895 //resetStack(); // more expensive than it looks... 1896} 1897 1898 1899RegexMatcher &RegexMatcher::reset(const UnicodeString &input) { 1900 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus); 1901 if (fPattern->fNeedsAltInput) { 1902 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1903 } 1904 fInputLength = utext_nativeLength(fInputText); 1905 1906 reset(); 1907 delete fInput; 1908 fInput = NULL; 1909 1910 // Do the following for any UnicodeString. 1911 // This is for compatibility for those clients who modify the input string "live" during regex operations. 1912 fInputUniStrMaybeMutable = TRUE; 1913 1914 if (fWordBreakItr != NULL) { 1915#if UCONFIG_NO_BREAK_ITERATION==0 1916 UErrorCode status = U_ZERO_ERROR; 1917 fWordBreakItr->setText(fInputText, status); 1918#endif 1919 } 1920 return *this; 1921} 1922 1923 1924RegexMatcher &RegexMatcher::reset(UText *input) { 1925 if (fInputText != input) { 1926 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus); 1927 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus); 1928 fInputLength = utext_nativeLength(fInputText); 1929 1930 delete fInput; 1931 fInput = NULL; 1932 1933 if (fWordBreakItr != NULL) { 1934#if UCONFIG_NO_BREAK_ITERATION==0 1935 UErrorCode status = U_ZERO_ERROR; 1936 fWordBreakItr->setText(input, status); 1937#endif 1938 } 1939 } 1940 reset(); 1941 fInputUniStrMaybeMutable = FALSE; 1942 1943 return *this; 1944} 1945 1946/*RegexMatcher &RegexMatcher::reset(const UChar *) { 1947 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR; 1948 return *this; 1949}*/ 1950 1951RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) { 1952 if (U_FAILURE(status)) { 1953 return *this; 1954 } 1955 reset(); // Reset also resets the region to be the entire string. 1956 1957 if (position < 0 || position > fActiveLimit) { 1958 status = U_INDEX_OUTOFBOUNDS_ERROR; 1959 return *this; 1960 } 1961 fMatchEnd = position; 1962 return *this; 1963} 1964 1965 1966// BEGIN android-added 1967// Removed this function after Android upgrad to ICU4.8. 1968//-------------------------------------------------------------------------------- 1969// 1970// refresh 1971// 1972//-------------------------------------------------------------------------------- 1973RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) { 1974 if (U_FAILURE(status)) { 1975 return *this; 1976 } 1977 if (input == NULL) { 1978 status = U_ILLEGAL_ARGUMENT_ERROR; 1979 return *this; 1980 } 1981 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) { 1982 status = U_ILLEGAL_ARGUMENT_ERROR; 1983 return *this; 1984 } 1985 int64_t pos = utext_getNativeIndex(fInputText); 1986 // Shallow read-only clone of the new UText into the existing input UText 1987 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status); 1988 if (U_FAILURE(status)) { 1989 return *this; 1990 } 1991 utext_setNativeIndex(fInputText, pos); 1992 1993 if (fAltInputText != NULL) { 1994 pos = utext_getNativeIndex(fAltInputText); 1995 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status); 1996 if (U_FAILURE(status)) { 1997 return *this; 1998 } 1999 utext_setNativeIndex(fAltInputText, pos); 2000 } 2001 return *this; 2002} 2003// END android-added 2004 2005 2006//-------------------------------------------------------------------------------- 2007// 2008// setTrace 2009// 2010//-------------------------------------------------------------------------------- 2011void RegexMatcher::setTrace(UBool state) { 2012 fTraceDebug = state; 2013} 2014 2015 2016 2017//--------------------------------------------------------------------- 2018// 2019// split 2020// 2021//--------------------------------------------------------------------- 2022int32_t RegexMatcher::split(const UnicodeString &input, 2023 UnicodeString dest[], 2024 int32_t destCapacity, 2025 UErrorCode &status) 2026{ 2027 UText inputText = UTEXT_INITIALIZER; 2028 utext_openConstUnicodeString(&inputText, &input, &status); 2029 if (U_FAILURE(status)) { 2030 return 0; 2031 } 2032 2033 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity); 2034 if (destText == NULL) { 2035 status = U_MEMORY_ALLOCATION_ERROR; 2036 return 0; 2037 } 2038 int32_t i; 2039 for (i = 0; i < destCapacity; i++) { 2040 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status); 2041 } 2042 2043 int32_t fieldCount = split(&inputText, destText, destCapacity, status); 2044 2045 for (i = 0; i < destCapacity; i++) { 2046 utext_close(destText[i]); 2047 } 2048 2049 uprv_free(destText); 2050 utext_close(&inputText); 2051 return fieldCount; 2052} 2053 2054// 2055// split, UText mode 2056// 2057int32_t RegexMatcher::split(UText *input, 2058 UText *dest[], 2059 int32_t destCapacity, 2060 UErrorCode &status) 2061{ 2062 // 2063 // Check arguements for validity 2064 // 2065 if (U_FAILURE(status)) { 2066 return 0; 2067 }; 2068 2069 if (destCapacity < 1) { 2070 status = U_ILLEGAL_ARGUMENT_ERROR; 2071 return 0; 2072 } 2073 2074 // 2075 // Reset for the input text 2076 // 2077 reset(input); 2078 int64_t nextOutputStringStart = 0; 2079 if (fActiveLimit == 0) { 2080 return 0; 2081 } 2082 2083 // 2084 // Loop through the input text, searching for the delimiter pattern 2085 // 2086 int32_t i; 2087 int32_t numCaptureGroups = fPattern->fGroupMap->size(); 2088 for (i=0; ; i++) { 2089 if (i>=destCapacity-1) { 2090 // There is one or zero output string left. 2091 // Fill the last output string with whatever is left from the input, then exit the loop. 2092 // ( i will be == destCapacity if we filled the output array while processing 2093 // capture groups of the delimiter expression, in which case we will discard the 2094 // last capture group saved in favor of the unprocessed remainder of the 2095 // input string.) 2096 i = destCapacity-1; 2097 if (fActiveLimit > nextOutputStringStart) { 2098 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2099 if (dest[i]) { 2100 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2101 input->chunkContents+nextOutputStringStart, 2102 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2103 } else { 2104 UText remainingText = UTEXT_INITIALIZER; 2105 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2106 fActiveLimit-nextOutputStringStart, &status); 2107 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2108 utext_close(&remainingText); 2109 } 2110 } else { 2111 UErrorCode lengthStatus = U_ZERO_ERROR; 2112 int32_t remaining16Length = 2113 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2114 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2115 if (remainingChars == NULL) { 2116 status = U_MEMORY_ALLOCATION_ERROR; 2117 break; 2118 } 2119 2120 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2121 if (dest[i]) { 2122 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2123 } else { 2124 UText remainingText = UTEXT_INITIALIZER; 2125 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2126 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2127 utext_close(&remainingText); 2128 } 2129 2130 uprv_free(remainingChars); 2131 } 2132 } 2133 break; 2134 } 2135 if (find()) { 2136 // We found another delimiter. Move everything from where we started looking 2137 // up until the start of the delimiter into the next output string. 2138 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2139 if (dest[i]) { 2140 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2141 input->chunkContents+nextOutputStringStart, 2142 (int32_t)(fMatchStart-nextOutputStringStart), &status); 2143 } else { 2144 UText remainingText = UTEXT_INITIALIZER; 2145 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2146 fMatchStart-nextOutputStringStart, &status); 2147 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2148 utext_close(&remainingText); 2149 } 2150 } else { 2151 UErrorCode lengthStatus = U_ZERO_ERROR; 2152 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus); 2153 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2154 if (remainingChars == NULL) { 2155 status = U_MEMORY_ALLOCATION_ERROR; 2156 break; 2157 } 2158 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status); 2159 if (dest[i]) { 2160 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2161 } else { 2162 UText remainingText = UTEXT_INITIALIZER; 2163 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2164 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2165 utext_close(&remainingText); 2166 } 2167 2168 uprv_free(remainingChars); 2169 } 2170 nextOutputStringStart = fMatchEnd; 2171 2172 // If the delimiter pattern has capturing parentheses, the captured 2173 // text goes out into the next n destination strings. 2174 int32_t groupNum; 2175 UBool lastGroupWasNullUText = FALSE; 2176 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) { 2177 if (i==destCapacity-1) { 2178 break; 2179 } 2180 i++; 2181 lastGroupWasNullUText = (dest[i] == NULL ? TRUE : FALSE); 2182 dest[i] = group(groupNum, dest[i], status); 2183 } 2184 2185 if (nextOutputStringStart == fActiveLimit) { 2186 // The delimiter was at the end of the string. We're done. 2187 break; 2188 } else if (i == destCapacity-1) { 2189 // We're out of capture groups, and the rest of the string is more important 2190 if (lastGroupWasNullUText) { 2191 utext_close(dest[i]); 2192 dest[i] = NULL; 2193 } 2194 } 2195 2196 } 2197 else 2198 { 2199 // We ran off the end of the input while looking for the next delimiter. 2200 // All the remaining text goes into the current output string. 2201 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) { 2202 if (dest[i]) { 2203 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), 2204 input->chunkContents+nextOutputStringStart, 2205 (int32_t)(fActiveLimit-nextOutputStringStart), &status); 2206 } else { 2207 UText remainingText = UTEXT_INITIALIZER; 2208 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart, 2209 fActiveLimit-nextOutputStringStart, &status); 2210 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2211 utext_close(&remainingText); 2212 } 2213 } else { 2214 UErrorCode lengthStatus = U_ZERO_ERROR; 2215 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus); 2216 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1)); 2217 if (remainingChars == NULL) { 2218 status = U_MEMORY_ALLOCATION_ERROR; 2219 break; 2220 } 2221 2222 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status); 2223 if (dest[i]) { 2224 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status); 2225 } else { 2226 UText remainingText = UTEXT_INITIALIZER; 2227 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status); 2228 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status); 2229 utext_close(&remainingText); 2230 } 2231 2232 uprv_free(remainingChars); 2233 } 2234 break; 2235 } 2236 if (U_FAILURE(status)) { 2237 break; 2238 } 2239 } // end of for loop 2240 return i+1; 2241} 2242 2243 2244//-------------------------------------------------------------------------------- 2245// 2246// start 2247// 2248//-------------------------------------------------------------------------------- 2249int32_t RegexMatcher::start(UErrorCode &status) const { 2250 return start(0, status); 2251} 2252 2253int64_t RegexMatcher::start64(UErrorCode &status) const { 2254 return start64(0, status); 2255} 2256 2257//-------------------------------------------------------------------------------- 2258// 2259// start(int32_t group, UErrorCode &status) 2260// 2261//-------------------------------------------------------------------------------- 2262 2263int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const { 2264 if (U_FAILURE(status)) { 2265 return -1; 2266 } 2267 if (U_FAILURE(fDeferredStatus)) { 2268 status = fDeferredStatus; 2269 return -1; 2270 } 2271 if (fMatch == FALSE) { 2272 status = U_REGEX_INVALID_STATE; 2273 return -1; 2274 } 2275 if (group < 0 || group > fPattern->fGroupMap->size()) { 2276 status = U_INDEX_OUTOFBOUNDS_ERROR; 2277 return -1; 2278 } 2279 int64_t s; 2280 if (group == 0) { 2281 s = fMatchStart; 2282 } else { 2283 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1); 2284 U_ASSERT(groupOffset < fPattern->fFrameSize); 2285 U_ASSERT(groupOffset >= 0); 2286 s = fFrame->fExtra[groupOffset]; 2287 } 2288 2289 return s; 2290} 2291 2292 2293int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const { 2294 return (int32_t)start64(group, status); 2295} 2296 2297//-------------------------------------------------------------------------------- 2298// 2299// useAnchoringBounds 2300// 2301//-------------------------------------------------------------------------------- 2302RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) { 2303 fAnchoringBounds = b; 2304 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0); 2305 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength); 2306 return *this; 2307} 2308 2309 2310//-------------------------------------------------------------------------------- 2311// 2312// useTransparentBounds 2313// 2314//-------------------------------------------------------------------------------- 2315RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) { 2316 fTransparentBounds = b; 2317 fLookStart = (fTransparentBounds ? 0 : fRegionStart); 2318 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit); 2319 return *this; 2320} 2321 2322//-------------------------------------------------------------------------------- 2323// 2324// setTimeLimit 2325// 2326//-------------------------------------------------------------------------------- 2327void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) { 2328 if (U_FAILURE(status)) { 2329 return; 2330 } 2331 if (U_FAILURE(fDeferredStatus)) { 2332 status = fDeferredStatus; 2333 return; 2334 } 2335 if (limit < 0) { 2336 status = U_ILLEGAL_ARGUMENT_ERROR; 2337 return; 2338 } 2339 fTimeLimit = limit; 2340} 2341 2342 2343//-------------------------------------------------------------------------------- 2344// 2345// getTimeLimit 2346// 2347//-------------------------------------------------------------------------------- 2348int32_t RegexMatcher::getTimeLimit() const { 2349 return fTimeLimit; 2350} 2351 2352 2353//-------------------------------------------------------------------------------- 2354// 2355// setStackLimit 2356// 2357//-------------------------------------------------------------------------------- 2358void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) { 2359 if (U_FAILURE(status)) { 2360 return; 2361 } 2362 if (U_FAILURE(fDeferredStatus)) { 2363 status = fDeferredStatus; 2364 return; 2365 } 2366 if (limit < 0) { 2367 status = U_ILLEGAL_ARGUMENT_ERROR; 2368 return; 2369 } 2370 2371 // Reset the matcher. This is needed here in case there is a current match 2372 // whose final stack frame (containing the match results, pointed to by fFrame) 2373 // would be lost by resizing to a smaller stack size. 2374 reset(); 2375 2376 if (limit == 0) { 2377 // Unlimited stack expansion 2378 fStack->setMaxCapacity(0); 2379 } else { 2380 // Change the units of the limit from bytes to ints, and bump the size up 2381 // to be big enough to hold at least one stack frame for the pattern, 2382 // if it isn't there already. 2383 int32_t adjustedLimit = limit / sizeof(int32_t); 2384 if (adjustedLimit < fPattern->fFrameSize) { 2385 adjustedLimit = fPattern->fFrameSize; 2386 } 2387 fStack->setMaxCapacity(adjustedLimit); 2388 } 2389 fStackLimit = limit; 2390} 2391 2392 2393//-------------------------------------------------------------------------------- 2394// 2395// getStackLimit 2396// 2397//-------------------------------------------------------------------------------- 2398int32_t RegexMatcher::getStackLimit() const { 2399 return fStackLimit; 2400} 2401 2402 2403//-------------------------------------------------------------------------------- 2404// 2405// setMatchCallback 2406// 2407//-------------------------------------------------------------------------------- 2408void RegexMatcher::setMatchCallback(URegexMatchCallback *callback, 2409 const void *context, 2410 UErrorCode &status) { 2411 if (U_FAILURE(status)) { 2412 return; 2413 } 2414 fCallbackFn = callback; 2415 fCallbackContext = context; 2416} 2417 2418 2419//-------------------------------------------------------------------------------- 2420// 2421// getMatchCallback 2422// 2423//-------------------------------------------------------------------------------- 2424void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback, 2425 const void *&context, 2426 UErrorCode &status) { 2427 if (U_FAILURE(status)) { 2428 return; 2429 } 2430 callback = fCallbackFn; 2431 context = fCallbackContext; 2432} 2433 2434 2435//-------------------------------------------------------------------------------- 2436// 2437// setMatchCallback 2438// 2439//-------------------------------------------------------------------------------- 2440void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback, 2441 const void *context, 2442 UErrorCode &status) { 2443 if (U_FAILURE(status)) { 2444 return; 2445 } 2446 fFindProgressCallbackFn = callback; 2447 fFindProgressCallbackContext = context; 2448} 2449 2450 2451//-------------------------------------------------------------------------------- 2452// 2453// getMatchCallback 2454// 2455//-------------------------------------------------------------------------------- 2456void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback, 2457 const void *&context, 2458 UErrorCode &status) { 2459 if (U_FAILURE(status)) { 2460 return; 2461 } 2462 callback = fFindProgressCallbackFn; 2463 context = fFindProgressCallbackContext; 2464} 2465 2466 2467//================================================================================ 2468// 2469// Code following this point in this file is the internal 2470// Match Engine Implementation. 2471// 2472//================================================================================ 2473 2474 2475//-------------------------------------------------------------------------------- 2476// 2477// resetStack 2478// Discard any previous contents of the state save stack, and initialize a 2479// new stack frame to all -1. The -1s are needed for capture group limits, 2480// where they indicate that a group has not yet matched anything. 2481//-------------------------------------------------------------------------------- 2482REStackFrame *RegexMatcher::resetStack() { 2483 // Discard any previous contents of the state save stack, and initialize a 2484 // new stack frame with all -1 data. The -1s are needed for capture group limits, 2485 // where they indicate that a group has not yet matched anything. 2486 fStack->removeAllElements(); 2487 2488 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus); 2489 int32_t i; 2490 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) { 2491 iFrame->fExtra[i] = -1; 2492 } 2493 return iFrame; 2494} 2495 2496 2497 2498//-------------------------------------------------------------------------------- 2499// 2500// isWordBoundary 2501// in perl, "xab..cd..", \b is true at positions 0,3,5,7 2502// For us, 2503// If the current char is a combining mark, 2504// \b is FALSE. 2505// Else Scan backwards to the first non-combining char. 2506// We are at a boundary if the this char and the original chars are 2507// opposite in membership in \w set 2508// 2509// parameters: pos - the current position in the input buffer 2510// 2511// TODO: double-check edge cases at region boundaries. 2512// 2513//-------------------------------------------------------------------------------- 2514UBool RegexMatcher::isWordBoundary(int64_t pos) { 2515 UBool isBoundary = FALSE; 2516 UBool cIsWord = FALSE; 2517 2518 if (pos >= fLookLimit) { 2519 fHitEnd = TRUE; 2520 } else { 2521 // Determine whether char c at current position is a member of the word set of chars. 2522 // If we're off the end of the string, behave as though we're not at a word char. 2523 UTEXT_SETNATIVEINDEX(fInputText, pos); 2524 UChar32 c = UTEXT_CURRENT32(fInputText); 2525 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2526 // Current char is a combining one. Not a boundary. 2527 return FALSE; 2528 } 2529 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2530 } 2531 2532 // Back up until we come to a non-combining char, determine whether 2533 // that char is a word char. 2534 UBool prevCIsWord = FALSE; 2535 for (;;) { 2536 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) { 2537 break; 2538 } 2539 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText); 2540 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2541 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2542 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2543 break; 2544 } 2545 } 2546 isBoundary = cIsWord ^ prevCIsWord; 2547 return isBoundary; 2548} 2549 2550UBool RegexMatcher::isChunkWordBoundary(int32_t pos) { 2551 UBool isBoundary = FALSE; 2552 UBool cIsWord = FALSE; 2553 2554 const UChar *inputBuf = fInputText->chunkContents; 2555 2556 if (pos >= fLookLimit) { 2557 fHitEnd = TRUE; 2558 } else { 2559 // Determine whether char c at current position is a member of the word set of chars. 2560 // If we're off the end of the string, behave as though we're not at a word char. 2561 UChar32 c; 2562 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c); 2563 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) { 2564 // Current char is a combining one. Not a boundary. 2565 return FALSE; 2566 } 2567 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c); 2568 } 2569 2570 // Back up until we come to a non-combining char, determine whether 2571 // that char is a word char. 2572 UBool prevCIsWord = FALSE; 2573 for (;;) { 2574 if (pos <= fLookStart) { 2575 break; 2576 } 2577 UChar32 prevChar; 2578 U16_PREV(inputBuf, fLookStart, pos, prevChar); 2579 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND) 2580 || u_charType(prevChar) == U_FORMAT_CHAR)) { 2581 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar); 2582 break; 2583 } 2584 } 2585 isBoundary = cIsWord ^ prevCIsWord; 2586 return isBoundary; 2587} 2588 2589//-------------------------------------------------------------------------------- 2590// 2591// isUWordBoundary 2592// 2593// Test for a word boundary using RBBI word break. 2594// 2595// parameters: pos - the current position in the input buffer 2596// 2597//-------------------------------------------------------------------------------- 2598UBool RegexMatcher::isUWordBoundary(int64_t pos) { 2599 UBool returnVal = FALSE; 2600#if UCONFIG_NO_BREAK_ITERATION==0 2601 2602 // If we haven't yet created a break iterator for this matcher, do it now. 2603 if (fWordBreakItr == NULL) { 2604 fWordBreakItr = 2605 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus); 2606 if (U_FAILURE(fDeferredStatus)) { 2607 return FALSE; 2608 } 2609 fWordBreakItr->setText(fInputText, fDeferredStatus); 2610 } 2611 2612 if (pos >= fLookLimit) { 2613 fHitEnd = TRUE; 2614 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real" 2615 // words are not boundaries. All non-word chars stand by themselves, 2616 // with word boundaries on both sides. 2617 } else { 2618 if (!UTEXT_USES_U16(fInputText)) { 2619 // !!!: Would like a better way to do this! 2620 UErrorCode status = U_ZERO_ERROR; 2621 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status); 2622 } 2623 returnVal = fWordBreakItr->isBoundary((int32_t)pos); 2624 } 2625#endif 2626 return returnVal; 2627} 2628 2629//-------------------------------------------------------------------------------- 2630// 2631// IncrementTime This function is called once each TIMER_INITIAL_VALUE state 2632// saves. Increment the "time" counter, and call the 2633// user callback function if there is one installed. 2634// 2635// If the match operation needs to be aborted, either for a time-out 2636// or because the user callback asked for it, just set an error status. 2637// The engine will pick that up and stop in its outer loop. 2638// 2639//-------------------------------------------------------------------------------- 2640void RegexMatcher::IncrementTime(UErrorCode &status) { 2641 fTickCounter = TIMER_INITIAL_VALUE; 2642 fTime++; 2643 if (fCallbackFn != NULL) { 2644 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) { 2645 status = U_REGEX_STOPPED_BY_CALLER; 2646 return; 2647 } 2648 } 2649 if (fTimeLimit > 0 && fTime >= fTimeLimit) { 2650 status = U_REGEX_TIME_OUT; 2651 } 2652} 2653 2654//-------------------------------------------------------------------------------- 2655// 2656// ReportFindProgress This function is called once for each advance in the target 2657// string from the find() function, and calls the user progress callback 2658// function if there is one installed. 2659// 2660// NOTE: 2661// 2662// If the match operation needs to be aborted because the user 2663// callback asked for it, just set an error status. 2664// The engine will pick that up and stop in its outer loop. 2665// 2666//-------------------------------------------------------------------------------- 2667UBool RegexMatcher::ReportFindProgress(int64_t matchIndex, UErrorCode &status) { 2668 if (fFindProgressCallbackFn != NULL) { 2669 if ((*fFindProgressCallbackFn)(fFindProgressCallbackContext, matchIndex) == FALSE) { 2670 status = U_ZERO_ERROR /*U_REGEX_STOPPED_BY_CALLER*/; 2671 return FALSE; 2672 } 2673 } 2674 return TRUE; 2675} 2676 2677//-------------------------------------------------------------------------------- 2678// 2679// StateSave 2680// Make a new stack frame, initialized as a copy of the current stack frame. 2681// Set the pattern index in the original stack frame from the operand value 2682// in the opcode. Execution of the engine continues with the state in 2683// the newly created stack frame 2684// 2685// Note that reserveBlock() may grow the stack, resulting in the 2686// whole thing being relocated in memory. 2687// 2688// Parameters: 2689// fp The top frame pointer when called. At return, a new 2690// fame will be present 2691// savePatIdx An index into the compiled pattern. Goes into the original 2692// (not new) frame. If execution ever back-tracks out of the 2693// new frame, this will be where we continue from in the pattern. 2694// Return 2695// The new frame pointer. 2696// 2697//-------------------------------------------------------------------------------- 2698inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) { 2699 // push storage for a new frame. 2700 int64_t *newFP = fStack->reserveBlock(fFrameSize, status); 2701 if (newFP == NULL) { 2702 // Failure on attempted stack expansion. 2703 // Stack function set some other error code, change it to a more 2704 // specific one for regular expressions. 2705 status = U_REGEX_STACK_OVERFLOW; 2706 // We need to return a writable stack frame, so just return the 2707 // previous frame. The match operation will stop quickly 2708 // because of the error status, after which the frame will never 2709 // be looked at again. 2710 return fp; 2711 } 2712 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack. 2713 2714 // New stack frame = copy of old top frame. 2715 int64_t *source = (int64_t *)fp; 2716 int64_t *dest = newFP; 2717 for (;;) { 2718 *dest++ = *source++; 2719 if (source == newFP) { 2720 break; 2721 } 2722 } 2723 2724 fTickCounter--; 2725 if (fTickCounter <= 0) { 2726 IncrementTime(status); // Re-initializes fTickCounter 2727 } 2728 fp->fPatIdx = savePatIdx; 2729 return (REStackFrame *)newFP; 2730} 2731 2732 2733//-------------------------------------------------------------------------------- 2734// 2735// MatchAt This is the actual matching engine. 2736// 2737// startIdx: begin matching a this index. 2738// toEnd: if true, match must extend to end of the input region 2739// 2740//-------------------------------------------------------------------------------- 2741void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) { 2742 UBool isMatch = FALSE; // True if the we have a match. 2743 2744 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards 2745 2746 int32_t op; // Operation from the compiled pattern, split into 2747 int32_t opType; // the opcode 2748 int32_t opValue; // and the operand value. 2749 2750 #ifdef REGEX_RUN_DEBUG 2751 if (fTraceDebug) 2752 { 2753 printf("MatchAt(startIdx=%ld)\n", startIdx); 2754 printf("Original Pattern: "); 2755 UChar32 c = utext_next32From(fPattern->fPattern, 0); 2756 while (c != U_SENTINEL) { 2757 if (c<32 || c>256) { 2758 c = '.'; 2759 } 2760 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 2761 2762 c = UTEXT_NEXT32(fPattern->fPattern); 2763 } 2764 printf("\n"); 2765 printf("Input String: "); 2766 c = utext_next32From(fInputText, 0); 2767 while (c != U_SENTINEL) { 2768 if (c<32 || c>256) { 2769 c = '.'; 2770 } 2771 printf("%c", c); 2772 2773 c = UTEXT_NEXT32(fInputText); 2774 } 2775 printf("\n"); 2776 printf("\n"); 2777 } 2778 #endif 2779 2780 if (U_FAILURE(status)) { 2781 return; 2782 } 2783 2784 // Cache frequently referenced items from the compiled pattern 2785 // 2786 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 2787 2788 const UChar *litText = fPattern->fLiteralText.getBuffer(); 2789 UVector *sets = fPattern->fSets; 2790 2791 fFrameSize = fPattern->fFrameSize; 2792 REStackFrame *fp = resetStack(); 2793 2794 fp->fPatIdx = 0; 2795 fp->fInputIdx = startIdx; 2796 2797 // Zero out the pattern's static data 2798 int32_t i; 2799 for (i = 0; i<fPattern->fDataSize; i++) { 2800 fData[i] = 0; 2801 } 2802 2803 // 2804 // Main loop for interpreting the compiled pattern. 2805 // One iteration of the loop per pattern operation performed. 2806 // 2807 for (;;) { 2808#if 0 2809 if (_heapchk() != _HEAPOK) { 2810 fprintf(stderr, "Heap Trouble\n"); 2811 } 2812#endif 2813 2814 op = (int32_t)pat[fp->fPatIdx]; 2815 opType = URX_TYPE(op); 2816 opValue = URX_VAL(op); 2817 #ifdef REGEX_RUN_DEBUG 2818 if (fTraceDebug) { 2819 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2820 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, 2821 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 2822 fPattern->dumpOp(fp->fPatIdx); 2823 } 2824 #endif 2825 fp->fPatIdx++; 2826 2827 switch (opType) { 2828 2829 2830 case URX_NOP: 2831 break; 2832 2833 2834 case URX_BACKTRACK: 2835 // Force a backtrack. In some circumstances, the pattern compiler 2836 // will notice that the pattern can't possibly match anything, and will 2837 // emit one of these at that point. 2838 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2839 break; 2840 2841 2842 case URX_ONECHAR: 2843 if (fp->fInputIdx < fActiveLimit) { 2844 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2845 UChar32 c = UTEXT_NEXT32(fInputText); 2846 if (c == opValue) { 2847 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2848 break; 2849 } 2850 } else { 2851 fHitEnd = TRUE; 2852 } 2853 2854 #ifdef REGEX_SMART_BACKTRACKING 2855 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 2856 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 2857 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 2858 UBool success = FALSE; 2859 UChar32 c = UTEXT_PREVIOUS32(fInputText); 2860 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { 2861 if (c == opValue) { 2862 success = TRUE; 2863 break; 2864 } else if (c == U_SENTINEL) { 2865 break; 2866 } 2867 c = UTEXT_PREVIOUS32(fInputText); 2868 } 2869 if (success) { 2870 fHitEnd = FALSE; 2871 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2872 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2873 if (fp->fInputIdx > backSearchIndex) { 2874 fp = StateSave(fp, fp->fPatIdx, status); 2875 } 2876 fp->fPatIdx++; // Skip the LOOP_C, we just did that 2877 break; 2878 } 2879 } 2880 } 2881 #endif 2882 2883 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2884 break; 2885 2886 2887 case URX_STRING: 2888 { 2889 // Test input against a literal string. 2890 // Strings require two slots in the compiled pattern, one for the 2891 // offset to the string text, and one for the length. 2892 int32_t stringStartIdx = opValue; 2893 int32_t stringLen; 2894 2895 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 2896 fp->fPatIdx++; 2897 opType = URX_TYPE(op); 2898 stringLen = URX_VAL(op); 2899 U_ASSERT(opType == URX_STRING_LEN); 2900 U_ASSERT(stringLen >= 2); 2901 2902 const UChar *patternChars = litText+stringStartIdx; 2903 const UChar *patternEnd = patternChars+stringLen; 2904 2905 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2906 UChar32 c; 2907 UBool success = TRUE; 2908 2909 while (patternChars < patternEnd && success) { 2910 c = UTEXT_NEXT32(fInputText); 2911 2912 if (c != U_SENTINEL && UTEXT_GETNATIVEINDEX(fInputText) <= fActiveLimit) { 2913 if (U_IS_BMP(c)) { 2914 success = (*patternChars == c); 2915 patternChars += 1; 2916 } else if (patternChars+1 < patternEnd) { 2917 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); 2918 patternChars += 2; 2919 } 2920 } else { 2921 success = FALSE; 2922 fHitEnd = TRUE; // TODO: See ticket 6074 2923 } 2924 } 2925 2926 if (success) { 2927 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2928 } else { 2929 #ifdef REGEX_SMART_BACKTRACKING 2930 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 2931 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 2932 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 2933 // Reset to last start point 2934 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 2935 patternChars = litText+stringStartIdx; 2936 2937 // Search backwards for a possible start 2938 do { 2939 c = UTEXT_PREVIOUS32(fInputText); 2940 if (c == U_SENTINEL) { 2941 break; 2942 } else if ((U_IS_BMP(c) && *patternChars == c) || 2943 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { 2944 success = TRUE; 2945 break; 2946 } 2947 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 2948 2949 // And try again 2950 if (success) { 2951 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2952 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 2953 if (fp->fInputIdx > backSearchIndex) { 2954 fp = StateSave(fp, fp->fPatIdx, status); 2955 } 2956 fp->fPatIdx++; // Skip the LOOP_C, we just did that 2957 break; 2958 } 2959 } 2960 } 2961 #endif 2962 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2963 } 2964 } 2965 break; 2966 2967 2968 case URX_STATE_SAVE: 2969 fp = StateSave(fp, opValue, status); 2970 break; 2971 2972 2973 case URX_END: 2974 // The match loop will exit via this path on a successful match, 2975 // when we reach the end of the pattern. 2976 if (toEnd && fp->fInputIdx != fActiveLimit) { 2977 // The pattern matched, but not to the end of input. Try some more. 2978 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 2979 break; 2980 } 2981 isMatch = TRUE; 2982 goto breakFromLoop; 2983 2984 // Start and End Capture stack frame variables are laid out out like this: 2985 // fp->fExtra[opValue] - The start of a completed capture group 2986 // opValue+1 - The end of a completed capture group 2987 // opValue+2 - the start of a capture group whose end 2988 // has not yet been reached (and might not ever be). 2989 case URX_START_CAPTURE: 2990 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2991 fp->fExtra[opValue+2] = fp->fInputIdx; 2992 break; 2993 2994 2995 case URX_END_CAPTURE: 2996 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 2997 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 2998 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 2999 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 3000 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 3001 break; 3002 3003 3004 case URX_DOLLAR: // $, test for End of line 3005 // or for position before new line at end of input 3006 { 3007 if (fp->fInputIdx >= fAnchorLimit) { 3008 // We really are at the end of input. Success. 3009 fHitEnd = TRUE; 3010 fRequireEnd = TRUE; 3011 break; 3012 } 3013 3014 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3015 3016 // If we are positioned just before a new-line that is located at the 3017 // end of input, succeed. 3018 UChar32 c = UTEXT_NEXT32(fInputText); 3019 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 3020 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 3021 // If not in the middle of a CR/LF sequence 3022 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && (UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) { 3023 // At new-line at end of input. Success 3024 fHitEnd = TRUE; 3025 fRequireEnd = TRUE; 3026 3027 break; 3028 } 3029 } 3030 } else { 3031 UChar32 nextC = UTEXT_NEXT32(fInputText); 3032 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) { 3033 fHitEnd = TRUE; 3034 fRequireEnd = TRUE; 3035 break; // At CR/LF at end of input. Success 3036 } 3037 } 3038 3039 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3040 } 3041 break; 3042 3043 3044 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 3045 if (fp->fInputIdx >= fAnchorLimit) { 3046 // Off the end of input. Success. 3047 fHitEnd = TRUE; 3048 fRequireEnd = TRUE; 3049 break; 3050 } else { 3051 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3052 UChar32 c = UTEXT_NEXT32(fInputText); 3053 // Either at the last character of input, or off the end. 3054 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) { 3055 fHitEnd = TRUE; 3056 fRequireEnd = TRUE; 3057 break; 3058 } 3059 } 3060 3061 // Not at end of input. Back-track out. 3062 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3063 break; 3064 3065 3066 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 3067 { 3068 if (fp->fInputIdx >= fAnchorLimit) { 3069 // We really are at the end of input. Success. 3070 fHitEnd = TRUE; 3071 fRequireEnd = TRUE; 3072 break; 3073 } 3074 // If we are positioned just before a new-line, succeed. 3075 // It makes no difference where the new-line is within the input. 3076 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3077 UChar32 c = UTEXT_CURRENT32(fInputText); 3078 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 3079 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 3080 // In multi-line mode, hitting a new-line just before the end of input does not 3081 // set the hitEnd or requireEnd flags 3082 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) { 3083 break; 3084 } 3085 } 3086 // not at a new line. Fail. 3087 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3088 } 3089 break; 3090 3091 3092 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 3093 { 3094 if (fp->fInputIdx >= fAnchorLimit) { 3095 // We really are at the end of input. Success. 3096 fHitEnd = TRUE; 3097 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 3098 break; // adding a new-line would not lose the match. 3099 } 3100 // If we are not positioned just before a new-line, the test fails; backtrack out. 3101 // It makes no difference where the new-line is within the input. 3102 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3103 if (UTEXT_CURRENT32(fInputText) != 0x0a) { 3104 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3105 } 3106 } 3107 break; 3108 3109 3110 case URX_CARET: // ^, test for start of line 3111 if (fp->fInputIdx != fAnchorStart) { 3112 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3113 } 3114 break; 3115 3116 3117 case URX_CARET_M: // ^, test for start of line in mulit-line mode 3118 { 3119 if (fp->fInputIdx == fAnchorStart) { 3120 // We are at the start input. Success. 3121 break; 3122 } 3123 // Check whether character just before the current pos is a new-line 3124 // unless we are at the end of input 3125 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3126 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3127 if ((fp->fInputIdx < fAnchorLimit) && 3128 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3129 // It's a new-line. ^ is true. Success. 3130 // TODO: what should be done with positions between a CR and LF? 3131 break; 3132 } 3133 // Not at the start of a line. Fail. 3134 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3135 } 3136 break; 3137 3138 3139 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 3140 { 3141 U_ASSERT(fp->fInputIdx >= fAnchorStart); 3142 if (fp->fInputIdx <= fAnchorStart) { 3143 // We are at the start input. Success. 3144 break; 3145 } 3146 // Check whether character just before the current pos is a new-line 3147 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 3148 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3149 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3150 if (c != 0x0a) { 3151 // Not at the start of a line. Back-track out. 3152 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3153 } 3154 } 3155 break; 3156 3157 case URX_BACKSLASH_B: // Test for word boundaries 3158 { 3159 UBool success = isWordBoundary(fp->fInputIdx); 3160 success ^= (opValue != 0); // flip sense for \B 3161 if (!success) { 3162 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3163 } 3164 } 3165 break; 3166 3167 3168 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 3169 { 3170 UBool success = isUWordBoundary(fp->fInputIdx); 3171 success ^= (opValue != 0); // flip sense for \B 3172 if (!success) { 3173 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3174 } 3175 } 3176 break; 3177 3178 3179 case URX_BACKSLASH_D: // Test for decimal digit 3180 { 3181 if (fp->fInputIdx >= fActiveLimit) { 3182 fHitEnd = TRUE; 3183 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3184 break; 3185 } 3186 3187 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3188 3189 UChar32 c = UTEXT_NEXT32(fInputText); 3190 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 3191 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 3192 success ^= (opValue != 0); // flip sense for \D 3193 if (success) { 3194 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3195 } else { 3196 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3197 } 3198 } 3199 break; 3200 3201 3202 case URX_BACKSLASH_G: // Test for position at end of previous match 3203 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 3204 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3205 } 3206 break; 3207 3208 3209 case URX_BACKSLASH_X: 3210 // Match a Grapheme, as defined by Unicode TR 29. 3211 // Differs slightly from Perl, which consumes combining marks independently 3212 // of context. 3213 { 3214 3215 // Fail if at end of input 3216 if (fp->fInputIdx >= fActiveLimit) { 3217 fHitEnd = TRUE; 3218 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3219 break; 3220 } 3221 3222 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3223 3224 // Examine (and consume) the current char. 3225 // Dispatch into a little state machine, based on the char. 3226 UChar32 c; 3227 c = UTEXT_NEXT32(fInputText); 3228 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3229 UnicodeSet **sets = fPattern->fStaticSets; 3230 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 3231 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 3232 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3233 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3234 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3235 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3236 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3237 goto GC_Extend; 3238 3239 3240 3241GC_L: 3242 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3243 c = UTEXT_NEXT32(fInputText); 3244 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3245 if (sets[URX_GC_L]->contains(c)) goto GC_L; 3246 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 3247 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 3248 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3249 UTEXT_PREVIOUS32(fInputText); 3250 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3251 goto GC_Extend; 3252 3253GC_V: 3254 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3255 c = UTEXT_NEXT32(fInputText); 3256 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3257 if (sets[URX_GC_V]->contains(c)) goto GC_V; 3258 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3259 UTEXT_PREVIOUS32(fInputText); 3260 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3261 goto GC_Extend; 3262 3263GC_T: 3264 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 3265 c = UTEXT_NEXT32(fInputText); 3266 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3267 if (sets[URX_GC_T]->contains(c)) goto GC_T; 3268 UTEXT_PREVIOUS32(fInputText); 3269 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3270 goto GC_Extend; 3271 3272GC_Extend: 3273 // Combining characters are consumed here 3274 for (;;) { 3275 if (fp->fInputIdx >= fActiveLimit) { 3276 break; 3277 } 3278 c = UTEXT_CURRENT32(fInputText); 3279 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 3280 break; 3281 } 3282 UTEXT_NEXT32(fInputText); 3283 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3284 } 3285 goto GC_Done; 3286 3287GC_Control: 3288 // Most control chars stand alone (don't combine with combining chars), 3289 // except for that CR/LF sequence is a single grapheme cluster. 3290 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) { 3291 c = UTEXT_NEXT32(fInputText); 3292 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3293 } 3294 3295GC_Done: 3296 if (fp->fInputIdx >= fActiveLimit) { 3297 fHitEnd = TRUE; 3298 } 3299 break; 3300 } 3301 3302 3303 3304 3305 case URX_BACKSLASH_Z: // Test for end of Input 3306 if (fp->fInputIdx < fAnchorLimit) { 3307 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3308 } else { 3309 fHitEnd = TRUE; 3310 fRequireEnd = TRUE; 3311 } 3312 break; 3313 3314 3315 3316 case URX_STATIC_SETREF: 3317 { 3318 // Test input character against one of the predefined sets 3319 // (Word Characters, for example) 3320 // The high bit of the op value is a flag for the match polarity. 3321 // 0: success if input char is in set. 3322 // 1: success if input char is not in set. 3323 if (fp->fInputIdx >= fActiveLimit) { 3324 fHitEnd = TRUE; 3325 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3326 break; 3327 } 3328 3329 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 3330 opValue &= ~URX_NEG_SET; 3331 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3332 3333 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3334 UChar32 c = UTEXT_NEXT32(fInputText); 3335 if (c < 256) { 3336 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3337 if (s8->contains(c)) { 3338 success = !success; 3339 } 3340 } else { 3341 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3342 if (s->contains(c)) { 3343 success = !success; 3344 } 3345 } 3346 if (success) { 3347 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3348 } else { 3349 // the character wasn't in the set. 3350 #ifdef REGEX_SMART_BACKTRACKING 3351 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3352 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3353 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3354 // Try to find it, backwards 3355 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried 3356 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset 3357 do { 3358 c = UTEXT_PREVIOUS32(fInputText); 3359 if (c == U_SENTINEL) { 3360 break; 3361 } else if (c < 256) { 3362 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3363 if (s8->contains(c)) { 3364 success = !success; 3365 } 3366 } else { 3367 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3368 if (s->contains(c)) { 3369 success = !success; 3370 } 3371 } 3372 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex && !success); 3373 3374 if (success && c != U_SENTINEL) { 3375 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3376 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3377 if (fp->fInputIdx > backSearchIndex) { 3378 fp = StateSave(fp, fp->fPatIdx, status); 3379 } 3380 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3381 break; 3382 } 3383 } 3384 } 3385 #endif 3386 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3387 } 3388 } 3389 break; 3390 3391 3392 case URX_STAT_SETREF_N: 3393 { 3394 // Test input character for NOT being a member of one of 3395 // the predefined sets (Word Characters, for example) 3396 if (fp->fInputIdx >= fActiveLimit) { 3397 fHitEnd = TRUE; 3398 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3399 break; 3400 } 3401 3402 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 3403 3404 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3405 3406 UChar32 c = UTEXT_NEXT32(fInputText); 3407 if (c < 256) { 3408 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3409 if (s8->contains(c) == FALSE) { 3410 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3411 break; 3412 } 3413 } else { 3414 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3415 if (s->contains(c) == FALSE) { 3416 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3417 break; 3418 } 3419 } 3420 // the character wasn't in the set. 3421 #ifdef REGEX_SMART_BACKTRACKING 3422 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3423 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3424 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3425 // Try to find it, backwards 3426 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried 3427 UBool success = FALSE; 3428 do { 3429 c = UTEXT_PREVIOUS32(fInputText); 3430 if (c == U_SENTINEL) { 3431 break; 3432 } else if (c < 256) { 3433 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 3434 if (s8->contains(c) == FALSE) { 3435 success = TRUE; 3436 break; 3437 } 3438 } else { 3439 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 3440 if (s->contains(c) == FALSE) { 3441 success = TRUE; 3442 break; 3443 } 3444 } 3445 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 3446 3447 if (success) { 3448 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3449 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3450 if (fp->fInputIdx > backSearchIndex) { 3451 fp = StateSave(fp, fp->fPatIdx, status); 3452 } 3453 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3454 break; 3455 } 3456 } 3457 } 3458 #endif 3459 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3460 } 3461 break; 3462 3463 3464 case URX_SETREF: 3465 if (fp->fInputIdx >= fActiveLimit) { 3466 fHitEnd = TRUE; 3467 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3468 break; 3469 } else { 3470 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3471 3472 // There is input left. Pick up one char and test it for set membership. 3473 UChar32 c = UTEXT_NEXT32(fInputText); 3474 U_ASSERT(opValue > 0 && opValue < sets->size()); 3475 if (c<256) { 3476 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3477 if (s8->contains(c)) { 3478 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3479 break; 3480 } 3481 } else { 3482 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 3483 if (s->contains(c)) { 3484 // The character is in the set. A Match. 3485 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3486 break; 3487 } 3488 } 3489 3490 // the character wasn't in the set. 3491 #ifdef REGEX_SMART_BACKTRACKING 3492 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3493 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3494 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3495 // Try to find it, backwards 3496 UTEXT_PREVIOUS32(fInputText); // skip the first character we tried 3497 UBool success = FALSE; 3498 do { 3499 c = UTEXT_PREVIOUS32(fInputText); 3500 if (c == U_SENTINEL) { 3501 break; 3502 } else if (c < 256) { 3503 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 3504 if (s8->contains(c)) { 3505 success = TRUE; 3506 break; 3507 } 3508 } else { 3509 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 3510 if (s->contains(c)) { 3511 success = TRUE; 3512 break; 3513 } 3514 } 3515 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 3516 3517 if (success) { 3518 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3519 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3520 if (fp->fInputIdx > backSearchIndex) { 3521 fp = StateSave(fp, fp->fPatIdx, status); 3522 } 3523 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3524 break; 3525 } 3526 } 3527 } 3528 #endif 3529 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3530 } 3531 break; 3532 3533 3534 case URX_DOTANY: 3535 { 3536 // . matches anything, but stops at end-of-line. 3537 if (fp->fInputIdx >= fActiveLimit) { 3538 // At end of input. Match failed. Backtrack out. 3539 fHitEnd = TRUE; 3540 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3541 break; 3542 } 3543 3544 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3545 3546 // There is input left. Advance over one char, unless we've hit end-of-line 3547 UChar32 c = UTEXT_NEXT32(fInputText); 3548 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 3549 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 3550 // End of line in normal mode. . does not match. 3551 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3552 break; 3553 } 3554 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3555 } 3556 break; 3557 3558 3559 case URX_DOTANY_ALL: 3560 { 3561 // ., in dot-matches-all (including new lines) mode 3562 if (fp->fInputIdx >= fActiveLimit) { 3563 // At end of input. Match failed. Backtrack out. 3564 fHitEnd = TRUE; 3565 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3566 break; 3567 } 3568 3569 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3570 3571 // There is input left. Advance over one char, except if we are 3572 // at a cr/lf, advance over both of them. 3573 UChar32 c; 3574 c = UTEXT_NEXT32(fInputText); 3575 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3576 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 3577 // In the case of a CR/LF, we need to advance over both. 3578 UChar32 nextc = UTEXT_CURRENT32(fInputText); 3579 if (nextc == 0x0a) { 3580 UTEXT_NEXT32(fInputText); 3581 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3582 } 3583 } 3584 } 3585 break; 3586 3587 3588 case URX_DOTANY_UNIX: 3589 { 3590 // '.' operator, matches all, but stops at end-of-line. 3591 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 3592 if (fp->fInputIdx >= fActiveLimit) { 3593 // At end of input. Match failed. Backtrack out. 3594 fHitEnd = TRUE; 3595 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3596 break; 3597 } 3598 3599 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3600 3601 // There is input left. Advance over one char, unless we've hit end-of-line 3602 UChar32 c = UTEXT_NEXT32(fInputText); 3603 if (c == 0x0a) { 3604 // End of line in normal mode. '.' does not match the \n 3605 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3606 } else { 3607 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3608 } 3609 } 3610 break; 3611 3612 3613 case URX_JMP: 3614 fp->fPatIdx = opValue; 3615 break; 3616 3617 case URX_FAIL: 3618 isMatch = FALSE; 3619 goto breakFromLoop; 3620 3621 case URX_JMP_SAV: 3622 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 3623 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3624 fp->fPatIdx = opValue; // Then JMP. 3625 break; 3626 3627 case URX_JMP_SAV_X: 3628 // This opcode is used with (x)+, when x can match a zero length string. 3629 // Same as JMP_SAV, except conditional on the match having made forward progress. 3630 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 3631 // data address of the input position at the start of the loop. 3632 { 3633 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 3634 int32_t stoOp = (int32_t)pat[opValue-1]; 3635 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 3636 int32_t frameLoc = URX_VAL(stoOp); 3637 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 3638 int64_t prevInputIdx = fp->fExtra[frameLoc]; 3639 U_ASSERT(prevInputIdx <= fp->fInputIdx); 3640 if (prevInputIdx < fp->fInputIdx) { 3641 // The match did make progress. Repeat the loop. 3642 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 3643 fp->fPatIdx = opValue; 3644 fp->fExtra[frameLoc] = fp->fInputIdx; 3645 } 3646 // If the input position did not advance, we do nothing here, 3647 // execution will fall out of the loop. 3648 } 3649 break; 3650 3651 case URX_CTR_INIT: 3652 { 3653 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3654 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3655 3656 // Pick up the three extra operands that CTR_INIT has, and 3657 // skip the pattern location counter past 3658 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3659 fp->fPatIdx += 3; 3660 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3661 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3662 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3663 U_ASSERT(minCount>=0); 3664 U_ASSERT(maxCount>=minCount || maxCount==-1); 3665 U_ASSERT(loopLoc>fp->fPatIdx); 3666 3667 if (minCount == 0) { 3668 fp = StateSave(fp, loopLoc+1, status); 3669 } 3670 if (maxCount == 0) { 3671 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3672 } 3673 } 3674 break; 3675 3676 case URX_CTR_LOOP: 3677 { 3678 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3679 int32_t initOp = (int32_t)pat[opValue]; 3680 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 3681 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3682 int32_t minCount = (int32_t)pat[opValue+2]; 3683 int32_t maxCount = (int32_t)pat[opValue+3]; 3684 // Increment the counter. Note: we DIDN'T worry about counter 3685 // overflow, since the data comes from UnicodeStrings, which 3686 // stores its length in an int32_t. Do we have to think about 3687 // this now that we're using UText? Probably not, since the length 3688 // in UChar32s is still an int32_t. 3689 (*pCounter)++; 3690 U_ASSERT(*pCounter > 0); 3691 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 3692 U_ASSERT(*pCounter == maxCount || maxCount == -1); 3693 break; 3694 } 3695 if (*pCounter >= minCount) { 3696 fp = StateSave(fp, fp->fPatIdx, status); 3697 } 3698 fp->fPatIdx = opValue + 4; // Loop back. 3699 } 3700 break; 3701 3702 case URX_CTR_INIT_NG: 3703 { 3704 // Initialize a non-greedy loop 3705 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 3706 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 3707 3708 // Pick up the three extra operands that CTR_INIT has, and 3709 // skip the pattern location counter past 3710 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3711 fp->fPatIdx += 3; 3712 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 3713 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 3714 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 3715 U_ASSERT(minCount>=0); 3716 U_ASSERT(maxCount>=minCount || maxCount==-1); 3717 U_ASSERT(loopLoc>fp->fPatIdx); 3718 3719 if (minCount == 0) { 3720 if (maxCount != 0) { 3721 fp = StateSave(fp, fp->fPatIdx, status); 3722 } 3723 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 3724 } 3725 } 3726 break; 3727 3728 case URX_CTR_LOOP_NG: 3729 { 3730 // Non-greedy {min, max} loops 3731 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 3732 int32_t initOp = (int32_t)pat[opValue]; 3733 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 3734 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 3735 int32_t minCount = (int32_t)pat[opValue+2]; 3736 int32_t maxCount = (int32_t)pat[opValue+3]; 3737 // Increment the counter. Note: we DIDN'T worry about counter 3738 // overflow, since the data comes from UnicodeStrings, which 3739 // stores its length in an int32_t. Do we have to think about 3740 // this now that we're using UText? Probably not, since the length 3741 // in UChar32s is still an int32_t. 3742 (*pCounter)++; 3743 U_ASSERT(*pCounter > 0); 3744 3745 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 3746 // The loop has matched the maximum permitted number of times. 3747 // Break out of here with no action. Matching will 3748 // continue with the following pattern. 3749 U_ASSERT(*pCounter == maxCount || maxCount == -1); 3750 break; 3751 } 3752 3753 if (*pCounter < minCount) { 3754 // We haven't met the minimum number of matches yet. 3755 // Loop back for another one. 3756 fp->fPatIdx = opValue + 4; // Loop back. 3757 } else { 3758 // We do have the minimum number of matches. 3759 // Fall into the following pattern, but first do 3760 // a state save to the top of the loop, so that a failure 3761 // in the following pattern will try another iteration of the loop. 3762 fp = StateSave(fp, opValue + 4, status); 3763 } 3764 } 3765 break; 3766 3767 case URX_STO_SP: 3768 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3769 fData[opValue] = fStack->size(); 3770 break; 3771 3772 case URX_LD_SP: 3773 { 3774 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 3775 int32_t newStackSize = (int32_t)fData[opValue]; 3776 U_ASSERT(newStackSize <= fStack->size()); 3777 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3778 if (newFP == (int64_t *)fp) { 3779 break; 3780 } 3781 int32_t i; 3782 for (i=0; i<fFrameSize; i++) { 3783 newFP[i] = ((int64_t *)fp)[i]; 3784 } 3785 fp = (REStackFrame *)newFP; 3786 fStack->setSize(newStackSize); 3787 } 3788 break; 3789 3790 case URX_BACKREF: 3791 case URX_BACKREF_I: 3792 { 3793 U_ASSERT(opValue < fFrameSize); 3794 int64_t groupStartIdx = fp->fExtra[opValue]; 3795 int64_t groupEndIdx = fp->fExtra[opValue+1]; 3796 U_ASSERT(groupStartIdx <= groupEndIdx); 3797 if (groupStartIdx < 0) { 3798 // This capture group has not participated in the match thus far, 3799 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3800 } 3801 3802 if (groupEndIdx == groupStartIdx) { 3803 // The capture group match was of an empty string. 3804 // Verified by testing: Perl matches succeed in this case, so 3805 // we do too. 3806 break; 3807 } 3808 3809 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx); 3810 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3811 3812 UBool haveMatch = (opType == URX_BACKREF ? 3813 (0 == utext_compareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1)) : 3814 (0 == utext_caseCompareNativeLimit(fAltInputText, groupEndIdx, fInputText, -1, U_FOLD_CASE_DEFAULT, &status))); 3815 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3816 3817 if (fp->fInputIdx > fActiveLimit) { 3818 fHitEnd = TRUE; 3819 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3820 } else if (!haveMatch) { 3821 if (fp->fInputIdx == fActiveLimit) { 3822 fHitEnd = TRUE; 3823 } 3824 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 3825 } 3826 } 3827 break; 3828 3829 case URX_STO_INP_LOC: 3830 { 3831 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 3832 fp->fExtra[opValue] = fp->fInputIdx; 3833 } 3834 break; 3835 3836 case URX_JMPX: 3837 { 3838 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 3839 fp->fPatIdx += 1; 3840 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 3841 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 3842 int64_t savedInputIdx = fp->fExtra[dataLoc]; 3843 U_ASSERT(savedInputIdx <= fp->fInputIdx); 3844 if (savedInputIdx < fp->fInputIdx) { 3845 fp->fPatIdx = opValue; // JMP 3846 } else { 3847 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 3848 } 3849 } 3850 break; 3851 3852 case URX_LA_START: 3853 { 3854 // Entering a lookahead block. 3855 // Save Stack Ptr, Input Pos. 3856 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3857 fData[opValue] = fStack->size(); 3858 fData[opValue+1] = fp->fInputIdx; 3859 fActiveStart = fLookStart; // Set the match region change for 3860 fActiveLimit = fLookLimit; // transparent bounds. 3861 } 3862 break; 3863 3864 case URX_LA_END: 3865 { 3866 // Leaving a look-ahead block. 3867 // restore Stack Ptr, Input Pos to positions they had on entry to block. 3868 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 3869 int32_t stackSize = fStack->size(); 3870 int32_t newStackSize =(int32_t)fData[opValue]; 3871 U_ASSERT(stackSize >= newStackSize); 3872 if (stackSize > newStackSize) { 3873 // Copy the current top frame back to the new (cut back) top frame. 3874 // This makes the capture groups from within the look-ahead 3875 // expression available. 3876 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 3877 int32_t i; 3878 for (i=0; i<fFrameSize; i++) { 3879 newFP[i] = ((int64_t *)fp)[i]; 3880 } 3881 fp = (REStackFrame *)newFP; 3882 fStack->setSize(newStackSize); 3883 } 3884 fp->fInputIdx = fData[opValue+1]; 3885 3886 // Restore the active region bounds in the input string; they may have 3887 // been changed because of transparent bounds on a Region. 3888 fActiveStart = fRegionStart; 3889 fActiveLimit = fRegionLimit; 3890 } 3891 break; 3892 3893 case URX_ONECHAR_I: 3894 if (fp->fInputIdx < fActiveLimit) { 3895 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3896 3897 UChar32 c = UTEXT_NEXT32(fInputText); 3898 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3899 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3900 break; 3901 } 3902 } else { 3903 fHitEnd = TRUE; 3904 } 3905 3906 #ifdef REGEX_SMART_BACKTRACKING 3907 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 3908 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 3909 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 3910 UBool success = FALSE; 3911 UChar32 c = UTEXT_PREVIOUS32(fInputText); 3912 while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex) { 3913 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 3914 success = TRUE; 3915 break; 3916 } else if (c == U_SENTINEL) { 3917 break; 3918 } 3919 c = UTEXT_PREVIOUS32(fInputText); 3920 } 3921 if (success) { 3922 fHitEnd = FALSE; 3923 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3924 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3925 if (fp->fInputIdx > backSearchIndex) { 3926 fp = StateSave(fp, fp->fPatIdx, status); 3927 } 3928 fp->fPatIdx++; // Skip the LOOP_C, we just did that 3929 break; 3930 } 3931 } 3932 } 3933 #endif 3934 3935 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 3936 break; 3937 3938 case URX_STRING_I: 3939 { 3940 // Test input against a literal string. 3941 // Strings require two slots in the compiled pattern, one for the 3942 // offset to the string text, and one for the length. 3943 const UCaseProps *csp = ucase_getSingleton(); 3944 { 3945 int32_t stringStartIdx, stringLen; 3946 stringStartIdx = opValue; 3947 3948 op = (int32_t)pat[fp->fPatIdx]; 3949 fp->fPatIdx++; 3950 opType = URX_TYPE(op); 3951 opValue = URX_VAL(op); 3952 U_ASSERT(opType == URX_STRING_LEN); 3953 stringLen = opValue; 3954 3955 const UChar *patternChars = litText+stringStartIdx; 3956 const UChar *patternEnd = patternChars+stringLen; 3957 3958 const UChar *foldChars = NULL; 3959 int32_t foldOffset, foldLength; 3960 UChar32 c; 3961 3962 foldOffset = foldLength = 0; 3963 UBool success = TRUE; 3964 3965 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 3966 while (patternChars < patternEnd && success) { 3967 if(foldOffset < foldLength) { 3968 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 3969 } else { 3970 c = UTEXT_NEXT32(fInputText); 3971 if (c != U_SENTINEL) { 3972 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 3973 if(foldLength >= 0) { 3974 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 3975 foldOffset = 0; 3976 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 3977 } else { 3978 c = foldLength; 3979 foldLength = foldOffset; // to avoid reading chars from the folding buffer 3980 } 3981 } 3982 } 3983 3984 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 3985 } 3986 3987 success = FALSE; 3988 if (c != U_SENTINEL && (fp->fInputIdx <= fActiveLimit)) { 3989 if (U_IS_BMP(c)) { 3990 success = (*patternChars == c); 3991 patternChars += 1; 3992 } else if (patternChars+1 < patternEnd) { 3993 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); 3994 patternChars += 2; 3995 } 3996 } else { 3997 fHitEnd = TRUE; // TODO: See ticket 6074 3998 } 3999 } 4000 4001 if (!success) { 4002 #ifdef REGEX_SMART_BACKTRACKING 4003 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 4004 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 4005 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 4006 // Reset to last start point 4007 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4008 patternChars = litText+stringStartIdx; 4009 4010 // Search backwards for a possible start 4011 do { 4012 c = UTEXT_PREVIOUS32(fInputText); 4013 if (c == U_SENTINEL) { 4014 break; 4015 } else { 4016 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 4017 if(foldLength >= 0) { 4018 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 4019 foldOffset = 0; 4020 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 4021 } else { 4022 c = foldLength; 4023 foldLength = foldOffset; // to avoid reading chars from the folding buffer 4024 } 4025 } 4026 4027 if ((U_IS_BMP(c) && *patternChars == c) || 4028 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { 4029 success = TRUE; 4030 break; 4031 } 4032 } 4033 } while (UTEXT_GETNATIVEINDEX(fInputText) >= backSearchIndex); 4034 4035 // And try again 4036 if (success) { 4037 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4038 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4039 if (fp->fInputIdx > backSearchIndex) { 4040 fp = StateSave(fp, fp->fPatIdx, status); 4041 } 4042 fp->fPatIdx++; // Skip the LOOP_C, we just did that 4043 break; 4044 } 4045 } 4046 } 4047 #endif 4048 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4049 } 4050 } 4051 } 4052 break; 4053 4054 case URX_LB_START: 4055 { 4056 // Entering a look-behind block. 4057 // Save Stack Ptr, Input Pos. 4058 // TODO: implement transparent bounds. Ticket #6067 4059 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4060 fData[opValue] = fStack->size(); 4061 fData[opValue+1] = fp->fInputIdx; 4062 // Init the variable containing the start index for attempted matches. 4063 fData[opValue+2] = -1; 4064 // Save input string length, then reset to pin any matches to end at 4065 // the current position. 4066 fData[opValue+3] = fActiveLimit; 4067 fActiveLimit = fp->fInputIdx; 4068 } 4069 break; 4070 4071 4072 case URX_LB_CONT: 4073 { 4074 // Positive Look-Behind, at top of loop checking for matches of LB expression 4075 // at all possible input starting positions. 4076 4077 // Fetch the min and max possible match lengths. They are the operands 4078 // of this op in the pattern. 4079 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 4080 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 4081 U_ASSERT(minML <= maxML); 4082 U_ASSERT(minML >= 0); 4083 4084 // Fetch (from data) the last input index where a match was attempted. 4085 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4086 int64_t *lbStartIdx = &fData[opValue+2]; 4087 if (*lbStartIdx < 0) { 4088 // First time through loop. 4089 *lbStartIdx = fp->fInputIdx - minML; 4090 } else { 4091 // 2nd through nth time through the loop. 4092 // Back up start position for match by one. 4093 if (*lbStartIdx == 0) { 4094 (*lbStartIdx)--; 4095 } else { 4096 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 4097 UTEXT_PREVIOUS32(fInputText); 4098 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 4099 } 4100 } 4101 4102 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 4103 // We have tried all potential match starting points without 4104 // getting a match. Backtrack out, and out of the 4105 // Look Behind altogether. 4106 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4107 int64_t restoreInputLen = fData[opValue+3]; 4108 U_ASSERT(restoreInputLen >= fActiveLimit); 4109 U_ASSERT(restoreInputLen <= fInputLength); 4110 fActiveLimit = restoreInputLen; 4111 break; 4112 } 4113 4114 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 4115 // (successful match will fall off the end of the loop.) 4116 fp = StateSave(fp, fp->fPatIdx-3, status); 4117 fp->fInputIdx = *lbStartIdx; 4118 } 4119 break; 4120 4121 case URX_LB_END: 4122 // End of a look-behind block, after a successful match. 4123 { 4124 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4125 if (fp->fInputIdx != fActiveLimit) { 4126 // The look-behind expression matched, but the match did not 4127 // extend all the way to the point that we are looking behind from. 4128 // FAIL out of here, which will take us back to the LB_CONT, which 4129 // will retry the match starting at another position or fail 4130 // the look-behind altogether, whichever is appropriate. 4131 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4132 break; 4133 } 4134 4135 // Look-behind match is good. Restore the orignal input string length, 4136 // which had been truncated to pin the end of the lookbehind match to the 4137 // position being looked-behind. 4138 int64_t originalInputLen = fData[opValue+3]; 4139 U_ASSERT(originalInputLen >= fActiveLimit); 4140 U_ASSERT(originalInputLen <= fInputLength); 4141 fActiveLimit = originalInputLen; 4142 } 4143 break; 4144 4145 4146 case URX_LBN_CONT: 4147 { 4148 // Negative Look-Behind, at top of loop checking for matches of LB expression 4149 // at all possible input starting positions. 4150 4151 // Fetch the extra parameters of this op. 4152 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 4153 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 4154 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 4155 continueLoc = URX_VAL(continueLoc); 4156 U_ASSERT(minML <= maxML); 4157 U_ASSERT(minML >= 0); 4158 U_ASSERT(continueLoc > fp->fPatIdx); 4159 4160 // Fetch (from data) the last input index where a match was attempted. 4161 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4162 int64_t *lbStartIdx = &fData[opValue+2]; 4163 if (*lbStartIdx < 0) { 4164 // First time through loop. 4165 *lbStartIdx = fp->fInputIdx - minML; 4166 } else { 4167 // 2nd through nth time through the loop. 4168 // Back up start position for match by one. 4169 if (*lbStartIdx == 0) { 4170 (*lbStartIdx)--; 4171 } else { 4172 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx); 4173 UTEXT_PREVIOUS32(fInputText); 4174 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText); 4175 } 4176 } 4177 4178 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 4179 // We have tried all potential match starting points without 4180 // getting a match, which means that the negative lookbehind as 4181 // a whole has succeeded. Jump forward to the continue location 4182 int64_t restoreInputLen = fData[opValue+3]; 4183 U_ASSERT(restoreInputLen >= fActiveLimit); 4184 U_ASSERT(restoreInputLen <= fInputLength); 4185 fActiveLimit = restoreInputLen; 4186 fp->fPatIdx = continueLoc; 4187 break; 4188 } 4189 4190 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 4191 // (successful match will cause a FAIL out of the loop altogether.) 4192 fp = StateSave(fp, fp->fPatIdx-4, status); 4193 fp->fInputIdx = *lbStartIdx; 4194 } 4195 break; 4196 4197 case URX_LBN_END: 4198 // End of a negative look-behind block, after a successful match. 4199 { 4200 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4201 if (fp->fInputIdx != fActiveLimit) { 4202 // The look-behind expression matched, but the match did not 4203 // extend all the way to the point that we are looking behind from. 4204 // FAIL out of here, which will take us back to the LB_CONT, which 4205 // will retry the match starting at another position or succeed 4206 // the look-behind altogether, whichever is appropriate. 4207 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4208 break; 4209 } 4210 4211 // Look-behind expression matched, which means look-behind test as 4212 // a whole Fails 4213 4214 // Restore the orignal input string length, which had been truncated 4215 // inorder to pin the end of the lookbehind match 4216 // to the position being looked-behind. 4217 int64_t originalInputLen = fData[opValue+3]; 4218 U_ASSERT(originalInputLen >= fActiveLimit); 4219 U_ASSERT(originalInputLen <= fInputLength); 4220 fActiveLimit = originalInputLen; 4221 4222 // Restore original stack position, discarding any state saved 4223 // by the successful pattern match. 4224 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 4225 int32_t newStackSize = (int32_t)fData[opValue]; 4226 U_ASSERT(fStack->size() > newStackSize); 4227 fStack->setSize(newStackSize); 4228 4229 // FAIL, which will take control back to someplace 4230 // prior to entering the look-behind test. 4231 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4232 } 4233 break; 4234 4235 4236 case URX_LOOP_SR_I: 4237 // Loop Initialization for the optimized implementation of 4238 // [some character set]* 4239 // This op scans through all matching input. 4240 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4241 { 4242 U_ASSERT(opValue > 0 && opValue < sets->size()); 4243 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 4244 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 4245 4246 // Loop through input, until either the input is exhausted or 4247 // we reach a character that is not a member of the set. 4248 int64_t ix = fp->fInputIdx; 4249 UTEXT_SETNATIVEINDEX(fInputText, ix); 4250 for (;;) { 4251 if (ix >= fActiveLimit) { 4252 fHitEnd = TRUE; 4253 break; 4254 } 4255 UChar32 c = UTEXT_NEXT32(fInputText); 4256 if (c<256) { 4257 if (s8->contains(c) == FALSE) { 4258 break; 4259 } 4260 } else { 4261 if (s->contains(c) == FALSE) { 4262 break; 4263 } 4264 } 4265 ix = UTEXT_GETNATIVEINDEX(fInputText); 4266 } 4267 4268 // If there were no matching characters, skip over the loop altogether. 4269 // The loop doesn't run at all, a * op always succeeds. 4270 if (ix == fp->fInputIdx) { 4271 fp->fPatIdx++; // skip the URX_LOOP_C op. 4272 break; 4273 } 4274 4275 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4276 // must follow. It's operand is the stack location 4277 // that holds the starting input index for the match of this [set]* 4278 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4279 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4280 int32_t stackLoc = URX_VAL(loopcOp); 4281 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4282 fp->fExtra[stackLoc] = fp->fInputIdx; 4283 #ifdef REGEX_SMART_BACKTRACKING 4284 backSearchIndex = fp->fInputIdx; 4285 #endif 4286 fp->fInputIdx = ix; 4287 4288 // Save State to the URX_LOOP_C op that follows this one, 4289 // so that match failures in the following code will return to there. 4290 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4291 fp = StateSave(fp, fp->fPatIdx, status); 4292 fp->fPatIdx++; 4293 } 4294 break; 4295 4296 4297 case URX_LOOP_DOT_I: 4298 // Loop Initialization for the optimized implementation of .* 4299 // This op scans through all remaining input. 4300 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 4301 { 4302 // Loop through input until the input is exhausted (we reach an end-of-line) 4303 // In DOTALL mode, we can just go straight to the end of the input. 4304 int64_t ix; 4305 if ((opValue & 1) == 1) { 4306 // Dot-matches-All mode. Jump straight to the end of the string. 4307 ix = fActiveLimit; 4308 fHitEnd = TRUE; 4309 } else { 4310 // NOT DOT ALL mode. Line endings do not match '.' 4311 // Scan forward until a line ending or end of input. 4312 ix = fp->fInputIdx; 4313 UTEXT_SETNATIVEINDEX(fInputText, ix); 4314 for (;;) { 4315 if (ix >= fActiveLimit) { 4316 fHitEnd = TRUE; 4317 break; 4318 } 4319 UChar32 c = UTEXT_NEXT32(fInputText); 4320 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 4321 if ((c == 0x0a) || // 0x0a is newline in both modes. 4322 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 4323 (c<=0x0d && c>=0x0a)) || c==0x85 ||c==0x2028 || c==0x2029) { 4324 // char is a line ending. Exit the scanning loop. 4325 break; 4326 } 4327 } 4328 ix = UTEXT_GETNATIVEINDEX(fInputText); 4329 } 4330 } 4331 4332 // If there were no matching characters, skip over the loop altogether. 4333 // The loop doesn't run at all, a * op always succeeds. 4334 if (ix == fp->fInputIdx) { 4335 fp->fPatIdx++; // skip the URX_LOOP_C op. 4336 break; 4337 } 4338 4339 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 4340 // must follow. It's operand is the stack location 4341 // that holds the starting input index for the match of this .* 4342 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 4343 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 4344 int32_t stackLoc = URX_VAL(loopcOp); 4345 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 4346 fp->fExtra[stackLoc] = fp->fInputIdx; 4347 #ifdef REGEX_SMART_BACKTRACKING 4348 backSearchIndex = fp->fInputIdx; 4349 #endif 4350 fp->fInputIdx = ix; 4351 4352 // Save State to the URX_LOOP_C op that follows this one, 4353 // so that match failures in the following code will return to there. 4354 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 4355 fp = StateSave(fp, fp->fPatIdx, status); 4356 fp->fPatIdx++; 4357 } 4358 break; 4359 4360 4361 case URX_LOOP_C: 4362 { 4363 U_ASSERT(opValue>=0 && opValue<fFrameSize); 4364 backSearchIndex = fp->fExtra[opValue]; 4365 U_ASSERT(backSearchIndex <= fp->fInputIdx); 4366 if (backSearchIndex == fp->fInputIdx) { 4367 // We've backed up the input idx to the point that the loop started. 4368 // The loop is done. Leave here without saving state. 4369 // Subsequent failures won't come back here. 4370 break; 4371 } 4372 // Set up for the next iteration of the loop, with input index 4373 // backed up by one from the last time through, 4374 // and a state save to this instruction in case the following code fails again. 4375 // (We're going backwards because this loop emulates stack unwinding, not 4376 // the initial scan forward.) 4377 U_ASSERT(fp->fInputIdx > 0); 4378 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4379 UChar32 prevC = UTEXT_PREVIOUS32(fInputText); 4380 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4381 4382 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText); 4383 if (prevC == 0x0a && 4384 fp->fInputIdx > backSearchIndex && 4385 twoPrevC == 0x0d) { 4386 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 4387 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 4388 // .*, stepping back over CRLF pair. 4389 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText); 4390 } 4391 } 4392 4393 4394 fp = StateSave(fp, fp->fPatIdx-1, status); 4395 } 4396 break; 4397 4398 4399 4400 default: 4401 // Trouble. The compiled pattern contains an entry with an 4402 // unrecognized type tag. 4403 U_ASSERT(FALSE); 4404 } 4405 4406 if (U_FAILURE(status)) { 4407 isMatch = FALSE; 4408 break; 4409 } 4410 } 4411 4412breakFromLoop: 4413 fMatch = isMatch; 4414 if (isMatch) { 4415 fLastMatchEnd = fMatchEnd; 4416 fMatchStart = startIdx; 4417 fMatchEnd = fp->fInputIdx; 4418 if (fTraceDebug) { 4419 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); 4420 } 4421 } 4422 else 4423 { 4424 if (fTraceDebug) { 4425 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); 4426 } 4427 } 4428 4429 fFrame = fp; // The active stack frame when the engine stopped. 4430 // Contains the capture group results that we need to 4431 // access later. 4432 return; 4433} 4434 4435 4436//-------------------------------------------------------------------------------- 4437// 4438// MatchChunkAt This is the actual matching engine. Like MatchAt, but with the 4439// assumption that the entire string is available in the UText's 4440// chunk buffer. For now, that means we can use int32_t indexes, 4441// except for anything that needs to be saved (like group starts 4442// and ends). 4443// 4444// startIdx: begin matching a this index. 4445// toEnd: if true, match must extend to end of the input region 4446// 4447//-------------------------------------------------------------------------------- 4448void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) { 4449 UBool isMatch = FALSE; // True if the we have a match. 4450 4451 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards 4452 4453 int32_t op; // Operation from the compiled pattern, split into 4454 int32_t opType; // the opcode 4455 int32_t opValue; // and the operand value. 4456 4457#ifdef REGEX_RUN_DEBUG 4458 if (fTraceDebug) 4459 { 4460 printf("MatchAt(startIdx=%ld)\n", startIdx); 4461 printf("Original Pattern: "); 4462 UChar32 c = utext_next32From(fPattern->fPattern, 0); 4463 while (c != U_SENTINEL) { 4464 if (c<32 || c>256) { 4465 c = '.'; 4466 } 4467 REGEX_DUMP_DEBUG_PRINTF(("%c", c)); 4468 4469 c = UTEXT_NEXT32(fPattern->fPattern); 4470 } 4471 printf("\n"); 4472 printf("Input String: "); 4473 c = utext_next32From(fInputText, 0); 4474 while (c != U_SENTINEL) { 4475 if (c<32 || c>256) { 4476 c = '.'; 4477 } 4478 printf("%c", c); 4479 4480 c = UTEXT_NEXT32(fInputText); 4481 } 4482 printf("\n"); 4483 printf("\n"); 4484 } 4485#endif 4486 4487 if (U_FAILURE(status)) { 4488 return; 4489 } 4490 4491 // Cache frequently referenced items from the compiled pattern 4492 // 4493 int64_t *pat = fPattern->fCompiledPat->getBuffer(); 4494 4495 const UChar *litText = fPattern->fLiteralText.getBuffer(); 4496 UVector *sets = fPattern->fSets; 4497 4498 const UChar *inputBuf = fInputText->chunkContents; 4499 4500 fFrameSize = fPattern->fFrameSize; 4501 REStackFrame *fp = resetStack(); 4502 4503 fp->fPatIdx = 0; 4504 fp->fInputIdx = startIdx; 4505 4506 // Zero out the pattern's static data 4507 int32_t i; 4508 for (i = 0; i<fPattern->fDataSize; i++) { 4509 fData[i] = 0; 4510 } 4511 4512 // 4513 // Main loop for interpreting the compiled pattern. 4514 // One iteration of the loop per pattern operation performed. 4515 // 4516 for (;;) { 4517#if 0 4518 if (_heapchk() != _HEAPOK) { 4519 fprintf(stderr, "Heap Trouble\n"); 4520 } 4521#endif 4522 4523 op = (int32_t)pat[fp->fPatIdx]; 4524 opType = URX_TYPE(op); 4525 opValue = URX_VAL(op); 4526#ifdef REGEX_RUN_DEBUG 4527 if (fTraceDebug) { 4528 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx); 4529 printf("inputIdx=%d inputChar=%x sp=%3d activeLimit=%d ", fp->fInputIdx, 4530 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit); 4531 fPattern->dumpOp(fp->fPatIdx); 4532 } 4533#endif 4534 fp->fPatIdx++; 4535 4536 switch (opType) { 4537 4538 4539 case URX_NOP: 4540 break; 4541 4542 4543 case URX_BACKTRACK: 4544 // Force a backtrack. In some circumstances, the pattern compiler 4545 // will notice that the pattern can't possibly match anything, and will 4546 // emit one of these at that point. 4547 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4548 break; 4549 4550 4551 case URX_ONECHAR: 4552 if (fp->fInputIdx < fActiveLimit) { 4553 UChar32 c; 4554 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4555 if (c == opValue) { 4556 break; 4557 } 4558 } else { 4559 fHitEnd = TRUE; 4560 } 4561 4562 #ifdef REGEX_SMART_BACKTRACKING 4563 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 4564 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 4565 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 4566 int64_t reverseIndex = fp->fInputIdx; 4567 UChar32 c; 4568 do { 4569 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 4570 if (c == opValue) { 4571 break; 4572 } 4573 } while (reverseIndex > backSearchIndex); 4574 if (c == opValue) { 4575 fHitEnd = FALSE; 4576 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4577 fp->fInputIdx = reverseIndex; 4578 if (fp->fInputIdx > backSearchIndex) { 4579 fp = StateSave(fp, fp->fPatIdx, status); 4580 } 4581 fp->fPatIdx++; // Skip the LOOP_C, we just did that 4582 break; 4583 } 4584 } 4585 } 4586 #endif 4587 4588 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4589 break; 4590 4591 4592 case URX_STRING: 4593 { 4594 // Test input against a literal string. 4595 // Strings require two slots in the compiled pattern, one for the 4596 // offset to the string text, and one for the length. 4597 int32_t stringStartIdx = opValue; 4598 int32_t stringLen; 4599 4600 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand 4601 fp->fPatIdx++; 4602 opType = URX_TYPE(op); 4603 stringLen = URX_VAL(op); 4604 U_ASSERT(opType == URX_STRING_LEN); 4605 U_ASSERT(stringLen >= 2); 4606 4607 if (fp->fInputIdx + stringLen > fActiveLimit) { 4608 // No match. String is longer than the remaining input text. 4609 fHitEnd = TRUE; // TODO: See ticket 6074 4610 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4611 break; 4612 } 4613 4614 const UChar * pInp = inputBuf + fp->fInputIdx; 4615 const UChar * pPat = litText+stringStartIdx; 4616 const UChar * pEnd = pInp + stringLen; 4617 UBool success = FALSE; 4618 for(;;) { 4619 if (*pInp == *pPat) { 4620 pInp++; 4621 pPat++; 4622 if (pInp == pEnd) { 4623 // Successful Match. 4624 success = TRUE; 4625 break; 4626 } 4627 } else { 4628 // Match failed. 4629 break; 4630 } 4631 } 4632 4633 if (success) { 4634 fp->fInputIdx += stringLen; 4635 } else { 4636 #ifdef REGEX_SMART_BACKTRACKING 4637 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 4638 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 4639 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 4640 // Reset to last start point 4641 int64_t reverseIndex = fp->fInputIdx; 4642 UChar32 c; 4643 pPat = litText+stringStartIdx; 4644 4645 // Search backwards for a possible start 4646 do { 4647 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 4648 if ((U_IS_BMP(c) && *pPat == c) || 4649 (*pPat == U16_LEAD(c) && *(pPat+1) == U16_TRAIL(c))) { 4650 success = TRUE; 4651 break; 4652 } 4653 } while (reverseIndex > backSearchIndex); 4654 4655 // And try again 4656 if (success) { 4657 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4658 fp->fInputIdx = reverseIndex; 4659 if (fp->fInputIdx > backSearchIndex) { 4660 fp = StateSave(fp, fp->fPatIdx, status); 4661 } 4662 fp->fPatIdx++; // Skip the LOOP_C, we just did that 4663 break; 4664 } 4665 } 4666 } 4667 #endif 4668 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4669 } 4670 } 4671 break; 4672 4673 4674 case URX_STATE_SAVE: 4675 fp = StateSave(fp, opValue, status); 4676 break; 4677 4678 4679 case URX_END: 4680 // The match loop will exit via this path on a successful match, 4681 // when we reach the end of the pattern. 4682 if (toEnd && fp->fInputIdx != fActiveLimit) { 4683 // The pattern matched, but not to the end of input. Try some more. 4684 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4685 break; 4686 } 4687 isMatch = TRUE; 4688 goto breakFromLoop; 4689 4690 // Start and End Capture stack frame variables are laid out out like this: 4691 // fp->fExtra[opValue] - The start of a completed capture group 4692 // opValue+1 - The end of a completed capture group 4693 // opValue+2 - the start of a capture group whose end 4694 // has not yet been reached (and might not ever be). 4695 case URX_START_CAPTURE: 4696 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4697 fp->fExtra[opValue+2] = fp->fInputIdx; 4698 break; 4699 4700 4701 case URX_END_CAPTURE: 4702 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3); 4703 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set. 4704 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real. 4705 fp->fExtra[opValue+1] = fp->fInputIdx; // End position 4706 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]); 4707 break; 4708 4709 4710 case URX_DOLLAR: // $, test for End of line 4711 // or for position before new line at end of input 4712 if (fp->fInputIdx < fAnchorLimit-2) { 4713 // We are no where near the end of input. Fail. 4714 // This is the common case. Keep it first. 4715 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4716 break; 4717 } 4718 if (fp->fInputIdx >= fAnchorLimit) { 4719 // We really are at the end of input. Success. 4720 fHitEnd = TRUE; 4721 fRequireEnd = TRUE; 4722 break; 4723 } 4724 4725 // If we are positioned just before a new-line that is located at the 4726 // end of input, succeed. 4727 if (fp->fInputIdx == fAnchorLimit-1) { 4728 UChar32 c; 4729 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c); 4730 4731 if ((c>=0x0a && c<=0x0d) || c==0x85 || c==0x2028 || c==0x2029) { 4732 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4733 // At new-line at end of input. Success 4734 fHitEnd = TRUE; 4735 fRequireEnd = TRUE; 4736 break; 4737 } 4738 } 4739 } else if (fp->fInputIdx == fAnchorLimit-2 && 4740 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) { 4741 fHitEnd = TRUE; 4742 fRequireEnd = TRUE; 4743 break; // At CR/LF at end of input. Success 4744 } 4745 4746 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4747 4748 break; 4749 4750 4751 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode. 4752 if (fp->fInputIdx >= fAnchorLimit-1) { 4753 // Either at the last character of input, or off the end. 4754 if (fp->fInputIdx == fAnchorLimit-1) { 4755 // At last char of input. Success if it's a new line. 4756 if (inputBuf[fp->fInputIdx] == 0x0a) { 4757 fHitEnd = TRUE; 4758 fRequireEnd = TRUE; 4759 break; 4760 } 4761 } else { 4762 // Off the end of input. Success. 4763 fHitEnd = TRUE; 4764 fRequireEnd = TRUE; 4765 break; 4766 } 4767 } 4768 4769 // Not at end of input. Back-track out. 4770 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4771 break; 4772 4773 4774 case URX_DOLLAR_M: // $, test for End of line in multi-line mode 4775 { 4776 if (fp->fInputIdx >= fAnchorLimit) { 4777 // We really are at the end of input. Success. 4778 fHitEnd = TRUE; 4779 fRequireEnd = TRUE; 4780 break; 4781 } 4782 // If we are positioned just before a new-line, succeed. 4783 // It makes no difference where the new-line is within the input. 4784 UChar32 c = inputBuf[fp->fInputIdx]; 4785 if ((c>=0x0a && c<=0x0d) || c==0x85 ||c==0x2028 || c==0x2029) { 4786 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence 4787 // In multi-line mode, hitting a new-line just before the end of input does not 4788 // set the hitEnd or requireEnd flags 4789 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) { 4790 break; 4791 } 4792 } 4793 // not at a new line. Fail. 4794 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4795 } 4796 break; 4797 4798 4799 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode 4800 { 4801 if (fp->fInputIdx >= fAnchorLimit) { 4802 // We really are at the end of input. Success. 4803 fHitEnd = TRUE; 4804 fRequireEnd = TRUE; // Java set requireEnd in this case, even though 4805 break; // adding a new-line would not lose the match. 4806 } 4807 // If we are not positioned just before a new-line, the test fails; backtrack out. 4808 // It makes no difference where the new-line is within the input. 4809 if (inputBuf[fp->fInputIdx] != 0x0a) { 4810 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4811 } 4812 } 4813 break; 4814 4815 4816 case URX_CARET: // ^, test for start of line 4817 if (fp->fInputIdx != fAnchorStart) { 4818 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4819 } 4820 break; 4821 4822 4823 case URX_CARET_M: // ^, test for start of line in mulit-line mode 4824 { 4825 if (fp->fInputIdx == fAnchorStart) { 4826 // We are at the start input. Success. 4827 break; 4828 } 4829 // Check whether character just before the current pos is a new-line 4830 // unless we are at the end of input 4831 UChar c = inputBuf[fp->fInputIdx - 1]; 4832 if ((fp->fInputIdx < fAnchorLimit) && 4833 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 4834 // It's a new-line. ^ is true. Success. 4835 // TODO: what should be done with positions between a CR and LF? 4836 break; 4837 } 4838 // Not at the start of a line. Fail. 4839 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4840 } 4841 break; 4842 4843 4844 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode 4845 { 4846 U_ASSERT(fp->fInputIdx >= fAnchorStart); 4847 if (fp->fInputIdx <= fAnchorStart) { 4848 // We are at the start input. Success. 4849 break; 4850 } 4851 // Check whether character just before the current pos is a new-line 4852 U_ASSERT(fp->fInputIdx <= fAnchorLimit); 4853 UChar c = inputBuf[fp->fInputIdx - 1]; 4854 if (c != 0x0a) { 4855 // Not at the start of a line. Back-track out. 4856 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4857 } 4858 } 4859 break; 4860 4861 case URX_BACKSLASH_B: // Test for word boundaries 4862 { 4863 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx); 4864 success ^= (opValue != 0); // flip sense for \B 4865 if (!success) { 4866 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4867 } 4868 } 4869 break; 4870 4871 4872 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style 4873 { 4874 UBool success = isUWordBoundary(fp->fInputIdx); 4875 success ^= (opValue != 0); // flip sense for \B 4876 if (!success) { 4877 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4878 } 4879 } 4880 break; 4881 4882 4883 case URX_BACKSLASH_D: // Test for decimal digit 4884 { 4885 if (fp->fInputIdx >= fActiveLimit) { 4886 fHitEnd = TRUE; 4887 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4888 break; 4889 } 4890 4891 UChar32 c; 4892 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4893 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster. 4894 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER); 4895 success ^= (opValue != 0); // flip sense for \D 4896 if (!success) { 4897 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4898 } 4899 } 4900 break; 4901 4902 4903 case URX_BACKSLASH_G: // Test for position at end of previous match 4904 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) { 4905 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4906 } 4907 break; 4908 4909 4910 case URX_BACKSLASH_X: 4911 // Match a Grapheme, as defined by Unicode TR 29. 4912 // Differs slightly from Perl, which consumes combining marks independently 4913 // of context. 4914 { 4915 4916 // Fail if at end of input 4917 if (fp->fInputIdx >= fActiveLimit) { 4918 fHitEnd = TRUE; 4919 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4920 break; 4921 } 4922 4923 // Examine (and consume) the current char. 4924 // Dispatch into a little state machine, based on the char. 4925 UChar32 c; 4926 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4927 UnicodeSet **sets = fPattern->fStaticSets; 4928 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend; 4929 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control; 4930 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4931 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4932 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4933 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4934 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4935 goto GC_Extend; 4936 4937 4938 4939GC_L: 4940 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4941 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4942 if (sets[URX_GC_L]->contains(c)) goto GC_L; 4943 if (sets[URX_GC_LV]->contains(c)) goto GC_V; 4944 if (sets[URX_GC_LVT]->contains(c)) goto GC_T; 4945 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4946 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4947 goto GC_Extend; 4948 4949GC_V: 4950 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4951 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4952 if (sets[URX_GC_V]->contains(c)) goto GC_V; 4953 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4954 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4955 goto GC_Extend; 4956 4957GC_T: 4958 if (fp->fInputIdx >= fActiveLimit) goto GC_Done; 4959 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4960 if (sets[URX_GC_T]->contains(c)) goto GC_T; 4961 U16_PREV(inputBuf, 0, fp->fInputIdx, c); 4962 goto GC_Extend; 4963 4964GC_Extend: 4965 // Combining characters are consumed here 4966 for (;;) { 4967 if (fp->fInputIdx >= fActiveLimit) { 4968 break; 4969 } 4970 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 4971 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) { 4972 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 4973 break; 4974 } 4975 } 4976 goto GC_Done; 4977 4978GC_Control: 4979 // Most control chars stand alone (don't combine with combining chars), 4980 // except for that CR/LF sequence is a single grapheme cluster. 4981 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) { 4982 fp->fInputIdx++; 4983 } 4984 4985GC_Done: 4986 if (fp->fInputIdx >= fActiveLimit) { 4987 fHitEnd = TRUE; 4988 } 4989 break; 4990 } 4991 4992 4993 4994 4995 case URX_BACKSLASH_Z: // Test for end of Input 4996 if (fp->fInputIdx < fAnchorLimit) { 4997 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 4998 } else { 4999 fHitEnd = TRUE; 5000 fRequireEnd = TRUE; 5001 } 5002 break; 5003 5004 5005 5006 case URX_STATIC_SETREF: 5007 { 5008 // Test input character against one of the predefined sets 5009 // (Word Characters, for example) 5010 // The high bit of the op value is a flag for the match polarity. 5011 // 0: success if input char is in set. 5012 // 1: success if input char is not in set. 5013 if (fp->fInputIdx >= fActiveLimit) { 5014 fHitEnd = TRUE; 5015 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5016 break; 5017 } 5018 5019 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET); 5020 opValue &= ~URX_NEG_SET; 5021 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 5022 5023 UChar32 c; 5024 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5025 if (c < 256) { 5026 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 5027 if (s8->contains(c)) { 5028 success = !success; 5029 } 5030 } else { 5031 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 5032 if (s->contains(c)) { 5033 success = !success; 5034 } 5035 } 5036 if (!success) { 5037 #ifdef REGEX_SMART_BACKTRACKING 5038 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5039 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5040 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5041 // Try to find it, backwards 5042 int64_t reverseIndex = fp->fInputIdx; 5043 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried 5044 success = ((opValue & URX_NEG_SET) == URX_NEG_SET); // reset 5045 do { 5046 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5047 if (c < 256) { 5048 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 5049 if (s8->contains(c)) { 5050 success = !success; 5051 } 5052 } else { 5053 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 5054 if (s->contains(c)) { 5055 success = !success; 5056 } 5057 } 5058 } while (reverseIndex > backSearchIndex && !success); 5059 5060 if (success) { 5061 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5062 fp->fInputIdx = reverseIndex; 5063 if (fp->fInputIdx > backSearchIndex) { 5064 fp = StateSave(fp, fp->fPatIdx, status); 5065 } 5066 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5067 break; 5068 } 5069 } 5070 } 5071 #endif 5072 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5073 } 5074 } 5075 break; 5076 5077 5078 case URX_STAT_SETREF_N: 5079 { 5080 // Test input character for NOT being a member of one of 5081 // the predefined sets (Word Characters, for example) 5082 if (fp->fInputIdx >= fActiveLimit) { 5083 fHitEnd = TRUE; 5084 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5085 break; 5086 } 5087 5088 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET); 5089 5090 UChar32 c; 5091 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5092 if (c < 256) { 5093 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 5094 if (s8->contains(c) == FALSE) { 5095 break; 5096 } 5097 } else { 5098 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 5099 if (s->contains(c) == FALSE) { 5100 break; 5101 } 5102 } 5103 5104 #ifdef REGEX_SMART_BACKTRACKING 5105 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5106 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5107 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5108 // Try to find it, backwards 5109 int64_t reverseIndex = fp->fInputIdx; 5110 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried 5111 UBool success = FALSE; 5112 do { 5113 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5114 if (c < 256) { 5115 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue]; 5116 if (s8->contains(c) == FALSE) { 5117 success = TRUE; 5118 break; 5119 } 5120 } else { 5121 const UnicodeSet *s = fPattern->fStaticSets[opValue]; 5122 if (s->contains(c) == FALSE) { 5123 success = TRUE; 5124 break; 5125 } 5126 } 5127 } while (reverseIndex > backSearchIndex); 5128 5129 if (success) { 5130 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5131 fp->fInputIdx = reverseIndex; 5132 if (fp->fInputIdx > backSearchIndex) { 5133 fp = StateSave(fp, fp->fPatIdx, status); 5134 } 5135 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5136 break; 5137 } 5138 } 5139 } 5140 #endif 5141 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5142 } 5143 break; 5144 5145 5146 case URX_SETREF: 5147 { 5148 if (fp->fInputIdx >= fActiveLimit) { 5149 fHitEnd = TRUE; 5150 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5151 break; 5152 } 5153 5154 U_ASSERT(opValue > 0 && opValue < sets->size()); 5155 5156 // There is input left. Pick up one char and test it for set membership. 5157 UChar32 c; 5158 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5159 if (c<256) { 5160 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5161 if (s8->contains(c)) { 5162 // The character is in the set. A Match. 5163 break; 5164 } 5165 } else { 5166 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5167 if (s->contains(c)) { 5168 // The character is in the set. A Match. 5169 break; 5170 } 5171 } 5172 5173 // the character wasn't in the set. 5174 #ifdef REGEX_SMART_BACKTRACKING 5175 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5176 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5177 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5178 // Try to find it, backwards 5179 int64_t reverseIndex = fp->fInputIdx; 5180 U16_BACK_1(inputBuf, backSearchIndex, reverseIndex); // skip the first character we tried 5181 UBool success = FALSE; 5182 do { 5183 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5184 if (c < 256) { 5185 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5186 if (s8->contains(c)) { 5187 success = TRUE; 5188 break; 5189 } 5190 } else { 5191 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5192 if (s->contains(c)) { 5193 success = TRUE; 5194 break; 5195 } 5196 } 5197 } while (reverseIndex > backSearchIndex); 5198 5199 if (success) { 5200 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5201 fp->fInputIdx = reverseIndex; 5202 if (fp->fInputIdx > reverseIndex) { 5203 fp = StateSave(fp, fp->fPatIdx, status); 5204 } 5205 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5206 break; 5207 } 5208 } 5209 } 5210 #endif 5211 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5212 } 5213 break; 5214 5215 5216 case URX_DOTANY: 5217 { 5218 // . matches anything, but stops at end-of-line. 5219 if (fp->fInputIdx >= fActiveLimit) { 5220 // At end of input. Match failed. Backtrack out. 5221 fHitEnd = TRUE; 5222 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5223 break; 5224 } 5225 5226 // There is input left. Advance over one char, unless we've hit end-of-line 5227 UChar32 c; 5228 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5229 if (((c & 0x7f) <= 0x29) && // First quickly bypass as many chars as possible 5230 ((c<=0x0d && c>=0x0a) || c==0x85 ||c==0x2028 || c==0x2029)) { 5231 // End of line in normal mode. . does not match. 5232 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5233 break; 5234 } 5235 } 5236 break; 5237 5238 5239 case URX_DOTANY_ALL: 5240 { 5241 // . in dot-matches-all (including new lines) mode 5242 if (fp->fInputIdx >= fActiveLimit) { 5243 // At end of input. Match failed. Backtrack out. 5244 fHitEnd = TRUE; 5245 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5246 break; 5247 } 5248 5249 // There is input left. Advance over one char, except if we are 5250 // at a cr/lf, advance over both of them. 5251 UChar32 c; 5252 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5253 if (c==0x0d && fp->fInputIdx < fActiveLimit) { 5254 // In the case of a CR/LF, we need to advance over both. 5255 if (inputBuf[fp->fInputIdx] == 0x0a) { 5256 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit); 5257 } 5258 } 5259 } 5260 break; 5261 5262 5263 case URX_DOTANY_UNIX: 5264 { 5265 // '.' operator, matches all, but stops at end-of-line. 5266 // UNIX_LINES mode, so 0x0a is the only recognized line ending. 5267 if (fp->fInputIdx >= fActiveLimit) { 5268 // At end of input. Match failed. Backtrack out. 5269 fHitEnd = TRUE; 5270 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5271 break; 5272 } 5273 5274 // There is input left. Advance over one char, unless we've hit end-of-line 5275 UChar32 c; 5276 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5277 if (c == 0x0a) { 5278 // End of line in normal mode. '.' does not match the \n 5279 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5280 } 5281 } 5282 break; 5283 5284 5285 case URX_JMP: 5286 fp->fPatIdx = opValue; 5287 break; 5288 5289 case URX_FAIL: 5290 isMatch = FALSE; 5291 goto breakFromLoop; 5292 5293 case URX_JMP_SAV: 5294 U_ASSERT(opValue < fPattern->fCompiledPat->size()); 5295 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 5296 fp->fPatIdx = opValue; // Then JMP. 5297 break; 5298 5299 case URX_JMP_SAV_X: 5300 // This opcode is used with (x)+, when x can match a zero length string. 5301 // Same as JMP_SAV, except conditional on the match having made forward progress. 5302 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the 5303 // data address of the input position at the start of the loop. 5304 { 5305 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size()); 5306 int32_t stoOp = (int32_t)pat[opValue-1]; 5307 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC); 5308 int32_t frameLoc = URX_VAL(stoOp); 5309 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize); 5310 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc]; 5311 U_ASSERT(prevInputIdx <= fp->fInputIdx); 5312 if (prevInputIdx < fp->fInputIdx) { 5313 // The match did make progress. Repeat the loop. 5314 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current 5315 fp->fPatIdx = opValue; 5316 fp->fExtra[frameLoc] = fp->fInputIdx; 5317 } 5318 // If the input position did not advance, we do nothing here, 5319 // execution will fall out of the loop. 5320 } 5321 break; 5322 5323 case URX_CTR_INIT: 5324 { 5325 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5326 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5327 5328 // Pick up the three extra operands that CTR_INIT has, and 5329 // skip the pattern location counter past 5330 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5331 fp->fPatIdx += 3; 5332 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5333 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 5334 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 5335 U_ASSERT(minCount>=0); 5336 U_ASSERT(maxCount>=minCount || maxCount==-1); 5337 U_ASSERT(loopLoc>fp->fPatIdx); 5338 5339 if (minCount == 0) { 5340 fp = StateSave(fp, loopLoc+1, status); 5341 } 5342 if (maxCount == 0) { 5343 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5344 } 5345 } 5346 break; 5347 5348 case URX_CTR_LOOP: 5349 { 5350 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5351 int32_t initOp = (int32_t)pat[opValue]; 5352 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT); 5353 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5354 int32_t minCount = (int32_t)pat[opValue+2]; 5355 int32_t maxCount = (int32_t)pat[opValue+3]; 5356 // Increment the counter. Note: we DIDN'T worry about counter 5357 // overflow, since the data comes from UnicodeStrings, which 5358 // stores its length in an int32_t. Do we have to think about 5359 // this now that we're using UText? Probably not, since the length 5360 // in UChar32s is still an int32_t. 5361 (*pCounter)++; 5362 U_ASSERT(*pCounter > 0); 5363 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 5364 U_ASSERT(*pCounter == maxCount || maxCount == -1); 5365 break; 5366 } 5367 if (*pCounter >= minCount) { 5368 fp = StateSave(fp, fp->fPatIdx, status); 5369 } 5370 fp->fPatIdx = opValue + 4; // Loop back. 5371 } 5372 break; 5373 5374 case URX_CTR_INIT_NG: 5375 { 5376 // Initialize a non-greedy loop 5377 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2); 5378 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero 5379 5380 // Pick up the three extra operands that CTR_INIT has, and 5381 // skip the pattern location counter past 5382 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5383 fp->fPatIdx += 3; 5384 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]); 5385 int32_t minCount = (int32_t)pat[instrOperandLoc+1]; 5386 int32_t maxCount = (int32_t)pat[instrOperandLoc+2]; 5387 U_ASSERT(minCount>=0); 5388 U_ASSERT(maxCount>=minCount || maxCount==-1); 5389 U_ASSERT(loopLoc>fp->fPatIdx); 5390 5391 if (minCount == 0) { 5392 if (maxCount != 0) { 5393 fp = StateSave(fp, fp->fPatIdx, status); 5394 } 5395 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block 5396 } 5397 } 5398 break; 5399 5400 case URX_CTR_LOOP_NG: 5401 { 5402 // Non-greedy {min, max} loops 5403 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2); 5404 int32_t initOp = (int32_t)pat[opValue]; 5405 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG); 5406 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)]; 5407 int32_t minCount = (int32_t)pat[opValue+2]; 5408 int32_t maxCount = (int32_t)pat[opValue+3]; 5409 // Increment the counter. Note: we DIDN'T worry about counter 5410 // overflow, since the data comes from UnicodeStrings, which 5411 // stores its length in an int32_t. Do we have to think about 5412 // this now that we're using UText? Probably not, since the length 5413 // in UChar32s is still an int32_t. 5414 (*pCounter)++; 5415 U_ASSERT(*pCounter > 0); 5416 5417 if ((uint64_t)*pCounter >= (uint32_t)maxCount) { 5418 // The loop has matched the maximum permitted number of times. 5419 // Break out of here with no action. Matching will 5420 // continue with the following pattern. 5421 U_ASSERT(*pCounter == maxCount || maxCount == -1); 5422 break; 5423 } 5424 5425 if (*pCounter < minCount) { 5426 // We haven't met the minimum number of matches yet. 5427 // Loop back for another one. 5428 fp->fPatIdx = opValue + 4; // Loop back. 5429 } else { 5430 // We do have the minimum number of matches. 5431 // Fall into the following pattern, but first do 5432 // a state save to the top of the loop, so that a failure 5433 // in the following pattern will try another iteration of the loop. 5434 fp = StateSave(fp, opValue + 4, status); 5435 } 5436 } 5437 break; 5438 5439 case URX_STO_SP: 5440 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5441 fData[opValue] = fStack->size(); 5442 break; 5443 5444 case URX_LD_SP: 5445 { 5446 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize); 5447 int32_t newStackSize = (int32_t)fData[opValue]; 5448 U_ASSERT(newStackSize <= fStack->size()); 5449 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5450 if (newFP == (int64_t *)fp) { 5451 break; 5452 } 5453 int32_t i; 5454 for (i=0; i<fFrameSize; i++) { 5455 newFP[i] = ((int64_t *)fp)[i]; 5456 } 5457 fp = (REStackFrame *)newFP; 5458 fStack->setSize(newStackSize); 5459 } 5460 break; 5461 5462 case URX_BACKREF: 5463 case URX_BACKREF_I: 5464 { 5465 U_ASSERT(opValue < fFrameSize); 5466 int64_t groupStartIdx = fp->fExtra[opValue]; 5467 int64_t groupEndIdx = fp->fExtra[opValue+1]; 5468 U_ASSERT(groupStartIdx <= groupEndIdx); 5469 int64_t len = groupEndIdx-groupStartIdx; 5470 if (groupStartIdx < 0) { 5471 // This capture group has not participated in the match thus far, 5472 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5473 } 5474 5475 if (len == 0) { 5476 // The capture group match was of an empty string. 5477 // Verified by testing: Perl matches succeed in this case, so 5478 // we do too. 5479 break; 5480 } 5481 5482 UBool haveMatch = FALSE; 5483 if (fp->fInputIdx + len <= fActiveLimit) { 5484 if (opType == URX_BACKREF) { 5485 if (u_strncmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, (int32_t)len) == 0) { 5486 haveMatch = TRUE; 5487 } 5488 } else { 5489 if (u_strncasecmp(inputBuf+groupStartIdx, inputBuf+fp->fInputIdx, 5490 (int32_t)len, U_FOLD_CASE_DEFAULT) == 0) { 5491 haveMatch = TRUE; 5492 } 5493 } 5494 } else { 5495 // TODO: probably need to do a partial string comparison, and only 5496 // set HitEnd if the available input matched. Ticket #6074 5497 fHitEnd = TRUE; 5498 } 5499 if (haveMatch) { 5500 fp->fInputIdx += len; // Match. Advance current input position. 5501 } else { 5502 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match. 5503 } 5504 } 5505 break; 5506 5507 case URX_STO_INP_LOC: 5508 { 5509 U_ASSERT(opValue >= 0 && opValue < fFrameSize); 5510 fp->fExtra[opValue] = fp->fInputIdx; 5511 } 5512 break; 5513 5514 case URX_JMPX: 5515 { 5516 int32_t instrOperandLoc = (int32_t)fp->fPatIdx; 5517 fp->fPatIdx += 1; 5518 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]); 5519 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize); 5520 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc]; 5521 U_ASSERT(savedInputIdx <= fp->fInputIdx); 5522 if (savedInputIdx < fp->fInputIdx) { 5523 fp->fPatIdx = opValue; // JMP 5524 } else { 5525 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop. 5526 } 5527 } 5528 break; 5529 5530 case URX_LA_START: 5531 { 5532 // Entering a lookahead block. 5533 // Save Stack Ptr, Input Pos. 5534 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5535 fData[opValue] = fStack->size(); 5536 fData[opValue+1] = fp->fInputIdx; 5537 fActiveStart = fLookStart; // Set the match region change for 5538 fActiveLimit = fLookLimit; // transparent bounds. 5539 } 5540 break; 5541 5542 case URX_LA_END: 5543 { 5544 // Leaving a look-ahead block. 5545 // restore Stack Ptr, Input Pos to positions they had on entry to block. 5546 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5547 int32_t stackSize = fStack->size(); 5548 int32_t newStackSize = (int32_t)fData[opValue]; 5549 U_ASSERT(stackSize >= newStackSize); 5550 if (stackSize > newStackSize) { 5551 // Copy the current top frame back to the new (cut back) top frame. 5552 // This makes the capture groups from within the look-ahead 5553 // expression available. 5554 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize; 5555 int32_t i; 5556 for (i=0; i<fFrameSize; i++) { 5557 newFP[i] = ((int64_t *)fp)[i]; 5558 } 5559 fp = (REStackFrame *)newFP; 5560 fStack->setSize(newStackSize); 5561 } 5562 fp->fInputIdx = fData[opValue+1]; 5563 5564 // Restore the active region bounds in the input string; they may have 5565 // been changed because of transparent bounds on a Region. 5566 fActiveStart = fRegionStart; 5567 fActiveLimit = fRegionLimit; 5568 } 5569 break; 5570 5571 case URX_ONECHAR_I: 5572 if (fp->fInputIdx < fActiveLimit) { 5573 UChar32 c; 5574 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5575 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5576 break; 5577 } 5578 } else { 5579 fHitEnd = TRUE; 5580 } 5581 5582 #ifdef REGEX_SMART_BACKTRACKING 5583 if (fp->fInputIdx > backSearchIndex && fStack->size() > fFrameSize) { 5584 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5585 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5586 UBool success = FALSE; 5587 int64_t reverseIndex = fp->fInputIdx; 5588 UChar32 c; 5589 while (reverseIndex > backSearchIndex) { 5590 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5591 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) { 5592 success = TRUE; 5593 break; 5594 } else if (c == U_SENTINEL) { 5595 break; 5596 } 5597 } 5598 if (success) { 5599 fHitEnd = FALSE; 5600 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5601 fp->fInputIdx = reverseIndex; 5602 if (fp->fInputIdx > backSearchIndex) { 5603 fp = StateSave(fp, fp->fPatIdx, status); 5604 } 5605 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5606 break; 5607 } 5608 } 5609 } 5610 #endif 5611 5612 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5613 break; 5614 5615 case URX_STRING_I: 5616 { 5617 // Test input against a literal string. 5618 // Strings require two slots in the compiled pattern, one for the 5619 // offset to the string text, and one for the length. 5620 const UCaseProps *csp = ucase_getSingleton(); 5621 { 5622 int32_t stringStartIdx, stringLen; 5623 stringStartIdx = opValue; 5624 5625 op = (int32_t)pat[fp->fPatIdx]; 5626 fp->fPatIdx++; 5627 opType = URX_TYPE(op); 5628 opValue = URX_VAL(op); 5629 U_ASSERT(opType == URX_STRING_LEN); 5630 stringLen = opValue; 5631 5632 const UChar *patternChars = litText+stringStartIdx; 5633 const UChar *patternEnd = patternChars+stringLen; 5634 5635 const UChar *foldChars = NULL; 5636 int32_t foldOffset, foldLength; 5637 UChar32 c; 5638 // BEGIN android-changed 5639 // For ICU ticket#8824 5640 UBool c_is_valid = FALSE; 5641 5642 #ifdef REGEX_SMART_BACKTRACKING 5643 int32_t originalInputIdx = fp->fInputIdx; 5644 #endif 5645 UBool success = TRUE; 5646 5647 foldOffset = foldLength = 0; 5648 5649 while (patternChars < patternEnd && success) { 5650 if (fp->fInputIdx < fActiveLimit) { // don't read past end of string 5651 if(foldOffset < foldLength) { 5652 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 5653 c_is_valid = TRUE; 5654 } else { 5655 // test pre-condition of U16_NEXT: i < length 5656 U_ASSERT(fp->fInputIdx < fActiveLimit); 5657 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c); 5658 c_is_valid = TRUE; 5659 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 5660 if(foldLength >= 0) { 5661 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 5662 foldOffset = 0; 5663 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 5664 } else { 5665 c = foldLength; 5666 foldLength = foldOffset; // to avoid reading chars from the folding buffer 5667 } 5668 } 5669 } 5670 } else { 5671 c_is_valid = FALSE; 5672 } 5673 5674 if (fp->fInputIdx <= fActiveLimit && c_is_valid) { 5675 if (U_IS_BMP(c)) { 5676 success = (*patternChars == c); 5677 patternChars += 1; 5678 } else if (patternChars+1 < patternEnd) { 5679 success = (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c)); 5680 patternChars += 2; 5681 } 5682 } else { 5683 success = FALSE; 5684 fHitEnd = TRUE; // TODO: See ticket 6074 5685 } 5686 } 5687 // END android-changed 5688 5689 if (!success) { 5690 #ifdef REGEX_SMART_BACKTRACKING 5691 if (fp->fInputIdx > backSearchIndex && fStack->size()) { 5692 REStackFrame *prevFrame = (REStackFrame *)fStack->peekFrame(fFrameSize); 5693 if (URX_LOOP_C == URX_TYPE(pat[prevFrame->fPatIdx]) && fp->fInputIdx <= prevFrame->fInputIdx) { 5694 // Reset to last start point 5695 int64_t reverseIndex = originalInputIdx; 5696 patternChars = litText+stringStartIdx; 5697 5698 // Search backwards for a possible start 5699 do { 5700 U16_PREV(inputBuf, backSearchIndex, reverseIndex, c); 5701 foldLength = ucase_toFullFolding(csp, c, &foldChars, U_FOLD_CASE_DEFAULT); 5702 if(foldLength >= 0) { 5703 if(foldLength <= UCASE_MAX_STRING_LENGTH) { // !!!: Does not correctly handle chars that fold to 0-length strings 5704 foldOffset = 0; 5705 U16_NEXT_UNSAFE(foldChars, foldOffset, c); 5706 } else { 5707 c = foldLength; 5708 foldLength = foldOffset; // to avoid reading chars from the folding buffer 5709 } 5710 } 5711 5712 if ((U_IS_BMP(c) && *patternChars == c) || 5713 (*patternChars == U16_LEAD(c) && *(patternChars+1) == U16_TRAIL(c))) { 5714 success = TRUE; 5715 break; 5716 } 5717 } while (reverseIndex > backSearchIndex); 5718 5719 // And try again 5720 if (success) { 5721 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5722 fp->fInputIdx = reverseIndex; 5723 if (fp->fInputIdx > backSearchIndex) { 5724 fp = StateSave(fp, fp->fPatIdx, status); 5725 } 5726 fp->fPatIdx++; // Skip the LOOP_C, we just did that 5727 break; 5728 } 5729 } 5730 } 5731 #endif 5732 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5733 } 5734 } 5735 } 5736 break; 5737 5738 case URX_LB_START: 5739 { 5740 // Entering a look-behind block. 5741 // Save Stack Ptr, Input Pos. 5742 // TODO: implement transparent bounds. Ticket #6067 5743 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5744 fData[opValue] = fStack->size(); 5745 fData[opValue+1] = fp->fInputIdx; 5746 // Init the variable containing the start index for attempted matches. 5747 fData[opValue+2] = -1; 5748 // Save input string length, then reset to pin any matches to end at 5749 // the current position. 5750 fData[opValue+3] = fActiveLimit; 5751 fActiveLimit = fp->fInputIdx; 5752 } 5753 break; 5754 5755 5756 case URX_LB_CONT: 5757 { 5758 // Positive Look-Behind, at top of loop checking for matches of LB expression 5759 // at all possible input starting positions. 5760 5761 // Fetch the min and max possible match lengths. They are the operands 5762 // of this op in the pattern. 5763 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5764 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5765 U_ASSERT(minML <= maxML); 5766 U_ASSERT(minML >= 0); 5767 5768 // Fetch (from data) the last input index where a match was attempted. 5769 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5770 int64_t *lbStartIdx = &fData[opValue+2]; 5771 if (*lbStartIdx < 0) { 5772 // First time through loop. 5773 *lbStartIdx = fp->fInputIdx - minML; 5774 } else { 5775 // 2nd through nth time through the loop. 5776 // Back up start position for match by one. 5777 if (*lbStartIdx == 0) { 5778 (*lbStartIdx)--; 5779 } else { 5780 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5781 } 5782 } 5783 5784 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5785 // We have tried all potential match starting points without 5786 // getting a match. Backtrack out, and out of the 5787 // Look Behind altogether. 5788 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5789 int64_t restoreInputLen = fData[opValue+3]; 5790 U_ASSERT(restoreInputLen >= fActiveLimit); 5791 U_ASSERT(restoreInputLen <= fInputLength); 5792 fActiveLimit = restoreInputLen; 5793 break; 5794 } 5795 5796 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5797 // (successful match will fall off the end of the loop.) 5798 fp = StateSave(fp, fp->fPatIdx-3, status); 5799 fp->fInputIdx = *lbStartIdx; 5800 } 5801 break; 5802 5803 case URX_LB_END: 5804 // End of a look-behind block, after a successful match. 5805 { 5806 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5807 if (fp->fInputIdx != fActiveLimit) { 5808 // The look-behind expression matched, but the match did not 5809 // extend all the way to the point that we are looking behind from. 5810 // FAIL out of here, which will take us back to the LB_CONT, which 5811 // will retry the match starting at another position or fail 5812 // the look-behind altogether, whichever is appropriate. 5813 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5814 break; 5815 } 5816 5817 // Look-behind match is good. Restore the orignal input string length, 5818 // which had been truncated to pin the end of the lookbehind match to the 5819 // position being looked-behind. 5820 int64_t originalInputLen = fData[opValue+3]; 5821 U_ASSERT(originalInputLen >= fActiveLimit); 5822 U_ASSERT(originalInputLen <= fInputLength); 5823 fActiveLimit = originalInputLen; 5824 } 5825 break; 5826 5827 5828 case URX_LBN_CONT: 5829 { 5830 // Negative Look-Behind, at top of loop checking for matches of LB expression 5831 // at all possible input starting positions. 5832 5833 // Fetch the extra parameters of this op. 5834 int32_t minML = (int32_t)pat[fp->fPatIdx++]; 5835 int32_t maxML = (int32_t)pat[fp->fPatIdx++]; 5836 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++]; 5837 continueLoc = URX_VAL(continueLoc); 5838 U_ASSERT(minML <= maxML); 5839 U_ASSERT(minML >= 0); 5840 U_ASSERT(continueLoc > fp->fPatIdx); 5841 5842 // Fetch (from data) the last input index where a match was attempted. 5843 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5844 int64_t *lbStartIdx = &fData[opValue+2]; 5845 if (*lbStartIdx < 0) { 5846 // First time through loop. 5847 *lbStartIdx = fp->fInputIdx - minML; 5848 } else { 5849 // 2nd through nth time through the loop. 5850 // Back up start position for match by one. 5851 if (*lbStartIdx == 0) { 5852 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0. 5853 } else { 5854 U16_BACK_1(inputBuf, 0, *lbStartIdx); 5855 } 5856 } 5857 5858 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) { 5859 // We have tried all potential match starting points without 5860 // getting a match, which means that the negative lookbehind as 5861 // a whole has succeeded. Jump forward to the continue location 5862 int64_t restoreInputLen = fData[opValue+3]; 5863 U_ASSERT(restoreInputLen >= fActiveLimit); 5864 U_ASSERT(restoreInputLen <= fInputLength); 5865 fActiveLimit = restoreInputLen; 5866 fp->fPatIdx = continueLoc; 5867 break; 5868 } 5869 5870 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop. 5871 // (successful match will cause a FAIL out of the loop altogether.) 5872 fp = StateSave(fp, fp->fPatIdx-4, status); 5873 fp->fInputIdx = *lbStartIdx; 5874 } 5875 break; 5876 5877 case URX_LBN_END: 5878 // End of a negative look-behind block, after a successful match. 5879 { 5880 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5881 if (fp->fInputIdx != fActiveLimit) { 5882 // The look-behind expression matched, but the match did not 5883 // extend all the way to the point that we are looking behind from. 5884 // FAIL out of here, which will take us back to the LB_CONT, which 5885 // will retry the match starting at another position or succeed 5886 // the look-behind altogether, whichever is appropriate. 5887 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5888 break; 5889 } 5890 5891 // Look-behind expression matched, which means look-behind test as 5892 // a whole Fails 5893 5894 // Restore the orignal input string length, which had been truncated 5895 // inorder to pin the end of the lookbehind match 5896 // to the position being looked-behind. 5897 int64_t originalInputLen = fData[opValue+3]; 5898 U_ASSERT(originalInputLen >= fActiveLimit); 5899 U_ASSERT(originalInputLen <= fInputLength); 5900 fActiveLimit = originalInputLen; 5901 5902 // Restore original stack position, discarding any state saved 5903 // by the successful pattern match. 5904 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize); 5905 int32_t newStackSize = (int32_t)fData[opValue]; 5906 U_ASSERT(fStack->size() > newStackSize); 5907 fStack->setSize(newStackSize); 5908 5909 // FAIL, which will take control back to someplace 5910 // prior to entering the look-behind test. 5911 fp = (REStackFrame *)fStack->popFrame(fFrameSize); 5912 } 5913 break; 5914 5915 5916 case URX_LOOP_SR_I: 5917 // Loop Initialization for the optimized implementation of 5918 // [some character set]* 5919 // This op scans through all matching input. 5920 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5921 { 5922 U_ASSERT(opValue > 0 && opValue < sets->size()); 5923 Regex8BitSet *s8 = &fPattern->fSets8[opValue]; 5924 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue); 5925 5926 // Loop through input, until either the input is exhausted or 5927 // we reach a character that is not a member of the set. 5928 int32_t ix = (int32_t)fp->fInputIdx; 5929 for (;;) { 5930 if (ix >= fActiveLimit) { 5931 fHitEnd = TRUE; 5932 break; 5933 } 5934 UChar32 c; 5935 U16_NEXT(inputBuf, ix, fActiveLimit, c); 5936 if (c<256) { 5937 if (s8->contains(c) == FALSE) { 5938 U16_BACK_1(inputBuf, 0, ix); 5939 break; 5940 } 5941 } else { 5942 if (s->contains(c) == FALSE) { 5943 U16_BACK_1(inputBuf, 0, ix); 5944 break; 5945 } 5946 } 5947 } 5948 5949 // If there were no matching characters, skip over the loop altogether. 5950 // The loop doesn't run at all, a * op always succeeds. 5951 if (ix == fp->fInputIdx) { 5952 fp->fPatIdx++; // skip the URX_LOOP_C op. 5953 break; 5954 } 5955 5956 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 5957 // must follow. It's operand is the stack location 5958 // that holds the starting input index for the match of this [set]* 5959 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 5960 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 5961 int32_t stackLoc = URX_VAL(loopcOp); 5962 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 5963 fp->fExtra[stackLoc] = fp->fInputIdx; 5964 #ifdef REGEX_SMART_BACKTRACKING 5965 backSearchIndex = fp->fInputIdx; 5966 #endif 5967 fp->fInputIdx = ix; 5968 5969 // Save State to the URX_LOOP_C op that follows this one, 5970 // so that match failures in the following code will return to there. 5971 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 5972 fp = StateSave(fp, fp->fPatIdx, status); 5973 fp->fPatIdx++; 5974 } 5975 break; 5976 5977 5978 case URX_LOOP_DOT_I: 5979 // Loop Initialization for the optimized implementation of .* 5980 // This op scans through all remaining input. 5981 // The following LOOP_C op emulates stack unwinding if the following pattern fails. 5982 { 5983 // Loop through input until the input is exhausted (we reach an end-of-line) 5984 // In DOTALL mode, we can just go straight to the end of the input. 5985 int32_t ix; 5986 if ((opValue & 1) == 1) { 5987 // Dot-matches-All mode. Jump straight to the end of the string. 5988 ix = (int32_t)fActiveLimit; 5989 fHitEnd = TRUE; 5990 } else { 5991 // NOT DOT ALL mode. Line endings do not match '.' 5992 // Scan forward until a line ending or end of input. 5993 ix = (int32_t)fp->fInputIdx; 5994 for (;;) { 5995 if (ix >= fActiveLimit) { 5996 fHitEnd = TRUE; 5997 break; 5998 } 5999 UChar32 c; 6000 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++] 6001 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s 6002 if ((c == 0x0a) || // 0x0a is newline in both modes. 6003 (((opValue & 2) == 0) && // IF not UNIX_LINES mode 6004 ((c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029))) { 6005 // char is a line ending. Put the input pos back to the 6006 // line ending char, and exit the scanning loop. 6007 U16_BACK_1(inputBuf, 0, ix); 6008 break; 6009 } 6010 } 6011 } 6012 } 6013 6014 // If there were no matching characters, skip over the loop altogether. 6015 // The loop doesn't run at all, a * op always succeeds. 6016 if (ix == fp->fInputIdx) { 6017 fp->fPatIdx++; // skip the URX_LOOP_C op. 6018 break; 6019 } 6020 6021 // Peek ahead in the compiled pattern, to the URX_LOOP_C that 6022 // must follow. It's operand is the stack location 6023 // that holds the starting input index for the match of this .* 6024 int32_t loopcOp = (int32_t)pat[fp->fPatIdx]; 6025 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C); 6026 int32_t stackLoc = URX_VAL(loopcOp); 6027 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize); 6028 fp->fExtra[stackLoc] = fp->fInputIdx; 6029 #ifdef REGEX_SMART_BACKTRACKING 6030 backSearchIndex = fp->fInputIdx; 6031 #endif 6032 fp->fInputIdx = ix; 6033 6034 // Save State to the URX_LOOP_C op that follows this one, 6035 // so that match failures in the following code will return to there. 6036 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here. 6037 fp = StateSave(fp, fp->fPatIdx, status); 6038 fp->fPatIdx++; 6039 } 6040 break; 6041 6042 6043 case URX_LOOP_C: 6044 { 6045 U_ASSERT(opValue>=0 && opValue<fFrameSize); 6046 backSearchIndex = (int32_t)fp->fExtra[opValue]; 6047 U_ASSERT(backSearchIndex <= fp->fInputIdx); 6048 if (backSearchIndex == fp->fInputIdx) { 6049 // We've backed up the input idx to the point that the loop started. 6050 // The loop is done. Leave here without saving state. 6051 // Subsequent failures won't come back here. 6052 break; 6053 } 6054 // Set up for the next iteration of the loop, with input index 6055 // backed up by one from the last time through, 6056 // and a state save to this instruction in case the following code fails again. 6057 // (We're going backwards because this loop emulates stack unwinding, not 6058 // the initial scan forward.) 6059 U_ASSERT(fp->fInputIdx > 0); 6060 UChar32 prevC; 6061 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit? 6062 6063 if (prevC == 0x0a && 6064 fp->fInputIdx > backSearchIndex && 6065 inputBuf[fp->fInputIdx-1] == 0x0d) { 6066 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2]; 6067 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) { 6068 // .*, stepping back over CRLF pair. 6069 U16_BACK_1(inputBuf, 0, fp->fInputIdx); 6070 } 6071 } 6072 6073 6074 fp = StateSave(fp, fp->fPatIdx-1, status); 6075 } 6076 break; 6077 6078 6079 6080 default: 6081 // Trouble. The compiled pattern contains an entry with an 6082 // unrecognized type tag. 6083 U_ASSERT(FALSE); 6084 } 6085 6086 if (U_FAILURE(status)) { 6087 isMatch = FALSE; 6088 break; 6089 } 6090 } 6091 6092breakFromLoop: 6093 fMatch = isMatch; 6094 if (isMatch) { 6095 fLastMatchEnd = fMatchEnd; 6096 fMatchStart = startIdx; 6097 fMatchEnd = fp->fInputIdx; 6098 if (fTraceDebug) { 6099 REGEX_RUN_DEBUG_PRINTF(("Match. start=%d end=%d\n\n", fMatchStart, fMatchEnd)); 6100 } 6101 } 6102 else 6103 { 6104 if (fTraceDebug) { 6105 REGEX_RUN_DEBUG_PRINTF(("No match\n\n")); 6106 } 6107 } 6108 6109 fFrame = fp; // The active stack frame when the engine stopped. 6110 // Contains the capture group results that we need to 6111 // access later. 6112 6113 return; 6114} 6115 6116 6117UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher) 6118 6119U_NAMESPACE_END 6120 6121#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 6122 6123