1// 2// file: regexcmp.cpp 3// 4// Copyright (C) 2002-2013 International Business Machines Corporation and others. 5// All Rights Reserved. 6// 7// This file contains the ICU regular expression compiler, which is responsible 8// for processing a regular expression pattern into the compiled form that 9// is used by the match finding engine. 10// 11 12#include "unicode/utypes.h" 13 14#if !UCONFIG_NO_REGULAR_EXPRESSIONS 15 16#include "unicode/ustring.h" 17#include "unicode/unistr.h" 18#include "unicode/uniset.h" 19#include "unicode/uchar.h" 20#include "unicode/uchriter.h" 21#include "unicode/parsepos.h" 22#include "unicode/parseerr.h" 23#include "unicode/regex.h" 24#include "unicode/utf.h" 25#include "unicode/utf16.h" 26#include "patternprops.h" 27#include "putilimp.h" 28#include "cmemory.h" 29#include "cstring.h" 30#include "uvectr32.h" 31#include "uvectr64.h" 32#include "uassert.h" 33#include "ucln_in.h" 34#include "uinvchar.h" 35 36#include "regeximp.h" 37#include "regexcst.h" // Contains state table for the regex pattern parser. 38 // generated by a Perl script. 39#include "regexcmp.h" 40#include "regexst.h" 41#include "regextxt.h" 42 43 44 45U_NAMESPACE_BEGIN 46 47 48//------------------------------------------------------------------------------ 49// 50// Constructor. 51// 52//------------------------------------------------------------------------------ 53RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) : 54 fParenStack(status), fSetStack(status), fSetOpStack(status) 55{ 56 // Lazy init of all shared global sets (needed for init()'s empty text) 57 RegexStaticSets::initGlobals(&status); 58 59 fStatus = &status; 60 61 fRXPat = rxp; 62 fScanIndex = 0; 63 fLastChar = -1; 64 fPeekChar = -1; 65 fLineNum = 1; 66 fCharNum = 0; 67 fQuoteMode = FALSE; 68 fInBackslashQuote = FALSE; 69 fModeFlags = fRXPat->fFlags | 0x80000000; 70 fEOLComments = TRUE; 71 72 fMatchOpenParen = -1; 73 fMatchCloseParen = -1; 74 75 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) { 76 status = rxp->fDeferredStatus; 77 } 78} 79 80static const UChar chAmp = 0x26; // '&' 81static const UChar chDash = 0x2d; // '-' 82 83 84//------------------------------------------------------------------------------ 85// 86// Destructor 87// 88//------------------------------------------------------------------------------ 89RegexCompile::~RegexCompile() { 90} 91 92static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) { 93 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec)); 94} 95 96//------------------------------------------------------------------------------ 97// 98// Compile regex pattern. The state machine for rexexp pattern parsing is here. 99// The state tables are hand-written in the file regexcst.txt, 100// and converted to the form used here by a perl 101// script regexcst.pl 102// 103//------------------------------------------------------------------------------ 104void RegexCompile::compile( 105 const UnicodeString &pat, // Source pat to be compiled. 106 UParseError &pp, // Error position info 107 UErrorCode &e) // Error Code 108{ 109 fRXPat->fPatternString = new UnicodeString(pat); 110 UText patternText = UTEXT_INITIALIZER; 111 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e); 112 113 if (U_SUCCESS(e)) { 114 compile(&patternText, pp, e); 115 utext_close(&patternText); 116 } 117} 118 119// 120// compile, UText mode 121// All the work is actually done here. 122// 123void RegexCompile::compile( 124 UText *pat, // Source pat to be compiled. 125 UParseError &pp, // Error position info 126 UErrorCode &e) // Error Code 127{ 128 fStatus = &e; 129 fParseErr = &pp; 130 fStackPtr = 0; 131 fStack[fStackPtr] = 0; 132 133 if (U_FAILURE(*fStatus)) { 134 return; 135 } 136 137 // There should be no pattern stuff in the RegexPattern object. They can not be reused. 138 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0); 139 140 // Prepare the RegexPattern object to receive the compiled pattern. 141 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus); 142 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets; 143 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8; 144 145 146 // Initialize the pattern scanning state machine 147 fPatternLength = utext_nativeLength(pat); 148 uint16_t state = 1; 149 const RegexTableEl *tableEl; 150 151 // UREGEX_LITERAL force entire pattern to be treated as a literal string. 152 if (fModeFlags & UREGEX_LITERAL) { 153 fQuoteMode = TRUE; 154 } 155 156 nextChar(fC); // Fetch the first char from the pattern string. 157 158 // 159 // Main loop for the regex pattern parsing state machine. 160 // Runs once per state transition. 161 // Each time through optionally performs, depending on the state table, 162 // - an advance to the the next pattern char 163 // - an action to be performed. 164 // - pushing or popping a state to/from the local state return stack. 165 // file regexcst.txt is the source for the state table. The logic behind 166 // recongizing the pattern syntax is there, not here. 167 // 168 for (;;) { 169 // Bail out if anything has gone wrong. 170 // Regex pattern parsing stops on the first error encountered. 171 if (U_FAILURE(*fStatus)) { 172 break; 173 } 174 175 U_ASSERT(state != 0); 176 177 // Find the state table element that matches the input char from the pattern, or the 178 // class of the input character. Start with the first table row for this 179 // state, then linearly scan forward until we find a row that matches the 180 // character. The last row for each state always matches all characters, so 181 // the search will stop there, if not before. 182 // 183 tableEl = &gRuleParseStateTable[state]; 184 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ", 185 fC.fChar, fLineNum, fCharNum, RegexStateNames[state])); 186 187 for (;;) { // loop through table rows belonging to this state, looking for one 188 // that matches the current input char. 189 REGEX_SCAN_DEBUG_PRINTF((".")); 190 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) { 191 // Table row specified an individual character, not a set, and 192 // the input character is not quoted, and 193 // the input character matched it. 194 break; 195 } 196 if (tableEl->fCharClass == 255) { 197 // Table row specified default, match anything character class. 198 break; 199 } 200 if (tableEl->fCharClass == 254 && fC.fQuoted) { 201 // Table row specified "quoted" and the char was quoted. 202 break; 203 } 204 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) { 205 // Table row specified eof and we hit eof on the input. 206 break; 207 } 208 209 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class && 210 fC.fQuoted == FALSE && // char is not escaped && 211 fC.fChar != (UChar32)-1) { // char is not EOF 212 U_ASSERT(tableEl->fCharClass <= 137); 213 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) { 214 // Table row specified a character class, or set of characters, 215 // and the current char matches it. 216 break; 217 } 218 } 219 220 // No match on this row, advance to the next row for this state, 221 tableEl++; 222 } 223 REGEX_SCAN_DEBUG_PRINTF(("\n")); 224 225 // 226 // We've found the row of the state table that matches the current input 227 // character from the rules string. 228 // Perform any action specified by this row in the state table. 229 if (doParseActions(tableEl->fAction) == FALSE) { 230 // Break out of the state machine loop if the 231 // the action signalled some kind of error, or 232 // the action was to exit, occurs on normal end-of-rules-input. 233 break; 234 } 235 236 if (tableEl->fPushState != 0) { 237 fStackPtr++; 238 if (fStackPtr >= kStackSize) { 239 error(U_REGEX_INTERNAL_ERROR); 240 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n")); 241 fStackPtr--; 242 } 243 fStack[fStackPtr] = tableEl->fPushState; 244 } 245 246 // 247 // NextChar. This is where characters are actually fetched from the pattern. 248 // Happens under control of the 'n' tag in the state table. 249 // 250 if (tableEl->fNextChar) { 251 nextChar(fC); 252 } 253 254 // Get the next state from the table entry, or from the 255 // state stack if the next state was specified as "pop". 256 if (tableEl->fNextState != 255) { 257 state = tableEl->fNextState; 258 } else { 259 state = fStack[fStackPtr]; 260 fStackPtr--; 261 if (fStackPtr < 0) { 262 // state stack underflow 263 // This will occur if the user pattern has mis-matched parentheses, 264 // with extra close parens. 265 // 266 fStackPtr++; 267 error(U_REGEX_MISMATCHED_PAREN); 268 } 269 } 270 271 } 272 273 if (U_FAILURE(*fStatus)) { 274 // Bail out if the pattern had errors. 275 // Set stack cleanup: a successful compile would have left it empty, 276 // but errors can leave temporary sets hanging around. 277 while (!fSetStack.empty()) { 278 delete (UnicodeSet *)fSetStack.pop(); 279 } 280 return; 281 } 282 283 // 284 // The pattern has now been read and processed, and the compiled code generated. 285 // 286 287 // 288 // Compute the number of digits requried for the largest capture group number. 289 // 290 fRXPat->fMaxCaptureDigits = 1; 291 int32_t n = 10; 292 int32_t groupCount = fRXPat->fGroupMap->size(); 293 while (n <= groupCount) { 294 fRXPat->fMaxCaptureDigits++; 295 n *= 10; 296 } 297 298 // 299 // The pattern's fFrameSize so far has accumulated the requirements for 300 // storage for capture parentheses, counters, etc. that are encountered 301 // in the pattern. Add space for the two variables that are always 302 // present in the saved state: the input string position (int64_t) and 303 // the position in the compiled pattern. 304 // 305 fRXPat->fFrameSize+=RESTACKFRAME_HDRCOUNT; 306 307 // 308 // Optimization pass 1: NOPs, back-references, and case-folding 309 // 310 stripNOPs(); 311 312 // 313 // Get bounds for the minimum and maximum length of a string that this 314 // pattern can match. Used to avoid looking for matches in strings that 315 // are too short. 316 // 317 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1); 318 319 // 320 // Optimization pass 2: match start type 321 // 322 matchStartType(); 323 324 // 325 // Set up fast latin-1 range sets 326 // 327 int32_t numSets = fRXPat->fSets->size(); 328 fRXPat->fSets8 = new Regex8BitSet[numSets]; 329 // Null pointer check. 330 if (fRXPat->fSets8 == NULL) { 331 e = *fStatus = U_MEMORY_ALLOCATION_ERROR; 332 return; 333 } 334 int32_t i; 335 for (i=0; i<numSets; i++) { 336 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i); 337 fRXPat->fSets8[i].init(s); 338 } 339 340} 341 342 343 344 345 346//------------------------------------------------------------------------------ 347// 348// doParseAction Do some action during regex pattern parsing. 349// Called by the parse state machine. 350// 351// Generation of the match engine PCode happens here, or 352// in functions called from the parse actions defined here. 353// 354// 355//------------------------------------------------------------------------------ 356UBool RegexCompile::doParseActions(int32_t action) 357{ 358 UBool returnVal = TRUE; 359 360 switch ((Regex_PatternParseAction)action) { 361 362 case doPatStart: 363 // Start of pattern compiles to: 364 //0 SAVE 2 Fall back to position of FAIL 365 //1 jmp 3 366 //2 FAIL Stop if we ever reach here. 367 //3 NOP Dummy, so start of pattern looks the same as 368 // the start of an ( grouping. 369 //4 NOP Resreved, will be replaced by a save if there are 370 // OR | operators at the top level 371 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_STATE_SAVE, 2), *fStatus); 372 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_JMP, 3), *fStatus); 373 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_FAIL, 0), *fStatus); 374 375 // Standard open nonCapture paren action emits the two NOPs and 376 // sets up the paren stack frame. 377 doParseActions(doOpenNonCaptureParen); 378 break; 379 380 case doPatFinish: 381 // We've scanned to the end of the pattern 382 // The end of pattern compiles to: 383 // URX_END 384 // which will stop the runtime match engine. 385 // Encountering end of pattern also behaves like a close paren, 386 // and forces fixups of the State Save at the beginning of the compiled pattern 387 // and of any OR operations at the top level. 388 // 389 handleCloseParen(); 390 if (fParenStack.size() > 0) { 391 // Missing close paren in pattern. 392 error(U_REGEX_MISMATCHED_PAREN); 393 } 394 395 // add the END operation to the compiled pattern. 396 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_END, 0), *fStatus); 397 398 // Terminate the pattern compilation state machine. 399 returnVal = FALSE; 400 break; 401 402 403 404 case doOrOperator: 405 // Scanning a '|', as in (A|B) 406 { 407 // Generate code for any pending literals preceding the '|' 408 fixLiterals(FALSE); 409 410 // Insert a SAVE operation at the start of the pattern section preceding 411 // this OR at this level. This SAVE will branch the match forward 412 // to the right hand side of the OR in the event that the left hand 413 // side fails to match and backtracks. Locate the position for the 414 // save from the location on the top of the parentheses stack. 415 int32_t savePosition = fParenStack.popi(); 416 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition); 417 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location 418 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1); 419 fRXPat->fCompiledPat->setElementAt(op, savePosition); 420 421 // Append an JMP operation into the compiled pattern. The operand for 422 // the JMP will eventually be the location following the ')' for the 423 // group. This will be patched in later, when the ')' is encountered. 424 op = URX_BUILD(URX_JMP, 0); 425 fRXPat->fCompiledPat->addElement(op, *fStatus); 426 427 // Push the position of the newly added JMP op onto the parentheses stack. 428 // This registers if for fixup when this block's close paren is encountered. 429 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 430 431 // Append a NOP to the compiled pattern. This is the slot reserved 432 // for a SAVE in the event that there is yet another '|' following 433 // this one. 434 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 435 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); 436 } 437 break; 438 439 440 case doOpenCaptureParen: 441 // Open Paren. 442 // Compile to a 443 // - NOP, which later may be replaced by a save-state if the 444 // parenthesized group gets a * quantifier, followed by 445 // - START_CAPTURE n where n is stack frame offset to the capture group variables. 446 // - NOP, which may later be replaced by a save-state if there 447 // is an '|' alternation within the parens. 448 // 449 // Each capture group gets three slots in the save stack frame: 450 // 0: Capture Group start position (in input string being matched.) 451 // 1: Capture Group end position. 452 // 2: Start of Match-in-progress. 453 // The first two locations are for a completed capture group, and are 454 // referred to by back references and the like. 455 // The third location stores the capture start position when an START_CAPTURE is 456 // encountered. This will be promoted to a completed capture when (and if) the corresponding 457 // END_CAPTURE is encountered. 458 { 459 fixLiterals(); 460 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 461 int32_t varsLoc = fRXPat->fFrameSize; // Reserve three slots in match stack frame. 462 fRXPat->fFrameSize += 3; 463 int32_t cop = URX_BUILD(URX_START_CAPTURE, varsLoc); 464 fRXPat->fCompiledPat->addElement(cop, *fStatus); 465 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 466 467 // On the Parentheses stack, start a new frame and add the postions 468 // of the two NOPs. Depending on what follows in the pattern, the 469 // NOPs may be changed to SAVE_STATE or JMP ops, with a target 470 // address of the end of the parenthesized group. 471 fParenStack.push(fModeFlags, *fStatus); // Match mode state 472 fParenStack.push(capturing, *fStatus); // Frame type. 473 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location 474 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 475 476 // Save the mapping from group number to stack frame variable position. 477 fRXPat->fGroupMap->addElement(varsLoc, *fStatus); 478 } 479 break; 480 481 case doOpenNonCaptureParen: 482 // Open non-caputuring (grouping only) Paren. 483 // Compile to a 484 // - NOP, which later may be replaced by a save-state if the 485 // parenthesized group gets a * quantifier, followed by 486 // - NOP, which may later be replaced by a save-state if there 487 // is an '|' alternation within the parens. 488 { 489 fixLiterals(); 490 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 491 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 492 493 // On the Parentheses stack, start a new frame and add the postions 494 // of the two NOPs. 495 fParenStack.push(fModeFlags, *fStatus); // Match mode state 496 fParenStack.push(plain, *fStatus); // Begin a new frame. 497 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 498 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc 499 } 500 break; 501 502 503 case doOpenAtomicParen: 504 // Open Atomic Paren. (?> 505 // Compile to a 506 // - NOP, which later may be replaced if the parenthesized group 507 // has a quantifier, followed by 508 // - STO_SP save state stack position, so it can be restored at the ")" 509 // - NOP, which may later be replaced by a save-state if there 510 // is an '|' alternation within the parens. 511 { 512 fixLiterals(); 513 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 514 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the 515 fRXPat->fDataSize += 1; // state stack ptr. 516 int32_t stoOp = URX_BUILD(URX_STO_SP, varLoc); 517 fRXPat->fCompiledPat->addElement(stoOp, *fStatus); 518 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 519 520 // On the Parentheses stack, start a new frame and add the postions 521 // of the two NOPs. Depending on what follows in the pattern, the 522 // NOPs may be changed to SAVE_STATE or JMP ops, with a target 523 // address of the end of the parenthesized group. 524 fParenStack.push(fModeFlags, *fStatus); // Match mode state 525 fParenStack.push(atomic, *fStatus); // Frame type. 526 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP 527 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 528 } 529 break; 530 531 532 case doOpenLookAhead: 533 // Positive Look-ahead (?= stuff ) 534 // 535 // Note: Addition of transparent input regions, with the need to 536 // restore the original regions when failing out of a lookahead 537 // block, complicated this sequence. Some conbined opcodes 538 // might make sense - or might not, lookahead aren't that common. 539 // 540 // Caution: min match length optimization knows about this 541 // sequence; don't change without making updates there too. 542 // 543 // Compiles to 544 // 1 START_LA dataLoc Saves SP, Input Pos 545 // 2. STATE_SAVE 4 on failure of lookahead, goto 4 546 // 3 JMP 6 continue ... 547 // 548 // 4. LA_END Look Ahead failed. Restore regions. 549 // 5. BACKTRACK and back track again. 550 // 551 // 6. NOP reserved for use by quantifiers on the block. 552 // Look-ahead can't have quantifiers, but paren stack 553 // compile time conventions require the slot anyhow. 554 // 7. NOP may be replaced if there is are '|' ops in the block. 555 // 8. code for parenthesized stuff. 556 // 9. LA_END 557 // 558 // Two data slots are reserved, for saving the stack ptr and the input position. 559 { 560 fixLiterals(); 561 int32_t dataLoc = fRXPat->fDataSize; 562 fRXPat->fDataSize += 2; 563 int32_t op = URX_BUILD(URX_LA_START, dataLoc); 564 fRXPat->fCompiledPat->addElement(op, *fStatus); 565 566 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2); 567 fRXPat->fCompiledPat->addElement(op, *fStatus); 568 569 op = URX_BUILD(URX_JMP, fRXPat->fCompiledPat->size()+ 3); 570 fRXPat->fCompiledPat->addElement(op, *fStatus); 571 572 op = URX_BUILD(URX_LA_END, dataLoc); 573 fRXPat->fCompiledPat->addElement(op, *fStatus); 574 575 op = URX_BUILD(URX_BACKTRACK, 0); 576 fRXPat->fCompiledPat->addElement(op, *fStatus); 577 578 op = URX_BUILD(URX_NOP, 0); 579 fRXPat->fCompiledPat->addElement(op, *fStatus); 580 fRXPat->fCompiledPat->addElement(op, *fStatus); 581 582 // On the Parentheses stack, start a new frame and add the postions 583 // of the NOPs. 584 fParenStack.push(fModeFlags, *fStatus); // Match mode state 585 fParenStack.push(lookAhead, *fStatus); // Frame type. 586 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 587 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 588 } 589 break; 590 591 case doOpenLookAheadNeg: 592 // Negated Lookahead. (?! stuff ) 593 // Compiles to 594 // 1. START_LA dataloc 595 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state, 596 // // which continues with the match. 597 // 3. NOP // Std. Open Paren sequence, for possible '|' 598 // 4. code for parenthesized stuff. 599 // 5. END_LA // Cut back stack, remove saved state from step 2. 600 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails. 601 // 7. END_LA // Restore match region, in case look-ahead was using 602 // an alternate (transparent) region. 603 { 604 fixLiterals(); 605 int32_t dataLoc = fRXPat->fDataSize; 606 fRXPat->fDataSize += 2; 607 int32_t op = URX_BUILD(URX_LA_START, dataLoc); 608 fRXPat->fCompiledPat->addElement(op, *fStatus); 609 610 op = URX_BUILD(URX_STATE_SAVE, 0); // dest address will be patched later. 611 fRXPat->fCompiledPat->addElement(op, *fStatus); 612 613 op = URX_BUILD(URX_NOP, 0); 614 fRXPat->fCompiledPat->addElement(op, *fStatus); 615 616 // On the Parentheses stack, start a new frame and add the postions 617 // of the StateSave and NOP. 618 fParenStack.push(fModeFlags, *fStatus); // Match mode state 619 fParenStack.push(negLookAhead, *fStatus); // Frame type 620 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location 621 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location 622 623 // Instructions #5 - #7 will be added when the ')' is encountered. 624 } 625 break; 626 627 case doOpenLookBehind: 628 { 629 // Compile a (?<= look-behind open paren. 630 // 631 // Compiles to 632 // 0 URX_LB_START dataLoc 633 // 1 URX_LB_CONT dataLoc 634 // 2 MinMatchLen 635 // 3 MaxMatchLen 636 // 4 URX_NOP Standard '(' boilerplate. 637 // 5 URX_NOP Reserved slot for use with '|' ops within (block). 638 // 6 <code for LookBehind expression> 639 // 7 URX_LB_END dataLoc # Check match len, restore input len 640 // 8 URX_LA_END dataLoc # Restore stack, input pos 641 // 642 // Allocate a block of matcher data, to contain (when running a match) 643 // 0: Stack ptr on entry 644 // 1: Input Index on entry 645 // 2: Start index of match current match attempt. 646 // 3: Original Input String len. 647 648 // Generate match code for any pending literals. 649 fixLiterals(); 650 651 // Allocate data space 652 int32_t dataLoc = fRXPat->fDataSize; 653 fRXPat->fDataSize += 4; 654 655 // Emit URX_LB_START 656 int32_t op = URX_BUILD(URX_LB_START, dataLoc); 657 fRXPat->fCompiledPat->addElement(op, *fStatus); 658 659 // Emit URX_LB_CONT 660 op = URX_BUILD(URX_LB_CONT, dataLoc); 661 fRXPat->fCompiledPat->addElement(op, *fStatus); 662 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later. 663 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later. 664 665 // Emit the NOP 666 op = URX_BUILD(URX_NOP, 0); 667 fRXPat->fCompiledPat->addElement(op, *fStatus); 668 fRXPat->fCompiledPat->addElement(op, *fStatus); 669 670 // On the Parentheses stack, start a new frame and add the postions 671 // of the URX_LB_CONT and the NOP. 672 fParenStack.push(fModeFlags, *fStatus); // Match mode state 673 fParenStack.push(lookBehind, *fStatus); // Frame type 674 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 675 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 676 677 // The final two instructions will be added when the ')' is encountered. 678 } 679 680 break; 681 682 case doOpenLookBehindNeg: 683 { 684 // Compile a (?<! negated look-behind open paren. 685 // 686 // Compiles to 687 // 0 URX_LB_START dataLoc # Save entry stack, input len 688 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions 689 // 2 MinMatchLen 690 // 3 MaxMatchLen 691 // 4 continueLoc (9) 692 // 5 URX_NOP Standard '(' boilerplate. 693 // 6 URX_NOP Reserved slot for use with '|' ops within (block). 694 // 7 <code for LookBehind expression> 695 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL 696 // 9 ... 697 // 698 // Allocate a block of matcher data, to contain (when running a match) 699 // 0: Stack ptr on entry 700 // 1: Input Index on entry 701 // 2: Start index of match current match attempt. 702 // 3: Original Input String len. 703 704 // Generate match code for any pending literals. 705 fixLiterals(); 706 707 // Allocate data space 708 int32_t dataLoc = fRXPat->fDataSize; 709 fRXPat->fDataSize += 4; 710 711 // Emit URX_LB_START 712 int32_t op = URX_BUILD(URX_LB_START, dataLoc); 713 fRXPat->fCompiledPat->addElement(op, *fStatus); 714 715 // Emit URX_LBN_CONT 716 op = URX_BUILD(URX_LBN_CONT, dataLoc); 717 fRXPat->fCompiledPat->addElement(op, *fStatus); 718 fRXPat->fCompiledPat->addElement(0, *fStatus); // MinMatchLength. To be filled later. 719 fRXPat->fCompiledPat->addElement(0, *fStatus); // MaxMatchLength. To be filled later. 720 fRXPat->fCompiledPat->addElement(0, *fStatus); // Continue Loc. To be filled later. 721 722 // Emit the NOP 723 op = URX_BUILD(URX_NOP, 0); 724 fRXPat->fCompiledPat->addElement(op, *fStatus); 725 fRXPat->fCompiledPat->addElement(op, *fStatus); 726 727 // On the Parentheses stack, start a new frame and add the postions 728 // of the URX_LB_CONT and the NOP. 729 fParenStack.push(fModeFlags, *fStatus); // Match mode state 730 fParenStack.push(lookBehindN, *fStatus); // Frame type 731 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location 732 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location 733 734 // The final two instructions will be added when the ')' is encountered. 735 } 736 break; 737 738 case doConditionalExpr: 739 // Conditionals such as (?(1)a:b) 740 case doPerlInline: 741 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them. 742 error(U_REGEX_UNIMPLEMENTED); 743 break; 744 745 746 case doCloseParen: 747 handleCloseParen(); 748 if (fParenStack.size() <= 0) { 749 // Extra close paren, or missing open paren. 750 error(U_REGEX_MISMATCHED_PAREN); 751 } 752 break; 753 754 case doNOP: 755 break; 756 757 758 case doBadOpenParenType: 759 case doRuleError: 760 error(U_REGEX_RULE_SYNTAX); 761 break; 762 763 764 case doMismatchedParenErr: 765 error(U_REGEX_MISMATCHED_PAREN); 766 break; 767 768 case doPlus: 769 // Normal '+' compiles to 770 // 1. stuff to be repeated (already built) 771 // 2. jmp-sav 1 772 // 3. ... 773 // 774 // Or, if the item to be repeated can match a zero length string, 775 // 1. STO_INP_LOC data-loc 776 // 2. body of stuff to be repeated 777 // 3. JMP_SAV_X 2 778 // 4. ... 779 780 // 781 // Or, if the item to be repeated is simple 782 // 1. Item to be repeated. 783 // 2. LOOP_SR_I set number (assuming repeated item is a set ref) 784 // 3. LOOP_C stack location 785 { 786 int32_t topLoc = blockTopLoc(FALSE); // location of item #1 787 int32_t frameLoc; 788 789 // Check for simple constructs, which may get special optimized code. 790 if (topLoc == fRXPat->fCompiledPat->size() - 1) { 791 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 792 793 if (URX_TYPE(repeatedOp) == URX_SETREF) { 794 // Emit optimized code for [char set]+ 795 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 796 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); 797 frameLoc = fRXPat->fFrameSize; 798 fRXPat->fFrameSize++; 799 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); 800 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 801 break; 802 } 803 804 if (URX_TYPE(repeatedOp) == URX_DOTANY || 805 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 806 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 807 // Emit Optimized code for .+ operations. 808 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); 809 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 810 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode. 811 loopOpI |= 1; 812 } 813 if (fModeFlags & UREGEX_UNIX_LINES) { 814 loopOpI |= 2; 815 } 816 fRXPat->fCompiledPat->addElement(loopOpI, *fStatus); 817 frameLoc = fRXPat->fFrameSize; 818 fRXPat->fFrameSize++; 819 int32_t loopOpC = URX_BUILD(URX_LOOP_C, frameLoc); 820 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 821 break; 822 } 823 824 } 825 826 // General case. 827 828 // Check for minimum match length of zero, which requires 829 // extra loop-breaking code. 830 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) { 831 // Zero length match is possible. 832 // Emit the code sequence that can handle it. 833 insertOp(topLoc); 834 frameLoc = fRXPat->fFrameSize; 835 fRXPat->fFrameSize++; 836 837 int32_t op = URX_BUILD(URX_STO_INP_LOC, frameLoc); 838 fRXPat->fCompiledPat->setElementAt(op, topLoc); 839 840 op = URX_BUILD(URX_JMP_SAV_X, topLoc+1); 841 fRXPat->fCompiledPat->addElement(op, *fStatus); 842 } else { 843 // Simpler code when the repeated body must match something non-empty 844 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, topLoc); 845 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); 846 } 847 } 848 break; 849 850 case doNGPlus: 851 // Non-greedy '+?' compiles to 852 // 1. stuff to be repeated (already built) 853 // 2. state-save 1 854 // 3. ... 855 { 856 int32_t topLoc = blockTopLoc(FALSE); 857 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, topLoc); 858 fRXPat->fCompiledPat->addElement(saveStateOp, *fStatus); 859 } 860 break; 861 862 863 case doOpt: 864 // Normal (greedy) ? quantifier. 865 // Compiles to 866 // 1. state save 3 867 // 2. body of optional block 868 // 3. ... 869 // Insert the state save into the compiled pattern, and we're done. 870 { 871 int32_t saveStateLoc = blockTopLoc(TRUE); 872 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()); 873 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 874 } 875 break; 876 877 case doNGOpt: 878 // Non-greedy ?? quantifier 879 // compiles to 880 // 1. jmp 4 881 // 2. body of optional block 882 // 3 jmp 5 883 // 4. state save 2 884 // 5 ... 885 // This code is less than ideal, with two jmps instead of one, because we can only 886 // insert one instruction at the top of the block being iterated. 887 { 888 int32_t jmp1_loc = blockTopLoc(TRUE); 889 int32_t jmp2_loc = fRXPat->fCompiledPat->size(); 890 891 int32_t jmp1_op = URX_BUILD(URX_JMP, jmp2_loc+1); 892 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc); 893 894 int32_t jmp2_op = URX_BUILD(URX_JMP, jmp2_loc+2); 895 fRXPat->fCompiledPat->addElement(jmp2_op, *fStatus); 896 897 int32_t save_op = URX_BUILD(URX_STATE_SAVE, jmp1_loc+1); 898 fRXPat->fCompiledPat->addElement(save_op, *fStatus); 899 } 900 break; 901 902 903 case doStar: 904 // Normal (greedy) * quantifier. 905 // Compiles to 906 // 1. STATE_SAVE 4 907 // 2. body of stuff being iterated over 908 // 3. JMP_SAV 2 909 // 4. ... 910 // 911 // Or, if the body is a simple [Set], 912 // 1. LOOP_SR_I set number 913 // 2. LOOP_C stack location 914 // ... 915 // 916 // Or if this is a .* 917 // 1. LOOP_DOT_I (. matches all mode flag) 918 // 2. LOOP_C stack location 919 // 920 // Or, if the body can match a zero-length string, to inhibit infinite loops, 921 // 1. STATE_SAVE 5 922 // 2. STO_INP_LOC data-loc 923 // 3. body of stuff 924 // 4. JMP_SAV_X 2 925 // 5. ... 926 { 927 // location of item #1, the STATE_SAVE 928 int32_t topLoc = blockTopLoc(FALSE); 929 int32_t dataLoc = -1; 930 931 // Check for simple *, where the construct being repeated 932 // compiled to single opcode, and might be optimizable. 933 if (topLoc == fRXPat->fCompiledPat->size() - 1) { 934 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc); 935 936 if (URX_TYPE(repeatedOp) == URX_SETREF) { 937 // Emit optimized code for a [char set]* 938 int32_t loopOpI = URX_BUILD(URX_LOOP_SR_I, URX_VAL(repeatedOp)); 939 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 940 dataLoc = fRXPat->fFrameSize; 941 fRXPat->fFrameSize++; 942 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); 943 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 944 break; 945 } 946 947 if (URX_TYPE(repeatedOp) == URX_DOTANY || 948 URX_TYPE(repeatedOp) == URX_DOTANY_ALL || 949 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) { 950 // Emit Optimized code for .* operations. 951 int32_t loopOpI = URX_BUILD(URX_LOOP_DOT_I, 0); 952 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) { 953 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode. 954 loopOpI |= 1; 955 } 956 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) { 957 loopOpI |= 2; 958 } 959 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc); 960 dataLoc = fRXPat->fFrameSize; 961 fRXPat->fFrameSize++; 962 int32_t loopOpC = URX_BUILD(URX_LOOP_C, dataLoc); 963 fRXPat->fCompiledPat->addElement(loopOpC, *fStatus); 964 break; 965 } 966 } 967 968 // Emit general case code for this * 969 // The optimizations did not apply. 970 971 int32_t saveStateLoc = blockTopLoc(TRUE); 972 int32_t jmpOp = URX_BUILD(URX_JMP_SAV, saveStateLoc+1); 973 974 // Check for minimum match length of zero, which requires 975 // extra loop-breaking code. 976 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) { 977 insertOp(saveStateLoc); 978 dataLoc = fRXPat->fFrameSize; 979 fRXPat->fFrameSize++; 980 981 int32_t op = URX_BUILD(URX_STO_INP_LOC, dataLoc); 982 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1); 983 jmpOp = URX_BUILD(URX_JMP_SAV_X, saveStateLoc+2); 984 } 985 986 // Locate the position in the compiled pattern where the match will continue 987 // after completing the *. (4 or 5 in the comment above) 988 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 989 990 // Put together the save state op store it into the compiled code. 991 int32_t saveStateOp = URX_BUILD(URX_STATE_SAVE, continueLoc); 992 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc); 993 994 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern. 995 fRXPat->fCompiledPat->addElement(jmpOp, *fStatus); 996 } 997 break; 998 999 case doNGStar: 1000 // Non-greedy *? quantifier 1001 // compiles to 1002 // 1. JMP 3 1003 // 2. body of stuff being iterated over 1004 // 3. STATE_SAVE 2 1005 // 4 ... 1006 { 1007 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1. 1008 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3. 1009 int32_t jmpOp = URX_BUILD(URX_JMP, saveLoc); 1010 int32_t stateSaveOp = URX_BUILD(URX_STATE_SAVE, jmpLoc+1); 1011 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc); 1012 fRXPat->fCompiledPat->addElement(stateSaveOp, *fStatus); 1013 } 1014 break; 1015 1016 1017 case doIntervalInit: 1018 // The '{' opening an interval quantifier was just scanned. 1019 // Init the counter varaiables that will accumulate the values as the digits 1020 // are scanned. 1021 fIntervalLow = 0; 1022 fIntervalUpper = -1; 1023 break; 1024 1025 case doIntevalLowerDigit: 1026 // Scanned a digit from the lower value of an {lower,upper} interval 1027 { 1028 int32_t digitValue = u_charDigitValue(fC.fChar); 1029 U_ASSERT(digitValue >= 0); 1030 fIntervalLow = fIntervalLow*10 + digitValue; 1031 if (fIntervalLow < 0) { 1032 error(U_REGEX_NUMBER_TOO_BIG); 1033 } 1034 } 1035 break; 1036 1037 case doIntervalUpperDigit: 1038 // Scanned a digit from the upper value of an {lower,upper} interval 1039 { 1040 if (fIntervalUpper < 0) { 1041 fIntervalUpper = 0; 1042 } 1043 int32_t digitValue = u_charDigitValue(fC.fChar); 1044 U_ASSERT(digitValue >= 0); 1045 fIntervalUpper = fIntervalUpper*10 + digitValue; 1046 if (fIntervalUpper < 0) { 1047 error(U_REGEX_NUMBER_TOO_BIG); 1048 } 1049 } 1050 break; 1051 1052 case doIntervalSame: 1053 // Scanned a single value interval like {27}. Upper = Lower. 1054 fIntervalUpper = fIntervalLow; 1055 break; 1056 1057 case doInterval: 1058 // Finished scanning a normal {lower,upper} interval. Generate the code for it. 1059 if (compileInlineInterval() == FALSE) { 1060 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1061 } 1062 break; 1063 1064 case doPossessiveInterval: 1065 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it. 1066 { 1067 // Remember the loc for the top of the block being looped over. 1068 // (Can not reserve a slot in the compiled pattern at this time, because 1069 // compileInterval needs to reserve also, and blockTopLoc can only reserve 1070 // once per block.) 1071 int32_t topLoc = blockTopLoc(FALSE); 1072 1073 // Produce normal looping code. 1074 compileInterval(URX_CTR_INIT, URX_CTR_LOOP); 1075 1076 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP 1077 // just as if the loop was inclosed in atomic parentheses. 1078 1079 // First the STO_SP before the start of the loop 1080 insertOp(topLoc); 1081 int32_t varLoc = fRXPat->fDataSize; // Reserve a data location for saving the 1082 fRXPat->fDataSize += 1; // state stack ptr. 1083 int32_t op = URX_BUILD(URX_STO_SP, varLoc); 1084 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1085 1086 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi(); 1087 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc); 1088 loopOp++; // point LoopOp after the just-inserted STO_SP 1089 fRXPat->fCompiledPat->push(loopOp, *fStatus); 1090 1091 // Then the LD_SP after the end of the loop 1092 op = URX_BUILD(URX_LD_SP, varLoc); 1093 fRXPat->fCompiledPat->addElement(op, *fStatus); 1094 } 1095 1096 break; 1097 1098 case doNGInterval: 1099 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it. 1100 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG); 1101 break; 1102 1103 case doIntervalError: 1104 error(U_REGEX_BAD_INTERVAL); 1105 break; 1106 1107 case doLiteralChar: 1108 // We've just scanned a "normal" character from the pattern, 1109 literalChar(fC.fChar); 1110 break; 1111 1112 1113 case doEscapedLiteralChar: 1114 // We've just scanned an backslashed escaped character with no 1115 // special meaning. It represents itself. 1116 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1117 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1118 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1119 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1120 } 1121 literalChar(fC.fChar); 1122 break; 1123 1124 1125 case doDotAny: 1126 // scanned a ".", match any single character. 1127 { 1128 fixLiterals(FALSE); 1129 int32_t op; 1130 if (fModeFlags & UREGEX_DOTALL) { 1131 op = URX_BUILD(URX_DOTANY_ALL, 0); 1132 } else if (fModeFlags & UREGEX_UNIX_LINES) { 1133 op = URX_BUILD(URX_DOTANY_UNIX, 0); 1134 } else { 1135 op = URX_BUILD(URX_DOTANY, 0); 1136 } 1137 fRXPat->fCompiledPat->addElement(op, *fStatus); 1138 } 1139 break; 1140 1141 case doCaret: 1142 { 1143 fixLiterals(FALSE); 1144 int32_t op = 0; 1145 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1146 op = URX_CARET; 1147 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1148 op = URX_CARET_M; 1149 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1150 op = URX_CARET; // Only testing true start of input. 1151 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1152 op = URX_CARET_M_UNIX; 1153 } 1154 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1155 } 1156 break; 1157 1158 case doDollar: 1159 { 1160 fixLiterals(FALSE); 1161 int32_t op = 0; 1162 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1163 op = URX_DOLLAR; 1164 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) { 1165 op = URX_DOLLAR_M; 1166 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1167 op = URX_DOLLAR_D; 1168 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) { 1169 op = URX_DOLLAR_MD; 1170 } 1171 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1172 } 1173 break; 1174 1175 case doBackslashA: 1176 fixLiterals(FALSE); 1177 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_CARET, 0), *fStatus); 1178 break; 1179 1180 case doBackslashB: 1181 { 1182 #if UCONFIG_NO_BREAK_ITERATION==1 1183 if (fModeFlags & UREGEX_UWORD) { 1184 error(U_UNSUPPORTED_ERROR); 1185 } 1186 #endif 1187 fixLiterals(FALSE); 1188 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1189 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 1), *fStatus); 1190 } 1191 break; 1192 1193 case doBackslashb: 1194 { 1195 #if UCONFIG_NO_BREAK_ITERATION==1 1196 if (fModeFlags & UREGEX_UWORD) { 1197 error(U_UNSUPPORTED_ERROR); 1198 } 1199 #endif 1200 fixLiterals(FALSE); 1201 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B; 1202 fRXPat->fCompiledPat->addElement(URX_BUILD(op, 0), *fStatus); 1203 } 1204 break; 1205 1206 case doBackslashD: 1207 fixLiterals(FALSE); 1208 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 1), *fStatus); 1209 break; 1210 1211 case doBackslashd: 1212 fixLiterals(FALSE); 1213 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_D, 0), *fStatus); 1214 break; 1215 1216 case doBackslashG: 1217 fixLiterals(FALSE); 1218 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_G, 0), *fStatus); 1219 break; 1220 1221 case doBackslashS: 1222 fixLiterals(FALSE); 1223 fRXPat->fCompiledPat->addElement( 1224 URX_BUILD(URX_STAT_SETREF_N, URX_ISSPACE_SET), *fStatus); 1225 break; 1226 1227 case doBackslashs: 1228 fixLiterals(FALSE); 1229 fRXPat->fCompiledPat->addElement( 1230 URX_BUILD(URX_STATIC_SETREF, URX_ISSPACE_SET), *fStatus); 1231 break; 1232 1233 case doBackslashW: 1234 fixLiterals(FALSE); 1235 fRXPat->fCompiledPat->addElement( 1236 URX_BUILD(URX_STAT_SETREF_N, URX_ISWORD_SET), *fStatus); 1237 break; 1238 1239 case doBackslashw: 1240 fixLiterals(FALSE); 1241 fRXPat->fCompiledPat->addElement( 1242 URX_BUILD(URX_STATIC_SETREF, URX_ISWORD_SET), *fStatus); 1243 break; 1244 1245 case doBackslashX: 1246 fixLiterals(FALSE); 1247 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_X, 0), *fStatus); 1248 break; 1249 1250 1251 case doBackslashZ: 1252 fixLiterals(FALSE); 1253 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_DOLLAR, 0), *fStatus); 1254 break; 1255 1256 case doBackslashz: 1257 fixLiterals(FALSE); 1258 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKSLASH_Z, 0), *fStatus); 1259 break; 1260 1261 case doEscapeError: 1262 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1263 break; 1264 1265 case doExit: 1266 fixLiterals(FALSE); 1267 returnVal = FALSE; 1268 break; 1269 1270 case doProperty: 1271 { 1272 fixLiterals(FALSE); 1273 UnicodeSet *theSet = scanProp(); 1274 compileSet(theSet); 1275 } 1276 break; 1277 1278 case doNamedChar: 1279 { 1280 UChar32 c = scanNamedChar(); 1281 literalChar(c); 1282 } 1283 break; 1284 1285 1286 case doBackRef: 1287 // BackReference. Somewhat unusual in that the front-end can not completely parse 1288 // the regular expression, because the number of digits to be consumed 1289 // depends on the number of capture groups that have been defined. So 1290 // we have to do it here instead. 1291 { 1292 int32_t numCaptureGroups = fRXPat->fGroupMap->size(); 1293 int32_t groupNum = 0; 1294 UChar32 c = fC.fChar; 1295 1296 for (;;) { 1297 // Loop once per digit, for max allowed number of digits in a back reference. 1298 int32_t digit = u_charDigitValue(c); 1299 groupNum = groupNum * 10 + digit; 1300 if (groupNum >= numCaptureGroups) { 1301 break; 1302 } 1303 c = peekCharLL(); 1304 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) { 1305 break; 1306 } 1307 nextCharLL(); 1308 } 1309 1310 // Scan of the back reference in the source regexp is complete. Now generate 1311 // the compiled code for it. 1312 // Because capture groups can be forward-referenced by back-references, 1313 // we fill the operand with the capture group number. At the end 1314 // of compilation, it will be changed to the variable's location. 1315 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence, 1316 // and shouldn't enter this code path at all. 1317 fixLiterals(FALSE); 1318 int32_t op; 1319 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1320 op = URX_BUILD(URX_BACKREF_I, groupNum); 1321 } else { 1322 op = URX_BUILD(URX_BACKREF, groupNum); 1323 } 1324 fRXPat->fCompiledPat->addElement(op, *fStatus); 1325 } 1326 break; 1327 1328 1329 case doPossessivePlus: 1330 // Possessive ++ quantifier. 1331 // Compiles to 1332 // 1. STO_SP 1333 // 2. body of stuff being iterated over 1334 // 3. STATE_SAVE 5 1335 // 4. JMP 2 1336 // 5. LD_SP 1337 // 6. ... 1338 // 1339 // Note: TODO: This is pretty inefficient. A mass of saved state is built up 1340 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056 1341 // 1342 { 1343 // Emit the STO_SP 1344 int32_t topLoc = blockTopLoc(TRUE); 1345 int32_t stoLoc = fRXPat->fDataSize; 1346 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1347 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1348 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1349 1350 // Emit the STATE_SAVE 1351 op = URX_BUILD(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2); 1352 fRXPat->fCompiledPat->addElement(op, *fStatus); 1353 1354 // Emit the JMP 1355 op = URX_BUILD(URX_JMP, topLoc+1); 1356 fRXPat->fCompiledPat->addElement(op, *fStatus); 1357 1358 // Emit the LD_SP 1359 op = URX_BUILD(URX_LD_SP, stoLoc); 1360 fRXPat->fCompiledPat->addElement(op, *fStatus); 1361 } 1362 break; 1363 1364 case doPossessiveStar: 1365 // Possessive *+ quantifier. 1366 // Compiles to 1367 // 1. STO_SP loc 1368 // 2. STATE_SAVE 5 1369 // 3. body of stuff being iterated over 1370 // 4. JMP 2 1371 // 5. LD_SP loc 1372 // 6 ... 1373 // TODO: do something to cut back the state stack each time through the loop. 1374 { 1375 // Reserve two slots at the top of the block. 1376 int32_t topLoc = blockTopLoc(TRUE); 1377 insertOp(topLoc); 1378 1379 // emit STO_SP loc 1380 int32_t stoLoc = fRXPat->fDataSize; 1381 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1382 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1383 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1384 1385 // Emit the SAVE_STATE 5 1386 int32_t L7 = fRXPat->fCompiledPat->size()+1; 1387 op = URX_BUILD(URX_STATE_SAVE, L7); 1388 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1389 1390 // Append the JMP operation. 1391 op = URX_BUILD(URX_JMP, topLoc+1); 1392 fRXPat->fCompiledPat->addElement(op, *fStatus); 1393 1394 // Emit the LD_SP loc 1395 op = URX_BUILD(URX_LD_SP, stoLoc); 1396 fRXPat->fCompiledPat->addElement(op, *fStatus); 1397 } 1398 break; 1399 1400 case doPossessiveOpt: 1401 // Possessive ?+ quantifier. 1402 // Compiles to 1403 // 1. STO_SP loc 1404 // 2. SAVE_STATE 5 1405 // 3. body of optional block 1406 // 4. LD_SP loc 1407 // 5. ... 1408 // 1409 { 1410 // Reserve two slots at the top of the block. 1411 int32_t topLoc = blockTopLoc(TRUE); 1412 insertOp(topLoc); 1413 1414 // Emit the STO_SP 1415 int32_t stoLoc = fRXPat->fDataSize; 1416 fRXPat->fDataSize++; // Reserve the data location for storing save stack ptr. 1417 int32_t op = URX_BUILD(URX_STO_SP, stoLoc); 1418 fRXPat->fCompiledPat->setElementAt(op, topLoc); 1419 1420 // Emit the SAVE_STATE 1421 int32_t continueLoc = fRXPat->fCompiledPat->size()+1; 1422 op = URX_BUILD(URX_STATE_SAVE, continueLoc); 1423 fRXPat->fCompiledPat->setElementAt(op, topLoc+1); 1424 1425 // Emit the LD_SP 1426 op = URX_BUILD(URX_LD_SP, stoLoc); 1427 fRXPat->fCompiledPat->addElement(op, *fStatus); 1428 } 1429 break; 1430 1431 1432 case doBeginMatchMode: 1433 fNewModeFlags = fModeFlags; 1434 fSetModeFlag = TRUE; 1435 break; 1436 1437 case doMatchMode: // (?i) and similar 1438 { 1439 int32_t bit = 0; 1440 switch (fC.fChar) { 1441 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break; 1442 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break; 1443 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break; 1444 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break; 1445 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break; 1446 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break; 1447 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break; 1448 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break; 1449 default: 1450 U_ASSERT(FALSE); // Should never happen. Other chars are filtered out 1451 // by the scanner. 1452 } 1453 if (fSetModeFlag) { 1454 fNewModeFlags |= bit; 1455 } else { 1456 fNewModeFlags &= ~bit; 1457 } 1458 } 1459 break; 1460 1461 case doSetMatchMode: 1462 // Emit code to match any pending literals, using the not-yet changed match mode. 1463 fixLiterals(); 1464 1465 // We've got a (?i) or similar. The match mode is being changed, but 1466 // the change is not scoped to a parenthesized block. 1467 U_ASSERT(fNewModeFlags < 0); 1468 fModeFlags = fNewModeFlags; 1469 1470 break; 1471 1472 1473 case doMatchModeParen: 1474 // We've got a (?i: or similar. Begin a parenthesized block, save old 1475 // mode flags so they can be restored at the close of the block. 1476 // 1477 // Compile to a 1478 // - NOP, which later may be replaced by a save-state if the 1479 // parenthesized group gets a * quantifier, followed by 1480 // - NOP, which may later be replaced by a save-state if there 1481 // is an '|' alternation within the parens. 1482 { 1483 fixLiterals(FALSE); 1484 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1485 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_NOP, 0), *fStatus); 1486 1487 // On the Parentheses stack, start a new frame and add the postions 1488 // of the two NOPs (a normal non-capturing () frame, except for the 1489 // saving of the orignal mode flags.) 1490 fParenStack.push(fModeFlags, *fStatus); 1491 fParenStack.push(flags, *fStatus); // Frame Marker 1492 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP 1493 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP 1494 1495 // Set the current mode flags to the new values. 1496 U_ASSERT(fNewModeFlags < 0); 1497 fModeFlags = fNewModeFlags; 1498 } 1499 break; 1500 1501 case doBadModeFlag: 1502 error(U_REGEX_INVALID_FLAG); 1503 break; 1504 1505 case doSuppressComments: 1506 // We have just scanned a '(?'. We now need to prevent the character scanner from 1507 // treating a '#' as a to-the-end-of-line comment. 1508 // (This Perl compatibility just gets uglier and uglier to do...) 1509 fEOLComments = FALSE; 1510 break; 1511 1512 1513 case doSetAddAmp: 1514 { 1515 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1516 set->add(chAmp); 1517 } 1518 break; 1519 1520 case doSetAddDash: 1521 { 1522 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1523 set->add(chDash); 1524 } 1525 break; 1526 1527 case doSetBackslash_s: 1528 { 1529 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1530 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1531 break; 1532 } 1533 1534 case doSetBackslash_S: 1535 { 1536 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1537 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]); 1538 SSet.complement(); 1539 set->addAll(SSet); 1540 break; 1541 } 1542 1543 case doSetBackslash_d: 1544 { 1545 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1546 // TODO - make a static set, ticket 6058. 1547 addCategory(set, U_GC_ND_MASK, *fStatus); 1548 break; 1549 } 1550 1551 case doSetBackslash_D: 1552 { 1553 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1554 UnicodeSet digits; 1555 // TODO - make a static set, ticket 6058. 1556 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 1557 digits.complement(); 1558 set->addAll(digits); 1559 break; 1560 } 1561 1562 case doSetBackslash_w: 1563 { 1564 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1565 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1566 break; 1567 } 1568 1569 case doSetBackslash_W: 1570 { 1571 UnicodeSet *set = (UnicodeSet *)fSetStack.peek(); 1572 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]); 1573 SSet.complement(); 1574 set->addAll(SSet); 1575 break; 1576 } 1577 1578 case doSetBegin: 1579 fixLiterals(FALSE); 1580 fSetStack.push(new UnicodeSet(), *fStatus); 1581 fSetOpStack.push(setStart, *fStatus); 1582 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1583 fSetOpStack.push(setCaseClose, *fStatus); 1584 } 1585 break; 1586 1587 case doSetBeginDifference1: 1588 // We have scanned something like [[abc]-[ 1589 // Set up a new UnicodeSet for the set beginning with the just-scanned '[' 1590 // Push a Difference operator, which will cause the new set to be subtracted from what 1591 // went before once it is created. 1592 setPushOp(setDifference1); 1593 fSetOpStack.push(setStart, *fStatus); 1594 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1595 fSetOpStack.push(setCaseClose, *fStatus); 1596 } 1597 break; 1598 1599 case doSetBeginIntersection1: 1600 // We have scanned something like [[abc]&[ 1601 // Need both the '&' operator and the open '[' operator. 1602 setPushOp(setIntersection1); 1603 fSetOpStack.push(setStart, *fStatus); 1604 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1605 fSetOpStack.push(setCaseClose, *fStatus); 1606 } 1607 break; 1608 1609 case doSetBeginUnion: 1610 // We have scanned something like [[abc][ 1611 // Need to handle the union operation explicitly [[abc] | [ 1612 setPushOp(setUnion); 1613 fSetOpStack.push(setStart, *fStatus); 1614 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) { 1615 fSetOpStack.push(setCaseClose, *fStatus); 1616 } 1617 break; 1618 1619 case doSetDifference2: 1620 // We have scanned something like [abc-- 1621 // Consider this to unambiguously be a set difference operator. 1622 setPushOp(setDifference2); 1623 break; 1624 1625 case doSetEnd: 1626 // Have encountered the ']' that closes a set. 1627 // Force the evaluation of any pending operations within this set, 1628 // leave the completed set on the top of the set stack. 1629 setEval(setEnd); 1630 U_ASSERT(fSetOpStack.peeki()==setStart); 1631 fSetOpStack.popi(); 1632 break; 1633 1634 case doSetFinish: 1635 { 1636 // Finished a complete set expression, including all nested sets. 1637 // The close bracket has already triggered clearing out pending set operators, 1638 // the operator stack should be empty and the operand stack should have just 1639 // one entry, the result set. 1640 U_ASSERT(fSetOpStack.empty()); 1641 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop(); 1642 U_ASSERT(fSetStack.empty()); 1643 compileSet(theSet); 1644 break; 1645 } 1646 1647 case doSetIntersection2: 1648 // Have scanned something like [abc&& 1649 setPushOp(setIntersection2); 1650 break; 1651 1652 case doSetLiteral: 1653 // Union the just-scanned literal character into the set being built. 1654 // This operation is the highest precedence set operation, so we can always do 1655 // it immediately, without waiting to see what follows. It is necessary to perform 1656 // any pending '-' or '&' operation first, because these have the same precedence 1657 // as union-ing in a literal' 1658 { 1659 setEval(setUnion); 1660 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1661 s->add(fC.fChar); 1662 fLastSetLiteral = fC.fChar; 1663 break; 1664 } 1665 1666 case doSetLiteralEscaped: 1667 // A back-slash escaped literal character was encountered. 1668 // Processing is the same as with setLiteral, above, with the addition of 1669 // the optional check for errors on escaped ASCII letters. 1670 { 1671 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 && 1672 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z] 1673 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z] 1674 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 1675 } 1676 setEval(setUnion); 1677 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1678 s->add(fC.fChar); 1679 fLastSetLiteral = fC.fChar; 1680 break; 1681 } 1682 1683 case doSetNamedChar: 1684 // Scanning a \N{UNICODE CHARACTER NAME} 1685 // Aside from the source of the character, the processing is identical to doSetLiteral, 1686 // above. 1687 { 1688 UChar32 c = scanNamedChar(); 1689 setEval(setUnion); 1690 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1691 s->add(c); 1692 fLastSetLiteral = c; 1693 break; 1694 } 1695 1696 case doSetNamedRange: 1697 // We have scanned literal-\N{CHAR NAME}. Add the range to the set. 1698 // The left character is already in the set, and is saved in fLastSetLiteral. 1699 // The right side needs to be picked up, the scan is at the 'N'. 1700 // Lower Limit > Upper limit being an error matches both Java 1701 // and ICU UnicodeSet behavior. 1702 { 1703 UChar32 c = scanNamedChar(); 1704 if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) { 1705 error(U_REGEX_INVALID_RANGE); 1706 } 1707 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1708 s->add(fLastSetLiteral, c); 1709 fLastSetLiteral = c; 1710 break; 1711 } 1712 1713 1714 case doSetNegate: 1715 // Scanned a '^' at the start of a set. 1716 // Push the negation operator onto the set op stack. 1717 // A twist for case-insensitive matching: 1718 // the case closure operation must happen _before_ negation. 1719 // But the case closure operation will already be on the stack if it's required. 1720 // This requires checking for case closure, and swapping the stack order 1721 // if it is present. 1722 { 1723 int32_t tosOp = fSetOpStack.peeki(); 1724 if (tosOp == setCaseClose) { 1725 fSetOpStack.popi(); 1726 fSetOpStack.push(setNegation, *fStatus); 1727 fSetOpStack.push(setCaseClose, *fStatus); 1728 } else { 1729 fSetOpStack.push(setNegation, *fStatus); 1730 } 1731 } 1732 break; 1733 1734 case doSetNoCloseError: 1735 error(U_REGEX_MISSING_CLOSE_BRACKET); 1736 break; 1737 1738 case doSetOpError: 1739 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal. 1740 break; 1741 1742 case doSetPosixProp: 1743 { 1744 UnicodeSet *s = scanPosixProp(); 1745 if (s != NULL) { 1746 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1747 tos->addAll(*s); 1748 delete s; 1749 } // else error. scanProp() reported the error status already. 1750 } 1751 break; 1752 1753 case doSetProp: 1754 // Scanned a \p \P within [brackets]. 1755 { 1756 UnicodeSet *s = scanProp(); 1757 if (s != NULL) { 1758 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek(); 1759 tos->addAll(*s); 1760 delete s; 1761 } // else error. scanProp() reported the error status already. 1762 } 1763 break; 1764 1765 1766 case doSetRange: 1767 // We have scanned literal-literal. Add the range to the set. 1768 // The left character is already in the set, and is saved in fLastSetLiteral. 1769 // The right side is the current character. 1770 // Lower Limit > Upper limit being an error matches both Java 1771 // and ICU UnicodeSet behavior. 1772 { 1773 if (fLastSetLiteral > fC.fChar) { 1774 error(U_REGEX_INVALID_RANGE); 1775 } 1776 UnicodeSet *s = (UnicodeSet *)fSetStack.peek(); 1777 s->add(fLastSetLiteral, fC.fChar); 1778 break; 1779 } 1780 1781 default: 1782 U_ASSERT(FALSE); 1783 error(U_REGEX_INTERNAL_ERROR); 1784 break; 1785 } 1786 1787 if (U_FAILURE(*fStatus)) { 1788 returnVal = FALSE; 1789 } 1790 1791 return returnVal; 1792} 1793 1794 1795 1796//------------------------------------------------------------------------------ 1797// 1798// literalChar We've encountered a literal character from the pattern, 1799// or an escape sequence that reduces to a character. 1800// Add it to the string containing all literal chars/strings from 1801// the pattern. 1802// 1803//------------------------------------------------------------------------------ 1804void RegexCompile::literalChar(UChar32 c) { 1805 fLiteralChars.append(c); 1806} 1807 1808 1809//------------------------------------------------------------------------------ 1810// 1811// fixLiterals When compiling something that can follow a literal 1812// string in a pattern, emit the code to match the 1813// accumulated literal string. 1814// 1815// Optionally, split the last char of the string off into 1816// a single "ONE_CHAR" operation, so that quantifiers can 1817// apply to that char alone. Example: abc* 1818// The * must apply to the 'c' only. 1819// 1820//------------------------------------------------------------------------------ 1821void RegexCompile::fixLiterals(UBool split) { 1822 int32_t op = 0; // An op from/for the compiled pattern. 1823 1824 // If no literal characters have been scanned but not yet had code generated 1825 // for them, nothing needs to be done. 1826 if (fLiteralChars.length() == 0) { 1827 return; 1828 } 1829 1830 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1); 1831 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); 1832 1833 // Split: We need to ensure that the last item in the compiled pattern 1834 // refers only to the last literal scanned in the pattern, so that 1835 // quantifiers (*, +, etc.) affect only it, and not a longer string. 1836 // Split before case folding for case insensitive matches. 1837 1838 if (split) { 1839 fLiteralChars.truncate(indexOfLastCodePoint); 1840 fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string. 1841 // Note that the truncated literal string may be empty, in which case 1842 // nothing will be emitted. 1843 1844 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal. 1845 fixLiterals(FALSE); // Second recursive call, code for the final code point. 1846 return; 1847 } 1848 1849 // If we are doing case-insensitive matching, case fold the string. This may expand 1850 // the string, e.g. the German sharp-s turns into "ss" 1851 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1852 fLiteralChars.foldCase(); 1853 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1); 1854 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint); 1855 } 1856 1857 if (indexOfLastCodePoint == 0) { 1858 // Single character, emit a URX_ONECHAR op to match it. 1859 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) && 1860 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) { 1861 op = URX_BUILD(URX_ONECHAR_I, lastCodePoint); 1862 } else { 1863 op = URX_BUILD(URX_ONECHAR, lastCodePoint); 1864 } 1865 fRXPat->fCompiledPat->addElement(op, *fStatus); 1866 } else { 1867 // Two or more chars, emit a URX_STRING to match them. 1868 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 1869 op = URX_BUILD(URX_STRING_I, fRXPat->fLiteralText.length()); 1870 } else { 1871 // TODO here: add optimization to split case sensitive strings of length two 1872 // into two single char ops, for efficiency. 1873 op = URX_BUILD(URX_STRING, fRXPat->fLiteralText.length()); 1874 } 1875 fRXPat->fCompiledPat->addElement(op, *fStatus); 1876 op = URX_BUILD(URX_STRING_LEN, fLiteralChars.length()); 1877 fRXPat->fCompiledPat->addElement(op, *fStatus); 1878 1879 // Add this string into the accumulated strings of the compiled pattern. 1880 fRXPat->fLiteralText.append(fLiteralChars); 1881 } 1882 1883 fLiteralChars.remove(); 1884} 1885 1886 1887 1888 1889 1890 1891//------------------------------------------------------------------------------ 1892// 1893// insertOp() Insert a slot for a new opcode into the already 1894// compiled pattern code. 1895// 1896// Fill the slot with a NOP. Our caller will replace it 1897// with what they really wanted. 1898// 1899//------------------------------------------------------------------------------ 1900void RegexCompile::insertOp(int32_t where) { 1901 UVector64 *code = fRXPat->fCompiledPat; 1902 U_ASSERT(where>0 && where < code->size()); 1903 1904 int32_t nop = URX_BUILD(URX_NOP, 0); 1905 code->insertElementAt(nop, where, *fStatus); 1906 1907 // Walk through the pattern, looking for any ops with targets that 1908 // were moved down by the insert. Fix them. 1909 int32_t loc; 1910 for (loc=0; loc<code->size(); loc++) { 1911 int32_t op = (int32_t)code->elementAti(loc); 1912 int32_t opType = URX_TYPE(op); 1913 int32_t opValue = URX_VAL(op); 1914 if ((opType == URX_JMP || 1915 opType == URX_JMPX || 1916 opType == URX_STATE_SAVE || 1917 opType == URX_CTR_LOOP || 1918 opType == URX_CTR_LOOP_NG || 1919 opType == URX_JMP_SAV || 1920 opType == URX_JMP_SAV_X || 1921 opType == URX_RELOC_OPRND) && opValue > where) { 1922 // Target location for this opcode is after the insertion point and 1923 // needs to be incremented to adjust for the insertion. 1924 opValue++; 1925 op = URX_BUILD(opType, opValue); 1926 code->setElementAt(op, loc); 1927 } 1928 } 1929 1930 // Now fix up the parentheses stack. All positive values in it are locations in 1931 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.) 1932 for (loc=0; loc<fParenStack.size(); loc++) { 1933 int32_t x = fParenStack.elementAti(loc); 1934 U_ASSERT(x < code->size()); 1935 if (x>where) { 1936 x++; 1937 fParenStack.setElementAt(x, loc); 1938 } 1939 } 1940 1941 if (fMatchCloseParen > where) { 1942 fMatchCloseParen++; 1943 } 1944 if (fMatchOpenParen > where) { 1945 fMatchOpenParen++; 1946 } 1947} 1948 1949 1950 1951//------------------------------------------------------------------------------ 1952// 1953// blockTopLoc() Find or create a location in the compiled pattern 1954// at the start of the operation or block that has 1955// just been compiled. Needed when a quantifier (* or 1956// whatever) appears, and we need to add an operation 1957// at the start of the thing being quantified. 1958// 1959// (Parenthesized Blocks) have a slot with a NOP that 1960// is reserved for this purpose. .* or similar don't 1961// and a slot needs to be added. 1962// 1963// parameter reserveLoc : TRUE - ensure that there is space to add an opcode 1964// at the returned location. 1965// FALSE - just return the address, 1966// do not reserve a location there. 1967// 1968//------------------------------------------------------------------------------ 1969int32_t RegexCompile::blockTopLoc(UBool reserveLoc) { 1970 int32_t theLoc; 1971 fixLiterals(TRUE); // Emit code for any pending literals. 1972 // If last item was a string, emit separate op for the its last char. 1973 if (fRXPat->fCompiledPat->size() == fMatchCloseParen) 1974 { 1975 // The item just processed is a parenthesized block. 1976 theLoc = fMatchOpenParen; // A slot is already reserved for us. 1977 U_ASSERT(theLoc > 0); 1978 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP); 1979 } 1980 else { 1981 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference. 1982 // No slot for STATE_SAVE was pre-reserved in the compiled code. 1983 // We need to make space now. 1984 theLoc = fRXPat->fCompiledPat->size()-1; 1985 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc); 1986 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) { 1987 // Strings take two opcode, we want the position of the first one. 1988 // We can have a string at this point if a single character case-folded to two. 1989 theLoc--; 1990 } 1991 if (reserveLoc) { 1992 int32_t nop = URX_BUILD(URX_NOP, 0); 1993 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus); 1994 } 1995 } 1996 return theLoc; 1997} 1998 1999 2000 2001//------------------------------------------------------------------------------ 2002// 2003// handleCloseParen When compiling a close paren, we need to go back 2004// and fix up any JMP or SAVE operations within the 2005// parenthesized block that need to target the end 2006// of the block. The locations of these are kept on 2007// the paretheses stack. 2008// 2009// This function is called both when encountering a 2010// real ) and at the end of the pattern. 2011// 2012//------------------------------------------------------------------------------ 2013void RegexCompile::handleCloseParen() { 2014 int32_t patIdx; 2015 int32_t patOp; 2016 if (fParenStack.size() <= 0) { 2017 error(U_REGEX_MISMATCHED_PAREN); 2018 return; 2019 } 2020 2021 // Emit code for any pending literals. 2022 fixLiterals(FALSE); 2023 2024 // Fixup any operations within the just-closed parenthesized group 2025 // that need to reference the end of the (block). 2026 // (The first one popped from the stack is an unused slot for 2027 // alternation (OR) state save, but applying the fixup to it does no harm.) 2028 for (;;) { 2029 patIdx = fParenStack.popi(); 2030 if (patIdx < 0) { 2031 // value < 0 flags the start of the frame on the paren stack. 2032 break; 2033 } 2034 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size()); 2035 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx); 2036 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set. 2037 patOp |= fRXPat->fCompiledPat->size(); // Set it now. 2038 fRXPat->fCompiledPat->setElementAt(patOp, patIdx); 2039 fMatchOpenParen = patIdx; 2040 } 2041 2042 // At the close of any parenthesized block, restore the match mode flags to 2043 // the value they had at the open paren. Saved value is 2044 // at the top of the paren stack. 2045 fModeFlags = fParenStack.popi(); 2046 U_ASSERT(fModeFlags < 0); 2047 2048 // DO any additional fixups, depending on the specific kind of 2049 // parentesized grouping this is 2050 2051 switch (patIdx) { 2052 case plain: 2053 case flags: 2054 // No additional fixups required. 2055 // (Grouping-only parentheses) 2056 break; 2057 case capturing: 2058 // Capturing Parentheses. 2059 // Insert a End Capture op into the pattern. 2060 // The frame offset of the variables for this cg is obtained from the 2061 // start capture op and put it into the end-capture op. 2062 { 2063 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 2064 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE); 2065 2066 int32_t frameVarLocation = URX_VAL(captureOp); 2067 int32_t endCaptureOp = URX_BUILD(URX_END_CAPTURE, frameVarLocation); 2068 fRXPat->fCompiledPat->addElement(endCaptureOp, *fStatus); 2069 } 2070 break; 2071 case atomic: 2072 // Atomic Parenthesis. 2073 // Insert a LD_SP operation to restore the state stack to the position 2074 // it was when the atomic parens were entered. 2075 { 2076 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1); 2077 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP); 2078 int32_t stoLoc = URX_VAL(stoOp); 2079 int32_t ldOp = URX_BUILD(URX_LD_SP, stoLoc); 2080 fRXPat->fCompiledPat->addElement(ldOp, *fStatus); 2081 } 2082 break; 2083 2084 case lookAhead: 2085 { 2086 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 2087 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2088 int32_t dataLoc = URX_VAL(startOp); 2089 int32_t op = URX_BUILD(URX_LA_END, dataLoc); 2090 fRXPat->fCompiledPat->addElement(op, *fStatus); 2091 } 2092 break; 2093 2094 case negLookAhead: 2095 { 2096 // See comment at doOpenLookAheadNeg 2097 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1); 2098 U_ASSERT(URX_TYPE(startOp) == URX_LA_START); 2099 int32_t dataLoc = URX_VAL(startOp); 2100 int32_t op = URX_BUILD(URX_LA_END, dataLoc); 2101 fRXPat->fCompiledPat->addElement(op, *fStatus); 2102 op = URX_BUILD(URX_BACKTRACK, 0); 2103 fRXPat->fCompiledPat->addElement(op, *fStatus); 2104 op = URX_BUILD(URX_LA_END, dataLoc); 2105 fRXPat->fCompiledPat->addElement(op, *fStatus); 2106 2107 // Patch the URX_SAVE near the top of the block. 2108 // The destination of the SAVE is the final LA_END that was just added. 2109 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen); 2110 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE); 2111 int32_t dest = fRXPat->fCompiledPat->size()-1; 2112 saveOp = URX_BUILD(URX_STATE_SAVE, dest); 2113 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen); 2114 } 2115 break; 2116 2117 case lookBehind: 2118 { 2119 // See comment at doOpenLookBehind. 2120 2121 // Append the URX_LB_END and URX_LA_END to the compiled pattern. 2122 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4); 2123 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2124 int32_t dataLoc = URX_VAL(startOp); 2125 int32_t op = URX_BUILD(URX_LB_END, dataLoc); 2126 fRXPat->fCompiledPat->addElement(op, *fStatus); 2127 op = URX_BUILD(URX_LA_END, dataLoc); 2128 fRXPat->fCompiledPat->addElement(op, *fStatus); 2129 2130 // Determine the min and max bounds for the length of the 2131 // string that the pattern can match. 2132 // An unbounded upper limit is an error. 2133 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2134 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2135 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2136 if (maxML == INT32_MAX) { 2137 error(U_REGEX_LOOK_BEHIND_LIMIT); 2138 break; 2139 } 2140 U_ASSERT(minML <= maxML); 2141 2142 // Insert the min and max match len bounds into the URX_LB_CONT op that 2143 // appears at the top of the look-behind block, at location fMatchOpenParen+1 2144 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2); 2145 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1); 2146 2147 } 2148 break; 2149 2150 2151 2152 case lookBehindN: 2153 { 2154 // See comment at doOpenLookBehindNeg. 2155 2156 // Append the URX_LBN_END to the compiled pattern. 2157 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5); 2158 U_ASSERT(URX_TYPE(startOp) == URX_LB_START); 2159 int32_t dataLoc = URX_VAL(startOp); 2160 int32_t op = URX_BUILD(URX_LBN_END, dataLoc); 2161 fRXPat->fCompiledPat->addElement(op, *fStatus); 2162 2163 // Determine the min and max bounds for the length of the 2164 // string that the pattern can match. 2165 // An unbounded upper limit is an error. 2166 int32_t patEnd = fRXPat->fCompiledPat->size() - 1; 2167 int32_t minML = minMatchLength(fMatchOpenParen, patEnd); 2168 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd); 2169 if (maxML == INT32_MAX) { 2170 error(U_REGEX_LOOK_BEHIND_LIMIT); 2171 break; 2172 } 2173 U_ASSERT(minML <= maxML); 2174 2175 // Insert the min and max match len bounds into the URX_LB_CONT op that 2176 // appears at the top of the look-behind block, at location fMatchOpenParen+1 2177 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3); 2178 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2); 2179 2180 // Insert the pattern location to continue at after a successful match 2181 // as the last operand of the URX_LBN_CONT 2182 op = URX_BUILD(URX_RELOC_OPRND, fRXPat->fCompiledPat->size()); 2183 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1); 2184 } 2185 break; 2186 2187 2188 2189 default: 2190 U_ASSERT(FALSE); 2191 } 2192 2193 // remember the next location in the compiled pattern. 2194 // The compilation of Quantifiers will look at this to see whether its looping 2195 // over a parenthesized block or a single item 2196 fMatchCloseParen = fRXPat->fCompiledPat->size(); 2197} 2198 2199 2200 2201//------------------------------------------------------------------------------ 2202// 2203// compileSet Compile the pattern operations for a reference to a 2204// UnicodeSet. 2205// 2206//------------------------------------------------------------------------------ 2207void RegexCompile::compileSet(UnicodeSet *theSet) 2208{ 2209 if (theSet == NULL) { 2210 return; 2211 } 2212 // Remove any strings from the set. 2213 // There shoudn't be any, but just in case. 2214 // (Case Closure can add them; if we had a simple case closure avaialble that 2215 // ignored strings, that would be better.) 2216 theSet->removeAllStrings(); 2217 int32_t setSize = theSet->size(); 2218 2219 switch (setSize) { 2220 case 0: 2221 { 2222 // Set of no elements. Always fails to match. 2223 fRXPat->fCompiledPat->addElement(URX_BUILD(URX_BACKTRACK, 0), *fStatus); 2224 delete theSet; 2225 } 2226 break; 2227 2228 case 1: 2229 { 2230 // The set contains only a single code point. Put it into 2231 // the compiled pattern as a single char operation rather 2232 // than a set, and discard the set itself. 2233 literalChar(theSet->charAt(0)); 2234 delete theSet; 2235 } 2236 break; 2237 2238 default: 2239 { 2240 // The set contains two or more chars. (the normal case) 2241 // Put it into the compiled pattern as a set. 2242 int32_t setNumber = fRXPat->fSets->size(); 2243 fRXPat->fSets->addElement(theSet, *fStatus); 2244 int32_t setOp = URX_BUILD(URX_SETREF, setNumber); 2245 fRXPat->fCompiledPat->addElement(setOp, *fStatus); 2246 } 2247 } 2248} 2249 2250 2251//------------------------------------------------------------------------------ 2252// 2253// compileInterval Generate the code for a {min, max} style interval quantifier. 2254// Except for the specific opcodes used, the code is the same 2255// for all three types (greedy, non-greedy, possessive) of 2256// intervals. The opcodes are supplied as parameters. 2257// (There are two sets of opcodes - greedy & possessive use the 2258// same ones, while non-greedy has it's own.) 2259// 2260// The code for interval loops has this form: 2261// 0 CTR_INIT counter loc (in stack frame) 2262// 1 5 patt address of CTR_LOOP at bottom of block 2263// 2 min count 2264// 3 max count (-1 for unbounded) 2265// 4 ... block to be iterated over 2266// 5 CTR_LOOP 2267// 2268// In 2269//------------------------------------------------------------------------------ 2270void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp) 2271{ 2272 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes 2273 // four slots in the compiled code. Reserve them. 2274 int32_t topOfBlock = blockTopLoc(TRUE); 2275 insertOp(topOfBlock); 2276 insertOp(topOfBlock); 2277 insertOp(topOfBlock); 2278 2279 // The operands for the CTR_INIT opcode include the index in the matcher data 2280 // of the counter. Allocate it now. There are two data items 2281 // counterLoc --> Loop counter 2282 // +1 --> Input index (for breaking non-progressing loops) 2283 // (Only present if unbounded upper limit on loop) 2284 int32_t counterLoc = fRXPat->fFrameSize; 2285 fRXPat->fFrameSize++; 2286 if (fIntervalUpper < 0) { 2287 fRXPat->fFrameSize++; 2288 } 2289 2290 int32_t op = URX_BUILD(InitOp, counterLoc); 2291 fRXPat->fCompiledPat->setElementAt(op, topOfBlock); 2292 2293 // The second operand of CTR_INIT is the location following the end of the loop. 2294 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the 2295 // compilation of something later on causes the code to grow and the target 2296 // position to move. 2297 int32_t loopEnd = fRXPat->fCompiledPat->size(); 2298 op = URX_BUILD(URX_RELOC_OPRND, loopEnd); 2299 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1); 2300 2301 // Followed by the min and max counts. 2302 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2); 2303 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3); 2304 2305 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op. 2306 // Goes at end of the block being looped over, so just append to the code so far. 2307 op = URX_BUILD(LoopOp, topOfBlock); 2308 fRXPat->fCompiledPat->addElement(op, *fStatus); 2309 2310 if ((fIntervalLow & 0xff000000) != 0 || 2311 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) { 2312 error(U_REGEX_NUMBER_TOO_BIG); 2313 } 2314 2315 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) { 2316 error(U_REGEX_MAX_LT_MIN); 2317 } 2318} 2319 2320 2321 2322UBool RegexCompile::compileInlineInterval() { 2323 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) { 2324 // Too big to inline. Fail, which will cause looping code to be generated. 2325 // (Upper < Lower picks up unbounded upper and errors, both.) 2326 return FALSE; 2327 } 2328 2329 int32_t topOfBlock = blockTopLoc(FALSE); 2330 if (fIntervalUpper == 0) { 2331 // Pathological case. Attempt no matches, as if the block doesn't exist. 2332 fRXPat->fCompiledPat->setSize(topOfBlock); 2333 return TRUE; 2334 } 2335 2336 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) { 2337 // The thing being repeated is not a single op, but some 2338 // more complex block. Do it as a loop, not inlines. 2339 // Note that things "repeated" a max of once are handled as inline, because 2340 // the one copy of the code already generated is just fine. 2341 return FALSE; 2342 } 2343 2344 // Pick up the opcode that is to be repeated 2345 // 2346 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock); 2347 2348 // Compute the pattern location where the inline sequence 2349 // will end, and set up the state save op that will be needed. 2350 // 2351 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1 2352 + fIntervalUpper + (fIntervalUpper-fIntervalLow); 2353 int32_t saveOp = URX_BUILD(URX_STATE_SAVE, endOfSequenceLoc); 2354 if (fIntervalLow == 0) { 2355 insertOp(topOfBlock); 2356 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock); 2357 } 2358 2359 2360 2361 // Loop, emitting the op for the thing being repeated each time. 2362 // Loop starts at 1 because one instance of the op already exists in the pattern, 2363 // it was put there when it was originally encountered. 2364 int32_t i; 2365 for (i=1; i<fIntervalUpper; i++ ) { 2366 if (i == fIntervalLow) { 2367 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); 2368 } 2369 if (i > fIntervalLow) { 2370 fRXPat->fCompiledPat->addElement(saveOp, *fStatus); 2371 } 2372 fRXPat->fCompiledPat->addElement(op, *fStatus); 2373 } 2374 return TRUE; 2375} 2376 2377 2378 2379//------------------------------------------------------------------------------ 2380// 2381// matchStartType Determine how a match can start. 2382// Used to optimize find() operations. 2383// 2384// Operation is very similar to minMatchLength(). Walk the compiled 2385// pattern, keeping an on-going minimum-match-length. For any 2386// op where the min match coming in is zero, add that ops possible 2387// starting matches to the possible starts for the overall pattern. 2388// 2389//------------------------------------------------------------------------------ 2390void RegexCompile::matchStartType() { 2391 if (U_FAILURE(*fStatus)) { 2392 return; 2393 } 2394 2395 2396 int32_t loc; // Location in the pattern of the current op being processed. 2397 int32_t op; // The op being processed 2398 int32_t opType; // The opcode type of the op 2399 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern 2400 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start. 2401 2402 UBool atStart = TRUE; // True if no part of the pattern yet encountered 2403 // could have advanced the position in a match. 2404 // (Maximum match length so far == 0) 2405 2406 // forwardedLength is a vector holding minimum-match-length values that 2407 // are propagated forward in the pattern by JMP or STATE_SAVE operations. 2408 // It must be one longer than the pattern being checked because some ops 2409 // will jmp to a end-of-block+1 location from within a block, and we must 2410 // count those when checking the block. 2411 int32_t end = fRXPat->fCompiledPat->size(); 2412 UVector32 forwardedLength(end+1, *fStatus); 2413 forwardedLength.setSize(end+1); 2414 for (loc=3; loc<end; loc++) { 2415 forwardedLength.setElementAt(INT32_MAX, loc); 2416 } 2417 2418 for (loc = 3; loc<end; loc++) { 2419 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2420 opType = URX_TYPE(op); 2421 2422 // The loop is advancing linearly through the pattern. 2423 // If the op we are now at was the destination of a branch in the pattern, 2424 // and that path has a shorter minimum length than the current accumulated value, 2425 // replace the current accumulated value. 2426 if (forwardedLength.elementAti(loc) < currentLen) { 2427 currentLen = forwardedLength.elementAti(loc); 2428 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 2429 } 2430 2431 switch (opType) { 2432 // Ops that don't change the total length matched 2433 case URX_RESERVED_OP: 2434 case URX_END: 2435 case URX_FAIL: 2436 case URX_STRING_LEN: 2437 case URX_NOP: 2438 case URX_START_CAPTURE: 2439 case URX_END_CAPTURE: 2440 case URX_BACKSLASH_B: 2441 case URX_BACKSLASH_BU: 2442 case URX_BACKSLASH_G: 2443 case URX_BACKSLASH_Z: 2444 case URX_DOLLAR: 2445 case URX_DOLLAR_M: 2446 case URX_DOLLAR_D: 2447 case URX_DOLLAR_MD: 2448 case URX_RELOC_OPRND: 2449 case URX_STO_INP_LOC: 2450 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 2451 case URX_BACKREF_I: 2452 2453 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 2454 case URX_LD_SP: 2455 break; 2456 2457 case URX_CARET: 2458 if (atStart) { 2459 fRXPat->fStartType = START_START; 2460 } 2461 break; 2462 2463 case URX_CARET_M: 2464 case URX_CARET_M_UNIX: 2465 if (atStart) { 2466 fRXPat->fStartType = START_LINE; 2467 } 2468 break; 2469 2470 case URX_ONECHAR: 2471 if (currentLen == 0) { 2472 // This character could appear at the start of a match. 2473 // Add it to the set of possible starting characters. 2474 fRXPat->fInitialChars->add(URX_VAL(op)); 2475 numInitialStrings += 2; 2476 } 2477 currentLen++; 2478 atStart = FALSE; 2479 break; 2480 2481 2482 case URX_SETREF: 2483 if (currentLen == 0) { 2484 int32_t sn = URX_VAL(op); 2485 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2486 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 2487 fRXPat->fInitialChars->addAll(*s); 2488 numInitialStrings += 2; 2489 } 2490 currentLen++; 2491 atStart = FALSE; 2492 break; 2493 2494 case URX_LOOP_SR_I: 2495 // [Set]*, like a SETREF, above, in what it can match, 2496 // but may not match at all, so currentLen is not incremented. 2497 if (currentLen == 0) { 2498 int32_t sn = URX_VAL(op); 2499 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size()); 2500 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn); 2501 fRXPat->fInitialChars->addAll(*s); 2502 numInitialStrings += 2; 2503 } 2504 atStart = FALSE; 2505 break; 2506 2507 case URX_LOOP_DOT_I: 2508 if (currentLen == 0) { 2509 // .* at the start of a pattern. 2510 // Any character can begin the match. 2511 fRXPat->fInitialChars->clear(); 2512 fRXPat->fInitialChars->complement(); 2513 numInitialStrings += 2; 2514 } 2515 atStart = FALSE; 2516 break; 2517 2518 2519 case URX_STATIC_SETREF: 2520 if (currentLen == 0) { 2521 int32_t sn = URX_VAL(op); 2522 U_ASSERT(sn>0 && sn<URX_LAST_SET); 2523 const UnicodeSet *s = fRXPat->fStaticSets[sn]; 2524 fRXPat->fInitialChars->addAll(*s); 2525 numInitialStrings += 2; 2526 } 2527 currentLen++; 2528 atStart = FALSE; 2529 break; 2530 2531 2532 2533 case URX_STAT_SETREF_N: 2534 if (currentLen == 0) { 2535 int32_t sn = URX_VAL(op); 2536 const UnicodeSet *s = fRXPat->fStaticSets[sn]; 2537 UnicodeSet sc(*s); 2538 sc.complement(); 2539 fRXPat->fInitialChars->addAll(sc); 2540 numInitialStrings += 2; 2541 } 2542 currentLen++; 2543 atStart = FALSE; 2544 break; 2545 2546 2547 2548 case URX_BACKSLASH_D: 2549 // Digit Char 2550 if (currentLen == 0) { 2551 UnicodeSet s; 2552 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus); 2553 if (URX_VAL(op) != 0) { 2554 s.complement(); 2555 } 2556 fRXPat->fInitialChars->addAll(s); 2557 numInitialStrings += 2; 2558 } 2559 currentLen++; 2560 atStart = FALSE; 2561 break; 2562 2563 2564 case URX_ONECHAR_I: 2565 // Case Insensitive Single Character. 2566 if (currentLen == 0) { 2567 UChar32 c = URX_VAL(op); 2568 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) { 2569 2570 // Disable optimizations on first char of match. 2571 // TODO: Compute the set of chars that case fold to this char, or to 2572 // a string that begins with this char. 2573 // For simple case folding, this code worked: 2574 // UnicodeSet s(c, c); 2575 // s.closeOver(USET_CASE_INSENSITIVE); 2576 // fRXPat->fInitialChars->addAll(s); 2577 2578 fRXPat->fInitialChars->clear(); 2579 fRXPat->fInitialChars->complement(); 2580 } else { 2581 // Char has no case variants. Just add it as-is to the 2582 // set of possible starting chars. 2583 fRXPat->fInitialChars->add(c); 2584 } 2585 numInitialStrings += 2; 2586 } 2587 currentLen++; 2588 atStart = FALSE; 2589 break; 2590 2591 2592 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 2593 case URX_DOTANY_ALL: // . matches one or two. 2594 case URX_DOTANY: 2595 case URX_DOTANY_UNIX: 2596 if (currentLen == 0) { 2597 // These constructs are all bad news when they appear at the start 2598 // of a match. Any character can begin the match. 2599 fRXPat->fInitialChars->clear(); 2600 fRXPat->fInitialChars->complement(); 2601 numInitialStrings += 2; 2602 } 2603 currentLen++; 2604 atStart = FALSE; 2605 break; 2606 2607 2608 case URX_JMPX: 2609 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP. 2610 case URX_JMP: 2611 { 2612 int32_t jmpDest = URX_VAL(op); 2613 if (jmpDest < loc) { 2614 // Loop of some kind. Can safely ignore, the worst that will happen 2615 // is that we understate the true minimum length 2616 currentLen = forwardedLength.elementAti(loc+1); 2617 2618 } else { 2619 // Forward jump. Propagate the current min length to the target loc of the jump. 2620 U_ASSERT(jmpDest <= end+1); 2621 if (forwardedLength.elementAti(jmpDest) > currentLen) { 2622 forwardedLength.setElementAt(currentLen, jmpDest); 2623 } 2624 } 2625 } 2626 atStart = FALSE; 2627 break; 2628 2629 case URX_JMP_SAV: 2630 case URX_JMP_SAV_X: 2631 // Combo of state save to the next loc, + jmp backwards. 2632 // Net effect on min. length computation is nothing. 2633 atStart = FALSE; 2634 break; 2635 2636 case URX_BACKTRACK: 2637 // Fails are kind of like a branch, except that the min length was 2638 // propagated already, by the state save. 2639 currentLen = forwardedLength.elementAti(loc+1); 2640 atStart = FALSE; 2641 break; 2642 2643 2644 case URX_STATE_SAVE: 2645 { 2646 // State Save, for forward jumps, propagate the current minimum. 2647 // of the state save. 2648 int32_t jmpDest = URX_VAL(op); 2649 if (jmpDest > loc) { 2650 if (currentLen < forwardedLength.elementAti(jmpDest)) { 2651 forwardedLength.setElementAt(currentLen, jmpDest); 2652 } 2653 } 2654 } 2655 atStart = FALSE; 2656 break; 2657 2658 2659 2660 2661 case URX_STRING: 2662 { 2663 loc++; 2664 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2665 int32_t stringLen = URX_VAL(stringLenOp); 2666 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 2667 U_ASSERT(stringLenOp >= 2); 2668 if (currentLen == 0) { 2669 // Add the starting character of this string to the set of possible starting 2670 // characters for this pattern. 2671 int32_t stringStartIdx = URX_VAL(op); 2672 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 2673 fRXPat->fInitialChars->add(c); 2674 2675 // Remember this string. After the entire pattern has been checked, 2676 // if nothing else is identified that can start a match, we'll use it. 2677 numInitialStrings++; 2678 fRXPat->fInitialStringIdx = stringStartIdx; 2679 fRXPat->fInitialStringLen = stringLen; 2680 } 2681 2682 currentLen += stringLen; 2683 atStart = FALSE; 2684 } 2685 break; 2686 2687 case URX_STRING_I: 2688 { 2689 // Case-insensitive string. Unlike exact-match strings, we won't 2690 // attempt a string search for possible match positions. But we 2691 // do update the set of possible starting characters. 2692 loc++; 2693 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2694 int32_t stringLen = URX_VAL(stringLenOp); 2695 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN); 2696 U_ASSERT(stringLenOp >= 2); 2697 if (currentLen == 0) { 2698 // Add the starting character of this string to the set of possible starting 2699 // characters for this pattern. 2700 int32_t stringStartIdx = URX_VAL(op); 2701 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx); 2702 UnicodeSet s(c, c); 2703 2704 // TODO: compute correct set of starting chars for full case folding. 2705 // For the moment, say any char can start. 2706 // s.closeOver(USET_CASE_INSENSITIVE); 2707 s.clear(); 2708 s.complement(); 2709 2710 fRXPat->fInitialChars->addAll(s); 2711 numInitialStrings += 2; // Matching on an initial string not possible. 2712 } 2713 currentLen += stringLen; 2714 atStart = FALSE; 2715 } 2716 break; 2717 2718 case URX_CTR_INIT: 2719 case URX_CTR_INIT_NG: 2720 { 2721 // Loop Init Ops. These don't change the min length, but they are 4 word ops 2722 // so location must be updated accordingly. 2723 // Loop Init Ops. 2724 // If the min loop count == 0 2725 // move loc forwards to the end of the loop, skipping over the body. 2726 // If the min count is > 0, 2727 // continue normal processing of the body of the loop. 2728 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 2729 loopEndLoc = URX_VAL(loopEndLoc); 2730 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 2731 if (minLoopCount == 0) { 2732 // Min Loop Count of 0, treat like a forward branch and 2733 // move the current minimum length up to the target 2734 // (end of loop) location. 2735 U_ASSERT(loopEndLoc <= end+1); 2736 if (forwardedLength.elementAti(loopEndLoc) > currentLen) { 2737 forwardedLength.setElementAt(currentLen, loopEndLoc); 2738 } 2739 } 2740 loc+=3; // Skips over operands of CTR_INIT 2741 } 2742 atStart = FALSE; 2743 break; 2744 2745 2746 case URX_CTR_LOOP: 2747 case URX_CTR_LOOP_NG: 2748 // Loop ops. 2749 // The jump is conditional, backwards only. 2750 atStart = FALSE; 2751 break; 2752 2753 case URX_LOOP_C: 2754 // More loop ops. These state-save to themselves. 2755 // don't change the minimum match 2756 atStart = FALSE; 2757 break; 2758 2759 2760 case URX_LA_START: 2761 case URX_LB_START: 2762 { 2763 // Look-around. Scan forward until the matching look-ahead end, 2764 // without processing the look-around block. This is overly pessimistic. 2765 2766 // Keep track of the nesting depth of look-around blocks. Boilerplate code for 2767 // lookahead contains two LA_END instructions, so count goes up by two 2768 // for each LA_START. 2769 int32_t depth = (opType == URX_LA_START? 2: 1); 2770 for (;;) { 2771 loc++; 2772 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2773 if (URX_TYPE(op) == URX_LA_START) { 2774 depth+=2; 2775 } 2776 if (URX_TYPE(op) == URX_LB_START) { 2777 depth++; 2778 } 2779 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 2780 depth--; 2781 if (depth == 0) { 2782 break; 2783 } 2784 } 2785 if (URX_TYPE(op) == URX_STATE_SAVE) { 2786 // Need this because neg lookahead blocks will FAIL to outside 2787 // of the block. 2788 int32_t jmpDest = URX_VAL(op); 2789 if (jmpDest > loc) { 2790 if (currentLen < forwardedLength.elementAti(jmpDest)) { 2791 forwardedLength.setElementAt(currentLen, jmpDest); 2792 } 2793 } 2794 } 2795 U_ASSERT(loc <= end); 2796 } 2797 } 2798 break; 2799 2800 case URX_LA_END: 2801 case URX_LB_CONT: 2802 case URX_LB_END: 2803 case URX_LBN_CONT: 2804 case URX_LBN_END: 2805 U_ASSERT(FALSE); // Shouldn't get here. These ops should be 2806 // consumed by the scan in URX_LA_START and LB_START 2807 2808 break; 2809 2810 default: 2811 U_ASSERT(FALSE); 2812 } 2813 2814 } 2815 2816 2817 // We have finished walking through the ops. Check whether some forward jump 2818 // propagated a shorter length to location end+1. 2819 if (forwardedLength.elementAti(end+1) < currentLen) { 2820 currentLen = forwardedLength.elementAti(end+1); 2821 } 2822 2823 2824 fRXPat->fInitialChars8->init(fRXPat->fInitialChars); 2825 2826 2827 // Sort out what we should check for when looking for candidate match start positions. 2828 // In order of preference, 2829 // 1. Start of input text buffer. 2830 // 2. A literal string. 2831 // 3. Start of line in multi-line mode. 2832 // 4. A single literal character. 2833 // 5. A character from a set of characters. 2834 // 2835 if (fRXPat->fStartType == START_START) { 2836 // Match only at the start of an input text string. 2837 // start type is already set. We're done. 2838 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) { 2839 // Match beginning only with a literal string. 2840 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx); 2841 U_ASSERT(fRXPat->fInitialChars->contains(c)); 2842 fRXPat->fStartType = START_STRING; 2843 fRXPat->fInitialChar = c; 2844 } else if (fRXPat->fStartType == START_LINE) { 2845 // Match at start of line in Multi-Line mode. 2846 // Nothing to do here; everything is already set. 2847 } else if (fRXPat->fMinMatchLen == 0) { 2848 // Zero length match possible. We could start anywhere. 2849 fRXPat->fStartType = START_NO_INFO; 2850 } else if (fRXPat->fInitialChars->size() == 1) { 2851 // All matches begin with the same char. 2852 fRXPat->fStartType = START_CHAR; 2853 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0); 2854 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1); 2855 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE && 2856 fRXPat->fMinMatchLen > 0) { 2857 // Matches start with a set of character smaller than the set of all chars. 2858 fRXPat->fStartType = START_SET; 2859 } else { 2860 // Matches can start with anything 2861 fRXPat->fStartType = START_NO_INFO; 2862 } 2863 2864 return; 2865} 2866 2867 2868 2869//------------------------------------------------------------------------------ 2870// 2871// minMatchLength Calculate the length of the shortest string that could 2872// match the specified pattern. 2873// Length is in 16 bit code units, not code points. 2874// 2875// The calculated length may not be exact. The returned 2876// value may be shorter than the actual minimum; it must 2877// never be longer. 2878// 2879// start and end are the range of p-code operations to be 2880// examined. The endpoints are included in the range. 2881// 2882//------------------------------------------------------------------------------ 2883int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) { 2884 if (U_FAILURE(*fStatus)) { 2885 return 0; 2886 } 2887 2888 U_ASSERT(start <= end); 2889 U_ASSERT(end < fRXPat->fCompiledPat->size()); 2890 2891 2892 int32_t loc; 2893 int32_t op; 2894 int32_t opType; 2895 int32_t currentLen = 0; 2896 2897 2898 // forwardedLength is a vector holding minimum-match-length values that 2899 // are propagated forward in the pattern by JMP or STATE_SAVE operations. 2900 // It must be one longer than the pattern being checked because some ops 2901 // will jmp to a end-of-block+1 location from within a block, and we must 2902 // count those when checking the block. 2903 UVector32 forwardedLength(end+2, *fStatus); 2904 forwardedLength.setSize(end+2); 2905 for (loc=start; loc<=end+1; loc++) { 2906 forwardedLength.setElementAt(INT32_MAX, loc); 2907 } 2908 2909 for (loc = start; loc<=end; loc++) { 2910 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 2911 opType = URX_TYPE(op); 2912 2913 // The loop is advancing linearly through the pattern. 2914 // If the op we are now at was the destination of a branch in the pattern, 2915 // and that path has a shorter minimum length than the current accumulated value, 2916 // replace the current accumulated value. 2917 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some 2918 // no-match-possible cases. 2919 if (forwardedLength.elementAti(loc) < currentLen) { 2920 currentLen = forwardedLength.elementAti(loc); 2921 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 2922 } 2923 2924 switch (opType) { 2925 // Ops that don't change the total length matched 2926 case URX_RESERVED_OP: 2927 case URX_END: 2928 case URX_STRING_LEN: 2929 case URX_NOP: 2930 case URX_START_CAPTURE: 2931 case URX_END_CAPTURE: 2932 case URX_BACKSLASH_B: 2933 case URX_BACKSLASH_BU: 2934 case URX_BACKSLASH_G: 2935 case URX_BACKSLASH_Z: 2936 case URX_CARET: 2937 case URX_DOLLAR: 2938 case URX_DOLLAR_M: 2939 case URX_DOLLAR_D: 2940 case URX_DOLLAR_MD: 2941 case URX_RELOC_OPRND: 2942 case URX_STO_INP_LOC: 2943 case URX_CARET_M: 2944 case URX_CARET_M_UNIX: 2945 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 2946 case URX_BACKREF_I: 2947 2948 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 2949 case URX_LD_SP: 2950 2951 case URX_JMP_SAV: 2952 case URX_JMP_SAV_X: 2953 break; 2954 2955 2956 // Ops that match a minimum of one character (one or two 16 bit code units.) 2957 // 2958 case URX_ONECHAR: 2959 case URX_STATIC_SETREF: 2960 case URX_STAT_SETREF_N: 2961 case URX_SETREF: 2962 case URX_BACKSLASH_D: 2963 case URX_ONECHAR_I: 2964 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 2965 case URX_DOTANY_ALL: // . matches one or two. 2966 case URX_DOTANY: 2967 case URX_DOTANY_UNIX: 2968 currentLen++; 2969 break; 2970 2971 2972 case URX_JMPX: 2973 loc++; // URX_JMPX has an extra operand, ignored here, 2974 // otherwise processed identically to URX_JMP. 2975 case URX_JMP: 2976 { 2977 int32_t jmpDest = URX_VAL(op); 2978 if (jmpDest < loc) { 2979 // Loop of some kind. Can safely ignore, the worst that will happen 2980 // is that we understate the true minimum length 2981 currentLen = forwardedLength.elementAti(loc+1); 2982 } else { 2983 // Forward jump. Propagate the current min length to the target loc of the jump. 2984 U_ASSERT(jmpDest <= end+1); 2985 if (forwardedLength.elementAti(jmpDest) > currentLen) { 2986 forwardedLength.setElementAt(currentLen, jmpDest); 2987 } 2988 } 2989 } 2990 break; 2991 2992 case URX_BACKTRACK: 2993 { 2994 // Back-tracks are kind of like a branch, except that the min length was 2995 // propagated already, by the state save. 2996 currentLen = forwardedLength.elementAti(loc+1); 2997 } 2998 break; 2999 3000 3001 case URX_STATE_SAVE: 3002 { 3003 // State Save, for forward jumps, propagate the current minimum. 3004 // of the state save. 3005 int32_t jmpDest = URX_VAL(op); 3006 if (jmpDest > loc) { 3007 if (currentLen < forwardedLength.elementAti(jmpDest)) { 3008 forwardedLength.setElementAt(currentLen, jmpDest); 3009 } 3010 } 3011 } 3012 break; 3013 3014 3015 case URX_STRING: 3016 { 3017 loc++; 3018 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3019 currentLen += URX_VAL(stringLenOp); 3020 } 3021 break; 3022 3023 3024 case URX_STRING_I: 3025 { 3026 loc++; 3027 // TODO: with full case folding, matching input text may be shorter than 3028 // the string we have here. More smarts could put some bounds on it. 3029 // Assume a min length of one for now. A min length of zero causes 3030 // optimization failures for a pattern like "string"+ 3031 // currentLen += URX_VAL(stringLenOp); 3032 currentLen += 1; 3033 } 3034 break; 3035 3036 case URX_CTR_INIT: 3037 case URX_CTR_INIT_NG: 3038 { 3039 // Loop Init Ops. 3040 // If the min loop count == 0 3041 // move loc forwards to the end of the loop, skipping over the body. 3042 // If the min count is > 0, 3043 // continue normal processing of the body of the loop. 3044 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1); 3045 loopEndLoc = URX_VAL(loopEndLoc); 3046 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2); 3047 if (minLoopCount == 0) { 3048 loc = loopEndLoc; 3049 } else { 3050 loc+=3; // Skips over operands of CTR_INIT 3051 } 3052 } 3053 break; 3054 3055 3056 case URX_CTR_LOOP: 3057 case URX_CTR_LOOP_NG: 3058 // Loop ops. 3059 // The jump is conditional, backwards only. 3060 break; 3061 3062 case URX_LOOP_SR_I: 3063 case URX_LOOP_DOT_I: 3064 case URX_LOOP_C: 3065 // More loop ops. These state-save to themselves. 3066 // don't change the minimum match - could match nothing at all. 3067 break; 3068 3069 3070 case URX_LA_START: 3071 case URX_LB_START: 3072 { 3073 // Look-around. Scan forward until the matching look-ahead end, 3074 // without processing the look-around block. This is overly pessimistic for look-ahead, 3075 // it assumes that the look-ahead match might be zero-length. 3076 // TODO: Positive lookahead could recursively do the block, then continue 3077 // with the longer of the block or the value coming in. Ticket 6060 3078 int32_t depth = (opType == URX_LA_START? 2: 1);; 3079 for (;;) { 3080 loc++; 3081 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3082 if (URX_TYPE(op) == URX_LA_START) { 3083 // The boilerplate for look-ahead includes two LA_END insturctions, 3084 // Depth will be decremented by each one when it is seen. 3085 depth += 2; 3086 } 3087 if (URX_TYPE(op) == URX_LB_START) { 3088 depth++; 3089 } 3090 if (URX_TYPE(op) == URX_LA_END) { 3091 depth--; 3092 if (depth == 0) { 3093 break; 3094 } 3095 } 3096 if (URX_TYPE(op)==URX_LBN_END) { 3097 depth--; 3098 if (depth == 0) { 3099 break; 3100 } 3101 } 3102 if (URX_TYPE(op) == URX_STATE_SAVE) { 3103 // Need this because neg lookahead blocks will FAIL to outside 3104 // of the block. 3105 int32_t jmpDest = URX_VAL(op); 3106 if (jmpDest > loc) { 3107 if (currentLen < forwardedLength.elementAti(jmpDest)) { 3108 forwardedLength.setElementAt(currentLen, jmpDest); 3109 } 3110 } 3111 } 3112 U_ASSERT(loc <= end); 3113 } 3114 } 3115 break; 3116 3117 case URX_LA_END: 3118 case URX_LB_CONT: 3119 case URX_LB_END: 3120 case URX_LBN_CONT: 3121 case URX_LBN_END: 3122 // Only come here if the matching URX_LA_START or URX_LB_START was not in the 3123 // range being sized, which happens when measuring size of look-behind blocks. 3124 break; 3125 3126 default: 3127 U_ASSERT(FALSE); 3128 } 3129 3130 } 3131 3132 // We have finished walking through the ops. Check whether some forward jump 3133 // propagated a shorter length to location end+1. 3134 if (forwardedLength.elementAti(end+1) < currentLen) { 3135 currentLen = forwardedLength.elementAti(end+1); 3136 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); 3137 } 3138 3139 return currentLen; 3140} 3141 3142// Increment with overflow check. 3143// val and delta will both be positive. 3144 3145static int32_t safeIncrement(int32_t val, int32_t delta) { 3146 if (INT32_MAX - val > delta) { 3147 return val + delta; 3148 } else { 3149 return INT32_MAX; 3150 } 3151} 3152 3153 3154//------------------------------------------------------------------------------ 3155// 3156// maxMatchLength Calculate the length of the longest string that could 3157// match the specified pattern. 3158// Length is in 16 bit code units, not code points. 3159// 3160// The calculated length may not be exact. The returned 3161// value may be longer than the actual maximum; it must 3162// never be shorter. 3163// 3164//------------------------------------------------------------------------------ 3165int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) { 3166 if (U_FAILURE(*fStatus)) { 3167 return 0; 3168 } 3169 U_ASSERT(start <= end); 3170 U_ASSERT(end < fRXPat->fCompiledPat->size()); 3171 3172 3173 int32_t loc; 3174 int32_t op; 3175 int32_t opType; 3176 int32_t currentLen = 0; 3177 UVector32 forwardedLength(end+1, *fStatus); 3178 forwardedLength.setSize(end+1); 3179 3180 for (loc=start; loc<=end; loc++) { 3181 forwardedLength.setElementAt(0, loc); 3182 } 3183 3184 for (loc = start; loc<=end; loc++) { 3185 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3186 opType = URX_TYPE(op); 3187 3188 // The loop is advancing linearly through the pattern. 3189 // If the op we are now at was the destination of a branch in the pattern, 3190 // and that path has a longer maximum length than the current accumulated value, 3191 // replace the current accumulated value. 3192 if (forwardedLength.elementAti(loc) > currentLen) { 3193 currentLen = forwardedLength.elementAti(loc); 3194 } 3195 3196 switch (opType) { 3197 // Ops that don't change the total length matched 3198 case URX_RESERVED_OP: 3199 case URX_END: 3200 case URX_STRING_LEN: 3201 case URX_NOP: 3202 case URX_START_CAPTURE: 3203 case URX_END_CAPTURE: 3204 case URX_BACKSLASH_B: 3205 case URX_BACKSLASH_BU: 3206 case URX_BACKSLASH_G: 3207 case URX_BACKSLASH_Z: 3208 case URX_CARET: 3209 case URX_DOLLAR: 3210 case URX_DOLLAR_M: 3211 case URX_DOLLAR_D: 3212 case URX_DOLLAR_MD: 3213 case URX_RELOC_OPRND: 3214 case URX_STO_INP_LOC: 3215 case URX_CARET_M: 3216 case URX_CARET_M_UNIX: 3217 3218 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match. 3219 case URX_LD_SP: 3220 3221 case URX_LB_END: 3222 case URX_LB_CONT: 3223 case URX_LBN_CONT: 3224 case URX_LBN_END: 3225 break; 3226 3227 3228 // Ops that increase that cause an unbounded increase in the length 3229 // of a matched string, or that increase it a hard to characterize way. 3230 // Call the max length unbounded, and stop further checking. 3231 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match 3232 case URX_BACKREF_I: 3233 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded. 3234 currentLen = INT32_MAX; 3235 break; 3236 3237 3238 // Ops that match a max of one character (possibly two 16 bit code units.) 3239 // 3240 case URX_STATIC_SETREF: 3241 case URX_STAT_SETREF_N: 3242 case URX_SETREF: 3243 case URX_BACKSLASH_D: 3244 case URX_ONECHAR_I: 3245 case URX_DOTANY_ALL: 3246 case URX_DOTANY: 3247 case URX_DOTANY_UNIX: 3248 currentLen = safeIncrement(currentLen, 2); 3249 break; 3250 3251 // Single literal character. Increase current max length by one or two, 3252 // depending on whether the char is in the supplementary range. 3253 case URX_ONECHAR: 3254 currentLen = safeIncrement(currentLen, 1); 3255 if (URX_VAL(op) > 0x10000) { 3256 currentLen = safeIncrement(currentLen, 1); 3257 } 3258 break; 3259 3260 // Jumps. 3261 // 3262 case URX_JMP: 3263 case URX_JMPX: 3264 case URX_JMP_SAV: 3265 case URX_JMP_SAV_X: 3266 { 3267 int32_t jmpDest = URX_VAL(op); 3268 if (jmpDest < loc) { 3269 // Loop of some kind. Max match length is unbounded. 3270 currentLen = INT32_MAX; 3271 } else { 3272 // Forward jump. Propagate the current min length to the target loc of the jump. 3273 if (forwardedLength.elementAti(jmpDest) < currentLen) { 3274 forwardedLength.setElementAt(currentLen, jmpDest); 3275 } 3276 currentLen = 0; 3277 } 3278 } 3279 break; 3280 3281 case URX_BACKTRACK: 3282 // back-tracks are kind of like a branch, except that the max length was 3283 // propagated already, by the state save. 3284 currentLen = forwardedLength.elementAti(loc+1); 3285 break; 3286 3287 3288 case URX_STATE_SAVE: 3289 { 3290 // State Save, for forward jumps, propagate the current minimum. 3291 // of the state save. 3292 // For backwards jumps, they create a loop, maximum 3293 // match length is unbounded. 3294 int32_t jmpDest = URX_VAL(op); 3295 if (jmpDest > loc) { 3296 if (currentLen > forwardedLength.elementAti(jmpDest)) { 3297 forwardedLength.setElementAt(currentLen, jmpDest); 3298 } 3299 } else { 3300 currentLen = INT32_MAX; 3301 } 3302 } 3303 break; 3304 3305 3306 3307 3308 case URX_STRING: 3309 { 3310 loc++; 3311 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3312 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); 3313 break; 3314 } 3315 3316 case URX_STRING_I: 3317 // TODO: This code assumes that any user string that matches will be no longer 3318 // than our compiled string, with case insensitive matching. 3319 // Our compiled string has been case-folded already. 3320 // 3321 // Any matching user string will have no more code points than our 3322 // compiled (folded) string. Folding may add code points, but 3323 // not remove them. 3324 // 3325 // There is a potential problem if a supplemental code point 3326 // case-folds to a BMP code point. In this case our compiled string 3327 // could be shorter (in code units) than a matching user string. 3328 // 3329 // At this time (Unicode 6.1) there are no such characters, and this case 3330 // is not being handled. A test, intltest regex/Bug9283, will fail if 3331 // any problematic characters are added to Unicode. 3332 // 3333 // If this happens, we can make a set of the BMP chars that the 3334 // troublesome supplementals fold to, scan our string, and bump the 3335 // currentLen one extra for each that is found. 3336 // 3337 { 3338 loc++; 3339 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3340 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp)); 3341 } 3342 break; 3343 3344 case URX_CTR_INIT: 3345 case URX_CTR_INIT_NG: 3346 // For Loops, recursively call this function on the pattern for the loop body, 3347 // then multiply the result by the maximum loop count. 3348 { 3349 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1)); 3350 if (loopEndLoc == loc+4) { 3351 // Loop has an empty body. No affect on max match length. 3352 // Continue processing with code after the loop end. 3353 loc = loopEndLoc; 3354 break; 3355 } 3356 3357 int32_t maxLoopCount = fRXPat->fCompiledPat->elementAti(loc+3); 3358 if (maxLoopCount == -1) { 3359 // Unbounded Loop. No upper bound on match length. 3360 currentLen = INT32_MAX; 3361 break; 3362 } 3363 3364 U_ASSERT(loopEndLoc >= loc+4); 3365 int32_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call. 3366 if (blockLen == INT32_MAX) { 3367 currentLen = blockLen; 3368 break; 3369 } 3370 currentLen += blockLen * maxLoopCount; 3371 loc = loopEndLoc; 3372 break; 3373 } 3374 3375 case URX_CTR_LOOP: 3376 case URX_CTR_LOOP_NG: 3377 // These opcodes will be skipped over by code for URX_CRT_INIT. 3378 // We shouldn't encounter them here. 3379 U_ASSERT(FALSE); 3380 break; 3381 3382 case URX_LOOP_SR_I: 3383 case URX_LOOP_DOT_I: 3384 case URX_LOOP_C: 3385 // For anything to do with loops, make the match length unbounded. 3386 currentLen = INT32_MAX; 3387 break; 3388 3389 3390 3391 case URX_LA_START: 3392 case URX_LA_END: 3393 // Look-ahead. Just ignore, treat the look-ahead block as if 3394 // it were normal pattern. Gives a too-long match length, 3395 // but good enough for now. 3396 break; 3397 3398 // End of look-ahead ops should always be consumed by the processing at 3399 // the URX_LA_START op. 3400 // U_ASSERT(FALSE); 3401 // break; 3402 3403 case URX_LB_START: 3404 { 3405 // Look-behind. Scan forward until the matching look-around end, 3406 // without processing the look-behind block. 3407 int32_t depth = 0; 3408 for (;;) { 3409 loc++; 3410 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3411 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) { 3412 depth++; 3413 } 3414 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) { 3415 if (depth == 0) { 3416 break; 3417 } 3418 depth--; 3419 } 3420 U_ASSERT(loc < end); 3421 } 3422 } 3423 break; 3424 3425 default: 3426 U_ASSERT(FALSE); 3427 } 3428 3429 3430 if (currentLen == INT32_MAX) { 3431 // The maximum length is unbounded. 3432 // Stop further processing of the pattern. 3433 break; 3434 } 3435 3436 } 3437 return currentLen; 3438 3439} 3440 3441 3442//------------------------------------------------------------------------------ 3443// 3444// stripNOPs Remove any NOP operations from the compiled pattern code. 3445// Extra NOPs are inserted for some constructs during the initial 3446// code generation to provide locations that may be patched later. 3447// Many end up unneeded, and are removed by this function. 3448// 3449// In order to minimize the number of passes through the pattern, 3450// back-reference fixup is also performed here (adjusting 3451// back-reference operands to point to the correct frame offsets). 3452// 3453//------------------------------------------------------------------------------ 3454void RegexCompile::stripNOPs() { 3455 3456 if (U_FAILURE(*fStatus)) { 3457 return; 3458 } 3459 3460 int32_t end = fRXPat->fCompiledPat->size(); 3461 UVector32 deltas(end, *fStatus); 3462 3463 // Make a first pass over the code, computing the amount that things 3464 // will be offset at each location in the original code. 3465 int32_t loc; 3466 int32_t d = 0; 3467 for (loc=0; loc<end; loc++) { 3468 deltas.addElement(d, *fStatus); 3469 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc); 3470 if (URX_TYPE(op) == URX_NOP) { 3471 d++; 3472 } 3473 } 3474 3475 UnicodeString caseStringBuffer; 3476 3477 // Make a second pass over the code, removing the NOPs by moving following 3478 // code up, and patching operands that refer to code locations that 3479 // are being moved. The array of offsets from the first step is used 3480 // to compute the new operand values. 3481 int32_t src; 3482 int32_t dst = 0; 3483 for (src=0; src<end; src++) { 3484 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src); 3485 int32_t opType = URX_TYPE(op); 3486 switch (opType) { 3487 case URX_NOP: 3488 break; 3489 3490 case URX_STATE_SAVE: 3491 case URX_JMP: 3492 case URX_CTR_LOOP: 3493 case URX_CTR_LOOP_NG: 3494 case URX_RELOC_OPRND: 3495 case URX_JMPX: 3496 case URX_JMP_SAV: 3497 case URX_JMP_SAV_X: 3498 // These are instructions with operands that refer to code locations. 3499 { 3500 int32_t operandAddress = URX_VAL(op); 3501 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size()); 3502 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress); 3503 op = URX_BUILD(opType, fixedOperandAddress); 3504 fRXPat->fCompiledPat->setElementAt(op, dst); 3505 dst++; 3506 break; 3507 } 3508 3509 case URX_BACKREF: 3510 case URX_BACKREF_I: 3511 { 3512 int32_t where = URX_VAL(op); 3513 if (where > fRXPat->fGroupMap->size()) { 3514 error(U_REGEX_INVALID_BACK_REF); 3515 break; 3516 } 3517 where = fRXPat->fGroupMap->elementAti(where-1); 3518 op = URX_BUILD(opType, where); 3519 fRXPat->fCompiledPat->setElementAt(op, dst); 3520 dst++; 3521 3522 fRXPat->fNeedsAltInput = TRUE; 3523 break; 3524 } 3525 case URX_RESERVED_OP: 3526 case URX_RESERVED_OP_N: 3527 case URX_BACKTRACK: 3528 case URX_END: 3529 case URX_ONECHAR: 3530 case URX_STRING: 3531 case URX_STRING_LEN: 3532 case URX_START_CAPTURE: 3533 case URX_END_CAPTURE: 3534 case URX_STATIC_SETREF: 3535 case URX_STAT_SETREF_N: 3536 case URX_SETREF: 3537 case URX_DOTANY: 3538 case URX_FAIL: 3539 case URX_BACKSLASH_B: 3540 case URX_BACKSLASH_BU: 3541 case URX_BACKSLASH_G: 3542 case URX_BACKSLASH_X: 3543 case URX_BACKSLASH_Z: 3544 case URX_DOTANY_ALL: 3545 case URX_BACKSLASH_D: 3546 case URX_CARET: 3547 case URX_DOLLAR: 3548 case URX_CTR_INIT: 3549 case URX_CTR_INIT_NG: 3550 case URX_DOTANY_UNIX: 3551 case URX_STO_SP: 3552 case URX_LD_SP: 3553 case URX_STO_INP_LOC: 3554 case URX_LA_START: 3555 case URX_LA_END: 3556 case URX_ONECHAR_I: 3557 case URX_STRING_I: 3558 case URX_DOLLAR_M: 3559 case URX_CARET_M: 3560 case URX_CARET_M_UNIX: 3561 case URX_LB_START: 3562 case URX_LB_CONT: 3563 case URX_LB_END: 3564 case URX_LBN_CONT: 3565 case URX_LBN_END: 3566 case URX_LOOP_SR_I: 3567 case URX_LOOP_DOT_I: 3568 case URX_LOOP_C: 3569 case URX_DOLLAR_D: 3570 case URX_DOLLAR_MD: 3571 // These instructions are unaltered by the relocation. 3572 fRXPat->fCompiledPat->setElementAt(op, dst); 3573 dst++; 3574 break; 3575 3576 default: 3577 // Some op is unaccounted for. 3578 U_ASSERT(FALSE); 3579 error(U_REGEX_INTERNAL_ERROR); 3580 } 3581 } 3582 3583 fRXPat->fCompiledPat->setSize(dst); 3584} 3585 3586 3587 3588 3589//------------------------------------------------------------------------------ 3590// 3591// Error Report a rule parse error. 3592// Only report it if no previous error has been recorded. 3593// 3594//------------------------------------------------------------------------------ 3595void RegexCompile::error(UErrorCode e) { 3596 if (U_SUCCESS(*fStatus)) { 3597 *fStatus = e; 3598 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public 3599 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are 3600 // int64_t. If the values of the latter are out of range for the former, 3601 // set them to the appropriate "field not supported" values. 3602 if (fLineNum > 0x7FFFFFFF) { 3603 fParseErr->line = 0; 3604 fParseErr->offset = -1; 3605 } else if (fCharNum > 0x7FFFFFFF) { 3606 fParseErr->line = (int32_t)fLineNum; 3607 fParseErr->offset = -1; 3608 } else { 3609 fParseErr->line = (int32_t)fLineNum; 3610 fParseErr->offset = (int32_t)fCharNum; 3611 } 3612 3613 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context 3614 3615 // Fill in the context. 3616 // Note: extractBetween() pins supplied indicies to the string bounds. 3617 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext)); 3618 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext)); 3619 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status); 3620 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status); 3621 } 3622} 3623 3624 3625// 3626// Assorted Unicode character constants. 3627// Numeric because there is no portable way to enter them as literals. 3628// (Think EBCDIC). 3629// 3630static const UChar chCR = 0x0d; // New lines, for terminating comments. 3631static const UChar chLF = 0x0a; // Line Feed 3632static const UChar chPound = 0x23; // '#', introduces a comment. 3633static const UChar chDigit0 = 0x30; // '0' 3634static const UChar chDigit7 = 0x37; // '9' 3635static const UChar chColon = 0x3A; // ':' 3636static const UChar chE = 0x45; // 'E' 3637static const UChar chQ = 0x51; // 'Q' 3638//static const UChar chN = 0x4E; // 'N' 3639static const UChar chP = 0x50; // 'P' 3640static const UChar chBackSlash = 0x5c; // '\' introduces a char escape 3641//static const UChar chLBracket = 0x5b; // '[' 3642static const UChar chRBracket = 0x5d; // ']' 3643static const UChar chUp = 0x5e; // '^' 3644static const UChar chLowerP = 0x70; 3645static const UChar chLBrace = 0x7b; // '{' 3646static const UChar chRBrace = 0x7d; // '}' 3647static const UChar chNEL = 0x85; // NEL newline variant 3648static const UChar chLS = 0x2028; // Unicode Line Separator 3649 3650 3651//------------------------------------------------------------------------------ 3652// 3653// nextCharLL Low Level Next Char from the regex pattern. 3654// Get a char from the string, keep track of input position 3655// for error reporting. 3656// 3657//------------------------------------------------------------------------------ 3658UChar32 RegexCompile::nextCharLL() { 3659 UChar32 ch; 3660 3661 if (fPeekChar != -1) { 3662 ch = fPeekChar; 3663 fPeekChar = -1; 3664 return ch; 3665 } 3666 3667 // assume we're already in the right place 3668 ch = UTEXT_NEXT32(fRXPat->fPattern); 3669 if (ch == U_SENTINEL) { 3670 return ch; 3671 } 3672 3673 if (ch == chCR || 3674 ch == chNEL || 3675 ch == chLS || 3676 (ch == chLF && fLastChar != chCR)) { 3677 // Character is starting a new line. Bump up the line number, and 3678 // reset the column to 0. 3679 fLineNum++; 3680 fCharNum=0; 3681 } 3682 else { 3683 // Character is not starting a new line. Except in the case of a 3684 // LF following a CR, increment the column position. 3685 if (ch != chLF) { 3686 fCharNum++; 3687 } 3688 } 3689 fLastChar = ch; 3690 return ch; 3691} 3692 3693//------------------------------------------------------------------------------ 3694// 3695// peekCharLL Low Level Character Scanning, sneak a peek at the next 3696// character without actually getting it. 3697// 3698//------------------------------------------------------------------------------ 3699UChar32 RegexCompile::peekCharLL() { 3700 if (fPeekChar == -1) { 3701 fPeekChar = nextCharLL(); 3702 } 3703 return fPeekChar; 3704} 3705 3706 3707//------------------------------------------------------------------------------ 3708// 3709// nextChar for pattern scanning. At this level, we handle stripping 3710// out comments and processing some backslash character escapes. 3711// The rest of the pattern grammar is handled at the next level up. 3712// 3713//------------------------------------------------------------------------------ 3714void RegexCompile::nextChar(RegexPatternChar &c) { 3715 3716 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3717 c.fChar = nextCharLL(); 3718 c.fQuoted = FALSE; 3719 3720 if (fQuoteMode) { 3721 c.fQuoted = TRUE; 3722 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) || 3723 c.fChar == (UChar32)-1) { 3724 fQuoteMode = FALSE; // Exit quote mode, 3725 nextCharLL(); // discard the E 3726 nextChar(c); // recurse to get the real next char 3727 } 3728 } 3729 else if (fInBackslashQuote) { 3730 // The current character immediately follows a '\' 3731 // Don't check for any further escapes, just return it as-is. 3732 // Don't set c.fQuoted, because that would prevent the state machine from 3733 // dispatching on the character. 3734 fInBackslashQuote = FALSE; 3735 } 3736 else 3737 { 3738 // We are not in a \Q quoted region \E of the source. 3739 // 3740 if (fModeFlags & UREGEX_COMMENTS) { 3741 // 3742 // We are in free-spacing and comments mode. 3743 // Scan through any white space and comments, until we 3744 // reach a significant character or the end of inut. 3745 for (;;) { 3746 if (c.fChar == (UChar32)-1) { 3747 break; // End of Input 3748 } 3749 if (c.fChar == chPound && fEOLComments == TRUE) { 3750 // Start of a comment. Consume the rest of it, until EOF or a new line 3751 for (;;) { 3752 c.fChar = nextCharLL(); 3753 if (c.fChar == (UChar32)-1 || // EOF 3754 c.fChar == chCR || 3755 c.fChar == chLF || 3756 c.fChar == chNEL || 3757 c.fChar == chLS) { 3758 break; 3759 } 3760 } 3761 } 3762 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061. 3763 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) { 3764 break; 3765 } 3766 c.fChar = nextCharLL(); 3767 } 3768 } 3769 3770 // 3771 // check for backslash escaped characters. 3772 // 3773 if (c.fChar == chBackSlash) { 3774 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3775 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) { 3776 // 3777 // A '\' sequence that is handled by ICU's standard unescapeAt function. 3778 // Includes \uxxxx, \n, \r, many others. 3779 // Return the single equivalent character. 3780 // 3781 nextCharLL(); // get & discard the peeked char. 3782 c.fQuoted = TRUE; 3783 3784 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) { 3785 int32_t endIndex = (int32_t)pos; 3786 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents); 3787 3788 if (endIndex == pos) { 3789 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 3790 } 3791 fCharNum += endIndex - pos; 3792 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex); 3793 } else { 3794 int32_t offset = 0; 3795 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern); 3796 3797 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos); 3798 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context); 3799 3800 if (offset == 0) { 3801 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 3802 } else if (context.lastOffset == offset) { 3803 UTEXT_PREVIOUS32(fRXPat->fPattern); 3804 } else if (context.lastOffset != offset-1) { 3805 utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1); 3806 } 3807 fCharNum += offset; 3808 } 3809 } 3810 else if (peekCharLL() == chDigit0) { 3811 // Octal Escape, using Java Regexp Conventions 3812 // which are \0 followed by 1-3 octal digits. 3813 // Different from ICU Unescape handling of Octal, which does not 3814 // require the leading 0. 3815 // Java also has the convention of only consuming 2 octal digits if 3816 // the three digit number would be > 0xff 3817 // 3818 c.fChar = 0; 3819 nextCharLL(); // Consume the initial 0. 3820 int index; 3821 for (index=0; index<3; index++) { 3822 int32_t ch = peekCharLL(); 3823 if (ch<chDigit0 || ch>chDigit7) { 3824 if (index==0) { 3825 // \0 is not followed by any octal digits. 3826 error(U_REGEX_BAD_ESCAPE_SEQUENCE); 3827 } 3828 break; 3829 } 3830 c.fChar <<= 3; 3831 c.fChar += ch&7; 3832 if (c.fChar <= 255) { 3833 nextCharLL(); 3834 } else { 3835 // The last digit made the number too big. Forget we saw it. 3836 c.fChar >>= 3; 3837 } 3838 } 3839 c.fQuoted = TRUE; 3840 } 3841 else if (peekCharLL() == chQ) { 3842 // "\Q" enter quote mode, which will continue until "\E" 3843 fQuoteMode = TRUE; 3844 nextCharLL(); // discard the 'Q'. 3845 nextChar(c); // recurse to get the real next char. 3846 } 3847 else 3848 { 3849 // We are in a '\' escape that will be handled by the state table scanner. 3850 // Just return the backslash, but remember that the following char is to 3851 // be taken literally. 3852 fInBackslashQuote = TRUE; 3853 } 3854 } 3855 } 3856 3857 // re-enable # to end-of-line comments, in case they were disabled. 3858 // They are disabled by the parser upon seeing '(?', but this lasts for 3859 // the fetching of the next character only. 3860 fEOLComments = TRUE; 3861 3862 // putc(c.fChar, stdout); 3863} 3864 3865 3866 3867//------------------------------------------------------------------------------ 3868// 3869// scanNamedChar 3870 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern. 3871// 3872// The scan position will be at the 'N'. On return 3873// the scan position should be just after the '}' 3874// 3875// Return the UChar32 3876// 3877//------------------------------------------------------------------------------ 3878UChar32 RegexCompile::scanNamedChar() { 3879 if (U_FAILURE(*fStatus)) { 3880 return 0; 3881 } 3882 3883 nextChar(fC); 3884 if (fC.fChar != chLBrace) { 3885 error(U_REGEX_PROPERTY_SYNTAX); 3886 return 0; 3887 } 3888 3889 UnicodeString charName; 3890 for (;;) { 3891 nextChar(fC); 3892 if (fC.fChar == chRBrace) { 3893 break; 3894 } 3895 if (fC.fChar == -1) { 3896 error(U_REGEX_PROPERTY_SYNTAX); 3897 return 0; 3898 } 3899 charName.append(fC.fChar); 3900 } 3901 3902 char name[100]; 3903 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) || 3904 (uint32_t)charName.length()>=sizeof(name)) { 3905 // All Unicode character names have only invariant characters. 3906 // The API to get a character, given a name, accepts only char *, forcing us to convert, 3907 // which requires this error check 3908 error(U_REGEX_PROPERTY_SYNTAX); 3909 return 0; 3910 } 3911 charName.extract(0, charName.length(), name, sizeof(name), US_INV); 3912 3913 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus); 3914 if (U_FAILURE(*fStatus)) { 3915 error(U_REGEX_PROPERTY_SYNTAX); 3916 } 3917 3918 nextChar(fC); // Continue overall regex pattern processing with char after the '}' 3919 return theChar; 3920} 3921 3922//------------------------------------------------------------------------------ 3923// 3924// scanProp Construct a UnicodeSet from the text at the current scan 3925// position, which will be of the form \p{whaterver} 3926// 3927// The scan position will be at the 'p' or 'P'. On return 3928// the scan position should be just after the '}' 3929// 3930// Return a UnicodeSet, constructed from the \P pattern, 3931// or NULL if the pattern is invalid. 3932// 3933//------------------------------------------------------------------------------ 3934UnicodeSet *RegexCompile::scanProp() { 3935 UnicodeSet *uset = NULL; 3936 3937 if (U_FAILURE(*fStatus)) { 3938 return NULL; 3939 } 3940 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP); 3941 UBool negated = (fC.fChar == chP); 3942 3943 UnicodeString propertyName; 3944 nextChar(fC); 3945 if (fC.fChar != chLBrace) { 3946 error(U_REGEX_PROPERTY_SYNTAX); 3947 return NULL; 3948 } 3949 for (;;) { 3950 nextChar(fC); 3951 if (fC.fChar == chRBrace) { 3952 break; 3953 } 3954 if (fC.fChar == -1) { 3955 // Hit the end of the input string without finding the closing '}' 3956 error(U_REGEX_PROPERTY_SYNTAX); 3957 return NULL; 3958 } 3959 propertyName.append(fC.fChar); 3960 } 3961 uset = createSetForProperty(propertyName, negated); 3962 nextChar(fC); // Move input scan to position following the closing '}' 3963 return uset; 3964} 3965 3966//------------------------------------------------------------------------------ 3967// 3968// scanPosixProp Construct a UnicodeSet from the text at the current scan 3969// position, which is expected be of the form [:property expression:] 3970// 3971// The scan position will be at the opening ':'. On return 3972// the scan position must be on the closing ']' 3973// 3974// Return a UnicodeSet constructed from the pattern, 3975// or NULL if this is not a valid POSIX-style set expression. 3976// If not a property expression, restore the initial scan position 3977// (to the opening ':') 3978// 3979// Note: the opening '[:' is not sufficient to guarantee that 3980// this is a [:property:] expression. 3981// [:'+=,] is a perfectly good ordinary set expression that 3982// happens to include ':' as one of its characters. 3983// 3984//------------------------------------------------------------------------------ 3985UnicodeSet *RegexCompile::scanPosixProp() { 3986 UnicodeSet *uset = NULL; 3987 3988 if (U_FAILURE(*fStatus)) { 3989 return NULL; 3990 } 3991 3992 U_ASSERT(fC.fChar == chColon); 3993 3994 // Save the scanner state. 3995 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062 3996 int64_t savedScanIndex = fScanIndex; 3997 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern); 3998 UBool savedQuoteMode = fQuoteMode; 3999 UBool savedInBackslashQuote = fInBackslashQuote; 4000 UBool savedEOLComments = fEOLComments; 4001 int64_t savedLineNum = fLineNum; 4002 int64_t savedCharNum = fCharNum; 4003 UChar32 savedLastChar = fLastChar; 4004 UChar32 savedPeekChar = fPeekChar; 4005 RegexPatternChar savedfC = fC; 4006 4007 // Scan for a closing ]. A little tricky because there are some perverse 4008 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression, 4009 // ending on the second closing ]. 4010 4011 UnicodeString propName; 4012 UBool negated = FALSE; 4013 4014 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:] 4015 nextChar(fC); 4016 if (fC.fChar == chUp) { 4017 negated = TRUE; 4018 nextChar(fC); 4019 } 4020 4021 // Scan for the closing ":]", collecting the property name along the way. 4022 UBool sawPropSetTerminator = FALSE; 4023 for (;;) { 4024 propName.append(fC.fChar); 4025 nextChar(fC); 4026 if (fC.fQuoted || fC.fChar == -1) { 4027 // Escaped characters or end of input - either says this isn't a [:Property:] 4028 break; 4029 } 4030 if (fC.fChar == chColon) { 4031 nextChar(fC); 4032 if (fC.fChar == chRBracket) { 4033 sawPropSetTerminator = TRUE; 4034 } 4035 break; 4036 } 4037 } 4038 4039 if (sawPropSetTerminator) { 4040 uset = createSetForProperty(propName, negated); 4041 } 4042 else 4043 { 4044 // No closing ":]". 4045 // Restore the original scan position. 4046 // The main scanner will retry the input as a normal set expression, 4047 // not a [:Property:] expression. 4048 fScanIndex = savedScanIndex; 4049 fQuoteMode = savedQuoteMode; 4050 fInBackslashQuote = savedInBackslashQuote; 4051 fEOLComments = savedEOLComments; 4052 fLineNum = savedLineNum; 4053 fCharNum = savedCharNum; 4054 fLastChar = savedLastChar; 4055 fPeekChar = savedPeekChar; 4056 fC = savedfC; 4057 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex); 4058 } 4059 return uset; 4060} 4061 4062static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { 4063 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f); 4064 addCategory(set, U_GC_CF_MASK, ec); 4065} 4066 4067// 4068// Create a Unicode Set from a Unicode Property expression. 4069// This is common code underlying both \p{...} ane [:...:] expressions. 4070// Includes trying the Java "properties" that aren't supported as 4071// normal ICU UnicodeSet properties 4072// 4073static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{" 4074static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{" 4075UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) { 4076 UnicodeString setExpr; 4077 UnicodeSet *set; 4078 uint32_t usetFlags = 0; 4079 4080 if (U_FAILURE(*fStatus)) { 4081 return NULL; 4082 } 4083 4084 // 4085 // First try the property as we received it 4086 // 4087 if (negated) { 4088 setExpr.append(negSetPrefix, -1); 4089 } else { 4090 setExpr.append(posSetPrefix, -1); 4091 } 4092 setExpr.append(propName); 4093 setExpr.append(chRBrace); 4094 setExpr.append(chRBracket); 4095 if (fModeFlags & UREGEX_CASE_INSENSITIVE) { 4096 usetFlags |= USET_CASE_INSENSITIVE; 4097 } 4098 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); 4099 if (U_SUCCESS(*fStatus)) { 4100 return set; 4101 } 4102 delete set; 4103 set = NULL; 4104 4105 // 4106 // The property as it was didn't work. 4107 4108 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX 4109 // or standard Java, but many other regular expression packages do recognize it. 4110 4111 if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) { 4112 *fStatus = U_ZERO_ERROR; 4113 set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])); 4114 if (set == NULL) { 4115 *fStatus = U_MEMORY_ALLOCATION_ERROR; 4116 return set; 4117 } 4118 if (negated) { 4119 set->complement(); 4120 } 4121 return set; 4122 } 4123 4124 4125 // Do Java fixes - 4126 // InGreek -> InGreek or Coptic, that being the official Unicode name for that block. 4127 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols. 4128 // 4129 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols" 4130 // is accepted by Java. The property part of the name is compared 4131 // case-insenstively. The spaces must be exactly as shown, either 4132 // all there, or all omitted, with exactly one at each position 4133 // if they are present. From checking against JDK 1.6 4134 // 4135 // This code should be removed when ICU properties support the Java compatibility names 4136 // (ICU 4.0?) 4137 // 4138 UnicodeString mPropName = propName; 4139 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) { 4140 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic"); 4141 } 4142 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 || 4143 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) { 4144 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols"); 4145 } 4146 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { 4147 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint"); 4148 } 4149 4150 // See if the property looks like a Java "InBlockName", which 4151 // we will recast as "Block=BlockName" 4152 // 4153 static const UChar IN[] = {0x49, 0x6E, 0}; // "In" 4154 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block=" 4155 if (mPropName.startsWith(IN, 2) && propName.length()>=3) { 4156 setExpr.truncate(4); // Leaves "[\p{", or "[\P{" 4157 setExpr.append(BLOCK, -1); 4158 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leading "In" removed. 4159 setExpr.append(chRBrace); 4160 setExpr.append(chRBracket); 4161 *fStatus = U_ZERO_ERROR; 4162 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); 4163 if (U_SUCCESS(*fStatus)) { 4164 return set; 4165 } 4166 delete set; 4167 set = NULL; 4168 } 4169 4170 if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) || 4171 propName.compare(UNICODE_STRING_SIMPLE("all")) == 0) 4172 { 4173 UErrorCode localStatus = U_ZERO_ERROR; 4174 //setExpr.remove(); 4175 set = new UnicodeSet(); 4176 // 4177 // Try the various Java specific properties. 4178 // These all begin with "java" 4179 // 4180 if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) { 4181 addCategory(set, U_GC_CN_MASK, localStatus); 4182 set->complement(); 4183 } 4184 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) { 4185 addCategory(set, U_GC_ND_MASK, localStatus); 4186 } 4187 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) { 4188 addIdentifierIgnorable(set, localStatus); 4189 } 4190 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) { 4191 set->add(0, 0x1F).add(0x7F, 0x9F); 4192 } 4193 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) { 4194 addCategory(set, U_GC_L_MASK, localStatus); 4195 addCategory(set, U_GC_SC_MASK, localStatus); 4196 addCategory(set, U_GC_PC_MASK, localStatus); 4197 addCategory(set, U_GC_ND_MASK, localStatus); 4198 addCategory(set, U_GC_NL_MASK, localStatus); 4199 addCategory(set, U_GC_MC_MASK, localStatus); 4200 addCategory(set, U_GC_MN_MASK, localStatus); 4201 addIdentifierIgnorable(set, localStatus); 4202 } 4203 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) { 4204 addCategory(set, U_GC_L_MASK, localStatus); 4205 addCategory(set, U_GC_NL_MASK, localStatus); 4206 addCategory(set, U_GC_SC_MASK, localStatus); 4207 addCategory(set, U_GC_PC_MASK, localStatus); 4208 } 4209 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) { 4210 addCategory(set, U_GC_L_MASK, localStatus); 4211 } 4212 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) { 4213 addCategory(set, U_GC_L_MASK, localStatus); 4214 addCategory(set, U_GC_ND_MASK, localStatus); 4215 } 4216 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) { 4217 addCategory(set, U_GC_LL_MASK, localStatus); 4218 } 4219 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) { 4220 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus); 4221 } 4222 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) { 4223 addCategory(set, U_GC_Z_MASK, localStatus); 4224 } 4225 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) { 4226 set->add(0x10000, UnicodeSet::MAX_VALUE); 4227 } 4228 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) { 4229 addCategory(set, U_GC_LT_MASK, localStatus); 4230 } 4231 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) { 4232 addCategory(set, U_GC_L_MASK, localStatus); 4233 addCategory(set, U_GC_NL_MASK, localStatus); 4234 } 4235 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) { 4236 addCategory(set, U_GC_L_MASK, localStatus); 4237 addCategory(set, U_GC_PC_MASK, localStatus); 4238 addCategory(set, U_GC_ND_MASK, localStatus); 4239 addCategory(set, U_GC_NL_MASK, localStatus); 4240 addCategory(set, U_GC_MC_MASK, localStatus); 4241 addCategory(set, U_GC_MN_MASK, localStatus); 4242 addIdentifierIgnorable(set, localStatus); 4243 } 4244 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) { 4245 addCategory(set, U_GC_LU_MASK, localStatus); 4246 } 4247 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) { 4248 set->add(0, UnicodeSet::MAX_VALUE); 4249 } 4250 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) { 4251 addCategory(set, U_GC_Z_MASK, localStatus); 4252 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f)); 4253 set->add(9, 0x0d).add(0x1c, 0x1f); 4254 } 4255 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { 4256 set->add(0, UnicodeSet::MAX_VALUE); 4257 } 4258 4259 if (U_SUCCESS(localStatus) && !set->isEmpty()) { 4260 *fStatus = U_ZERO_ERROR; 4261 if (usetFlags & USET_CASE_INSENSITIVE) { 4262 set->closeOver(USET_CASE_INSENSITIVE); 4263 } 4264 if (negated) { 4265 set->complement(); 4266 } 4267 return set; 4268 } 4269 delete set; 4270 set = NULL; 4271 } 4272 error(*fStatus); 4273 return NULL; 4274} 4275 4276 4277 4278// 4279// SetEval Part of the evaluation of [set expressions]. 4280// Perform any pending (stacked) operations with precedence 4281// equal or greater to that of the next operator encountered 4282// in the expression. 4283// 4284void RegexCompile::setEval(int32_t nextOp) { 4285 UnicodeSet *rightOperand = NULL; 4286 UnicodeSet *leftOperand = NULL; 4287 for (;;) { 4288 U_ASSERT(fSetOpStack.empty()==FALSE); 4289 int32_t pendingSetOperation = fSetOpStack.peeki(); 4290 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) { 4291 break; 4292 } 4293 fSetOpStack.popi(); 4294 U_ASSERT(fSetStack.empty() == FALSE); 4295 rightOperand = (UnicodeSet *)fSetStack.peek(); 4296 switch (pendingSetOperation) { 4297 case setNegation: 4298 rightOperand->complement(); 4299 break; 4300 case setCaseClose: 4301 // TODO: need a simple close function. Ticket 6065 4302 rightOperand->closeOver(USET_CASE_INSENSITIVE); 4303 rightOperand->removeAllStrings(); 4304 break; 4305 case setDifference1: 4306 case setDifference2: 4307 fSetStack.pop(); 4308 leftOperand = (UnicodeSet *)fSetStack.peek(); 4309 leftOperand->removeAll(*rightOperand); 4310 delete rightOperand; 4311 break; 4312 case setIntersection1: 4313 case setIntersection2: 4314 fSetStack.pop(); 4315 leftOperand = (UnicodeSet *)fSetStack.peek(); 4316 leftOperand->retainAll(*rightOperand); 4317 delete rightOperand; 4318 break; 4319 case setUnion: 4320 fSetStack.pop(); 4321 leftOperand = (UnicodeSet *)fSetStack.peek(); 4322 leftOperand->addAll(*rightOperand); 4323 delete rightOperand; 4324 break; 4325 default: 4326 U_ASSERT(FALSE); 4327 break; 4328 } 4329 } 4330 } 4331 4332void RegexCompile::setPushOp(int32_t op) { 4333 setEval(op); 4334 fSetOpStack.push(op, *fStatus); 4335 fSetStack.push(new UnicodeSet(), *fStatus); 4336} 4337 4338U_NAMESPACE_END 4339#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 4340 4341