parse.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright 2006 The RE2 Authors. All Rights Reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Regular expression parser. 6 7// The parser is a simple precedence-based parser with a 8// manual stack. The parsing work is done by the methods 9// of the ParseState class. The Regexp::Parse function is 10// essentially just a lexer that calls the ParseState method 11// for each token. 12 13// The parser recognizes POSIX extended regular expressions 14// excluding backreferences, collating elements, and collating 15// classes. It also allows the empty string as a regular expression 16// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W. 17// See regexp.h for rationale. 18 19#include <ctype.h> 20 21#include "util/util.h" 22#include "re2/regexp.h" 23#include "re2/stringpiece.h" 24#include "re2/unicode_casefold.h" 25#include "re2/unicode_groups.h" 26 27namespace re2 { 28 29// Regular expression parse state. 30// The list of parsed regexps so far is maintained as a vector of 31// Regexp pointers called the stack. Left parenthesis and vertical 32// bar markers are also placed on the stack, as Regexps with 33// non-standard opcodes. 34// Scanning a left parenthesis causes the parser to push a left parenthesis 35// marker on the stack. 36// Scanning a vertical bar causes the parser to pop the stack until it finds a 37// vertical bar or left parenthesis marker (not popping the marker), 38// concatenate all the popped results, and push them back on 39// the stack (DoConcatenation). 40// Scanning a right parenthesis causes the parser to act as though it 41// has seen a vertical bar, which then leaves the top of the stack in the 42// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar. 43// The parser pops all this off the stack and creates an alternation of the 44// regexps (DoAlternation). 45 46class Regexp::ParseState { 47 public: 48 ParseState(ParseFlags flags, const StringPiece& whole_regexp, 49 RegexpStatus* status); 50 ~ParseState(); 51 52 ParseFlags flags() { return flags_; } 53 int rune_max() { return rune_max_; } 54 55 // Parse methods. All public methods return a bool saying 56 // whether parsing should continue. If a method returns 57 // false, it has set fields in *status_, and the parser 58 // should return NULL. 59 60 // Pushes the given regular expression onto the stack. 61 // Could check for too much memory used here. 62 bool PushRegexp(Regexp* re); 63 64 // Pushes the literal rune r onto the stack. 65 bool PushLiteral(Rune r); 66 67 // Pushes a regexp with the given op (and no args) onto the stack. 68 bool PushSimpleOp(RegexpOp op); 69 70 // Pushes a ^ onto the stack. 71 bool PushCarat(); 72 73 // Pushes a \b (word == true) or \B (word == false) onto the stack. 74 bool PushWordBoundary(bool word); 75 76 // Pushes a $ onto the stack. 77 bool PushDollar(); 78 79 // Pushes a . onto the stack 80 bool PushDot(); 81 82 // Pushes a repeat operator regexp onto the stack. 83 // A valid argument for the operator must already be on the stack. 84 // s is the name of the operator, for use in error messages. 85 bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy); 86 87 // Pushes a repetition regexp onto the stack. 88 // A valid argument for the operator must already be on the stack. 89 bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy); 90 91 // Checks whether a particular regexp op is a marker. 92 bool IsMarker(RegexpOp op); 93 94 // Processes a left parenthesis in the input. 95 // Pushes a marker onto the stack. 96 bool DoLeftParen(const StringPiece& name); 97 bool DoLeftParenNoCapture(); 98 99 // Processes a vertical bar in the input. 100 bool DoVerticalBar(); 101 102 // Processes a right parenthesis in the input. 103 bool DoRightParen(); 104 105 // Processes the end of input, returning the final regexp. 106 Regexp* DoFinish(); 107 108 // Finishes the regexp if necessary, preparing it for use 109 // in a more complicated expression. 110 // If it is a CharClassBuilder, converts into a CharClass. 111 Regexp* FinishRegexp(Regexp*); 112 113 // These routines don't manipulate the parse stack 114 // directly, but they do need to look at flags_. 115 // ParseCharClass also manipulates the internals of Regexp 116 // while creating *out_re. 117 118 // Parse a character class into *out_re. 119 // Removes parsed text from s. 120 bool ParseCharClass(StringPiece* s, Regexp** out_re, 121 RegexpStatus* status); 122 123 // Parse a character class character into *rp. 124 // Removes parsed text from s. 125 bool ParseCCCharacter(StringPiece* s, Rune *rp, 126 const StringPiece& whole_class, 127 RegexpStatus* status); 128 129 // Parse a character class range into rr. 130 // Removes parsed text from s. 131 bool ParseCCRange(StringPiece* s, RuneRange* rr, 132 const StringPiece& whole_class, 133 RegexpStatus* status); 134 135 // Parse a Perl flag set or non-capturing group from s. 136 bool ParsePerlFlags(StringPiece* s); 137 138 139 // Finishes the current concatenation, 140 // collapsing it into a single regexp on the stack. 141 void DoConcatenation(); 142 143 // Finishes the current alternation, 144 // collapsing it to a single regexp on the stack. 145 void DoAlternation(); 146 147 // Generalized DoAlternation/DoConcatenation. 148 void DoCollapse(RegexpOp op); 149 150 // Maybe concatenate Literals into LiteralString. 151 bool MaybeConcatString(int r, ParseFlags flags); 152 153private: 154 ParseFlags flags_; 155 StringPiece whole_regexp_; 156 RegexpStatus* status_; 157 Regexp* stacktop_; 158 int ncap_; // number of capturing parens seen 159 int rune_max_; // maximum char value for this encoding 160 161 DISALLOW_EVIL_CONSTRUCTORS(ParseState); 162}; 163 164// Pseudo-operators - only on parse stack. 165const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1); 166const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2); 167 168Regexp::ParseState::ParseState(ParseFlags flags, 169 const StringPiece& whole_regexp, 170 RegexpStatus* status) 171 : flags_(flags), whole_regexp_(whole_regexp), 172 status_(status), stacktop_(NULL), ncap_(0) { 173 if (flags_ & Latin1) 174 rune_max_ = 0xFF; 175 else 176 rune_max_ = Runemax; 177} 178 179// Cleans up by freeing all the regexps on the stack. 180Regexp::ParseState::~ParseState() { 181 Regexp* next; 182 for (Regexp* re = stacktop_; re != NULL; re = next) { 183 next = re->down_; 184 re->down_ = NULL; 185 if (re->op() == kLeftParen) 186 delete re->name_; 187 re->Decref(); 188 } 189} 190 191// Finishes the regexp if necessary, preparing it for use in 192// a more complex expression. 193// If it is a CharClassBuilder, converts into a CharClass. 194Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) { 195 if (re == NULL) 196 return NULL; 197 re->down_ = NULL; 198 199 if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { 200 CharClassBuilder* ccb = re->ccb_; 201 re->ccb_ = NULL; 202 re->cc_ = ccb->GetCharClass(); 203 delete ccb; 204 } 205 206 return re; 207} 208 209// Pushes the given regular expression onto the stack. 210// Could check for too much memory used here. 211bool Regexp::ParseState::PushRegexp(Regexp* re) { 212 MaybeConcatString(-1, NoParseFlags); 213 214 // Special case: a character class of one character is just 215 // a literal. This is a common idiom for escaping 216 // single characters (e.g., [.] instead of \.), and some 217 // analysis does better with fewer character classes. 218 // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. 219 if (re->op_ == kRegexpCharClass) { 220 if (re->ccb_->size() == 1) { 221 Rune r = re->ccb_->begin()->lo; 222 re->Decref(); 223 re = new Regexp(kRegexpLiteral, flags_); 224 re->rune_ = r; 225 } else if (re->ccb_->size() == 2) { 226 Rune r = re->ccb_->begin()->lo; 227 if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) { 228 re->Decref(); 229 re = new Regexp(kRegexpLiteral, flags_ | FoldCase); 230 re->rune_ = r + 'a' - 'A'; 231 } 232 } 233 } 234 235 if (!IsMarker(re->op())) 236 re->simple_ = re->ComputeSimple(); 237 re->down_ = stacktop_; 238 stacktop_ = re; 239 return true; 240} 241 242// Searches the case folding tables and returns the CaseFold* that contains r. 243// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r. 244// If there isn't one, returns NULL. 245CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) { 246 CaseFold* ef = f + n; 247 248 // Binary search for entry containing r. 249 while (n > 0) { 250 int m = n/2; 251 if (f[m].lo <= r && r <= f[m].hi) 252 return &f[m]; 253 if (r < f[m].lo) { 254 n = m; 255 } else { 256 f += m+1; 257 n -= m+1; 258 } 259 } 260 261 // There is no entry that contains r, but f points 262 // where it would have been. Unless f points at 263 // the end of the array, it points at the next entry 264 // after r. 265 if (f < ef) 266 return f; 267 268 // No entry contains r; no entry contains runes > r. 269 return NULL; 270} 271 272// Returns the result of applying the fold f to the rune r. 273Rune ApplyFold(CaseFold *f, Rune r) { 274 switch (f->delta) { 275 default: 276 return r + f->delta; 277 278 case EvenOddSkip: // even <-> odd but only applies to every other 279 if ((r - f->lo) % 2) 280 return r; 281 // fall through 282 case EvenOdd: // even <-> odd 283 if (r%2 == 0) 284 return r + 1; 285 return r - 1; 286 287 case OddEvenSkip: // odd <-> even but only applies to every other 288 if ((r - f->lo) % 2) 289 return r; 290 // fall through 291 case OddEven: // odd <-> even 292 if (r%2 == 1) 293 return r + 1; 294 return r - 1; 295 } 296} 297 298// Returns the next Rune in r's folding cycle (see unicode_casefold.h). 299// Examples: 300// CycleFoldRune('A') = 'a' 301// CycleFoldRune('a') = 'A' 302// 303// CycleFoldRune('K') = 'k' 304// CycleFoldRune('k') = 0x212A (Kelvin) 305// CycleFoldRune(0x212A) = 'K' 306// 307// CycleFoldRune('?') = '?' 308Rune CycleFoldRune(Rune r) { 309 CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r); 310 if (f == NULL || r < f->lo) 311 return r; 312 return ApplyFold(f, r); 313} 314 315// Add lo-hi to the class, along with their fold-equivalent characters. 316// If lo-hi is already in the class, assume that the fold-equivalent 317// chars are there too, so there's no work to do. 318static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { 319 // AddFoldedRange calls itself recursively for each rune in the fold cycle. 320 // Most folding cycles are small: there aren't any bigger than four in the 321 // current Unicode tables. make_unicode_casefold.py checks that 322 // the cycles are not too long, and we double-check here using depth. 323 if (depth > 10) { 324 LOG(DFATAL) << "AddFoldedRange recurses too much."; 325 return; 326 } 327 328 if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done 329 return; 330 331 while (lo <= hi) { 332 CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo); 333 if (f == NULL) // lo has no fold, nor does anything above lo 334 break; 335 if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo 336 lo = f->lo; 337 continue; 338 } 339 340 // Add in the result of folding the range lo - f->hi 341 // and that range's fold, recursively. 342 Rune lo1 = lo; 343 Rune hi1 = min<Rune>(hi, f->hi); 344 switch (f->delta) { 345 default: 346 lo1 += f->delta; 347 hi1 += f->delta; 348 break; 349 case EvenOdd: 350 if (lo1%2 == 1) 351 lo1--; 352 if (hi1%2 == 0) 353 hi1++; 354 break; 355 case OddEven: 356 if (lo1%2 == 0) 357 lo1--; 358 if (hi1%2 == 1) 359 hi1++; 360 break; 361 } 362 AddFoldedRange(cc, lo1, hi1, depth+1); 363 364 // Pick up where this fold left off. 365 lo = f->hi + 1; 366 } 367} 368 369// Pushes the literal rune r onto the stack. 370bool Regexp::ParseState::PushLiteral(Rune r) { 371 // Do case folding if needed. 372 if ((flags_ & FoldCase) && CycleFoldRune(r) != r) { 373 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 374 re->ccb_ = new CharClassBuilder; 375 Rune r1 = r; 376 do { 377 if (!(flags_ & NeverNL) || r != '\n') { 378 re->ccb_->AddRange(r, r); 379 } 380 r = CycleFoldRune(r); 381 } while (r != r1); 382 re->ccb_->RemoveAbove(rune_max_); 383 return PushRegexp(re); 384 } 385 386 // Exclude newline if applicable. 387 if ((flags_ & NeverNL) && r == '\n') 388 return PushRegexp(new Regexp(kRegexpNoMatch, flags_)); 389 390 // No fancy stuff worked. Ordinary literal. 391 if (MaybeConcatString(r, flags_)) 392 return true; 393 394 Regexp* re = new Regexp(kRegexpLiteral, flags_); 395 re->rune_ = r; 396 return PushRegexp(re); 397} 398 399// Pushes a ^ onto the stack. 400bool Regexp::ParseState::PushCarat() { 401 if (flags_ & OneLine) { 402 return PushSimpleOp(kRegexpBeginText); 403 } 404 return PushSimpleOp(kRegexpBeginLine); 405} 406 407// Pushes a \b or \B onto the stack. 408bool Regexp::ParseState::PushWordBoundary(bool word) { 409 if (word) 410 return PushSimpleOp(kRegexpWordBoundary); 411 return PushSimpleOp(kRegexpNoWordBoundary); 412} 413 414// Pushes a $ onto the stack. 415bool Regexp::ParseState::PushDollar() { 416 if (flags_ & OneLine) { 417 // Clumsy marker so that MimicsPCRE() can tell whether 418 // this kRegexpEndText was a $ and not a \z. 419 Regexp::ParseFlags oflags = flags_; 420 flags_ = flags_ | WasDollar; 421 bool ret = PushSimpleOp(kRegexpEndText); 422 flags_ = oflags; 423 return ret; 424 } 425 return PushSimpleOp(kRegexpEndLine); 426} 427 428// Pushes a . onto the stack. 429bool Regexp::ParseState::PushDot() { 430 if ((flags_ & DotNL) && !(flags_ & NeverNL)) 431 return PushSimpleOp(kRegexpAnyChar); 432 // Rewrite . into [^\n] 433 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 434 re->ccb_ = new CharClassBuilder; 435 re->ccb_->AddRange(0, '\n' - 1); 436 re->ccb_->AddRange('\n' + 1, rune_max_); 437 return PushRegexp(re); 438} 439 440// Pushes a regexp with the given op (and no args) onto the stack. 441bool Regexp::ParseState::PushSimpleOp(RegexpOp op) { 442 Regexp* re = new Regexp(op, flags_); 443 return PushRegexp(re); 444} 445 446// Pushes a repeat operator regexp onto the stack. 447// A valid argument for the operator must already be on the stack. 448// The char c is the name of the operator, for use in error messages. 449bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, 450 bool nongreedy) { 451 if (stacktop_ == NULL || IsMarker(stacktop_->op())) { 452 status_->set_code(kRegexpRepeatArgument); 453 status_->set_error_arg(s); 454 return false; 455 } 456 Regexp::ParseFlags fl = flags_; 457 if (nongreedy) 458 fl = fl ^ NonGreedy; 459 Regexp* re = new Regexp(op, fl); 460 re->AllocSub(1); 461 re->down_ = stacktop_->down_; 462 re->sub()[0] = FinishRegexp(stacktop_); 463 re->simple_ = re->ComputeSimple(); 464 stacktop_ = re; 465 return true; 466} 467 468// Pushes a repetition regexp onto the stack. 469// A valid argument for the operator must already be on the stack. 470bool Regexp::ParseState::PushRepetition(int min, int max, 471 const StringPiece& s, 472 bool nongreedy) { 473 if ((max != -1 && max < min) || min > 1000 || max > 1000) { 474 status_->set_code(kRegexpRepeatSize); 475 status_->set_error_arg(s); 476 return false; 477 } 478 if (stacktop_ == NULL || IsMarker(stacktop_->op())) { 479 status_->set_code(kRegexpRepeatArgument); 480 status_->set_error_arg(s); 481 return false; 482 } 483 Regexp::ParseFlags fl = flags_; 484 if (nongreedy) 485 fl = fl ^ NonGreedy; 486 Regexp* re = new Regexp(kRegexpRepeat, fl); 487 re->min_ = min; 488 re->max_ = max; 489 re->AllocSub(1); 490 re->down_ = stacktop_->down_; 491 re->sub()[0] = FinishRegexp(stacktop_); 492 re->simple_ = re->ComputeSimple(); 493 494 stacktop_ = re; 495 return true; 496} 497 498// Checks whether a particular regexp op is a marker. 499bool Regexp::ParseState::IsMarker(RegexpOp op) { 500 return op >= kLeftParen; 501} 502 503// Processes a left parenthesis in the input. 504// Pushes a marker onto the stack. 505bool Regexp::ParseState::DoLeftParen(const StringPiece& name) { 506 Regexp* re = new Regexp(kLeftParen, flags_); 507 re->cap_ = ++ncap_; 508 if (name.data() != NULL) 509 re->name_ = new string(name.as_string()); 510 return PushRegexp(re); 511} 512 513// Pushes a non-capturing marker onto the stack. 514bool Regexp::ParseState::DoLeftParenNoCapture() { 515 Regexp* re = new Regexp(kLeftParen, flags_); 516 re->cap_ = -1; 517 return PushRegexp(re); 518} 519 520// Adds r to cc, along with r's upper case if foldascii is set. 521static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) { 522 cc->AddRange(r, r); 523 if (foldascii && 'a' <= r && r <= 'z') 524 cc->AddRange(r + 'A' - 'a', r + 'A' - 'a'); 525} 526 527// Processes a vertical bar in the input. 528bool Regexp::ParseState::DoVerticalBar() { 529 MaybeConcatString(-1, NoParseFlags); 530 DoConcatenation(); 531 532 // Below the vertical bar is a list to alternate. 533 // Above the vertical bar is a list to concatenate. 534 // We just did the concatenation, so either swap 535 // the result below the vertical bar or push a new 536 // vertical bar on the stack. 537 Regexp* r1; 538 Regexp* r2; 539 if ((r1 = stacktop_) != NULL && 540 (r2 = stacktop_->down_) != NULL && 541 r2->op() == kVerticalBar) { 542 // If above and below vertical bar are literal or char class, 543 // can merge into a single char class. 544 Regexp* r3; 545 if ((r1->op() == kRegexpLiteral || 546 r1->op() == kRegexpCharClass || 547 r1->op() == kRegexpAnyChar) && 548 (r3 = r2->down_) != NULL) { 549 Rune rune; 550 switch (r3->op()) { 551 case kRegexpLiteral: // convert to char class 552 rune = r3->rune_; 553 r3->op_ = kRegexpCharClass; 554 r3->cc_ = NULL; 555 r3->ccb_ = new CharClassBuilder; 556 AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase); 557 // fall through 558 case kRegexpCharClass: 559 if (r1->op() == kRegexpLiteral) 560 AddLiteral(r3->ccb_, r1->rune_, 561 r1->parse_flags_ & Regexp::FoldCase); 562 else if (r1->op() == kRegexpCharClass) 563 r3->ccb_->AddCharClass(r1->ccb_); 564 if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) { 565 delete r3->ccb_; 566 r3->ccb_ = NULL; 567 r3->op_ = kRegexpAnyChar; 568 } 569 // fall through 570 case kRegexpAnyChar: 571 // pop r1 572 stacktop_ = r2; 573 r1->Decref(); 574 return true; 575 default: 576 break; 577 } 578 } 579 580 // Swap r1 below vertical bar (r2). 581 r1->down_ = r2->down_; 582 r2->down_ = r1; 583 stacktop_ = r2; 584 return true; 585 } 586 return PushSimpleOp(kVerticalBar); 587} 588 589// Processes a right parenthesis in the input. 590bool Regexp::ParseState::DoRightParen() { 591 // Finish the current concatenation and alternation. 592 DoAlternation(); 593 594 // The stack should be: LeftParen regexp 595 // Remove the LeftParen, leaving the regexp, 596 // parenthesized. 597 Regexp* r1; 598 Regexp* r2; 599 if ((r1 = stacktop_) == NULL || 600 (r2 = r1->down_) == NULL || 601 r2->op() != kLeftParen) { 602 status_->set_code(kRegexpMissingParen); 603 status_->set_error_arg(whole_regexp_); 604 return false; 605 } 606 607 // Pop off r1, r2. Will Decref or reuse below. 608 stacktop_ = r2->down_; 609 610 // Restore flags from when paren opened. 611 Regexp* re = r2; 612 flags_ = re->parse_flags(); 613 614 // Rewrite LeftParen as capture if needed. 615 if (re->cap_ > 0) { 616 re->op_ = kRegexpCapture; 617 // re->cap_ is already set 618 re->AllocSub(1); 619 re->sub()[0] = FinishRegexp(r1); 620 re->simple_ = re->ComputeSimple(); 621 } else { 622 re->Decref(); 623 re = r1; 624 } 625 return PushRegexp(re); 626} 627 628// Processes the end of input, returning the final regexp. 629Regexp* Regexp::ParseState::DoFinish() { 630 DoAlternation(); 631 Regexp* re = stacktop_; 632 if (re != NULL && re->down_ != NULL) { 633 status_->set_code(kRegexpMissingParen); 634 status_->set_error_arg(whole_regexp_); 635 return NULL; 636 } 637 stacktop_ = NULL; 638 return FinishRegexp(re); 639} 640 641// Returns the leading regexp that re starts with. 642// The returned Regexp* points into a piece of re, 643// so it must not be used after the caller calls re->Decref(). 644Regexp* Regexp::LeadingRegexp(Regexp* re) { 645 if (re->op() == kRegexpEmptyMatch) 646 return NULL; 647 if (re->op() == kRegexpConcat && re->nsub() >= 2) { 648 Regexp** sub = re->sub(); 649 if (sub[0]->op() == kRegexpEmptyMatch) 650 return NULL; 651 return sub[0]; 652 } 653 return re; 654} 655 656// Removes LeadingRegexp(re) from re and returns what's left. 657// Consumes the reference to re and may edit it in place. 658// If caller wants to hold on to LeadingRegexp(re), 659// must have already Incref'ed it. 660Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) { 661 if (re->op() == kRegexpEmptyMatch) 662 return re; 663 if (re->op() == kRegexpConcat && re->nsub() >= 2) { 664 Regexp** sub = re->sub(); 665 if (sub[0]->op() == kRegexpEmptyMatch) 666 return re; 667 sub[0]->Decref(); 668 sub[0] = NULL; 669 if (re->nsub() == 2) { 670 // Collapse concatenation to single regexp. 671 Regexp* nre = sub[1]; 672 sub[1] = NULL; 673 re->Decref(); 674 return nre; 675 } 676 // 3 or more -> 2 or more. 677 re->nsub_--; 678 memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); 679 return re; 680 } 681 Regexp::ParseFlags pf = re->parse_flags(); 682 re->Decref(); 683 return new Regexp(kRegexpEmptyMatch, pf); 684} 685 686// Returns the leading string that re starts with. 687// The returned Rune* points into a piece of re, 688// so it must not be used after the caller calls re->Decref(). 689Rune* Regexp::LeadingString(Regexp* re, int *nrune, 690 Regexp::ParseFlags *flags) { 691 while (re->op() == kRegexpConcat && re->nsub() > 0) 692 re = re->sub()[0]; 693 694 *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase); 695 696 if (re->op() == kRegexpLiteral) { 697 *nrune = 1; 698 return &re->rune_; 699 } 700 701 if (re->op() == kRegexpLiteralString) { 702 *nrune = re->nrunes_; 703 return re->runes_; 704 } 705 706 *nrune = 0; 707 return NULL; 708} 709 710// Removes the first n leading runes from the beginning of re. 711// Edits re in place. 712void Regexp::RemoveLeadingString(Regexp* re, int n) { 713 // Chase down concats to find first string. 714 // For regexps generated by parser, nested concats are 715 // flattened except when doing so would overflow the 16-bit 716 // limit on the size of a concatenation, so we should never 717 // see more than two here. 718 Regexp* stk[4]; 719 int d = 0; 720 while (re->op() == kRegexpConcat) { 721 if (d < arraysize(stk)) 722 stk[d++] = re; 723 re = re->sub()[0]; 724 } 725 726 // Remove leading string from re. 727 if (re->op() == kRegexpLiteral) { 728 re->rune_ = 0; 729 re->op_ = kRegexpEmptyMatch; 730 } else if (re->op() == kRegexpLiteralString) { 731 if (n >= re->nrunes_) { 732 delete[] re->runes_; 733 re->runes_ = NULL; 734 re->nrunes_ = 0; 735 re->op_ = kRegexpEmptyMatch; 736 } else if (n == re->nrunes_ - 1) { 737 Rune rune = re->runes_[re->nrunes_ - 1]; 738 delete[] re->runes_; 739 re->runes_ = NULL; 740 re->nrunes_ = 0; 741 re->rune_ = rune; 742 re->op_ = kRegexpLiteral; 743 } else { 744 re->nrunes_ -= n; 745 memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]); 746 } 747 } 748 749 // If re is now empty, concatenations might simplify too. 750 while (d-- > 0) { 751 re = stk[d]; 752 Regexp** sub = re->sub(); 753 if (sub[0]->op() == kRegexpEmptyMatch) { 754 sub[0]->Decref(); 755 sub[0] = NULL; 756 // Delete first element of concat. 757 switch (re->nsub()) { 758 case 0: 759 case 1: 760 // Impossible. 761 LOG(DFATAL) << "Concat of " << re->nsub(); 762 re->submany_ = NULL; 763 re->op_ = kRegexpEmptyMatch; 764 break; 765 766 case 2: { 767 // Replace re with sub[1]. 768 Regexp* old = sub[1]; 769 sub[1] = NULL; 770 re->Swap(old); 771 old->Decref(); 772 break; 773 } 774 775 default: 776 // Slide down. 777 re->nsub_--; 778 memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]); 779 break; 780 } 781 } 782 } 783} 784 785// Factors common prefixes from alternation. 786// For example, 787// ABC|ABD|AEF|BCX|BCY 788// simplifies to 789// A(B(C|D)|EF)|BC(X|Y) 790// which the normal parse state routines will further simplify to 791// A(B[CD]|EF)|BC[XY] 792// 793// Rewrites sub to contain simplified list to alternate and returns 794// the new length of sub. Adjusts reference counts accordingly 795// (incoming sub[i] decremented, outgoing sub[i] incremented). 796 797// It's too much of a pain to write this code with an explicit stack, 798// so instead we let the caller specify a maximum depth and 799// don't simplify beyond that. There are around 15 words of local 800// variables and parameters in the frame, so allowing 8 levels 801// on a 64-bit machine is still less than a kilobyte of stack and 802// probably enough benefit for practical uses. 803const int kFactorAlternationMaxDepth = 8; 804 805int Regexp::FactorAlternation( 806 Regexp** sub, int n, 807 Regexp::ParseFlags altflags) { 808 return FactorAlternationRecursive(sub, n, altflags, 809 kFactorAlternationMaxDepth); 810} 811 812int Regexp::FactorAlternationRecursive( 813 Regexp** sub, int n, 814 Regexp::ParseFlags altflags, 815 int maxdepth) { 816 817 if (maxdepth <= 0) 818 return n; 819 820 // Round 1: Factor out common literal prefixes. 821 Rune *rune = NULL; 822 int nrune = 0; 823 Regexp::ParseFlags runeflags = Regexp::NoParseFlags; 824 int start = 0; 825 int out = 0; 826 for (int i = 0; i <= n; i++) { 827 // Invariant: what was in sub[0:start] has been Decref'ed 828 // and that space has been reused for sub[0:out] (out <= start). 829 // 830 // Invariant: sub[start:i] consists of regexps that all begin 831 // with the string rune[0:nrune]. 832 833 Rune* rune_i = NULL; 834 int nrune_i = 0; 835 Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags; 836 if (i < n) { 837 rune_i = LeadingString(sub[i], &nrune_i, &runeflags_i); 838 if (runeflags_i == runeflags) { 839 int same = 0; 840 while (same < nrune && same < nrune_i && rune[same] == rune_i[same]) 841 same++; 842 if (same > 0) { 843 // Matches at least one rune in current range. Keep going around. 844 nrune = same; 845 continue; 846 } 847 } 848 } 849 850 // Found end of a run with common leading literal string: 851 // sub[start:i] all begin with rune[0:nrune] but sub[i] 852 // does not even begin with rune[0]. 853 // 854 // Factor out common string and append factored expression to sub[0:out]. 855 if (i == start) { 856 // Nothing to do - first iteration. 857 } else if (i == start+1) { 858 // Just one: don't bother factoring. 859 sub[out++] = sub[start]; 860 } else { 861 // Construct factored form: prefix(suffix1|suffix2|...) 862 Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... 863 x[0] = LiteralString(rune, nrune, runeflags); 864 for (int j = start; j < i; j++) 865 RemoveLeadingString(sub[j], nrune); 866 int nn = FactorAlternationRecursive(sub + start, i - start, altflags, 867 maxdepth - 1); 868 x[1] = AlternateNoFactor(sub + start, nn, altflags); 869 sub[out++] = Concat(x, 2, altflags); 870 } 871 872 // Prepare for next round (if there is one). 873 if (i < n) { 874 start = i; 875 rune = rune_i; 876 nrune = nrune_i; 877 runeflags = runeflags_i; 878 } 879 } 880 n = out; 881 882 // Round 2: Factor out common complex prefixes, 883 // just the first piece of each concatenation, 884 // whatever it is. This is good enough a lot of the time. 885 start = 0; 886 out = 0; 887 Regexp* first = NULL; 888 for (int i = 0; i <= n; i++) { 889 // Invariant: what was in sub[0:start] has been Decref'ed 890 // and that space has been reused for sub[0:out] (out <= start). 891 // 892 // Invariant: sub[start:i] consists of regexps that all begin with first. 893 894 Regexp* first_i = NULL; 895 if (i < n) { 896 first_i = LeadingRegexp(sub[i]); 897 if (first != NULL && Regexp::Equal(first, first_i)) { 898 continue; 899 } 900 } 901 902 // Found end of a run with common leading regexp: 903 // sub[start:i] all begin with first but sub[i] does not. 904 // 905 // Factor out common regexp and append factored expression to sub[0:out]. 906 if (i == start) { 907 // Nothing to do - first iteration. 908 } else if (i == start+1) { 909 // Just one: don't bother factoring. 910 sub[out++] = sub[start]; 911 } else { 912 // Construct factored form: prefix(suffix1|suffix2|...) 913 Regexp* x[2]; // x[0] = prefix, x[1] = suffix1|suffix2|... 914 x[0] = first->Incref(); 915 for (int j = start; j < i; j++) 916 sub[j] = RemoveLeadingRegexp(sub[j]); 917 int nn = FactorAlternationRecursive(sub + start, i - start, altflags, 918 maxdepth - 1); 919 x[1] = AlternateNoFactor(sub + start, nn, altflags); 920 sub[out++] = Concat(x, 2, altflags); 921 } 922 923 // Prepare for next round (if there is one). 924 if (i < n) { 925 start = i; 926 first = first_i; 927 } 928 } 929 n = out; 930 931 // Round 3: Collapse runs of single literals into character classes. 932 start = 0; 933 out = 0; 934 for (int i = 0; i <= n; i++) { 935 // Invariant: what was in sub[0:start] has been Decref'ed 936 // and that space has been reused for sub[0:out] (out <= start). 937 // 938 // Invariant: sub[start:i] consists of regexps that are either 939 // literal runes or character classes. 940 941 if (i < n && 942 (sub[i]->op() == kRegexpLiteral || 943 sub[i]->op() == kRegexpCharClass)) 944 continue; 945 946 // sub[i] is not a char or char class; 947 // emit char class for sub[start:i]... 948 if (i == start) { 949 // Nothing to do. 950 } else if (i == start+1) { 951 sub[out++] = sub[start]; 952 } else { 953 // Make new char class. 954 CharClassBuilder ccb; 955 for (int j = start; j < i; j++) { 956 Regexp* re = sub[j]; 957 if (re->op() == kRegexpCharClass) { 958 CharClass* cc = re->cc(); 959 for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it) 960 ccb.AddRange(it->lo, it->hi); 961 } else if (re->op() == kRegexpLiteral) { 962 ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags()); 963 } else { 964 LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " " 965 << re->ToString(); 966 } 967 re->Decref(); 968 } 969 sub[out++] = NewCharClass(ccb.GetCharClass(), altflags); 970 } 971 972 // ... and then emit sub[i]. 973 if (i < n) 974 sub[out++] = sub[i]; 975 start = i+1; 976 } 977 n = out; 978 979 // Round 4: Collapse runs of empty matches into single empty match. 980 start = 0; 981 out = 0; 982 for (int i = 0; i < n; i++) { 983 if (i + 1 < n && 984 sub[i]->op() == kRegexpEmptyMatch && 985 sub[i+1]->op() == kRegexpEmptyMatch) { 986 sub[i]->Decref(); 987 continue; 988 } 989 sub[out++] = sub[i]; 990 } 991 n = out; 992 993 return n; 994} 995 996// Collapse the regexps on top of the stack, down to the 997// first marker, into a new op node (op == kRegexpAlternate 998// or op == kRegexpConcat). 999void Regexp::ParseState::DoCollapse(RegexpOp op) { 1000 // Scan backward to marker, counting children of composite. 1001 int n = 0; 1002 Regexp* next = NULL; 1003 Regexp* sub; 1004 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { 1005 next = sub->down_; 1006 if (sub->op_ == op) 1007 n += sub->nsub_; 1008 else 1009 n++; 1010 } 1011 1012 // If there's just one child, leave it alone. 1013 // (Concat of one thing is that one thing; alternate of one thing is same.) 1014 if (stacktop_ != NULL && stacktop_->down_ == next) 1015 return; 1016 1017 // Construct op (alternation or concatenation), flattening op of op. 1018 Regexp** subs = new Regexp*[n]; 1019 next = NULL; 1020 int i = n; 1021 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) { 1022 next = sub->down_; 1023 if (sub->op_ == op) { 1024 Regexp** sub_subs = sub->sub(); 1025 for (int k = sub->nsub_ - 1; k >= 0; k--) 1026 subs[--i] = sub_subs[k]->Incref(); 1027 sub->Decref(); 1028 } else { 1029 subs[--i] = FinishRegexp(sub); 1030 } 1031 } 1032 1033 Regexp* re = ConcatOrAlternate(op, subs, n, flags_, true); 1034 delete[] subs; 1035 re->simple_ = re->ComputeSimple(); 1036 re->down_ = next; 1037 stacktop_ = re; 1038} 1039 1040// Finishes the current concatenation, 1041// collapsing it into a single regexp on the stack. 1042void Regexp::ParseState::DoConcatenation() { 1043 Regexp* r1 = stacktop_; 1044 if (r1 == NULL || IsMarker(r1->op())) { 1045 // empty concatenation is special case 1046 Regexp* re = new Regexp(kRegexpEmptyMatch, flags_); 1047 PushRegexp(re); 1048 } 1049 DoCollapse(kRegexpConcat); 1050} 1051 1052// Finishes the current alternation, 1053// collapsing it to a single regexp on the stack. 1054void Regexp::ParseState::DoAlternation() { 1055 DoVerticalBar(); 1056 // Now stack top is kVerticalBar. 1057 Regexp* r1 = stacktop_; 1058 stacktop_ = r1->down_; 1059 r1->Decref(); 1060 DoCollapse(kRegexpAlternate); 1061} 1062 1063// Incremental conversion of concatenated literals into strings. 1064// If top two elements on stack are both literal or string, 1065// collapse into single string. 1066// Don't walk down the stack -- the parser calls this frequently 1067// enough that below the bottom two is known to be collapsed. 1068// Only called when another regexp is about to be pushed 1069// on the stack, so that the topmost literal is not being considered. 1070// (Otherwise ab* would turn into (ab)*.) 1071// If r >= 0, consider pushing a literal r on the stack. 1072// Return whether that happened. 1073bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { 1074 Regexp* re1; 1075 Regexp* re2; 1076 if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL) 1077 return false; 1078 1079 if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString) 1080 return false; 1081 if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString) 1082 return false; 1083 if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase)) 1084 return false; 1085 1086 if (re2->op_ == kRegexpLiteral) { 1087 // convert into string 1088 Rune rune = re2->rune_; 1089 re2->op_ = kRegexpLiteralString; 1090 re2->nrunes_ = 0; 1091 re2->runes_ = NULL; 1092 re2->AddRuneToString(rune); 1093 } 1094 1095 // push re1 into re2. 1096 if (re1->op_ == kRegexpLiteral) { 1097 re2->AddRuneToString(re1->rune_); 1098 } else { 1099 for (int i = 0; i < re1->nrunes_; i++) 1100 re2->AddRuneToString(re1->runes_[i]); 1101 re1->nrunes_ = 0; 1102 delete[] re1->runes_; 1103 re1->runes_ = NULL; 1104 } 1105 1106 // reuse re1 if possible 1107 if (r >= 0) { 1108 re1->op_ = kRegexpLiteral; 1109 re1->rune_ = r; 1110 re1->parse_flags_ = flags; 1111 return true; 1112 } 1113 1114 stacktop_ = re2; 1115 re1->Decref(); 1116 return false; 1117} 1118 1119// Lexing routines. 1120 1121// Parses a decimal integer, storing it in *n. 1122// Sets *s to span the remainder of the string. 1123// Sets *out_re to the regexp for the class. 1124static bool ParseInteger(StringPiece* s, int* np) { 1125 if (s->size() == 0 || !isdigit((*s)[0] & 0xFF)) 1126 return false; 1127 // Disallow leading zeros. 1128 if (s->size() >= 2 && (*s)[0] == '0' && isdigit((*s)[1] & 0xFF)) 1129 return false; 1130 int n = 0; 1131 int c; 1132 while (s->size() > 0 && isdigit(c = (*s)[0] & 0xFF)) { 1133 // Avoid overflow. 1134 if (n >= 100000000) 1135 return false; 1136 n = n*10 + c - '0'; 1137 s->remove_prefix(1); // digit 1138 } 1139 *np = n; 1140 return true; 1141} 1142 1143// Parses a repetition suffix like {1,2} or {2} or {2,}. 1144// Sets *s to span the remainder of the string on success. 1145// Sets *lo and *hi to the given range. 1146// In the case of {2,}, the high number is unbounded; 1147// sets *hi to -1 to signify this. 1148// {,2} is NOT a valid suffix. 1149// The Maybe in the name signifies that the regexp parse 1150// doesn't fail even if ParseRepetition does, so the StringPiece 1151// s must NOT be edited unless MaybeParseRepetition returns true. 1152static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) { 1153 StringPiece s = *sp; 1154 if (s.size() == 0 || s[0] != '{') 1155 return false; 1156 s.remove_prefix(1); // '{' 1157 if (!ParseInteger(&s, lo)) 1158 return false; 1159 if (s.size() == 0) 1160 return false; 1161 if (s[0] == ',') { 1162 s.remove_prefix(1); // ',' 1163 if (s.size() == 0) 1164 return false; 1165 if (s[0] == '}') { 1166 // {2,} means at least 2 1167 *hi = -1; 1168 } else { 1169 // {2,4} means 2, 3, or 4. 1170 if (!ParseInteger(&s, hi)) 1171 return false; 1172 } 1173 } else { 1174 // {2} means exactly two 1175 *hi = *lo; 1176 } 1177 if (s.size() == 0 || s[0] != '}') 1178 return false; 1179 s.remove_prefix(1); // '}' 1180 *sp = s; 1181 return true; 1182} 1183 1184// Removes the next Rune from the StringPiece and stores it in *r. 1185// Returns number of bytes removed from sp. 1186// Behaves as though there is a terminating NUL at the end of sp. 1187// Argument order is backwards from usual Google style 1188// but consistent with chartorune. 1189static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { 1190 int n; 1191 if (fullrune(sp->data(), sp->size())) { 1192 n = chartorune(r, sp->data()); 1193 if (!(n == 1 && *r == Runeerror)) { // no decoding error 1194 sp->remove_prefix(n); 1195 return n; 1196 } 1197 } 1198 1199 status->set_code(kRegexpBadUTF8); 1200 status->set_error_arg(NULL); 1201 return -1; 1202} 1203 1204// Return whether name is valid UTF-8. 1205// If not, set status to kRegexpBadUTF8. 1206static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) { 1207 StringPiece t = s; 1208 Rune r; 1209 while (t.size() > 0) { 1210 if (StringPieceToRune(&r, &t, status) < 0) 1211 return false; 1212 } 1213 return true; 1214} 1215 1216// Is c a hex digit? 1217static int IsHex(int c) { 1218 return ('0' <= c && c <= '9') || 1219 ('A' <= c && c <= 'F') || 1220 ('a' <= c && c <= 'f'); 1221} 1222 1223// Convert hex digit to value. 1224static int UnHex(int c) { 1225 if ('0' <= c && c <= '9') 1226 return c - '0'; 1227 if ('A' <= c && c <= 'F') 1228 return c - 'A' + 10; 1229 if ('a' <= c && c <= 'f') 1230 return c - 'a' + 10; 1231 LOG(DFATAL) << "Bad hex digit " << c; 1232 return 0; 1233} 1234 1235// Parse an escape sequence (e.g., \n, \{). 1236// Sets *s to span the remainder of the string. 1237// Sets *rp to the named character. 1238static bool ParseEscape(StringPiece* s, Rune* rp, 1239 RegexpStatus* status, int rune_max) { 1240 const char* begin = s->begin(); 1241 if (s->size() < 1 || (*s)[0] != '\\') { 1242 // Should not happen - caller always checks. 1243 status->set_code(kRegexpInternalError); 1244 status->set_error_arg(NULL); 1245 return false; 1246 } 1247 if (s->size() < 2) { 1248 status->set_code(kRegexpTrailingBackslash); 1249 status->set_error_arg(NULL); 1250 return false; 1251 } 1252 Rune c, c1; 1253 s->remove_prefix(1); // backslash 1254 if (StringPieceToRune(&c, s, status) < 0) 1255 return false; 1256 int code; 1257 switch (c) { 1258 default: 1259 if (c < Runeself && !isalpha(c) && !isdigit(c)) { 1260 // Escaped non-word characters are always themselves. 1261 // PCRE is not quite so rigorous: it accepts things like 1262 // \q, but we don't. We once rejected \_, but too many 1263 // programs and people insist on using it, so allow \_. 1264 *rp = c; 1265 return true; 1266 } 1267 goto BadEscape; 1268 1269 // Octal escapes. 1270 case '1': 1271 case '2': 1272 case '3': 1273 case '4': 1274 case '5': 1275 case '6': 1276 case '7': 1277 // Single non-zero octal digit is a backreference; not supported. 1278 if (s->size() == 0 || (*s)[0] < '0' || (*s)[0] > '7') 1279 goto BadEscape; 1280 // fall through 1281 case '0': 1282 // consume up to three octal digits; already have one. 1283 code = c - '0'; 1284 if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') { 1285 code = code * 8 + c - '0'; 1286 s->remove_prefix(1); // digit 1287 if (s->size() > 0) { 1288 c = (*s)[0]; 1289 if ('0' <= c && c <= '7') { 1290 code = code * 8 + c - '0'; 1291 s->remove_prefix(1); // digit 1292 } 1293 } 1294 } 1295 *rp = code; 1296 return true; 1297 1298 // Hexadecimal escapes 1299 case 'x': 1300 if (s->size() == 0) 1301 goto BadEscape; 1302 if (StringPieceToRune(&c, s, status) < 0) 1303 return false; 1304 if (c == '{') { 1305 // Any number of digits in braces. 1306 // Update n as we consume the string, so that 1307 // the whole thing gets shown in the error message. 1308 // Perl accepts any text at all; it ignores all text 1309 // after the first non-hex digit. We require only hex digits, 1310 // and at least one. 1311 if (StringPieceToRune(&c, s, status) < 0) 1312 return false; 1313 int nhex = 0; 1314 code = 0; 1315 while (IsHex(c)) { 1316 nhex++; 1317 code = code * 16 + UnHex(c); 1318 if (code > rune_max) 1319 goto BadEscape; 1320 if (s->size() == 0) 1321 goto BadEscape; 1322 if (StringPieceToRune(&c, s, status) < 0) 1323 return false; 1324 } 1325 if (c != '}' || nhex == 0) 1326 goto BadEscape; 1327 *rp = code; 1328 return true; 1329 } 1330 // Easy case: two hex digits. 1331 if (s->size() == 0) 1332 goto BadEscape; 1333 if (StringPieceToRune(&c1, s, status) < 0) 1334 return false; 1335 if (!IsHex(c) || !IsHex(c1)) 1336 goto BadEscape; 1337 *rp = UnHex(c) * 16 + UnHex(c1); 1338 return true; 1339 1340 // C escapes. 1341 case 'n': 1342 *rp = '\n'; 1343 return true; 1344 case 'r': 1345 *rp = '\r'; 1346 return true; 1347 case 't': 1348 *rp = '\t'; 1349 return true; 1350 1351 // Less common C escapes. 1352 case 'a': 1353 *rp = '\a'; 1354 return true; 1355 case 'f': 1356 *rp = '\f'; 1357 return true; 1358 case 'v': 1359 *rp = '\v'; 1360 return true; 1361 1362 // This code is disabled to avoid misparsing 1363 // the Perl word-boundary \b as a backspace 1364 // when in POSIX regexp mode. Surprisingly, 1365 // in Perl, \b means word-boundary but [\b] 1366 // means backspace. We don't support that: 1367 // if you want a backspace embed a literal 1368 // backspace character or use \x08. 1369 // 1370 // case 'b': 1371 // *rp = '\b'; 1372 // return true; 1373 } 1374 1375 LOG(DFATAL) << "Not reached in ParseEscape."; 1376 1377BadEscape: 1378 // Unrecognized escape sequence. 1379 status->set_code(kRegexpBadEscape); 1380 status->set_error_arg(StringPiece(begin, s->data() - begin)); 1381 return false; 1382} 1383 1384// Add a range to the character class, but exclude newline if asked. 1385// Also handle case folding. 1386void CharClassBuilder::AddRangeFlags( 1387 Rune lo, Rune hi, Regexp::ParseFlags parse_flags) { 1388 1389 // Take out \n if the flags say so. 1390 bool cutnl = !(parse_flags & Regexp::ClassNL) || 1391 (parse_flags & Regexp::NeverNL); 1392 if (cutnl && lo <= '\n' && '\n' <= hi) { 1393 if (lo < '\n') 1394 AddRangeFlags(lo, '\n' - 1, parse_flags); 1395 if (hi > '\n') 1396 AddRangeFlags('\n' + 1, hi, parse_flags); 1397 return; 1398 } 1399 1400 // If folding case, add fold-equivalent characters too. 1401 if (parse_flags & Regexp::FoldCase) 1402 AddFoldedRange(this, lo, hi, 0); 1403 else 1404 AddRange(lo, hi); 1405} 1406 1407// Look for a group with the given name. 1408static UGroup* LookupGroup(const StringPiece& name, 1409 UGroup *groups, int ngroups) { 1410 // Simple name lookup. 1411 for (int i = 0; i < ngroups; i++) 1412 if (StringPiece(groups[i].name) == name) 1413 return &groups[i]; 1414 return NULL; 1415} 1416 1417// Fake UGroup containing all Runes 1418static URange16 any16[] = { { 0, 65535 } }; 1419static URange32 any32[] = { { 65536, Runemax } }; 1420static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 }; 1421 1422// Look for a POSIX group with the given name (e.g., "[:^alpha:]") 1423static UGroup* LookupPosixGroup(const StringPiece& name) { 1424 return LookupGroup(name, posix_groups, num_posix_groups); 1425} 1426 1427static UGroup* LookupPerlGroup(const StringPiece& name) { 1428 return LookupGroup(name, perl_groups, num_perl_groups); 1429} 1430 1431// Look for a Unicode group with the given name (e.g., "Han") 1432static UGroup* LookupUnicodeGroup(const StringPiece& name) { 1433 // Special case: "Any" means any. 1434 if (name == StringPiece("Any")) 1435 return &anygroup; 1436 return LookupGroup(name, unicode_groups, num_unicode_groups); 1437} 1438 1439// Add a UGroup or its negation to the character class. 1440static void AddUGroup(CharClassBuilder *cc, UGroup *g, int sign, 1441 Regexp::ParseFlags parse_flags) { 1442 if (sign == +1) { 1443 for (int i = 0; i < g->nr16; i++) { 1444 cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags); 1445 } 1446 for (int i = 0; i < g->nr32; i++) { 1447 cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags); 1448 } 1449 } else { 1450 if (parse_flags & Regexp::FoldCase) { 1451 // Normally adding a case-folded group means 1452 // adding all the extra fold-equivalent runes too. 1453 // But if we're adding the negation of the group, 1454 // we have to exclude all the runes that are fold-equivalent 1455 // to what's already missing. Too hard, so do in two steps. 1456 CharClassBuilder ccb1; 1457 AddUGroup(&ccb1, g, +1, parse_flags); 1458 // If the flags say to take out \n, put it in, so that negating will take it out. 1459 // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags. 1460 bool cutnl = !(parse_flags & Regexp::ClassNL) || 1461 (parse_flags & Regexp::NeverNL); 1462 if (cutnl) { 1463 ccb1.AddRange('\n', '\n'); 1464 } 1465 ccb1.Negate(); 1466 cc->AddCharClass(&ccb1); 1467 return; 1468 } 1469 int next = 0; 1470 for (int i = 0; i < g->nr16; i++) { 1471 if (next < g->r16[i].lo) 1472 cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags); 1473 next = g->r16[i].hi + 1; 1474 } 1475 for (int i = 0; i < g->nr32; i++) { 1476 if (next < g->r32[i].lo) 1477 cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags); 1478 next = g->r32[i].hi + 1; 1479 } 1480 if (next <= Runemax) 1481 cc->AddRangeFlags(next, Runemax, parse_flags); 1482 } 1483} 1484 1485// Maybe parse a Perl character class escape sequence. 1486// Only recognizes the Perl character classes (\d \s \w \D \S \W), 1487// not the Perl empty-string classes (\b \B \A \Z \z). 1488// On success, sets *s to span the remainder of the string 1489// and returns the corresponding UGroup. 1490// The StringPiece must *NOT* be edited unless the call succeeds. 1491UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { 1492 if (!(parse_flags & Regexp::PerlClasses)) 1493 return NULL; 1494 if (s->size() < 2 || (*s)[0] != '\\') 1495 return NULL; 1496 // Could use StringPieceToRune, but there aren't 1497 // any non-ASCII Perl group names. 1498 StringPiece name(s->begin(), 2); 1499 UGroup *g = LookupPerlGroup(name); 1500 if (g == NULL) 1501 return NULL; 1502 s->remove_prefix(name.size()); 1503 return g; 1504} 1505 1506enum ParseStatus { 1507 kParseOk, // Did some parsing. 1508 kParseError, // Found an error. 1509 kParseNothing, // Decided not to parse. 1510}; 1511 1512// Maybe parses a Unicode character group like \p{Han} or \P{Han} 1513// (the latter is a negated group). 1514ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, 1515 CharClassBuilder *cc, 1516 RegexpStatus* status) { 1517 // Decide whether to parse. 1518 if (!(parse_flags & Regexp::UnicodeGroups)) 1519 return kParseNothing; 1520 if (s->size() < 2 || (*s)[0] != '\\') 1521 return kParseNothing; 1522 Rune c = (*s)[1]; 1523 if (c != 'p' && c != 'P') 1524 return kParseNothing; 1525 1526 // Committed to parse. Results: 1527 int sign = +1; // -1 = negated char class 1528 if (c == 'P') 1529 sign = -1; 1530 StringPiece seq = *s; // \p{Han} or \pL 1531 StringPiece name; // Han or L 1532 s->remove_prefix(2); // '\\', 'p' 1533 1534 if (!StringPieceToRune(&c, s, status)) 1535 return kParseError; 1536 if (c != '{') { 1537 // Name is the bit of string we just skipped over for c. 1538 const char* p = seq.begin() + 2; 1539 name = StringPiece(p, s->begin() - p); 1540 } else { 1541 // Name is in braces. Look for closing } 1542 int end = s->find('}', 0); 1543 if (end == s->npos) { 1544 if (!IsValidUTF8(seq, status)) 1545 return kParseError; 1546 status->set_code(kRegexpBadCharRange); 1547 status->set_error_arg(seq); 1548 return kParseError; 1549 } 1550 name = StringPiece(s->begin(), end); // without '}' 1551 s->remove_prefix(end + 1); // with '}' 1552 if (!IsValidUTF8(name, status)) 1553 return kParseError; 1554 } 1555 1556 // Chop seq where s now begins. 1557 seq = StringPiece(seq.begin(), s->begin() - seq.begin()); 1558 1559 // Look up group 1560 if (name.size() > 0 && name[0] == '^') { 1561 sign = -sign; 1562 name.remove_prefix(1); // '^' 1563 } 1564 UGroup *g = LookupUnicodeGroup(name); 1565 if (g == NULL) { 1566 status->set_code(kRegexpBadCharRange); 1567 status->set_error_arg(seq); 1568 return kParseError; 1569 } 1570 1571 AddUGroup(cc, g, sign, parse_flags); 1572 return kParseOk; 1573} 1574 1575// Parses a character class name like [:alnum:]. 1576// Sets *s to span the remainder of the string. 1577// Adds the ranges corresponding to the class to ranges. 1578static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, 1579 CharClassBuilder *cc, 1580 RegexpStatus* status) { 1581 // Check begins with [: 1582 const char* p = s->data(); 1583 const char* ep = s->data() + s->size(); 1584 if (ep - p < 2 || p[0] != '[' || p[1] != ':') 1585 return kParseNothing; 1586 1587 // Look for closing :]. 1588 const char* q; 1589 for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++) 1590 ; 1591 1592 // If no closing :], then ignore. 1593 if (q > ep-2) 1594 return kParseNothing; 1595 1596 // Got it. Check that it's valid. 1597 q += 2; 1598 StringPiece name(p, q-p); 1599 1600 UGroup *g = LookupPosixGroup(name); 1601 if (g == NULL) { 1602 status->set_code(kRegexpBadCharRange); 1603 status->set_error_arg(name); 1604 return kParseError; 1605 } 1606 1607 s->remove_prefix(name.size()); 1608 AddUGroup(cc, g, g->sign, parse_flags); 1609 return kParseOk; 1610} 1611 1612// Parses a character inside a character class. 1613// There are fewer special characters here than in the rest of the regexp. 1614// Sets *s to span the remainder of the string. 1615// Sets *rp to the character. 1616bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp, 1617 const StringPiece& whole_class, 1618 RegexpStatus* status) { 1619 if (s->size() == 0) { 1620 status->set_code(kRegexpMissingBracket); 1621 status->set_error_arg(whole_class); 1622 return false; 1623 } 1624 1625 // Allow regular escape sequences even though 1626 // many need not be escaped in this context. 1627 if (s->size() >= 1 && (*s)[0] == '\\') 1628 return ParseEscape(s, rp, status, rune_max_); 1629 1630 // Otherwise take the next rune. 1631 return StringPieceToRune(rp, s, status) >= 0; 1632} 1633 1634// Parses a character class character, or, if the character 1635// is followed by a hyphen, parses a character class range. 1636// For single characters, rr->lo == rr->hi. 1637// Sets *s to span the remainder of the string. 1638// Sets *rp to the character. 1639bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, 1640 const StringPiece& whole_class, 1641 RegexpStatus* status) { 1642 StringPiece os = *s; 1643 if (!ParseCCCharacter(s, &rr->lo, whole_class, status)) 1644 return false; 1645 // [a-] means (a|-), so check for final ]. 1646 if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') { 1647 s->remove_prefix(1); // '-' 1648 if (!ParseCCCharacter(s, &rr->hi, whole_class, status)) 1649 return false; 1650 if (rr->hi < rr->lo) { 1651 status->set_code(kRegexpBadCharRange); 1652 status->set_error_arg(StringPiece(os.data(), s->data() - os.data())); 1653 return false; 1654 } 1655 } else { 1656 rr->hi = rr->lo; 1657 } 1658 return true; 1659} 1660 1661// Parses a possibly-negated character class expression like [^abx-z[:digit:]]. 1662// Sets *s to span the remainder of the string. 1663// Sets *out_re to the regexp for the class. 1664bool Regexp::ParseState::ParseCharClass(StringPiece* s, 1665 Regexp** out_re, 1666 RegexpStatus* status) { 1667 StringPiece whole_class = *s; 1668 if (s->size() == 0 || (*s)[0] != '[') { 1669 // Caller checked this. 1670 status->set_code(kRegexpInternalError); 1671 status->set_error_arg(NULL); 1672 return false; 1673 } 1674 bool negated = false; 1675 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase); 1676 re->ccb_ = new CharClassBuilder; 1677 s->remove_prefix(1); // '[' 1678 if (s->size() > 0 && (*s)[0] == '^') { 1679 s->remove_prefix(1); // '^' 1680 negated = true; 1681 if (!(flags_ & ClassNL) || (flags_ & NeverNL)) { 1682 // If NL can't match implicitly, then pretend 1683 // negated classes include a leading \n. 1684 re->ccb_->AddRange('\n', '\n'); 1685 } 1686 } 1687 bool first = true; // ] is okay as first char in class 1688 while (s->size() > 0 && ((*s)[0] != ']' || first)) { 1689 // - is only okay unescaped as first or last in class. 1690 // Except that Perl allows - anywhere. 1691 if ((*s)[0] == '-' && !first && !(flags_&PerlX) && 1692 (s->size() == 1 || (*s)[1] != ']')) { 1693 StringPiece t = *s; 1694 t.remove_prefix(1); // '-' 1695 Rune r; 1696 int n = StringPieceToRune(&r, &t, status); 1697 if (n < 0) { 1698 re->Decref(); 1699 return false; 1700 } 1701 status->set_code(kRegexpBadCharRange); 1702 status->set_error_arg(StringPiece(s->data(), 1+n)); 1703 re->Decref(); 1704 return false; 1705 } 1706 first = false; 1707 1708 // Look for [:alnum:] etc. 1709 if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') { 1710 switch (ParseCCName(s, flags_, re->ccb_, status)) { 1711 case kParseOk: 1712 continue; 1713 case kParseError: 1714 re->Decref(); 1715 return false; 1716 case kParseNothing: 1717 break; 1718 } 1719 } 1720 1721 // Look for Unicode character group like \p{Han} 1722 if (s->size() > 2 && 1723 (*s)[0] == '\\' && 1724 ((*s)[1] == 'p' || (*s)[1] == 'P')) { 1725 switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) { 1726 case kParseOk: 1727 continue; 1728 case kParseError: 1729 re->Decref(); 1730 return false; 1731 case kParseNothing: 1732 break; 1733 } 1734 } 1735 1736 // Look for Perl character class symbols (extension). 1737 UGroup *g = MaybeParsePerlCCEscape(s, flags_); 1738 if (g != NULL) { 1739 AddUGroup(re->ccb_, g, g->sign, flags_); 1740 continue; 1741 } 1742 1743 // Otherwise assume single character or simple range. 1744 RuneRange rr; 1745 if (!ParseCCRange(s, &rr, whole_class, status)) { 1746 re->Decref(); 1747 return false; 1748 } 1749 // AddRangeFlags is usually called in response to a class like 1750 // \p{Foo} or [[:foo:]]; for those, it filters \n out unless 1751 // Regexp::ClassNL is set. In an explicit range or singleton 1752 // like we just parsed, we do not filter \n out, so set ClassNL 1753 // in the flags. 1754 re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL); 1755 } 1756 if (s->size() == 0) { 1757 status->set_code(kRegexpMissingBracket); 1758 status->set_error_arg(whole_class); 1759 re->Decref(); 1760 return false; 1761 } 1762 s->remove_prefix(1); // ']' 1763 1764 if (negated) 1765 re->ccb_->Negate(); 1766 re->ccb_->RemoveAbove(rune_max_); 1767 1768 *out_re = re; 1769 return true; 1770} 1771 1772// Is this a valid capture name? [A-Za-z0-9_]+ 1773// PCRE limits names to 32 bytes. 1774// Python rejects names starting with digits. 1775// We don't enforce either of those. 1776static bool IsValidCaptureName(const StringPiece& name) { 1777 if (name.size() == 0) 1778 return false; 1779 for (int i = 0; i < name.size(); i++) { 1780 int c = name[i]; 1781 if (('0' <= c && c <= '9') || 1782 ('a' <= c && c <= 'z') || 1783 ('A' <= c && c <= 'Z') || 1784 c == '_') 1785 continue; 1786 return false; 1787 } 1788 return true; 1789} 1790 1791// Parses a Perl flag setting or non-capturing group or both, 1792// like (?i) or (?: or (?i:. Removes from s, updates parse state. 1793// The caller must check that s begins with "(?". 1794// Returns true on success. If the Perl flag is not 1795// well-formed or not supported, sets status_ and returns false. 1796bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { 1797 StringPiece t = *s; 1798 1799 // Caller is supposed to check this. 1800 if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') { 1801 LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags"; 1802 status_->set_code(kRegexpInternalError); 1803 return false; 1804 } 1805 1806 t.remove_prefix(2); // "(?" 1807 1808 // Check for named captures, first introduced in Python's regexp library. 1809 // As usual, there are three slightly different syntaxes: 1810 // 1811 // (?P<name>expr) the original, introduced by Python 1812 // (?<name>expr) the .NET alteration, adopted by Perl 5.10 1813 // (?'name'expr) another .NET alteration, adopted by Perl 5.10 1814 // 1815 // Perl 5.10 gave in and implemented the Python version too, 1816 // but they claim that the last two are the preferred forms. 1817 // PCRE and languages based on it (specifically, PHP and Ruby) 1818 // support all three as well. EcmaScript 4 uses only the Python form. 1819 // 1820 // In both the open source world (via Code Search) and the 1821 // Google source tree, (?P<expr>name) is the dominant form, 1822 // so that's the one we implement. One is enough. 1823 if (t.size() > 2 && t[0] == 'P' && t[1] == '<') { 1824 // Pull out name. 1825 int end = t.find('>', 2); 1826 if (end == t.npos) { 1827 if (!IsValidUTF8(*s, status_)) 1828 return false; 1829 status_->set_code(kRegexpBadNamedCapture); 1830 status_->set_error_arg(*s); 1831 return false; 1832 } 1833 1834 // t is "P<name>...", t[end] == '>' 1835 StringPiece capture(t.begin()-2, end+3); // "(?P<name>" 1836 StringPiece name(t.begin()+2, end-2); // "name" 1837 if (!IsValidUTF8(name, status_)) 1838 return false; 1839 if (!IsValidCaptureName(name)) { 1840 status_->set_code(kRegexpBadNamedCapture); 1841 status_->set_error_arg(capture); 1842 return false; 1843 } 1844 1845 if (!DoLeftParen(name)) { 1846 // DoLeftParen's failure set status_. 1847 return false; 1848 } 1849 1850 s->remove_prefix(capture.end() - s->begin()); 1851 return true; 1852 } 1853 1854 bool negated = false; 1855 bool sawflags = false; 1856 int nflags = flags_; 1857 Rune c; 1858 for (bool done = false; !done; ) { 1859 if (t.size() == 0) 1860 goto BadPerlOp; 1861 if (StringPieceToRune(&c, &t, status_) < 0) 1862 return false; 1863 switch (c) { 1864 default: 1865 goto BadPerlOp; 1866 1867 // Parse flags. 1868 case 'i': 1869 sawflags = true; 1870 if (negated) 1871 nflags &= ~FoldCase; 1872 else 1873 nflags |= FoldCase; 1874 break; 1875 1876 case 'm': // opposite of our OneLine 1877 sawflags = true; 1878 if (negated) 1879 nflags |= OneLine; 1880 else 1881 nflags &= ~OneLine; 1882 break; 1883 1884 case 's': 1885 sawflags = true; 1886 if (negated) 1887 nflags &= ~DotNL; 1888 else 1889 nflags |= DotNL; 1890 break; 1891 1892 case 'U': 1893 sawflags = true; 1894 if (negated) 1895 nflags &= ~NonGreedy; 1896 else 1897 nflags |= NonGreedy; 1898 break; 1899 1900 // Negation 1901 case '-': 1902 if (negated) 1903 goto BadPerlOp; 1904 negated = true; 1905 sawflags = false; 1906 break; 1907 1908 // Open new group. 1909 case ':': 1910 if (!DoLeftParenNoCapture()) { 1911 // DoLeftParenNoCapture's failure set status_. 1912 return false; 1913 } 1914 done = true; 1915 break; 1916 1917 // Finish flags. 1918 case ')': 1919 done = true; 1920 break; 1921 } 1922 } 1923 1924 if (negated && !sawflags) 1925 goto BadPerlOp; 1926 1927 flags_ = static_cast<Regexp::ParseFlags>(nflags); 1928 *s = t; 1929 return true; 1930 1931BadPerlOp: 1932 status_->set_code(kRegexpBadPerlOp); 1933 status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin())); 1934 return false; 1935} 1936 1937// Converts latin1 (assumed to be encoded as Latin1 bytes) 1938// into UTF8 encoding in string. 1939// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is 1940// deprecated and because it rejects code points 0x80-0x9F. 1941void ConvertLatin1ToUTF8(const StringPiece& latin1, string* utf) { 1942 char buf[UTFmax]; 1943 1944 utf->clear(); 1945 for (int i = 0; i < latin1.size(); i++) { 1946 Rune r = latin1[i] & 0xFF; 1947 int n = runetochar(buf, &r); 1948 utf->append(buf, n); 1949 } 1950} 1951 1952// Parses the regular expression given by s, 1953// returning the corresponding Regexp tree. 1954// The caller must Decref the return value when done with it. 1955// Returns NULL on error. 1956Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, 1957 RegexpStatus* status) { 1958 // Make status non-NULL (easier on everyone else). 1959 RegexpStatus xstatus; 1960 if (status == NULL) 1961 status = &xstatus; 1962 1963 ParseState ps(global_flags, s, status); 1964 StringPiece t = s; 1965 1966 // Convert regexp to UTF-8 (easier on the rest of the parser). 1967 if (global_flags & Latin1) { 1968 string* tmp = new string; 1969 ConvertLatin1ToUTF8(t, tmp); 1970 status->set_tmp(tmp); 1971 t = *tmp; 1972 } 1973 1974 if (global_flags & Literal) { 1975 // Special parse loop for literal string. 1976 while (t.size() > 0) { 1977 Rune r; 1978 if (StringPieceToRune(&r, &t, status) < 0) 1979 return NULL; 1980 if (!ps.PushLiteral(r)) 1981 return NULL; 1982 } 1983 return ps.DoFinish(); 1984 } 1985 1986 StringPiece lastunary = NULL; 1987 while (t.size() > 0) { 1988 StringPiece isunary = NULL; 1989 switch (t[0]) { 1990 default: { 1991 Rune r; 1992 if (StringPieceToRune(&r, &t, status) < 0) 1993 return NULL; 1994 if (!ps.PushLiteral(r)) 1995 return NULL; 1996 break; 1997 } 1998 1999 case '(': 2000 // "(?" introduces Perl escape. 2001 if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) { 2002 // Flag changes and non-capturing groups. 2003 if (!ps.ParsePerlFlags(&t)) 2004 return NULL; 2005 break; 2006 } 2007 if (ps.flags() & NeverCapture) { 2008 if (!ps.DoLeftParenNoCapture()) 2009 return NULL; 2010 } else { 2011 if (!ps.DoLeftParen(NULL)) 2012 return NULL; 2013 } 2014 t.remove_prefix(1); // '(' 2015 break; 2016 2017 case '|': 2018 if (!ps.DoVerticalBar()) 2019 return NULL; 2020 t.remove_prefix(1); // '|' 2021 break; 2022 2023 case ')': 2024 if (!ps.DoRightParen()) 2025 return NULL; 2026 t.remove_prefix(1); // ')' 2027 break; 2028 2029 case '^': // Beginning of line. 2030 if (!ps.PushCarat()) 2031 return NULL; 2032 t.remove_prefix(1); // '^' 2033 break; 2034 2035 case '$': // End of line. 2036 if (!ps.PushDollar()) 2037 return NULL; 2038 t.remove_prefix(1); // '$' 2039 break; 2040 2041 case '.': // Any character (possibly except newline). 2042 if (!ps.PushDot()) 2043 return NULL; 2044 t.remove_prefix(1); // '.' 2045 break; 2046 2047 case '[': { // Character class. 2048 Regexp* re; 2049 if (!ps.ParseCharClass(&t, &re, status)) 2050 return NULL; 2051 if (!ps.PushRegexp(re)) 2052 return NULL; 2053 break; 2054 } 2055 2056 case '*': { // Zero or more. 2057 RegexpOp op; 2058 op = kRegexpStar; 2059 goto Rep; 2060 case '+': // One or more. 2061 op = kRegexpPlus; 2062 goto Rep; 2063 case '?': // Zero or one. 2064 op = kRegexpQuest; 2065 goto Rep; 2066 Rep: 2067 StringPiece opstr = t; 2068 bool nongreedy = false; 2069 t.remove_prefix(1); // '*' or '+' or '?' 2070 if (ps.flags() & PerlX) { 2071 if (t.size() > 0 && t[0] == '?') { 2072 nongreedy = true; 2073 t.remove_prefix(1); // '?' 2074 } 2075 if (lastunary.size() > 0) { 2076 // In Perl it is not allowed to stack repetition operators: 2077 // a** is a syntax error, not a double-star. 2078 // (and a++ means something else entirely, which we don't support!) 2079 status->set_code(kRegexpRepeatOp); 2080 status->set_error_arg(StringPiece(lastunary.begin(), 2081 t.begin() - lastunary.begin())); 2082 return NULL; 2083 } 2084 } 2085 opstr.set(opstr.data(), t.data() - opstr.data()); 2086 if (!ps.PushRepeatOp(op, opstr, nongreedy)) 2087 return NULL; 2088 isunary = opstr; 2089 break; 2090 } 2091 2092 case '{': { // Counted repetition. 2093 int lo, hi; 2094 StringPiece opstr = t; 2095 if (!MaybeParseRepetition(&t, &lo, &hi)) { 2096 // Treat like a literal. 2097 if (!ps.PushLiteral('{')) 2098 return NULL; 2099 t.remove_prefix(1); // '{' 2100 break; 2101 } 2102 bool nongreedy = false; 2103 if (ps.flags() & PerlX) { 2104 if (t.size() > 0 && t[0] == '?') { 2105 nongreedy = true; 2106 t.remove_prefix(1); // '?' 2107 } 2108 if (lastunary.size() > 0) { 2109 // Not allowed to stack repetition operators. 2110 status->set_code(kRegexpRepeatOp); 2111 status->set_error_arg(StringPiece(lastunary.begin(), 2112 t.begin() - lastunary.begin())); 2113 return NULL; 2114 } 2115 } 2116 opstr.set(opstr.data(), t.data() - opstr.data()); 2117 if (!ps.PushRepetition(lo, hi, opstr, nongreedy)) 2118 return NULL; 2119 isunary = opstr; 2120 break; 2121 } 2122 2123 case '\\': { // Escaped character or Perl sequence. 2124 // \b and \B: word boundary or not 2125 if ((ps.flags() & Regexp::PerlB) && 2126 t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) { 2127 if (!ps.PushWordBoundary(t[1] == 'b')) 2128 return NULL; 2129 t.remove_prefix(2); // '\\', 'b' 2130 break; 2131 } 2132 2133 if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) { 2134 if (t[1] == 'A') { 2135 if (!ps.PushSimpleOp(kRegexpBeginText)) 2136 return NULL; 2137 t.remove_prefix(2); // '\\', 'A' 2138 break; 2139 } 2140 if (t[1] == 'z') { 2141 if (!ps.PushSimpleOp(kRegexpEndText)) 2142 return NULL; 2143 t.remove_prefix(2); // '\\', 'z' 2144 break; 2145 } 2146 // Do not recognize \Z, because this library can't 2147 // implement the exact Perl/PCRE semantics. 2148 // (This library treats "(?-m)$" as \z, even though 2149 // in Perl and PCRE it is equivalent to \Z.) 2150 2151 if (t[1] == 'C') { // \C: any byte [sic] 2152 if (!ps.PushSimpleOp(kRegexpAnyByte)) 2153 return NULL; 2154 t.remove_prefix(2); // '\\', 'C' 2155 break; 2156 } 2157 2158 if (t[1] == 'Q') { // \Q ... \E: the ... is always literals 2159 t.remove_prefix(2); // '\\', 'Q' 2160 while (t.size() > 0) { 2161 if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') { 2162 t.remove_prefix(2); // '\\', 'E' 2163 break; 2164 } 2165 Rune r; 2166 if (StringPieceToRune(&r, &t, status) < 0) 2167 return NULL; 2168 if (!ps.PushLiteral(r)) 2169 return NULL; 2170 } 2171 break; 2172 } 2173 } 2174 2175 if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) { 2176 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); 2177 re->ccb_ = new CharClassBuilder; 2178 switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) { 2179 case kParseOk: 2180 if (!ps.PushRegexp(re)) 2181 return NULL; 2182 goto Break2; 2183 case kParseError: 2184 re->Decref(); 2185 return NULL; 2186 case kParseNothing: 2187 re->Decref(); 2188 break; 2189 } 2190 } 2191 2192 UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); 2193 if (g != NULL) { 2194 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); 2195 re->ccb_ = new CharClassBuilder; 2196 AddUGroup(re->ccb_, g, g->sign, ps.flags()); 2197 if (!ps.PushRegexp(re)) 2198 return NULL; 2199 break; 2200 } 2201 2202 Rune r; 2203 if (!ParseEscape(&t, &r, status, ps.rune_max())) 2204 return NULL; 2205 if (!ps.PushLiteral(r)) 2206 return NULL; 2207 break; 2208 } 2209 } 2210 Break2: 2211 lastunary = isunary; 2212 } 2213 return ps.DoFinish(); 2214} 2215 2216} // namespace re2 2217