1// Copyright 2016 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "src/regexp/regexp-parser.h" 6 7#include "src/char-predicates-inl.h" 8#include "src/factory.h" 9#include "src/isolate.h" 10#include "src/objects-inl.h" 11#include "src/regexp/jsregexp.h" 12#include "src/utils.h" 13 14namespace v8 { 15namespace internal { 16 17RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, 18 bool multiline, bool unicode, Isolate* isolate, 19 Zone* zone) 20 : isolate_(isolate), 21 zone_(zone), 22 error_(error), 23 captures_(NULL), 24 in_(in), 25 current_(kEndMarker), 26 next_pos_(0), 27 captures_started_(0), 28 capture_count_(0), 29 has_more_(true), 30 multiline_(multiline), 31 unicode_(unicode), 32 simple_(false), 33 contains_anchor_(false), 34 is_scanned_for_captures_(false), 35 failed_(false) { 36 Advance(); 37} 38 39 40uc32 RegExpParser::Next() { 41 if (has_next()) { 42 return in()->Get(next_pos_); 43 } else { 44 return kEndMarker; 45 } 46} 47 48 49void RegExpParser::Advance() { 50 if (next_pos_ < in()->length()) { 51 StackLimitCheck check(isolate()); 52 if (check.HasOverflowed()) { 53 ReportError(CStrVector(Isolate::kStackOverflowMessage)); 54 } else if (zone()->excess_allocation()) { 55 ReportError(CStrVector("Regular expression too large")); 56 } else { 57 current_ = in()->Get(next_pos_); 58 next_pos_++; 59 // Read the whole surrogate pair in case of unicode flag, if possible. 60 if (unicode_ && next_pos_ < in()->length() && 61 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) { 62 uc16 trail = in()->Get(next_pos_); 63 if (unibrow::Utf16::IsTrailSurrogate(trail)) { 64 current_ = unibrow::Utf16::CombineSurrogatePair( 65 static_cast<uc16>(current_), trail); 66 next_pos_++; 67 } 68 } 69 } 70 } else { 71 current_ = kEndMarker; 72 // Advance so that position() points to 1-after-the-last-character. This is 73 // important so that Reset() to this position works correctly. 74 next_pos_ = in()->length() + 1; 75 has_more_ = false; 76 } 77} 78 79 80void RegExpParser::Reset(int pos) { 81 next_pos_ = pos; 82 has_more_ = (pos < in()->length()); 83 Advance(); 84} 85 86 87void RegExpParser::Advance(int dist) { 88 next_pos_ += dist - 1; 89 Advance(); 90} 91 92 93bool RegExpParser::simple() { return simple_; } 94 95 96bool RegExpParser::IsSyntaxCharacter(uc32 c) { 97 return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' || 98 c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' || 99 c == '{' || c == '}' || c == '|'; 100} 101 102 103RegExpTree* RegExpParser::ReportError(Vector<const char> message) { 104 failed_ = true; 105 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked(); 106 // Zip to the end to make sure the no more input is read. 107 current_ = kEndMarker; 108 next_pos_ = in()->length(); 109 return NULL; 110} 111 112 113#define CHECK_FAILED /**/); \ 114 if (failed_) return NULL; \ 115 ((void)0 116 117 118// Pattern :: 119// Disjunction 120RegExpTree* RegExpParser::ParsePattern() { 121 RegExpTree* result = ParseDisjunction(CHECK_FAILED); 122 DCHECK(!has_more()); 123 // If the result of parsing is a literal string atom, and it has the 124 // same length as the input, then the atom is identical to the input. 125 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) { 126 simple_ = true; 127 } 128 return result; 129} 130 131 132// Disjunction :: 133// Alternative 134// Alternative | Disjunction 135// Alternative :: 136// [empty] 137// Term Alternative 138// Term :: 139// Assertion 140// Atom 141// Atom Quantifier 142RegExpTree* RegExpParser::ParseDisjunction() { 143 // Used to store current state while parsing subexpressions. 144 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, 145 zone()); 146 RegExpParserState* state = &initial_state; 147 // Cache the builder in a local variable for quick access. 148 RegExpBuilder* builder = initial_state.builder(); 149 while (true) { 150 switch (current()) { 151 case kEndMarker: 152 if (state->IsSubexpression()) { 153 // Inside a parenthesized group when hitting end of input. 154 ReportError(CStrVector("Unterminated group") CHECK_FAILED); 155 } 156 DCHECK_EQ(INITIAL, state->group_type()); 157 // Parsing completed successfully. 158 return builder->ToRegExp(); 159 case ')': { 160 if (!state->IsSubexpression()) { 161 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED); 162 } 163 DCHECK_NE(INITIAL, state->group_type()); 164 165 Advance(); 166 // End disjunction parsing and convert builder content to new single 167 // regexp atom. 168 RegExpTree* body = builder->ToRegExp(); 169 170 int end_capture_index = captures_started(); 171 172 int capture_index = state->capture_index(); 173 SubexpressionType group_type = state->group_type(); 174 175 // Build result of subexpression. 176 if (group_type == CAPTURE) { 177 RegExpCapture* capture = GetCapture(capture_index); 178 capture->set_body(body); 179 body = capture; 180 } else if (group_type != GROUPING) { 181 DCHECK(group_type == POSITIVE_LOOKAROUND || 182 group_type == NEGATIVE_LOOKAROUND); 183 bool is_positive = (group_type == POSITIVE_LOOKAROUND); 184 body = new (zone()) RegExpLookaround( 185 body, is_positive, end_capture_index - capture_index, 186 capture_index, state->lookaround_type()); 187 } 188 189 // Restore previous state. 190 state = state->previous_state(); 191 builder = state->builder(); 192 193 builder->AddAtom(body); 194 // For compatability with JSC and ES3, we allow quantifiers after 195 // lookaheads, and break in all cases. 196 break; 197 } 198 case '|': { 199 Advance(); 200 builder->NewAlternative(); 201 continue; 202 } 203 case '*': 204 case '+': 205 case '?': 206 return ReportError(CStrVector("Nothing to repeat")); 207 case '^': { 208 Advance(); 209 if (multiline_) { 210 builder->AddAssertion( 211 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); 212 } else { 213 builder->AddAssertion( 214 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); 215 set_contains_anchor(); 216 } 217 continue; 218 } 219 case '$': { 220 Advance(); 221 RegExpAssertion::AssertionType assertion_type = 222 multiline_ ? RegExpAssertion::END_OF_LINE 223 : RegExpAssertion::END_OF_INPUT; 224 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); 225 continue; 226 } 227 case '.': { 228 Advance(); 229 // everything except \x0a, \x0d, \u2028 and \u2029 230 ZoneList<CharacterRange>* ranges = 231 new (zone()) ZoneList<CharacterRange>(2, zone()); 232 CharacterRange::AddClassEscape('.', ranges, zone()); 233 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); 234 builder->AddAtom(atom); 235 break; 236 } 237 case '(': { 238 SubexpressionType subexpr_type = CAPTURE; 239 RegExpLookaround::Type lookaround_type = state->lookaround_type(); 240 Advance(); 241 if (current() == '?') { 242 switch (Next()) { 243 case ':': 244 subexpr_type = GROUPING; 245 break; 246 case '=': 247 lookaround_type = RegExpLookaround::LOOKAHEAD; 248 subexpr_type = POSITIVE_LOOKAROUND; 249 break; 250 case '!': 251 lookaround_type = RegExpLookaround::LOOKAHEAD; 252 subexpr_type = NEGATIVE_LOOKAROUND; 253 break; 254 case '<': 255 if (FLAG_harmony_regexp_lookbehind) { 256 Advance(); 257 lookaround_type = RegExpLookaround::LOOKBEHIND; 258 if (Next() == '=') { 259 subexpr_type = POSITIVE_LOOKAROUND; 260 break; 261 } else if (Next() == '!') { 262 subexpr_type = NEGATIVE_LOOKAROUND; 263 break; 264 } 265 } 266 // Fall through. 267 default: 268 ReportError(CStrVector("Invalid group") CHECK_FAILED); 269 break; 270 } 271 Advance(2); 272 } else { 273 if (captures_started_ >= kMaxCaptures) { 274 ReportError(CStrVector("Too many captures") CHECK_FAILED); 275 } 276 captures_started_++; 277 } 278 // Store current state and begin new disjunction parsing. 279 state = new (zone()) RegExpParserState( 280 state, subexpr_type, lookaround_type, captures_started_, zone()); 281 builder = state->builder(); 282 continue; 283 } 284 case '[': { 285 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); 286 builder->AddAtom(atom); 287 break; 288 } 289 // Atom :: 290 // \ AtomEscape 291 case '\\': 292 switch (Next()) { 293 case kEndMarker: 294 return ReportError(CStrVector("\\ at end of pattern")); 295 case 'b': 296 Advance(2); 297 builder->AddAssertion( 298 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY)); 299 continue; 300 case 'B': 301 Advance(2); 302 builder->AddAssertion( 303 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY)); 304 continue; 305 // AtomEscape :: 306 // CharacterClassEscape 307 // 308 // CharacterClassEscape :: one of 309 // d D s S w W 310 case 'd': 311 case 'D': 312 case 's': 313 case 'S': 314 case 'w': 315 case 'W': { 316 uc32 c = Next(); 317 Advance(2); 318 ZoneList<CharacterRange>* ranges = 319 new (zone()) ZoneList<CharacterRange>(2, zone()); 320 CharacterRange::AddClassEscape(c, ranges, zone()); 321 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); 322 builder->AddAtom(atom); 323 break; 324 } 325 case '1': 326 case '2': 327 case '3': 328 case '4': 329 case '5': 330 case '6': 331 case '7': 332 case '8': 333 case '9': { 334 int index = 0; 335 if (ParseBackReferenceIndex(&index)) { 336 if (state->IsInsideCaptureGroup(index)) { 337 // The back reference is inside the capture group it refers to. 338 // Nothing can possibly have been captured yet, so we use empty 339 // instead. This ensures that, when checking a back reference, 340 // the capture registers of the referenced capture are either 341 // both set or both cleared. 342 builder->AddEmpty(); 343 } else { 344 RegExpCapture* capture = GetCapture(index); 345 RegExpTree* atom = new (zone()) RegExpBackReference(capture); 346 builder->AddAtom(atom); 347 } 348 break; 349 } 350 uc32 first_digit = Next(); 351 if (first_digit == '8' || first_digit == '9') { 352 // If the 'u' flag is present, only syntax characters can be 353 // escaped, 354 // no other identity escapes are allowed. If the 'u' flag is not 355 // present, all identity escapes are allowed. 356 if (!unicode_) { 357 builder->AddCharacter(first_digit); 358 Advance(2); 359 } else { 360 return ReportError(CStrVector("Invalid escape")); 361 } 362 break; 363 } 364 } 365 // FALLTHROUGH 366 case '0': { 367 Advance(); 368 uc32 octal = ParseOctalLiteral(); 369 builder->AddCharacter(octal); 370 break; 371 } 372 // ControlEscape :: one of 373 // f n r t v 374 case 'f': 375 Advance(2); 376 builder->AddCharacter('\f'); 377 break; 378 case 'n': 379 Advance(2); 380 builder->AddCharacter('\n'); 381 break; 382 case 'r': 383 Advance(2); 384 builder->AddCharacter('\r'); 385 break; 386 case 't': 387 Advance(2); 388 builder->AddCharacter('\t'); 389 break; 390 case 'v': 391 Advance(2); 392 builder->AddCharacter('\v'); 393 break; 394 case 'c': { 395 Advance(); 396 uc32 controlLetter = Next(); 397 // Special case if it is an ASCII letter. 398 // Convert lower case letters to uppercase. 399 uc32 letter = controlLetter & ~('a' ^ 'A'); 400 if (letter < 'A' || 'Z' < letter) { 401 // controlLetter is not in range 'A'-'Z' or 'a'-'z'. 402 // This is outside the specification. We match JSC in 403 // reading the backslash as a literal character instead 404 // of as starting an escape. 405 builder->AddCharacter('\\'); 406 } else { 407 Advance(2); 408 builder->AddCharacter(controlLetter & 0x1f); 409 } 410 break; 411 } 412 case 'x': { 413 Advance(2); 414 uc32 value; 415 if (ParseHexEscape(2, &value)) { 416 builder->AddCharacter(value); 417 } else if (!unicode_) { 418 builder->AddCharacter('x'); 419 } else { 420 // If the 'u' flag is present, invalid escapes are not treated as 421 // identity escapes. 422 return ReportError(CStrVector("Invalid escape")); 423 } 424 break; 425 } 426 case 'u': { 427 Advance(2); 428 uc32 value; 429 if (ParseUnicodeEscape(&value)) { 430 builder->AddUnicodeCharacter(value); 431 } else if (!unicode_) { 432 builder->AddCharacter('u'); 433 } else { 434 // If the 'u' flag is present, invalid escapes are not treated as 435 // identity escapes. 436 return ReportError(CStrVector("Invalid unicode escape")); 437 } 438 break; 439 } 440 default: 441 Advance(); 442 // If the 'u' flag is present, only syntax characters can be 443 // escaped, no 444 // other identity escapes are allowed. If the 'u' flag is not 445 // present, 446 // all identity escapes are allowed. 447 if (!unicode_ || IsSyntaxCharacter(current())) { 448 builder->AddCharacter(current()); 449 Advance(); 450 } else { 451 return ReportError(CStrVector("Invalid escape")); 452 } 453 break; 454 } 455 break; 456 case '{': { 457 int dummy; 458 if (ParseIntervalQuantifier(&dummy, &dummy)) { 459 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED); 460 } 461 // fallthrough 462 } 463 default: 464 builder->AddUnicodeCharacter(current()); 465 Advance(); 466 break; 467 } // end switch(current()) 468 469 int min; 470 int max; 471 switch (current()) { 472 // QuantifierPrefix :: 473 // * 474 // + 475 // ? 476 // { 477 case '*': 478 min = 0; 479 max = RegExpTree::kInfinity; 480 Advance(); 481 break; 482 case '+': 483 min = 1; 484 max = RegExpTree::kInfinity; 485 Advance(); 486 break; 487 case '?': 488 min = 0; 489 max = 1; 490 Advance(); 491 break; 492 case '{': 493 if (ParseIntervalQuantifier(&min, &max)) { 494 if (max < min) { 495 ReportError(CStrVector("numbers out of order in {} quantifier.") 496 CHECK_FAILED); 497 } 498 break; 499 } else { 500 continue; 501 } 502 default: 503 continue; 504 } 505 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; 506 if (current() == '?') { 507 quantifier_type = RegExpQuantifier::NON_GREEDY; 508 Advance(); 509 } else if (FLAG_regexp_possessive_quantifier && current() == '+') { 510 // FLAG_regexp_possessive_quantifier is a debug-only flag. 511 quantifier_type = RegExpQuantifier::POSSESSIVE; 512 Advance(); 513 } 514 builder->AddQuantifierToAtom(min, max, quantifier_type); 515 } 516} 517 518 519#ifdef DEBUG 520// Currently only used in an DCHECK. 521static bool IsSpecialClassEscape(uc32 c) { 522 switch (c) { 523 case 'd': 524 case 'D': 525 case 's': 526 case 'S': 527 case 'w': 528 case 'W': 529 return true; 530 default: 531 return false; 532 } 533} 534#endif 535 536 537// In order to know whether an escape is a backreference or not we have to scan 538// the entire regexp and find the number of capturing parentheses. However we 539// don't want to scan the regexp twice unless it is necessary. This mini-parser 540// is called when needed. It can see the difference between capturing and 541// noncapturing parentheses and can skip character classes and backslash-escaped 542// characters. 543void RegExpParser::ScanForCaptures() { 544 // Start with captures started previous to current position 545 int capture_count = captures_started(); 546 // Add count of captures after this position. 547 int n; 548 while ((n = current()) != kEndMarker) { 549 Advance(); 550 switch (n) { 551 case '\\': 552 Advance(); 553 break; 554 case '[': { 555 int c; 556 while ((c = current()) != kEndMarker) { 557 Advance(); 558 if (c == '\\') { 559 Advance(); 560 } else { 561 if (c == ']') break; 562 } 563 } 564 break; 565 } 566 case '(': 567 if (current() != '?') capture_count++; 568 break; 569 } 570 } 571 capture_count_ = capture_count; 572 is_scanned_for_captures_ = true; 573} 574 575 576bool RegExpParser::ParseBackReferenceIndex(int* index_out) { 577 DCHECK_EQ('\\', current()); 578 DCHECK('1' <= Next() && Next() <= '9'); 579 // Try to parse a decimal literal that is no greater than the total number 580 // of left capturing parentheses in the input. 581 int start = position(); 582 int value = Next() - '0'; 583 Advance(2); 584 while (true) { 585 uc32 c = current(); 586 if (IsDecimalDigit(c)) { 587 value = 10 * value + (c - '0'); 588 if (value > kMaxCaptures) { 589 Reset(start); 590 return false; 591 } 592 Advance(); 593 } else { 594 break; 595 } 596 } 597 if (value > captures_started()) { 598 if (!is_scanned_for_captures_) { 599 int saved_position = position(); 600 ScanForCaptures(); 601 Reset(saved_position); 602 } 603 if (value > capture_count_) { 604 Reset(start); 605 return false; 606 } 607 } 608 *index_out = value; 609 return true; 610} 611 612 613RegExpCapture* RegExpParser::GetCapture(int index) { 614 // The index for the capture groups are one-based. Its index in the list is 615 // zero-based. 616 int know_captures = 617 is_scanned_for_captures_ ? capture_count_ : captures_started_; 618 DCHECK(index <= know_captures); 619 if (captures_ == NULL) { 620 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone()); 621 } 622 while (captures_->length() < know_captures) { 623 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone()); 624 } 625 return captures_->at(index - 1); 626} 627 628 629bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) { 630 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) { 631 if (s->group_type() != CAPTURE) continue; 632 // Return true if we found the matching capture index. 633 if (index == s->capture_index()) return true; 634 // Abort if index is larger than what has been parsed up till this state. 635 if (index > s->capture_index()) return false; 636 } 637 return false; 638} 639 640 641// QuantifierPrefix :: 642// { DecimalDigits } 643// { DecimalDigits , } 644// { DecimalDigits , DecimalDigits } 645// 646// Returns true if parsing succeeds, and set the min_out and max_out 647// values. Values are truncated to RegExpTree::kInfinity if they overflow. 648bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) { 649 DCHECK_EQ(current(), '{'); 650 int start = position(); 651 Advance(); 652 int min = 0; 653 if (!IsDecimalDigit(current())) { 654 Reset(start); 655 return false; 656 } 657 while (IsDecimalDigit(current())) { 658 int next = current() - '0'; 659 if (min > (RegExpTree::kInfinity - next) / 10) { 660 // Overflow. Skip past remaining decimal digits and return -1. 661 do { 662 Advance(); 663 } while (IsDecimalDigit(current())); 664 min = RegExpTree::kInfinity; 665 break; 666 } 667 min = 10 * min + next; 668 Advance(); 669 } 670 int max = 0; 671 if (current() == '}') { 672 max = min; 673 Advance(); 674 } else if (current() == ',') { 675 Advance(); 676 if (current() == '}') { 677 max = RegExpTree::kInfinity; 678 Advance(); 679 } else { 680 while (IsDecimalDigit(current())) { 681 int next = current() - '0'; 682 if (max > (RegExpTree::kInfinity - next) / 10) { 683 do { 684 Advance(); 685 } while (IsDecimalDigit(current())); 686 max = RegExpTree::kInfinity; 687 break; 688 } 689 max = 10 * max + next; 690 Advance(); 691 } 692 if (current() != '}') { 693 Reset(start); 694 return false; 695 } 696 Advance(); 697 } 698 } else { 699 Reset(start); 700 return false; 701 } 702 *min_out = min; 703 *max_out = max; 704 return true; 705} 706 707 708uc32 RegExpParser::ParseOctalLiteral() { 709 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker); 710 // For compatibility with some other browsers (not all), we parse 711 // up to three octal digits with a value below 256. 712 uc32 value = current() - '0'; 713 Advance(); 714 if ('0' <= current() && current() <= '7') { 715 value = value * 8 + current() - '0'; 716 Advance(); 717 if (value < 32 && '0' <= current() && current() <= '7') { 718 value = value * 8 + current() - '0'; 719 Advance(); 720 } 721 } 722 return value; 723} 724 725 726bool RegExpParser::ParseHexEscape(int length, uc32* value) { 727 int start = position(); 728 uc32 val = 0; 729 for (int i = 0; i < length; ++i) { 730 uc32 c = current(); 731 int d = HexValue(c); 732 if (d < 0) { 733 Reset(start); 734 return false; 735 } 736 val = val * 16 + d; 737 Advance(); 738 } 739 *value = val; 740 return true; 741} 742 743 744bool RegExpParser::ParseUnicodeEscape(uc32* value) { 745 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are 746 // allowed). In the latter case, the number of hex digits between { } is 747 // arbitrary. \ and u have already been read. 748 if (current() == '{' && unicode_) { 749 int start = position(); 750 Advance(); 751 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { 752 if (current() == '}') { 753 Advance(); 754 return true; 755 } 756 } 757 Reset(start); 758 return false; 759 } 760 // \u but no {, or \u{...} escapes not allowed. 761 return ParseHexEscape(4, value); 762} 763 764 765bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { 766 uc32 x = 0; 767 int d = HexValue(current()); 768 if (d < 0) { 769 return false; 770 } 771 while (d >= 0) { 772 x = x * 16 + d; 773 if (x > max_value) { 774 return false; 775 } 776 Advance(); 777 d = HexValue(current()); 778 } 779 *value = x; 780 return true; 781} 782 783 784uc32 RegExpParser::ParseClassCharacterEscape() { 785 DCHECK(current() == '\\'); 786 DCHECK(has_next() && !IsSpecialClassEscape(Next())); 787 Advance(); 788 switch (current()) { 789 case 'b': 790 Advance(); 791 return '\b'; 792 // ControlEscape :: one of 793 // f n r t v 794 case 'f': 795 Advance(); 796 return '\f'; 797 case 'n': 798 Advance(); 799 return '\n'; 800 case 'r': 801 Advance(); 802 return '\r'; 803 case 't': 804 Advance(); 805 return '\t'; 806 case 'v': 807 Advance(); 808 return '\v'; 809 case 'c': { 810 uc32 controlLetter = Next(); 811 uc32 letter = controlLetter & ~('A' ^ 'a'); 812 // For compatibility with JSC, inside a character class 813 // we also accept digits and underscore as control characters. 814 if ((controlLetter >= '0' && controlLetter <= '9') || 815 controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) { 816 Advance(2); 817 // Control letters mapped to ASCII control characters in the range 818 // 0x00-0x1f. 819 return controlLetter & 0x1f; 820 } 821 // We match JSC in reading the backslash as a literal 822 // character instead of as starting an escape. 823 return '\\'; 824 } 825 case '0': 826 case '1': 827 case '2': 828 case '3': 829 case '4': 830 case '5': 831 case '6': 832 case '7': 833 // For compatibility, we interpret a decimal escape that isn't 834 // a back reference (and therefore either \0 or not valid according 835 // to the specification) as a 1..3 digit octal character code. 836 return ParseOctalLiteral(); 837 case 'x': { 838 Advance(); 839 uc32 value; 840 if (ParseHexEscape(2, &value)) { 841 return value; 842 } 843 if (!unicode_) { 844 // If \x is not followed by a two-digit hexadecimal, treat it 845 // as an identity escape. 846 return 'x'; 847 } 848 // If the 'u' flag is present, invalid escapes are not treated as 849 // identity escapes. 850 ReportError(CStrVector("Invalid escape")); 851 return 0; 852 } 853 case 'u': { 854 Advance(); 855 uc32 value; 856 if (ParseUnicodeEscape(&value)) { 857 return value; 858 } 859 if (!unicode_) { 860 return 'u'; 861 } 862 // If the 'u' flag is present, invalid escapes are not treated as 863 // identity escapes. 864 ReportError(CStrVector("Invalid unicode escape")); 865 return 0; 866 } 867 default: { 868 uc32 result = current(); 869 // If the 'u' flag is present, only syntax characters can be escaped, no 870 // other identity escapes are allowed. If the 'u' flag is not present, all 871 // identity escapes are allowed. 872 if (!unicode_ || IsSyntaxCharacter(result)) { 873 Advance(); 874 return result; 875 } 876 ReportError(CStrVector("Invalid escape")); 877 return 0; 878 } 879 } 880 return 0; 881} 882 883 884CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { 885 DCHECK_EQ(0, *char_class); 886 uc32 first = current(); 887 if (first == '\\') { 888 switch (Next()) { 889 case 'w': 890 case 'W': 891 case 'd': 892 case 'D': 893 case 's': 894 case 'S': { 895 *char_class = Next(); 896 Advance(2); 897 return CharacterRange::Singleton(0); // Return dummy value. 898 } 899 case kEndMarker: 900 return ReportError(CStrVector("\\ at end of pattern")); 901 default: 902 uc32 c = ParseClassCharacterEscape(CHECK_FAILED); 903 return CharacterRange::Singleton(c); 904 } 905 } else { 906 Advance(); 907 return CharacterRange::Singleton(first); 908 } 909} 910 911 912static const uc16 kNoCharClass = 0; 913 914// Adds range or pre-defined character class to character ranges. 915// If char_class is not kInvalidClass, it's interpreted as a class 916// escape (i.e., 's' means whitespace, from '\s'). 917static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, 918 uc16 char_class, CharacterRange range, 919 Zone* zone) { 920 if (char_class != kNoCharClass) { 921 CharacterRange::AddClassEscape(char_class, ranges, zone); 922 } else { 923 ranges->Add(range, zone); 924 } 925} 926 927 928RegExpTree* RegExpParser::ParseCharacterClass() { 929 static const char* kUnterminated = "Unterminated character class"; 930 static const char* kRangeOutOfOrder = "Range out of order in character class"; 931 932 DCHECK_EQ(current(), '['); 933 Advance(); 934 bool is_negated = false; 935 if (current() == '^') { 936 is_negated = true; 937 Advance(); 938 } 939 ZoneList<CharacterRange>* ranges = 940 new (zone()) ZoneList<CharacterRange>(2, zone()); 941 while (has_more() && current() != ']') { 942 uc16 char_class = kNoCharClass; 943 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); 944 if (current() == '-') { 945 Advance(); 946 if (current() == kEndMarker) { 947 // If we reach the end we break out of the loop and let the 948 // following code report an error. 949 break; 950 } else if (current() == ']') { 951 AddRangeOrEscape(ranges, char_class, first, zone()); 952 ranges->Add(CharacterRange::Singleton('-'), zone()); 953 break; 954 } 955 uc16 char_class_2 = kNoCharClass; 956 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); 957 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { 958 // Either end is an escaped character class. Treat the '-' verbatim. 959 AddRangeOrEscape(ranges, char_class, first, zone()); 960 ranges->Add(CharacterRange::Singleton('-'), zone()); 961 AddRangeOrEscape(ranges, char_class_2, next, zone()); 962 continue; 963 } 964 if (first.from() > next.to()) { 965 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED); 966 } 967 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); 968 } else { 969 AddRangeOrEscape(ranges, char_class, first, zone()); 970 } 971 } 972 if (!has_more()) { 973 return ReportError(CStrVector(kUnterminated) CHECK_FAILED); 974 } 975 Advance(); 976 if (ranges->length() == 0) { 977 ranges->Add(CharacterRange::Everything(), zone()); 978 is_negated = !is_negated; 979 } 980 return new (zone()) RegExpCharacterClass(ranges, is_negated); 981} 982 983 984#undef CHECK_FAILED 985 986 987bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, 988 FlatStringReader* input, bool multiline, 989 bool unicode, RegExpCompileData* result) { 990 DCHECK(result != NULL); 991 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone); 992 RegExpTree* tree = parser.ParsePattern(); 993 if (parser.failed()) { 994 DCHECK(tree == NULL); 995 DCHECK(!result->error.is_null()); 996 } else { 997 DCHECK(tree != NULL); 998 DCHECK(result->error.is_null()); 999 if (FLAG_trace_regexp_parser) { 1000 OFStream os(stdout); 1001 tree->Print(os, zone); 1002 os << "\n"; 1003 } 1004 result->tree = tree; 1005 int capture_count = parser.captures_started(); 1006 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; 1007 result->contains_anchor = parser.contains_anchor(); 1008 result->capture_count = capture_count; 1009 } 1010 return !parser.failed(); 1011} 1012 1013 1014RegExpBuilder::RegExpBuilder(Zone* zone) 1015 : zone_(zone), 1016 pending_empty_(false), 1017 characters_(NULL), 1018 terms_(), 1019 alternatives_() 1020#ifdef DEBUG 1021 , 1022 last_added_(ADD_NONE) 1023#endif 1024{ 1025} 1026 1027 1028void RegExpBuilder::FlushCharacters() { 1029 pending_empty_ = false; 1030 if (characters_ != NULL) { 1031 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); 1032 characters_ = NULL; 1033 text_.Add(atom, zone()); 1034 LAST(ADD_ATOM); 1035 } 1036} 1037 1038 1039void RegExpBuilder::FlushText() { 1040 FlushCharacters(); 1041 int num_text = text_.length(); 1042 if (num_text == 0) { 1043 return; 1044 } else if (num_text == 1) { 1045 terms_.Add(text_.last(), zone()); 1046 } else { 1047 RegExpText* text = new (zone()) RegExpText(zone()); 1048 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); 1049 terms_.Add(text, zone()); 1050 } 1051 text_.Clear(); 1052} 1053 1054 1055void RegExpBuilder::AddCharacter(uc16 c) { 1056 pending_empty_ = false; 1057 if (characters_ == NULL) { 1058 characters_ = new (zone()) ZoneList<uc16>(4, zone()); 1059 } 1060 characters_->Add(c, zone()); 1061 LAST(ADD_CHAR); 1062} 1063 1064 1065void RegExpBuilder::AddUnicodeCharacter(uc32 c) { 1066 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { 1067 ZoneList<uc16> surrogate_pair(2, zone()); 1068 surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone()); 1069 surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone()); 1070 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); 1071 AddAtom(atom); 1072 } else { 1073 AddCharacter(static_cast<uc16>(c)); 1074 } 1075} 1076 1077 1078void RegExpBuilder::AddEmpty() { pending_empty_ = true; } 1079 1080 1081void RegExpBuilder::AddAtom(RegExpTree* term) { 1082 if (term->IsEmpty()) { 1083 AddEmpty(); 1084 return; 1085 } 1086 if (term->IsTextElement()) { 1087 FlushCharacters(); 1088 text_.Add(term, zone()); 1089 } else { 1090 FlushText(); 1091 terms_.Add(term, zone()); 1092 } 1093 LAST(ADD_ATOM); 1094} 1095 1096 1097void RegExpBuilder::AddAssertion(RegExpTree* assert) { 1098 FlushText(); 1099 terms_.Add(assert, zone()); 1100 LAST(ADD_ASSERT); 1101} 1102 1103 1104void RegExpBuilder::NewAlternative() { FlushTerms(); } 1105 1106 1107void RegExpBuilder::FlushTerms() { 1108 FlushText(); 1109 int num_terms = terms_.length(); 1110 RegExpTree* alternative; 1111 if (num_terms == 0) { 1112 alternative = new (zone()) RegExpEmpty(); 1113 } else if (num_terms == 1) { 1114 alternative = terms_.last(); 1115 } else { 1116 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone())); 1117 } 1118 alternatives_.Add(alternative, zone()); 1119 terms_.Clear(); 1120 LAST(ADD_NONE); 1121} 1122 1123 1124RegExpTree* RegExpBuilder::ToRegExp() { 1125 FlushTerms(); 1126 int num_alternatives = alternatives_.length(); 1127 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); 1128 if (num_alternatives == 1) return alternatives_.last(); 1129 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); 1130} 1131 1132 1133void RegExpBuilder::AddQuantifierToAtom( 1134 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { 1135 if (pending_empty_) { 1136 pending_empty_ = false; 1137 return; 1138 } 1139 RegExpTree* atom; 1140 if (characters_ != NULL) { 1141 DCHECK(last_added_ == ADD_CHAR); 1142 // Last atom was character. 1143 Vector<const uc16> char_vector = characters_->ToConstVector(); 1144 int num_chars = char_vector.length(); 1145 if (num_chars > 1) { 1146 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); 1147 text_.Add(new (zone()) RegExpAtom(prefix), zone()); 1148 char_vector = char_vector.SubVector(num_chars - 1, num_chars); 1149 } 1150 characters_ = NULL; 1151 atom = new (zone()) RegExpAtom(char_vector); 1152 FlushText(); 1153 } else if (text_.length() > 0) { 1154 DCHECK(last_added_ == ADD_ATOM); 1155 atom = text_.RemoveLast(); 1156 FlushText(); 1157 } else if (terms_.length() > 0) { 1158 DCHECK(last_added_ == ADD_ATOM); 1159 atom = terms_.RemoveLast(); 1160 if (atom->max_match() == 0) { 1161 // Guaranteed to only match an empty string. 1162 LAST(ADD_TERM); 1163 if (min == 0) { 1164 return; 1165 } 1166 terms_.Add(atom, zone()); 1167 return; 1168 } 1169 } else { 1170 // Only call immediately after adding an atom or character! 1171 UNREACHABLE(); 1172 return; 1173 } 1174 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1175 zone()); 1176 LAST(ADD_TERM); 1177} 1178 1179} // namespace internal 1180} // namespace v8 1181