1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp-parser.h"
6
7#include "src/char-predicates-inl.h"
8#include "src/factory.h"
9#include "src/isolate.h"
10#include "src/objects-inl.h"
11#include "src/regexp/jsregexp.h"
12#include "src/utils.h"
13
14namespace v8 {
15namespace internal {
16
17RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
18                           bool multiline, bool unicode, Isolate* isolate,
19                           Zone* zone)
20    : isolate_(isolate),
21      zone_(zone),
22      error_(error),
23      captures_(NULL),
24      in_(in),
25      current_(kEndMarker),
26      next_pos_(0),
27      captures_started_(0),
28      capture_count_(0),
29      has_more_(true),
30      multiline_(multiline),
31      unicode_(unicode),
32      simple_(false),
33      contains_anchor_(false),
34      is_scanned_for_captures_(false),
35      failed_(false) {
36  Advance();
37}
38
39
40uc32 RegExpParser::Next() {
41  if (has_next()) {
42    return in()->Get(next_pos_);
43  } else {
44    return kEndMarker;
45  }
46}
47
48
49void RegExpParser::Advance() {
50  if (next_pos_ < in()->length()) {
51    StackLimitCheck check(isolate());
52    if (check.HasOverflowed()) {
53      ReportError(CStrVector(Isolate::kStackOverflowMessage));
54    } else if (zone()->excess_allocation()) {
55      ReportError(CStrVector("Regular expression too large"));
56    } else {
57      current_ = in()->Get(next_pos_);
58      next_pos_++;
59      // Read the whole surrogate pair in case of unicode flag, if possible.
60      if (unicode_ && next_pos_ < in()->length() &&
61          unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {
62        uc16 trail = in()->Get(next_pos_);
63        if (unibrow::Utf16::IsTrailSurrogate(trail)) {
64          current_ = unibrow::Utf16::CombineSurrogatePair(
65              static_cast<uc16>(current_), trail);
66          next_pos_++;
67        }
68      }
69    }
70  } else {
71    current_ = kEndMarker;
72    // Advance so that position() points to 1-after-the-last-character. This is
73    // important so that Reset() to this position works correctly.
74    next_pos_ = in()->length() + 1;
75    has_more_ = false;
76  }
77}
78
79
80void RegExpParser::Reset(int pos) {
81  next_pos_ = pos;
82  has_more_ = (pos < in()->length());
83  Advance();
84}
85
86
87void RegExpParser::Advance(int dist) {
88  next_pos_ += dist - 1;
89  Advance();
90}
91
92
93bool RegExpParser::simple() { return simple_; }
94
95
96bool RegExpParser::IsSyntaxCharacter(uc32 c) {
97  return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
98         c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
99         c == '{' || c == '}' || c == '|';
100}
101
102
103RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
104  failed_ = true;
105  *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
106  // Zip to the end to make sure the no more input is read.
107  current_ = kEndMarker;
108  next_pos_ = in()->length();
109  return NULL;
110}
111
112
113#define CHECK_FAILED /**/); \
114  if (failed_) return NULL; \
115  ((void)0
116
117
118// Pattern ::
119//   Disjunction
120RegExpTree* RegExpParser::ParsePattern() {
121  RegExpTree* result = ParseDisjunction(CHECK_FAILED);
122  DCHECK(!has_more());
123  // If the result of parsing is a literal string atom, and it has the
124  // same length as the input, then the atom is identical to the input.
125  if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
126    simple_ = true;
127  }
128  return result;
129}
130
131
132// Disjunction ::
133//   Alternative
134//   Alternative | Disjunction
135// Alternative ::
136//   [empty]
137//   Term Alternative
138// Term ::
139//   Assertion
140//   Atom
141//   Atom Quantifier
142RegExpTree* RegExpParser::ParseDisjunction() {
143  // Used to store current state while parsing subexpressions.
144  RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
145                                  zone());
146  RegExpParserState* state = &initial_state;
147  // Cache the builder in a local variable for quick access.
148  RegExpBuilder* builder = initial_state.builder();
149  while (true) {
150    switch (current()) {
151      case kEndMarker:
152        if (state->IsSubexpression()) {
153          // Inside a parenthesized group when hitting end of input.
154          ReportError(CStrVector("Unterminated group") CHECK_FAILED);
155        }
156        DCHECK_EQ(INITIAL, state->group_type());
157        // Parsing completed successfully.
158        return builder->ToRegExp();
159      case ')': {
160        if (!state->IsSubexpression()) {
161          ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
162        }
163        DCHECK_NE(INITIAL, state->group_type());
164
165        Advance();
166        // End disjunction parsing and convert builder content to new single
167        // regexp atom.
168        RegExpTree* body = builder->ToRegExp();
169
170        int end_capture_index = captures_started();
171
172        int capture_index = state->capture_index();
173        SubexpressionType group_type = state->group_type();
174
175        // Build result of subexpression.
176        if (group_type == CAPTURE) {
177          RegExpCapture* capture = GetCapture(capture_index);
178          capture->set_body(body);
179          body = capture;
180        } else if (group_type != GROUPING) {
181          DCHECK(group_type == POSITIVE_LOOKAROUND ||
182                 group_type == NEGATIVE_LOOKAROUND);
183          bool is_positive = (group_type == POSITIVE_LOOKAROUND);
184          body = new (zone()) RegExpLookaround(
185              body, is_positive, end_capture_index - capture_index,
186              capture_index, state->lookaround_type());
187        }
188
189        // Restore previous state.
190        state = state->previous_state();
191        builder = state->builder();
192
193        builder->AddAtom(body);
194        // For compatability with JSC and ES3, we allow quantifiers after
195        // lookaheads, and break in all cases.
196        break;
197      }
198      case '|': {
199        Advance();
200        builder->NewAlternative();
201        continue;
202      }
203      case '*':
204      case '+':
205      case '?':
206        return ReportError(CStrVector("Nothing to repeat"));
207      case '^': {
208        Advance();
209        if (multiline_) {
210          builder->AddAssertion(
211              new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
212        } else {
213          builder->AddAssertion(
214              new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
215          set_contains_anchor();
216        }
217        continue;
218      }
219      case '$': {
220        Advance();
221        RegExpAssertion::AssertionType assertion_type =
222            multiline_ ? RegExpAssertion::END_OF_LINE
223                       : RegExpAssertion::END_OF_INPUT;
224        builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
225        continue;
226      }
227      case '.': {
228        Advance();
229        // everything except \x0a, \x0d, \u2028 and \u2029
230        ZoneList<CharacterRange>* ranges =
231            new (zone()) ZoneList<CharacterRange>(2, zone());
232        CharacterRange::AddClassEscape('.', ranges, zone());
233        RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
234        builder->AddAtom(atom);
235        break;
236      }
237      case '(': {
238        SubexpressionType subexpr_type = CAPTURE;
239        RegExpLookaround::Type lookaround_type = state->lookaround_type();
240        Advance();
241        if (current() == '?') {
242          switch (Next()) {
243            case ':':
244              subexpr_type = GROUPING;
245              break;
246            case '=':
247              lookaround_type = RegExpLookaround::LOOKAHEAD;
248              subexpr_type = POSITIVE_LOOKAROUND;
249              break;
250            case '!':
251              lookaround_type = RegExpLookaround::LOOKAHEAD;
252              subexpr_type = NEGATIVE_LOOKAROUND;
253              break;
254            case '<':
255              if (FLAG_harmony_regexp_lookbehind) {
256                Advance();
257                lookaround_type = RegExpLookaround::LOOKBEHIND;
258                if (Next() == '=') {
259                  subexpr_type = POSITIVE_LOOKAROUND;
260                  break;
261                } else if (Next() == '!') {
262                  subexpr_type = NEGATIVE_LOOKAROUND;
263                  break;
264                }
265              }
266            // Fall through.
267            default:
268              ReportError(CStrVector("Invalid group") CHECK_FAILED);
269              break;
270          }
271          Advance(2);
272        } else {
273          if (captures_started_ >= kMaxCaptures) {
274            ReportError(CStrVector("Too many captures") CHECK_FAILED);
275          }
276          captures_started_++;
277        }
278        // Store current state and begin new disjunction parsing.
279        state = new (zone()) RegExpParserState(
280            state, subexpr_type, lookaround_type, captures_started_, zone());
281        builder = state->builder();
282        continue;
283      }
284      case '[': {
285        RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
286        builder->AddAtom(atom);
287        break;
288      }
289      // Atom ::
290      //   \ AtomEscape
291      case '\\':
292        switch (Next()) {
293          case kEndMarker:
294            return ReportError(CStrVector("\\ at end of pattern"));
295          case 'b':
296            Advance(2);
297            builder->AddAssertion(
298                new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
299            continue;
300          case 'B':
301            Advance(2);
302            builder->AddAssertion(
303                new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
304            continue;
305          // AtomEscape ::
306          //   CharacterClassEscape
307          //
308          // CharacterClassEscape :: one of
309          //   d D s S w W
310          case 'd':
311          case 'D':
312          case 's':
313          case 'S':
314          case 'w':
315          case 'W': {
316            uc32 c = Next();
317            Advance(2);
318            ZoneList<CharacterRange>* ranges =
319                new (zone()) ZoneList<CharacterRange>(2, zone());
320            CharacterRange::AddClassEscape(c, ranges, zone());
321            RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
322            builder->AddAtom(atom);
323            break;
324          }
325          case '1':
326          case '2':
327          case '3':
328          case '4':
329          case '5':
330          case '6':
331          case '7':
332          case '8':
333          case '9': {
334            int index = 0;
335            if (ParseBackReferenceIndex(&index)) {
336              if (state->IsInsideCaptureGroup(index)) {
337                // The back reference is inside the capture group it refers to.
338                // Nothing can possibly have been captured yet, so we use empty
339                // instead. This ensures that, when checking a back reference,
340                // the capture registers of the referenced capture are either
341                // both set or both cleared.
342                builder->AddEmpty();
343              } else {
344                RegExpCapture* capture = GetCapture(index);
345                RegExpTree* atom = new (zone()) RegExpBackReference(capture);
346                builder->AddAtom(atom);
347              }
348              break;
349            }
350            uc32 first_digit = Next();
351            if (first_digit == '8' || first_digit == '9') {
352              // If the 'u' flag is present, only syntax characters can be
353              // escaped,
354              // no other identity escapes are allowed. If the 'u' flag is not
355              // present, all identity escapes are allowed.
356              if (!unicode_) {
357                builder->AddCharacter(first_digit);
358                Advance(2);
359              } else {
360                return ReportError(CStrVector("Invalid escape"));
361              }
362              break;
363            }
364          }
365          // FALLTHROUGH
366          case '0': {
367            Advance();
368            uc32 octal = ParseOctalLiteral();
369            builder->AddCharacter(octal);
370            break;
371          }
372          // ControlEscape :: one of
373          //   f n r t v
374          case 'f':
375            Advance(2);
376            builder->AddCharacter('\f');
377            break;
378          case 'n':
379            Advance(2);
380            builder->AddCharacter('\n');
381            break;
382          case 'r':
383            Advance(2);
384            builder->AddCharacter('\r');
385            break;
386          case 't':
387            Advance(2);
388            builder->AddCharacter('\t');
389            break;
390          case 'v':
391            Advance(2);
392            builder->AddCharacter('\v');
393            break;
394          case 'c': {
395            Advance();
396            uc32 controlLetter = Next();
397            // Special case if it is an ASCII letter.
398            // Convert lower case letters to uppercase.
399            uc32 letter = controlLetter & ~('a' ^ 'A');
400            if (letter < 'A' || 'Z' < letter) {
401              // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
402              // This is outside the specification. We match JSC in
403              // reading the backslash as a literal character instead
404              // of as starting an escape.
405              builder->AddCharacter('\\');
406            } else {
407              Advance(2);
408              builder->AddCharacter(controlLetter & 0x1f);
409            }
410            break;
411          }
412          case 'x': {
413            Advance(2);
414            uc32 value;
415            if (ParseHexEscape(2, &value)) {
416              builder->AddCharacter(value);
417            } else if (!unicode_) {
418              builder->AddCharacter('x');
419            } else {
420              // If the 'u' flag is present, invalid escapes are not treated as
421              // identity escapes.
422              return ReportError(CStrVector("Invalid escape"));
423            }
424            break;
425          }
426          case 'u': {
427            Advance(2);
428            uc32 value;
429            if (ParseUnicodeEscape(&value)) {
430              builder->AddUnicodeCharacter(value);
431            } else if (!unicode_) {
432              builder->AddCharacter('u');
433            } else {
434              // If the 'u' flag is present, invalid escapes are not treated as
435              // identity escapes.
436              return ReportError(CStrVector("Invalid unicode escape"));
437            }
438            break;
439          }
440          default:
441            Advance();
442            // If the 'u' flag is present, only syntax characters can be
443            // escaped, no
444            // other identity escapes are allowed. If the 'u' flag is not
445            // present,
446            // all identity escapes are allowed.
447            if (!unicode_ || IsSyntaxCharacter(current())) {
448              builder->AddCharacter(current());
449              Advance();
450            } else {
451              return ReportError(CStrVector("Invalid escape"));
452            }
453            break;
454        }
455        break;
456      case '{': {
457        int dummy;
458        if (ParseIntervalQuantifier(&dummy, &dummy)) {
459          ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
460        }
461        // fallthrough
462      }
463      default:
464        builder->AddUnicodeCharacter(current());
465        Advance();
466        break;
467    }  // end switch(current())
468
469    int min;
470    int max;
471    switch (current()) {
472      // QuantifierPrefix ::
473      //   *
474      //   +
475      //   ?
476      //   {
477      case '*':
478        min = 0;
479        max = RegExpTree::kInfinity;
480        Advance();
481        break;
482      case '+':
483        min = 1;
484        max = RegExpTree::kInfinity;
485        Advance();
486        break;
487      case '?':
488        min = 0;
489        max = 1;
490        Advance();
491        break;
492      case '{':
493        if (ParseIntervalQuantifier(&min, &max)) {
494          if (max < min) {
495            ReportError(CStrVector("numbers out of order in {} quantifier.")
496                            CHECK_FAILED);
497          }
498          break;
499        } else {
500          continue;
501        }
502      default:
503        continue;
504    }
505    RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
506    if (current() == '?') {
507      quantifier_type = RegExpQuantifier::NON_GREEDY;
508      Advance();
509    } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
510      // FLAG_regexp_possessive_quantifier is a debug-only flag.
511      quantifier_type = RegExpQuantifier::POSSESSIVE;
512      Advance();
513    }
514    builder->AddQuantifierToAtom(min, max, quantifier_type);
515  }
516}
517
518
519#ifdef DEBUG
520// Currently only used in an DCHECK.
521static bool IsSpecialClassEscape(uc32 c) {
522  switch (c) {
523    case 'd':
524    case 'D':
525    case 's':
526    case 'S':
527    case 'w':
528    case 'W':
529      return true;
530    default:
531      return false;
532  }
533}
534#endif
535
536
537// In order to know whether an escape is a backreference or not we have to scan
538// the entire regexp and find the number of capturing parentheses.  However we
539// don't want to scan the regexp twice unless it is necessary.  This mini-parser
540// is called when needed.  It can see the difference between capturing and
541// noncapturing parentheses and can skip character classes and backslash-escaped
542// characters.
543void RegExpParser::ScanForCaptures() {
544  // Start with captures started previous to current position
545  int capture_count = captures_started();
546  // Add count of captures after this position.
547  int n;
548  while ((n = current()) != kEndMarker) {
549    Advance();
550    switch (n) {
551      case '\\':
552        Advance();
553        break;
554      case '[': {
555        int c;
556        while ((c = current()) != kEndMarker) {
557          Advance();
558          if (c == '\\') {
559            Advance();
560          } else {
561            if (c == ']') break;
562          }
563        }
564        break;
565      }
566      case '(':
567        if (current() != '?') capture_count++;
568        break;
569    }
570  }
571  capture_count_ = capture_count;
572  is_scanned_for_captures_ = true;
573}
574
575
576bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
577  DCHECK_EQ('\\', current());
578  DCHECK('1' <= Next() && Next() <= '9');
579  // Try to parse a decimal literal that is no greater than the total number
580  // of left capturing parentheses in the input.
581  int start = position();
582  int value = Next() - '0';
583  Advance(2);
584  while (true) {
585    uc32 c = current();
586    if (IsDecimalDigit(c)) {
587      value = 10 * value + (c - '0');
588      if (value > kMaxCaptures) {
589        Reset(start);
590        return false;
591      }
592      Advance();
593    } else {
594      break;
595    }
596  }
597  if (value > captures_started()) {
598    if (!is_scanned_for_captures_) {
599      int saved_position = position();
600      ScanForCaptures();
601      Reset(saved_position);
602    }
603    if (value > capture_count_) {
604      Reset(start);
605      return false;
606    }
607  }
608  *index_out = value;
609  return true;
610}
611
612
613RegExpCapture* RegExpParser::GetCapture(int index) {
614  // The index for the capture groups are one-based. Its index in the list is
615  // zero-based.
616  int know_captures =
617      is_scanned_for_captures_ ? capture_count_ : captures_started_;
618  DCHECK(index <= know_captures);
619  if (captures_ == NULL) {
620    captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
621  }
622  while (captures_->length() < know_captures) {
623    captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
624  }
625  return captures_->at(index - 1);
626}
627
628
629bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
630  for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
631    if (s->group_type() != CAPTURE) continue;
632    // Return true if we found the matching capture index.
633    if (index == s->capture_index()) return true;
634    // Abort if index is larger than what has been parsed up till this state.
635    if (index > s->capture_index()) return false;
636  }
637  return false;
638}
639
640
641// QuantifierPrefix ::
642//   { DecimalDigits }
643//   { DecimalDigits , }
644//   { DecimalDigits , DecimalDigits }
645//
646// Returns true if parsing succeeds, and set the min_out and max_out
647// values. Values are truncated to RegExpTree::kInfinity if they overflow.
648bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
649  DCHECK_EQ(current(), '{');
650  int start = position();
651  Advance();
652  int min = 0;
653  if (!IsDecimalDigit(current())) {
654    Reset(start);
655    return false;
656  }
657  while (IsDecimalDigit(current())) {
658    int next = current() - '0';
659    if (min > (RegExpTree::kInfinity - next) / 10) {
660      // Overflow. Skip past remaining decimal digits and return -1.
661      do {
662        Advance();
663      } while (IsDecimalDigit(current()));
664      min = RegExpTree::kInfinity;
665      break;
666    }
667    min = 10 * min + next;
668    Advance();
669  }
670  int max = 0;
671  if (current() == '}') {
672    max = min;
673    Advance();
674  } else if (current() == ',') {
675    Advance();
676    if (current() == '}') {
677      max = RegExpTree::kInfinity;
678      Advance();
679    } else {
680      while (IsDecimalDigit(current())) {
681        int next = current() - '0';
682        if (max > (RegExpTree::kInfinity - next) / 10) {
683          do {
684            Advance();
685          } while (IsDecimalDigit(current()));
686          max = RegExpTree::kInfinity;
687          break;
688        }
689        max = 10 * max + next;
690        Advance();
691      }
692      if (current() != '}') {
693        Reset(start);
694        return false;
695      }
696      Advance();
697    }
698  } else {
699    Reset(start);
700    return false;
701  }
702  *min_out = min;
703  *max_out = max;
704  return true;
705}
706
707
708uc32 RegExpParser::ParseOctalLiteral() {
709  DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
710  // For compatibility with some other browsers (not all), we parse
711  // up to three octal digits with a value below 256.
712  uc32 value = current() - '0';
713  Advance();
714  if ('0' <= current() && current() <= '7') {
715    value = value * 8 + current() - '0';
716    Advance();
717    if (value < 32 && '0' <= current() && current() <= '7') {
718      value = value * 8 + current() - '0';
719      Advance();
720    }
721  }
722  return value;
723}
724
725
726bool RegExpParser::ParseHexEscape(int length, uc32* value) {
727  int start = position();
728  uc32 val = 0;
729  for (int i = 0; i < length; ++i) {
730    uc32 c = current();
731    int d = HexValue(c);
732    if (d < 0) {
733      Reset(start);
734      return false;
735    }
736    val = val * 16 + d;
737    Advance();
738  }
739  *value = val;
740  return true;
741}
742
743
744bool RegExpParser::ParseUnicodeEscape(uc32* value) {
745  // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
746  // allowed). In the latter case, the number of hex digits between { } is
747  // arbitrary. \ and u have already been read.
748  if (current() == '{' && unicode_) {
749    int start = position();
750    Advance();
751    if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
752      if (current() == '}') {
753        Advance();
754        return true;
755      }
756    }
757    Reset(start);
758    return false;
759  }
760  // \u but no {, or \u{...} escapes not allowed.
761  return ParseHexEscape(4, value);
762}
763
764
765bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
766  uc32 x = 0;
767  int d = HexValue(current());
768  if (d < 0) {
769    return false;
770  }
771  while (d >= 0) {
772    x = x * 16 + d;
773    if (x > max_value) {
774      return false;
775    }
776    Advance();
777    d = HexValue(current());
778  }
779  *value = x;
780  return true;
781}
782
783
784uc32 RegExpParser::ParseClassCharacterEscape() {
785  DCHECK(current() == '\\');
786  DCHECK(has_next() && !IsSpecialClassEscape(Next()));
787  Advance();
788  switch (current()) {
789    case 'b':
790      Advance();
791      return '\b';
792    // ControlEscape :: one of
793    //   f n r t v
794    case 'f':
795      Advance();
796      return '\f';
797    case 'n':
798      Advance();
799      return '\n';
800    case 'r':
801      Advance();
802      return '\r';
803    case 't':
804      Advance();
805      return '\t';
806    case 'v':
807      Advance();
808      return '\v';
809    case 'c': {
810      uc32 controlLetter = Next();
811      uc32 letter = controlLetter & ~('A' ^ 'a');
812      // For compatibility with JSC, inside a character class
813      // we also accept digits and underscore as control characters.
814      if ((controlLetter >= '0' && controlLetter <= '9') ||
815          controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
816        Advance(2);
817        // Control letters mapped to ASCII control characters in the range
818        // 0x00-0x1f.
819        return controlLetter & 0x1f;
820      }
821      // We match JSC in reading the backslash as a literal
822      // character instead of as starting an escape.
823      return '\\';
824    }
825    case '0':
826    case '1':
827    case '2':
828    case '3':
829    case '4':
830    case '5':
831    case '6':
832    case '7':
833      // For compatibility, we interpret a decimal escape that isn't
834      // a back reference (and therefore either \0 or not valid according
835      // to the specification) as a 1..3 digit octal character code.
836      return ParseOctalLiteral();
837    case 'x': {
838      Advance();
839      uc32 value;
840      if (ParseHexEscape(2, &value)) {
841        return value;
842      }
843      if (!unicode_) {
844        // If \x is not followed by a two-digit hexadecimal, treat it
845        // as an identity escape.
846        return 'x';
847      }
848      // If the 'u' flag is present, invalid escapes are not treated as
849      // identity escapes.
850      ReportError(CStrVector("Invalid escape"));
851      return 0;
852    }
853    case 'u': {
854      Advance();
855      uc32 value;
856      if (ParseUnicodeEscape(&value)) {
857        return value;
858      }
859      if (!unicode_) {
860        return 'u';
861      }
862      // If the 'u' flag is present, invalid escapes are not treated as
863      // identity escapes.
864      ReportError(CStrVector("Invalid unicode escape"));
865      return 0;
866    }
867    default: {
868      uc32 result = current();
869      // If the 'u' flag is present, only syntax characters can be escaped, no
870      // other identity escapes are allowed. If the 'u' flag is not present, all
871      // identity escapes are allowed.
872      if (!unicode_ || IsSyntaxCharacter(result)) {
873        Advance();
874        return result;
875      }
876      ReportError(CStrVector("Invalid escape"));
877      return 0;
878    }
879  }
880  return 0;
881}
882
883
884CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
885  DCHECK_EQ(0, *char_class);
886  uc32 first = current();
887  if (first == '\\') {
888    switch (Next()) {
889      case 'w':
890      case 'W':
891      case 'd':
892      case 'D':
893      case 's':
894      case 'S': {
895        *char_class = Next();
896        Advance(2);
897        return CharacterRange::Singleton(0);  // Return dummy value.
898      }
899      case kEndMarker:
900        return ReportError(CStrVector("\\ at end of pattern"));
901      default:
902        uc32 c = ParseClassCharacterEscape(CHECK_FAILED);
903        return CharacterRange::Singleton(c);
904    }
905  } else {
906    Advance();
907    return CharacterRange::Singleton(first);
908  }
909}
910
911
912static const uc16 kNoCharClass = 0;
913
914// Adds range or pre-defined character class to character ranges.
915// If char_class is not kInvalidClass, it's interpreted as a class
916// escape (i.e., 's' means whitespace, from '\s').
917static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
918                                    uc16 char_class, CharacterRange range,
919                                    Zone* zone) {
920  if (char_class != kNoCharClass) {
921    CharacterRange::AddClassEscape(char_class, ranges, zone);
922  } else {
923    ranges->Add(range, zone);
924  }
925}
926
927
928RegExpTree* RegExpParser::ParseCharacterClass() {
929  static const char* kUnterminated = "Unterminated character class";
930  static const char* kRangeOutOfOrder = "Range out of order in character class";
931
932  DCHECK_EQ(current(), '[');
933  Advance();
934  bool is_negated = false;
935  if (current() == '^') {
936    is_negated = true;
937    Advance();
938  }
939  ZoneList<CharacterRange>* ranges =
940      new (zone()) ZoneList<CharacterRange>(2, zone());
941  while (has_more() && current() != ']') {
942    uc16 char_class = kNoCharClass;
943    CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
944    if (current() == '-') {
945      Advance();
946      if (current() == kEndMarker) {
947        // If we reach the end we break out of the loop and let the
948        // following code report an error.
949        break;
950      } else if (current() == ']') {
951        AddRangeOrEscape(ranges, char_class, first, zone());
952        ranges->Add(CharacterRange::Singleton('-'), zone());
953        break;
954      }
955      uc16 char_class_2 = kNoCharClass;
956      CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
957      if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
958        // Either end is an escaped character class. Treat the '-' verbatim.
959        AddRangeOrEscape(ranges, char_class, first, zone());
960        ranges->Add(CharacterRange::Singleton('-'), zone());
961        AddRangeOrEscape(ranges, char_class_2, next, zone());
962        continue;
963      }
964      if (first.from() > next.to()) {
965        return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
966      }
967      ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
968    } else {
969      AddRangeOrEscape(ranges, char_class, first, zone());
970    }
971  }
972  if (!has_more()) {
973    return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
974  }
975  Advance();
976  if (ranges->length() == 0) {
977    ranges->Add(CharacterRange::Everything(), zone());
978    is_negated = !is_negated;
979  }
980  return new (zone()) RegExpCharacterClass(ranges, is_negated);
981}
982
983
984#undef CHECK_FAILED
985
986
987bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
988                               FlatStringReader* input, bool multiline,
989                               bool unicode, RegExpCompileData* result) {
990  DCHECK(result != NULL);
991  RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);
992  RegExpTree* tree = parser.ParsePattern();
993  if (parser.failed()) {
994    DCHECK(tree == NULL);
995    DCHECK(!result->error.is_null());
996  } else {
997    DCHECK(tree != NULL);
998    DCHECK(result->error.is_null());
999    if (FLAG_trace_regexp_parser) {
1000      OFStream os(stdout);
1001      tree->Print(os, zone);
1002      os << "\n";
1003    }
1004    result->tree = tree;
1005    int capture_count = parser.captures_started();
1006    result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1007    result->contains_anchor = parser.contains_anchor();
1008    result->capture_count = capture_count;
1009  }
1010  return !parser.failed();
1011}
1012
1013
1014RegExpBuilder::RegExpBuilder(Zone* zone)
1015    : zone_(zone),
1016      pending_empty_(false),
1017      characters_(NULL),
1018      terms_(),
1019      alternatives_()
1020#ifdef DEBUG
1021      ,
1022      last_added_(ADD_NONE)
1023#endif
1024{
1025}
1026
1027
1028void RegExpBuilder::FlushCharacters() {
1029  pending_empty_ = false;
1030  if (characters_ != NULL) {
1031    RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1032    characters_ = NULL;
1033    text_.Add(atom, zone());
1034    LAST(ADD_ATOM);
1035  }
1036}
1037
1038
1039void RegExpBuilder::FlushText() {
1040  FlushCharacters();
1041  int num_text = text_.length();
1042  if (num_text == 0) {
1043    return;
1044  } else if (num_text == 1) {
1045    terms_.Add(text_.last(), zone());
1046  } else {
1047    RegExpText* text = new (zone()) RegExpText(zone());
1048    for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1049    terms_.Add(text, zone());
1050  }
1051  text_.Clear();
1052}
1053
1054
1055void RegExpBuilder::AddCharacter(uc16 c) {
1056  pending_empty_ = false;
1057  if (characters_ == NULL) {
1058    characters_ = new (zone()) ZoneList<uc16>(4, zone());
1059  }
1060  characters_->Add(c, zone());
1061  LAST(ADD_CHAR);
1062}
1063
1064
1065void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1066  if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
1067    ZoneList<uc16> surrogate_pair(2, zone());
1068    surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());
1069    surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());
1070    RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1071    AddAtom(atom);
1072  } else {
1073    AddCharacter(static_cast<uc16>(c));
1074  }
1075}
1076
1077
1078void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1079
1080
1081void RegExpBuilder::AddAtom(RegExpTree* term) {
1082  if (term->IsEmpty()) {
1083    AddEmpty();
1084    return;
1085  }
1086  if (term->IsTextElement()) {
1087    FlushCharacters();
1088    text_.Add(term, zone());
1089  } else {
1090    FlushText();
1091    terms_.Add(term, zone());
1092  }
1093  LAST(ADD_ATOM);
1094}
1095
1096
1097void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1098  FlushText();
1099  terms_.Add(assert, zone());
1100  LAST(ADD_ASSERT);
1101}
1102
1103
1104void RegExpBuilder::NewAlternative() { FlushTerms(); }
1105
1106
1107void RegExpBuilder::FlushTerms() {
1108  FlushText();
1109  int num_terms = terms_.length();
1110  RegExpTree* alternative;
1111  if (num_terms == 0) {
1112    alternative = new (zone()) RegExpEmpty();
1113  } else if (num_terms == 1) {
1114    alternative = terms_.last();
1115  } else {
1116    alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1117  }
1118  alternatives_.Add(alternative, zone());
1119  terms_.Clear();
1120  LAST(ADD_NONE);
1121}
1122
1123
1124RegExpTree* RegExpBuilder::ToRegExp() {
1125  FlushTerms();
1126  int num_alternatives = alternatives_.length();
1127  if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1128  if (num_alternatives == 1) return alternatives_.last();
1129  return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1130}
1131
1132
1133void RegExpBuilder::AddQuantifierToAtom(
1134    int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1135  if (pending_empty_) {
1136    pending_empty_ = false;
1137    return;
1138  }
1139  RegExpTree* atom;
1140  if (characters_ != NULL) {
1141    DCHECK(last_added_ == ADD_CHAR);
1142    // Last atom was character.
1143    Vector<const uc16> char_vector = characters_->ToConstVector();
1144    int num_chars = char_vector.length();
1145    if (num_chars > 1) {
1146      Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1147      text_.Add(new (zone()) RegExpAtom(prefix), zone());
1148      char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1149    }
1150    characters_ = NULL;
1151    atom = new (zone()) RegExpAtom(char_vector);
1152    FlushText();
1153  } else if (text_.length() > 0) {
1154    DCHECK(last_added_ == ADD_ATOM);
1155    atom = text_.RemoveLast();
1156    FlushText();
1157  } else if (terms_.length() > 0) {
1158    DCHECK(last_added_ == ADD_ATOM);
1159    atom = terms_.RemoveLast();
1160    if (atom->max_match() == 0) {
1161      // Guaranteed to only match an empty string.
1162      LAST(ADD_TERM);
1163      if (min == 0) {
1164        return;
1165      }
1166      terms_.Add(atom, zone());
1167      return;
1168    }
1169  } else {
1170    // Only call immediately after adding an atom or character!
1171    UNREACHABLE();
1172    return;
1173  }
1174  terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1175             zone());
1176  LAST(ADD_TERM);
1177}
1178
1179}  // namespace internal
1180}  // namespace v8
1181