1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp-parser.h"
6
7#include "src/char-predicates-inl.h"
8#include "src/factory.h"
9#include "src/isolate.h"
10#include "src/objects-inl.h"
11#include "src/ostreams.h"
12#include "src/regexp/jsregexp.h"
13#include "src/utils.h"
14
15#ifdef V8_I18N_SUPPORT
16#include "unicode/uniset.h"
17#endif  // V8_I18N_SUPPORT
18
19namespace v8 {
20namespace internal {
21
22RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
23                           JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
24    : isolate_(isolate),
25      zone_(zone),
26      error_(error),
27      captures_(NULL),
28      named_captures_(NULL),
29      named_back_references_(NULL),
30      in_(in),
31      current_(kEndMarker),
32      ignore_case_(flags & JSRegExp::kIgnoreCase),
33      multiline_(flags & JSRegExp::kMultiline),
34      unicode_(flags & JSRegExp::kUnicode),
35      next_pos_(0),
36      captures_started_(0),
37      capture_count_(0),
38      has_more_(true),
39      simple_(false),
40      contains_anchor_(false),
41      is_scanned_for_captures_(false),
42      failed_(false) {
43  Advance();
44}
45
46template <bool update_position>
47inline uc32 RegExpParser::ReadNext() {
48  int position = next_pos_;
49  uc32 c0 = in()->Get(position);
50  position++;
51  // Read the whole surrogate pair in case of unicode flag, if possible.
52  if (unicode() && position < in()->length() &&
53      unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
54    uc16 c1 = in()->Get(position);
55    if (unibrow::Utf16::IsTrailSurrogate(c1)) {
56      c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
57      position++;
58    }
59  }
60  if (update_position) next_pos_ = position;
61  return c0;
62}
63
64
65uc32 RegExpParser::Next() {
66  if (has_next()) {
67    return ReadNext<false>();
68  } else {
69    return kEndMarker;
70  }
71}
72
73
74void RegExpParser::Advance() {
75  if (has_next()) {
76    StackLimitCheck check(isolate());
77    if (check.HasOverflowed()) {
78      if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");
79      ReportError(CStrVector(
80          MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
81    } else if (zone()->excess_allocation()) {
82      ReportError(CStrVector("Regular expression too large"));
83    } else {
84      current_ = ReadNext<true>();
85    }
86  } else {
87    current_ = kEndMarker;
88    // Advance so that position() points to 1-after-the-last-character. This is
89    // important so that Reset() to this position works correctly.
90    next_pos_ = in()->length() + 1;
91    has_more_ = false;
92  }
93}
94
95
96void RegExpParser::Reset(int pos) {
97  next_pos_ = pos;
98  has_more_ = (pos < in()->length());
99  Advance();
100}
101
102
103void RegExpParser::Advance(int dist) {
104  next_pos_ += dist - 1;
105  Advance();
106}
107
108
109bool RegExpParser::simple() { return simple_; }
110
111bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
112  switch (c) {
113    case '^':
114    case '$':
115    case '\\':
116    case '.':
117    case '*':
118    case '+':
119    case '?':
120    case '(':
121    case ')':
122    case '[':
123    case ']':
124    case '{':
125    case '}':
126    case '|':
127    case '/':
128      return true;
129    default:
130      break;
131  }
132  return false;
133}
134
135
136RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
137  if (failed_) return NULL;  // Do not overwrite any existing error.
138  failed_ = true;
139  *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
140  // Zip to the end to make sure the no more input is read.
141  current_ = kEndMarker;
142  next_pos_ = in()->length();
143  return NULL;
144}
145
146
147#define CHECK_FAILED /**/); \
148  if (failed_) return NULL; \
149  ((void)0
150
151
152// Pattern ::
153//   Disjunction
154RegExpTree* RegExpParser::ParsePattern() {
155  RegExpTree* result = ParseDisjunction(CHECK_FAILED);
156  PatchNamedBackReferences(CHECK_FAILED);
157  DCHECK(!has_more());
158  // If the result of parsing is a literal string atom, and it has the
159  // same length as the input, then the atom is identical to the input.
160  if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
161    simple_ = true;
162  }
163  return result;
164}
165
166
167// Disjunction ::
168//   Alternative
169//   Alternative | Disjunction
170// Alternative ::
171//   [empty]
172//   Term Alternative
173// Term ::
174//   Assertion
175//   Atom
176//   Atom Quantifier
177RegExpTree* RegExpParser::ParseDisjunction() {
178  // Used to store current state while parsing subexpressions.
179  RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
180                                  nullptr, ignore_case(), unicode(), zone());
181  RegExpParserState* state = &initial_state;
182  // Cache the builder in a local variable for quick access.
183  RegExpBuilder* builder = initial_state.builder();
184  while (true) {
185    switch (current()) {
186      case kEndMarker:
187        if (state->IsSubexpression()) {
188          // Inside a parenthesized group when hitting end of input.
189          return ReportError(CStrVector("Unterminated group"));
190        }
191        DCHECK_EQ(INITIAL, state->group_type());
192        // Parsing completed successfully.
193        return builder->ToRegExp();
194      case ')': {
195        if (!state->IsSubexpression()) {
196          return ReportError(CStrVector("Unmatched ')'"));
197        }
198        DCHECK_NE(INITIAL, state->group_type());
199
200        Advance();
201        // End disjunction parsing and convert builder content to new single
202        // regexp atom.
203        RegExpTree* body = builder->ToRegExp();
204
205        int end_capture_index = captures_started();
206
207        int capture_index = state->capture_index();
208        SubexpressionType group_type = state->group_type();
209
210        // Build result of subexpression.
211        if (group_type == CAPTURE) {
212          if (state->IsNamedCapture()) {
213            CreateNamedCaptureAtIndex(state->capture_name(),
214                                      capture_index CHECK_FAILED);
215          }
216          RegExpCapture* capture = GetCapture(capture_index);
217          capture->set_body(body);
218          body = capture;
219        } else if (group_type == GROUPING) {
220          body = new (zone()) RegExpGroup(body);
221        } else {
222          DCHECK(group_type == POSITIVE_LOOKAROUND ||
223                 group_type == NEGATIVE_LOOKAROUND);
224          bool is_positive = (group_type == POSITIVE_LOOKAROUND);
225          body = new (zone()) RegExpLookaround(
226              body, is_positive, end_capture_index - capture_index,
227              capture_index, state->lookaround_type());
228        }
229
230        // Restore previous state.
231        state = state->previous_state();
232        builder = state->builder();
233
234        builder->AddAtom(body);
235        // For compatability with JSC and ES3, we allow quantifiers after
236        // lookaheads, and break in all cases.
237        break;
238      }
239      case '|': {
240        Advance();
241        builder->NewAlternative();
242        continue;
243      }
244      case '*':
245      case '+':
246      case '?':
247        return ReportError(CStrVector("Nothing to repeat"));
248      case '^': {
249        Advance();
250        if (multiline()) {
251          builder->AddAssertion(
252              new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
253        } else {
254          builder->AddAssertion(
255              new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
256          set_contains_anchor();
257        }
258        continue;
259      }
260      case '$': {
261        Advance();
262        RegExpAssertion::AssertionType assertion_type =
263            multiline() ? RegExpAssertion::END_OF_LINE
264                        : RegExpAssertion::END_OF_INPUT;
265        builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
266        continue;
267      }
268      case '.': {
269        Advance();
270        // everything except \x0a, \x0d, \u2028 and \u2029
271        ZoneList<CharacterRange>* ranges =
272            new (zone()) ZoneList<CharacterRange>(2, zone());
273        CharacterRange::AddClassEscape('.', ranges, zone());
274        RegExpCharacterClass* cc =
275            new (zone()) RegExpCharacterClass(ranges, false);
276        builder->AddCharacterClass(cc);
277        break;
278      }
279      case '(': {
280        SubexpressionType subexpr_type = CAPTURE;
281        RegExpLookaround::Type lookaround_type = state->lookaround_type();
282        bool is_named_capture = false;
283        Advance();
284        if (current() == '?') {
285          switch (Next()) {
286            case ':':
287              subexpr_type = GROUPING;
288              Advance(2);
289              break;
290            case '=':
291              lookaround_type = RegExpLookaround::LOOKAHEAD;
292              subexpr_type = POSITIVE_LOOKAROUND;
293              Advance(2);
294              break;
295            case '!':
296              lookaround_type = RegExpLookaround::LOOKAHEAD;
297              subexpr_type = NEGATIVE_LOOKAROUND;
298              Advance(2);
299              break;
300            case '<':
301              Advance();
302              if (FLAG_harmony_regexp_lookbehind) {
303                if (Next() == '=') {
304                  subexpr_type = POSITIVE_LOOKAROUND;
305                  lookaround_type = RegExpLookaround::LOOKBEHIND;
306                  Advance(2);
307                  break;
308                } else if (Next() == '!') {
309                  subexpr_type = NEGATIVE_LOOKAROUND;
310                  lookaround_type = RegExpLookaround::LOOKBEHIND;
311                  Advance(2);
312                  break;
313                }
314              }
315              if (FLAG_harmony_regexp_named_captures && unicode()) {
316                is_named_capture = true;
317                Advance();
318                break;
319              }
320            // Fall through.
321            default:
322              return ReportError(CStrVector("Invalid group"));
323          }
324        }
325
326        const ZoneVector<uc16>* capture_name = nullptr;
327        if (subexpr_type == CAPTURE) {
328          if (captures_started_ >= kMaxCaptures) {
329            return ReportError(CStrVector("Too many captures"));
330          }
331          captures_started_++;
332
333          if (is_named_capture) {
334            capture_name = ParseCaptureGroupName(CHECK_FAILED);
335          }
336        }
337        // Store current state and begin new disjunction parsing.
338        state = new (zone()) RegExpParserState(
339            state, subexpr_type, lookaround_type, captures_started_,
340            capture_name, ignore_case(), unicode(), zone());
341        builder = state->builder();
342        continue;
343      }
344      case '[': {
345        RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
346        builder->AddCharacterClass(cc->AsCharacterClass());
347        break;
348      }
349      // Atom ::
350      //   \ AtomEscape
351      case '\\':
352        switch (Next()) {
353          case kEndMarker:
354            return ReportError(CStrVector("\\ at end of pattern"));
355          case 'b':
356            Advance(2);
357            builder->AddAssertion(
358                new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
359            continue;
360          case 'B':
361            Advance(2);
362            builder->AddAssertion(
363                new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
364            continue;
365          // AtomEscape ::
366          //   CharacterClassEscape
367          //
368          // CharacterClassEscape :: one of
369          //   d D s S w W
370          case 'd':
371          case 'D':
372          case 's':
373          case 'S':
374          case 'w':
375          case 'W': {
376            uc32 c = Next();
377            Advance(2);
378            ZoneList<CharacterRange>* ranges =
379                new (zone()) ZoneList<CharacterRange>(2, zone());
380            CharacterRange::AddClassEscape(c, ranges, zone());
381            RegExpCharacterClass* cc =
382                new (zone()) RegExpCharacterClass(ranges, false);
383            builder->AddCharacterClass(cc);
384            break;
385          }
386          case 'p':
387          case 'P': {
388            uc32 p = Next();
389            Advance(2);
390            if (unicode()) {
391              if (FLAG_harmony_regexp_property) {
392                ZoneList<CharacterRange>* ranges =
393                    new (zone()) ZoneList<CharacterRange>(2, zone());
394                if (!ParsePropertyClass(ranges, p == 'P')) {
395                  return ReportError(CStrVector("Invalid property name"));
396                }
397                RegExpCharacterClass* cc =
398                    new (zone()) RegExpCharacterClass(ranges, false);
399                builder->AddCharacterClass(cc);
400              } else {
401                // With /u, no identity escapes except for syntax characters
402                // are allowed. Otherwise, all identity escapes are allowed.
403                return ReportError(CStrVector("Invalid escape"));
404              }
405            } else {
406              builder->AddCharacter(p);
407            }
408            break;
409          }
410          case '1':
411          case '2':
412          case '3':
413          case '4':
414          case '5':
415          case '6':
416          case '7':
417          case '8':
418          case '9': {
419            int index = 0;
420            bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
421            if (is_backref) {
422              if (state->IsInsideCaptureGroup(index)) {
423                // The back reference is inside the capture group it refers to.
424                // Nothing can possibly have been captured yet, so we use empty
425                // instead. This ensures that, when checking a back reference,
426                // the capture registers of the referenced capture are either
427                // both set or both cleared.
428                builder->AddEmpty();
429              } else {
430                RegExpCapture* capture = GetCapture(index);
431                RegExpTree* atom = new (zone()) RegExpBackReference(capture);
432                builder->AddAtom(atom);
433              }
434              break;
435            }
436            // With /u, no identity escapes except for syntax characters
437            // are allowed. Otherwise, all identity escapes are allowed.
438            if (unicode()) {
439              return ReportError(CStrVector("Invalid escape"));
440            }
441            uc32 first_digit = Next();
442            if (first_digit == '8' || first_digit == '9') {
443              builder->AddCharacter(first_digit);
444              Advance(2);
445              break;
446            }
447          }
448          // Fall through.
449          case '0': {
450            Advance();
451            if (unicode() && Next() >= '0' && Next() <= '9') {
452              // With /u, decimal escape with leading 0 are not parsed as octal.
453              return ReportError(CStrVector("Invalid decimal escape"));
454            }
455            uc32 octal = ParseOctalLiteral();
456            builder->AddCharacter(octal);
457            break;
458          }
459          // ControlEscape :: one of
460          //   f n r t v
461          case 'f':
462            Advance(2);
463            builder->AddCharacter('\f');
464            break;
465          case 'n':
466            Advance(2);
467            builder->AddCharacter('\n');
468            break;
469          case 'r':
470            Advance(2);
471            builder->AddCharacter('\r');
472            break;
473          case 't':
474            Advance(2);
475            builder->AddCharacter('\t');
476            break;
477          case 'v':
478            Advance(2);
479            builder->AddCharacter('\v');
480            break;
481          case 'c': {
482            Advance();
483            uc32 controlLetter = Next();
484            // Special case if it is an ASCII letter.
485            // Convert lower case letters to uppercase.
486            uc32 letter = controlLetter & ~('a' ^ 'A');
487            if (letter < 'A' || 'Z' < letter) {
488              // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
489              // This is outside the specification. We match JSC in
490              // reading the backslash as a literal character instead
491              // of as starting an escape.
492              if (unicode()) {
493                // With /u, invalid escapes are not treated as identity escapes.
494                return ReportError(CStrVector("Invalid unicode escape"));
495              }
496              builder->AddCharacter('\\');
497            } else {
498              Advance(2);
499              builder->AddCharacter(controlLetter & 0x1f);
500            }
501            break;
502          }
503          case 'x': {
504            Advance(2);
505            uc32 value;
506            if (ParseHexEscape(2, &value)) {
507              builder->AddCharacter(value);
508            } else if (!unicode()) {
509              builder->AddCharacter('x');
510            } else {
511              // With /u, invalid escapes are not treated as identity escapes.
512              return ReportError(CStrVector("Invalid escape"));
513            }
514            break;
515          }
516          case 'u': {
517            Advance(2);
518            uc32 value;
519            if (ParseUnicodeEscape(&value)) {
520              builder->AddEscapedUnicodeCharacter(value);
521            } else if (!unicode()) {
522              builder->AddCharacter('u');
523            } else {
524              // With /u, invalid escapes are not treated as identity escapes.
525              return ReportError(CStrVector("Invalid unicode escape"));
526            }
527            break;
528          }
529          case 'k':
530            if (FLAG_harmony_regexp_named_captures && unicode()) {
531              Advance(2);
532              ParseNamedBackReference(builder, state CHECK_FAILED);
533              break;
534            }
535          // Fall through.
536          default:
537            Advance();
538            // With /u, no identity escapes except for syntax characters
539            // are allowed. Otherwise, all identity escapes are allowed.
540            if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
541              builder->AddCharacter(current());
542              Advance();
543            } else {
544              return ReportError(CStrVector("Invalid escape"));
545            }
546            break;
547        }
548        break;
549      case '{': {
550        int dummy;
551        bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
552        if (parsed) return ReportError(CStrVector("Nothing to repeat"));
553        // Fall through.
554      }
555      case '}':
556      case ']':
557        if (unicode()) {
558          return ReportError(CStrVector("Lone quantifier brackets"));
559        }
560      // Fall through.
561      default:
562        builder->AddUnicodeCharacter(current());
563        Advance();
564        break;
565    }  // end switch(current())
566
567    int min;
568    int max;
569    switch (current()) {
570      // QuantifierPrefix ::
571      //   *
572      //   +
573      //   ?
574      //   {
575      case '*':
576        min = 0;
577        max = RegExpTree::kInfinity;
578        Advance();
579        break;
580      case '+':
581        min = 1;
582        max = RegExpTree::kInfinity;
583        Advance();
584        break;
585      case '?':
586        min = 0;
587        max = 1;
588        Advance();
589        break;
590      case '{':
591        if (ParseIntervalQuantifier(&min, &max)) {
592          if (max < min) {
593            return ReportError(
594                CStrVector("numbers out of order in {} quantifier"));
595          }
596          break;
597        } else if (unicode()) {
598          // With /u, incomplete quantifiers are not allowed.
599          return ReportError(CStrVector("Incomplete quantifier"));
600        }
601        continue;
602      default:
603        continue;
604    }
605    RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
606    if (current() == '?') {
607      quantifier_type = RegExpQuantifier::NON_GREEDY;
608      Advance();
609    } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
610      // FLAG_regexp_possessive_quantifier is a debug-only flag.
611      quantifier_type = RegExpQuantifier::POSSESSIVE;
612      Advance();
613    }
614    if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
615      return ReportError(CStrVector("Invalid quantifier"));
616    }
617  }
618}
619
620
621#ifdef DEBUG
622// Currently only used in an DCHECK.
623static bool IsSpecialClassEscape(uc32 c) {
624  switch (c) {
625    case 'd':
626    case 'D':
627    case 's':
628    case 'S':
629    case 'w':
630    case 'W':
631      return true;
632    default:
633      return false;
634  }
635}
636#endif
637
638
639// In order to know whether an escape is a backreference or not we have to scan
640// the entire regexp and find the number of capturing parentheses.  However we
641// don't want to scan the regexp twice unless it is necessary.  This mini-parser
642// is called when needed.  It can see the difference between capturing and
643// noncapturing parentheses and can skip character classes and backslash-escaped
644// characters.
645void RegExpParser::ScanForCaptures() {
646  // Start with captures started previous to current position
647  int capture_count = captures_started();
648  // Add count of captures after this position.
649  int n;
650  while ((n = current()) != kEndMarker) {
651    Advance();
652    switch (n) {
653      case '\\':
654        Advance();
655        break;
656      case '[': {
657        int c;
658        while ((c = current()) != kEndMarker) {
659          Advance();
660          if (c == '\\') {
661            Advance();
662          } else {
663            if (c == ']') break;
664          }
665        }
666        break;
667      }
668      case '(':
669        if (current() != '?') capture_count++;
670        break;
671    }
672  }
673  capture_count_ = capture_count;
674  is_scanned_for_captures_ = true;
675}
676
677
678bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
679  DCHECK_EQ('\\', current());
680  DCHECK('1' <= Next() && Next() <= '9');
681  // Try to parse a decimal literal that is no greater than the total number
682  // of left capturing parentheses in the input.
683  int start = position();
684  int value = Next() - '0';
685  Advance(2);
686  while (true) {
687    uc32 c = current();
688    if (IsDecimalDigit(c)) {
689      value = 10 * value + (c - '0');
690      if (value > kMaxCaptures) {
691        Reset(start);
692        return false;
693      }
694      Advance();
695    } else {
696      break;
697    }
698  }
699  if (value > captures_started()) {
700    if (!is_scanned_for_captures_) {
701      int saved_position = position();
702      ScanForCaptures();
703      Reset(saved_position);
704    }
705    if (value > capture_count_) {
706      Reset(start);
707      return false;
708    }
709  }
710  *index_out = value;
711  return true;
712}
713
714static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
715  if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
716    v->push_back(code_unit);
717  } else {
718    v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
719    v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
720  }
721}
722
723const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
724  DCHECK(FLAG_harmony_regexp_named_captures);
725  DCHECK(unicode());
726
727  ZoneVector<uc16>* name =
728      new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
729
730  bool at_start = true;
731  while (true) {
732    uc32 c = current();
733    Advance();
734
735    // Convert unicode escapes.
736    if (c == '\\' && current() == 'u') {
737      Advance();
738      if (!ParseUnicodeEscape(&c)) {
739        ReportError(CStrVector("Invalid Unicode escape sequence"));
740        return nullptr;
741      }
742    }
743
744    if (at_start) {
745      if (!IdentifierStart::Is(c)) {
746        ReportError(CStrVector("Invalid capture group name"));
747        return nullptr;
748      }
749      push_code_unit(name, c);
750      at_start = false;
751    } else {
752      if (c == '>') {
753        break;
754      } else if (IdentifierPart::Is(c)) {
755        push_code_unit(name, c);
756      } else {
757        ReportError(CStrVector("Invalid capture group name"));
758        return nullptr;
759      }
760    }
761  }
762
763  return name;
764}
765
766bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
767                                             int index) {
768  DCHECK(FLAG_harmony_regexp_named_captures);
769  DCHECK(unicode());
770  DCHECK(0 < index && index <= captures_started_);
771  DCHECK_NOT_NULL(name);
772
773  if (named_captures_ == nullptr) {
774    named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
775  } else {
776    // Check for duplicates and bail if we find any.
777    for (const auto& named_capture : *named_captures_) {
778      if (*named_capture->name() == *name) {
779        ReportError(CStrVector("Duplicate capture group name"));
780        return false;
781      }
782    }
783  }
784
785  RegExpCapture* capture = GetCapture(index);
786  DCHECK(capture->name() == nullptr);
787
788  capture->set_name(name);
789  named_captures_->Add(capture, zone());
790
791  return true;
792}
793
794bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
795                                           RegExpParserState* state) {
796  // The parser is assumed to be on the '<' in \k<name>.
797  if (current() != '<') {
798    ReportError(CStrVector("Invalid named reference"));
799    return false;
800  }
801
802  Advance();
803  const ZoneVector<uc16>* name = ParseCaptureGroupName();
804  if (name == nullptr) {
805    return false;
806  }
807
808  if (state->IsInsideCaptureGroup(name)) {
809    builder->AddEmpty();
810  } else {
811    RegExpBackReference* atom = new (zone()) RegExpBackReference();
812    atom->set_name(name);
813
814    builder->AddAtom(atom);
815
816    if (named_back_references_ == nullptr) {
817      named_back_references_ =
818          new (zone()) ZoneList<RegExpBackReference*>(1, zone());
819    }
820    named_back_references_->Add(atom, zone());
821  }
822
823  return true;
824}
825
826void RegExpParser::PatchNamedBackReferences() {
827  if (named_back_references_ == nullptr) return;
828
829  if (named_captures_ == nullptr) {
830    ReportError(CStrVector("Invalid named capture referenced"));
831    return;
832  }
833
834  // Look up and patch the actual capture for each named back reference.
835  // TODO(jgruber): O(n^2), optimize if necessary.
836
837  for (int i = 0; i < named_back_references_->length(); i++) {
838    RegExpBackReference* ref = named_back_references_->at(i);
839
840    int index = -1;
841    for (const auto& capture : *named_captures_) {
842      if (*capture->name() == *ref->name()) {
843        index = capture->index();
844        break;
845      }
846    }
847
848    if (index == -1) {
849      ReportError(CStrVector("Invalid named capture referenced"));
850      return;
851    }
852
853    ref->set_capture(GetCapture(index));
854  }
855}
856
857RegExpCapture* RegExpParser::GetCapture(int index) {
858  // The index for the capture groups are one-based. Its index in the list is
859  // zero-based.
860  int know_captures =
861      is_scanned_for_captures_ ? capture_count_ : captures_started_;
862  DCHECK(index <= know_captures);
863  if (captures_ == NULL) {
864    captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
865  }
866  while (captures_->length() < know_captures) {
867    captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
868  }
869  return captures_->at(index - 1);
870}
871
872Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
873  if (named_captures_ == nullptr || named_captures_->is_empty())
874    return Handle<FixedArray>();
875
876  Factory* factory = isolate()->factory();
877
878  int len = named_captures_->length() * 2;
879  Handle<FixedArray> array = factory->NewFixedArray(len);
880
881  for (int i = 0; i < named_captures_->length(); i++) {
882    RegExpCapture* capture = named_captures_->at(i);
883    MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
884    array->set(i * 2, *name.ToHandleChecked());
885    array->set(i * 2 + 1, Smi::FromInt(capture->index()));
886  }
887
888  return array;
889}
890
891bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
892  for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
893    if (s->group_type() != CAPTURE) continue;
894    // Return true if we found the matching capture index.
895    if (index == s->capture_index()) return true;
896    // Abort if index is larger than what has been parsed up till this state.
897    if (index > s->capture_index()) return false;
898  }
899  return false;
900}
901
902bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
903    const ZoneVector<uc16>* name) {
904  DCHECK_NOT_NULL(name);
905  for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
906    if (s->capture_name() == nullptr) continue;
907    if (*s->capture_name() == *name) return true;
908  }
909  return false;
910}
911
912// QuantifierPrefix ::
913//   { DecimalDigits }
914//   { DecimalDigits , }
915//   { DecimalDigits , DecimalDigits }
916//
917// Returns true if parsing succeeds, and set the min_out and max_out
918// values. Values are truncated to RegExpTree::kInfinity if they overflow.
919bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
920  DCHECK_EQ(current(), '{');
921  int start = position();
922  Advance();
923  int min = 0;
924  if (!IsDecimalDigit(current())) {
925    Reset(start);
926    return false;
927  }
928  while (IsDecimalDigit(current())) {
929    int next = current() - '0';
930    if (min > (RegExpTree::kInfinity - next) / 10) {
931      // Overflow. Skip past remaining decimal digits and return -1.
932      do {
933        Advance();
934      } while (IsDecimalDigit(current()));
935      min = RegExpTree::kInfinity;
936      break;
937    }
938    min = 10 * min + next;
939    Advance();
940  }
941  int max = 0;
942  if (current() == '}') {
943    max = min;
944    Advance();
945  } else if (current() == ',') {
946    Advance();
947    if (current() == '}') {
948      max = RegExpTree::kInfinity;
949      Advance();
950    } else {
951      while (IsDecimalDigit(current())) {
952        int next = current() - '0';
953        if (max > (RegExpTree::kInfinity - next) / 10) {
954          do {
955            Advance();
956          } while (IsDecimalDigit(current()));
957          max = RegExpTree::kInfinity;
958          break;
959        }
960        max = 10 * max + next;
961        Advance();
962      }
963      if (current() != '}') {
964        Reset(start);
965        return false;
966      }
967      Advance();
968    }
969  } else {
970    Reset(start);
971    return false;
972  }
973  *min_out = min;
974  *max_out = max;
975  return true;
976}
977
978
979uc32 RegExpParser::ParseOctalLiteral() {
980  DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
981  // For compatibility with some other browsers (not all), we parse
982  // up to three octal digits with a value below 256.
983  uc32 value = current() - '0';
984  Advance();
985  if ('0' <= current() && current() <= '7') {
986    value = value * 8 + current() - '0';
987    Advance();
988    if (value < 32 && '0' <= current() && current() <= '7') {
989      value = value * 8 + current() - '0';
990      Advance();
991    }
992  }
993  return value;
994}
995
996
997bool RegExpParser::ParseHexEscape(int length, uc32* value) {
998  int start = position();
999  uc32 val = 0;
1000  for (int i = 0; i < length; ++i) {
1001    uc32 c = current();
1002    int d = HexValue(c);
1003    if (d < 0) {
1004      Reset(start);
1005      return false;
1006    }
1007    val = val * 16 + d;
1008    Advance();
1009  }
1010  *value = val;
1011  return true;
1012}
1013
1014// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1015bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1016  // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1017  // allowed). In the latter case, the number of hex digits between { } is
1018  // arbitrary. \ and u have already been read.
1019  if (current() == '{' && unicode()) {
1020    int start = position();
1021    Advance();
1022    if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
1023      if (current() == '}') {
1024        Advance();
1025        return true;
1026      }
1027    }
1028    Reset(start);
1029    return false;
1030  }
1031  // \u but no {, or \u{...} escapes not allowed.
1032  bool result = ParseHexEscape(4, value);
1033  if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1034      current() == '\\') {
1035    // Attempt to read trail surrogate.
1036    int start = position();
1037    if (Next() == 'u') {
1038      Advance(2);
1039      uc32 trail;
1040      if (ParseHexEscape(4, &trail) &&
1041          unibrow::Utf16::IsTrailSurrogate(trail)) {
1042        *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1043                                                      static_cast<uc16>(trail));
1044        return true;
1045      }
1046    }
1047    Reset(start);
1048  }
1049  return result;
1050}
1051
1052#ifdef V8_I18N_SUPPORT
1053
1054namespace {
1055
1056bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1057  const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1058  if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
1059  for (int i = 0;; i++) {
1060    const char* long_name = u_getPropertyName(
1061        property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1062    if (long_name == NULL) break;
1063    if (strcmp(property_name, long_name) == 0) return true;
1064  }
1065  return false;
1066}
1067
1068bool IsExactPropertyValueAlias(const char* property_value_name,
1069                               UProperty property, int32_t property_value) {
1070  const char* short_name =
1071      u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1072  if (short_name != NULL && strcmp(property_value_name, short_name) == 0) {
1073    return true;
1074  }
1075  for (int i = 0;; i++) {
1076    const char* long_name = u_getPropertyValueName(
1077        property, property_value,
1078        static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1079    if (long_name == NULL) break;
1080    if (strcmp(property_value_name, long_name) == 0) return true;
1081  }
1082  return false;
1083}
1084
1085bool LookupPropertyValueName(UProperty property,
1086                             const char* property_value_name, bool negate,
1087                             ZoneList<CharacterRange>* result, Zone* zone) {
1088  UProperty property_for_lookup = property;
1089  if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1090    // For the property Script_Extensions, we have to do the property value
1091    // name lookup as if the property is Script.
1092    property_for_lookup = UCHAR_SCRIPT;
1093  }
1094  int32_t property_value =
1095      u_getPropertyValueEnum(property_for_lookup, property_value_name);
1096  if (property_value == UCHAR_INVALID_CODE) return false;
1097
1098  // We require the property name to match exactly to one of the property value
1099  // aliases. However, u_getPropertyValueEnum uses loose matching.
1100  if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1101                                 property_value)) {
1102    return false;
1103  }
1104
1105  UErrorCode ec = U_ZERO_ERROR;
1106  icu::UnicodeSet set;
1107  set.applyIntPropertyValue(property, property_value, ec);
1108  bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1109
1110  if (success) {
1111    set.removeAllStrings();
1112    if (negate) set.complement();
1113    for (int i = 0; i < set.getRangeCount(); i++) {
1114      result->Add(
1115          CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1116          zone);
1117    }
1118  }
1119  return success;
1120}
1121
1122template <size_t N>
1123inline bool NameEquals(const char* name, const char (&literal)[N]) {
1124  return strncmp(name, literal, N + 1) == 0;
1125}
1126
1127bool LookupSpecialPropertyValueName(const char* name,
1128                                    ZoneList<CharacterRange>* result,
1129                                    bool negate, Zone* zone) {
1130  if (NameEquals(name, "Any")) {
1131    if (!negate) result->Add(CharacterRange::Everything(), zone);
1132  } else if (NameEquals(name, "ASCII")) {
1133    result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1134                       : CharacterRange::Range(0x0, 0x7f),
1135                zone);
1136  } else if (NameEquals(name, "Assigned")) {
1137    return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1138                                   !negate, result, zone);
1139  } else {
1140    return false;
1141  }
1142  return true;
1143}
1144
1145}  // anonymous namespace
1146
1147bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1148                                      bool negate) {
1149  // Parse the property class as follows:
1150  // - In \p{name}, 'name' is interpreted
1151  //   - either as a general category property value name.
1152  //   - or as a binary property name.
1153  // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1154  //   and 'value' is interpreted as one of the available property value names.
1155  // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1156  // - Loose matching is not applied.
1157  List<char> first_part;
1158  List<char> second_part;
1159  if (current() == '{') {
1160    // Parse \p{[PropertyName=]PropertyNameValue}
1161    for (Advance(); current() != '}' && current() != '='; Advance()) {
1162      if (!has_next()) return false;
1163      first_part.Add(static_cast<char>(current()));
1164    }
1165    if (current() == '=') {
1166      for (Advance(); current() != '}'; Advance()) {
1167        if (!has_next()) return false;
1168        second_part.Add(static_cast<char>(current()));
1169      }
1170      second_part.Add(0);  // null-terminate string.
1171    }
1172  } else {
1173    return false;
1174  }
1175  Advance();
1176  first_part.Add(0);  // null-terminate string.
1177
1178  if (second_part.is_empty()) {
1179    // First attempt to interpret as general category property value name.
1180    const char* name = first_part.ToConstVector().start();
1181    if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1182                                result, zone())) {
1183      return true;
1184    }
1185    // Interpret "Any", "ASCII", and "Assigned".
1186    if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
1187      return true;
1188    }
1189    // Then attempt to interpret as binary property name with value name 'Y'.
1190    UProperty property = u_getPropertyEnum(name);
1191    if (property < UCHAR_BINARY_START) return false;
1192    if (property >= UCHAR_BINARY_LIMIT) return false;
1193    if (!IsExactPropertyAlias(name, property)) return false;
1194    return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
1195                                   zone());
1196  } else {
1197    // Both property name and value name are specified. Attempt to interpret
1198    // the property name as enumerated property.
1199    const char* property_name = first_part.ToConstVector().start();
1200    const char* value_name = second_part.ToConstVector().start();
1201    UProperty property = u_getPropertyEnum(property_name);
1202    if (!IsExactPropertyAlias(property_name, property)) return false;
1203    if (property == UCHAR_GENERAL_CATEGORY) {
1204      // We want to allow aggregate value names such as "Letter".
1205      property = UCHAR_GENERAL_CATEGORY_MASK;
1206    } else if (property != UCHAR_SCRIPT &&
1207               property != UCHAR_SCRIPT_EXTENSIONS) {
1208      return false;
1209    }
1210    return LookupPropertyValueName(property, value_name, negate, result,
1211                                   zone());
1212  }
1213}
1214
1215#else  // V8_I18N_SUPPORT
1216
1217bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1218                                      bool negate) {
1219  return false;
1220}
1221
1222#endif  // V8_I18N_SUPPORT
1223
1224bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1225  uc32 x = 0;
1226  int d = HexValue(current());
1227  if (d < 0) {
1228    return false;
1229  }
1230  while (d >= 0) {
1231    x = x * 16 + d;
1232    if (x > max_value) {
1233      return false;
1234    }
1235    Advance();
1236    d = HexValue(current());
1237  }
1238  *value = x;
1239  return true;
1240}
1241
1242
1243uc32 RegExpParser::ParseClassCharacterEscape() {
1244  DCHECK(current() == '\\');
1245  DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1246  Advance();
1247  switch (current()) {
1248    case 'b':
1249      Advance();
1250      return '\b';
1251    // ControlEscape :: one of
1252    //   f n r t v
1253    case 'f':
1254      Advance();
1255      return '\f';
1256    case 'n':
1257      Advance();
1258      return '\n';
1259    case 'r':
1260      Advance();
1261      return '\r';
1262    case 't':
1263      Advance();
1264      return '\t';
1265    case 'v':
1266      Advance();
1267      return '\v';
1268    case 'c': {
1269      uc32 controlLetter = Next();
1270      uc32 letter = controlLetter & ~('A' ^ 'a');
1271      // For compatibility with JSC, inside a character class. We also accept
1272      // digits and underscore as control characters, unless with /u.
1273      if (letter >= 'A' && letter <= 'Z') {
1274        Advance(2);
1275        // Control letters mapped to ASCII control characters in the range
1276        // 0x00-0x1f.
1277        return controlLetter & 0x1f;
1278      }
1279      if (unicode()) {
1280        // With /u, invalid escapes are not treated as identity escapes.
1281        ReportError(CStrVector("Invalid class escape"));
1282        return 0;
1283      }
1284      if ((controlLetter >= '0' && controlLetter <= '9') ||
1285          controlLetter == '_') {
1286        Advance(2);
1287        return controlLetter & 0x1f;
1288      }
1289      // We match JSC in reading the backslash as a literal
1290      // character instead of as starting an escape.
1291      return '\\';
1292    }
1293    case '0':
1294      // With /u, \0 is interpreted as NUL if not followed by another digit.
1295      if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1296        Advance();
1297        return 0;
1298      }
1299    // Fall through.
1300    case '1':
1301    case '2':
1302    case '3':
1303    case '4':
1304    case '5':
1305    case '6':
1306    case '7':
1307      // For compatibility, we interpret a decimal escape that isn't
1308      // a back reference (and therefore either \0 or not valid according
1309      // to the specification) as a 1..3 digit octal character code.
1310      if (unicode()) {
1311        // With /u, decimal escape is not interpreted as octal character code.
1312        ReportError(CStrVector("Invalid class escape"));
1313        return 0;
1314      }
1315      return ParseOctalLiteral();
1316    case 'x': {
1317      Advance();
1318      uc32 value;
1319      if (ParseHexEscape(2, &value)) return value;
1320      if (unicode()) {
1321        // With /u, invalid escapes are not treated as identity escapes.
1322        ReportError(CStrVector("Invalid escape"));
1323        return 0;
1324      }
1325      // If \x is not followed by a two-digit hexadecimal, treat it
1326      // as an identity escape.
1327      return 'x';
1328    }
1329    case 'u': {
1330      Advance();
1331      uc32 value;
1332      if (ParseUnicodeEscape(&value)) return value;
1333      if (unicode()) {
1334        // With /u, invalid escapes are not treated as identity escapes.
1335        ReportError(CStrVector("Invalid unicode escape"));
1336        return 0;
1337      }
1338      // If \u is not followed by a two-digit hexadecimal, treat it
1339      // as an identity escape.
1340      return 'u';
1341    }
1342    default: {
1343      uc32 result = current();
1344      // With /u, no identity escapes except for syntax characters and '-' are
1345      // allowed. Otherwise, all identity escapes are allowed.
1346      if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1347        Advance();
1348        return result;
1349      }
1350      ReportError(CStrVector("Invalid escape"));
1351      return 0;
1352    }
1353  }
1354  return 0;
1355}
1356
1357
1358CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
1359  DCHECK_EQ(0, *char_class);
1360  uc32 first = current();
1361  if (first == '\\') {
1362    switch (Next()) {
1363      case 'w':
1364      case 'W':
1365      case 'd':
1366      case 'D':
1367      case 's':
1368      case 'S': {
1369        *char_class = Next();
1370        Advance(2);
1371        return CharacterRange::Singleton(0);  // Return dummy value.
1372      }
1373      case kEndMarker:
1374        return ReportError(CStrVector("\\ at end of pattern"));
1375      default:
1376        first = ParseClassCharacterEscape(CHECK_FAILED);
1377    }
1378  } else {
1379    Advance();
1380  }
1381
1382  return CharacterRange::Singleton(first);
1383}
1384
1385static const uc16 kNoCharClass = 0;
1386
1387// Adds range or pre-defined character class to character ranges.
1388// If char_class is not kInvalidClass, it's interpreted as a class
1389// escape (i.e., 's' means whitespace, from '\s').
1390static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
1391                                    uc16 char_class, CharacterRange range,
1392                                    Zone* zone) {
1393  if (char_class != kNoCharClass) {
1394    CharacterRange::AddClassEscape(char_class, ranges, zone);
1395  } else {
1396    ranges->Add(range, zone);
1397  }
1398}
1399
1400bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
1401  if (!FLAG_harmony_regexp_property) return false;
1402  if (!unicode()) return false;
1403  if (current() != '\\') return false;
1404  uc32 next = Next();
1405  bool parse_success = false;
1406  if (next == 'p') {
1407    Advance(2);
1408    parse_success = ParsePropertyClass(ranges, false);
1409  } else if (next == 'P') {
1410    Advance(2);
1411    parse_success = ParsePropertyClass(ranges, true);
1412  } else {
1413    return false;
1414  }
1415  if (!parse_success)
1416    ReportError(CStrVector("Invalid property name in character class"));
1417  return parse_success;
1418}
1419
1420RegExpTree* RegExpParser::ParseCharacterClass() {
1421  static const char* kUnterminated = "Unterminated character class";
1422  static const char* kRangeInvalid = "Invalid character class";
1423  static const char* kRangeOutOfOrder = "Range out of order in character class";
1424
1425  DCHECK_EQ(current(), '[');
1426  Advance();
1427  bool is_negated = false;
1428  if (current() == '^') {
1429    is_negated = true;
1430    Advance();
1431  }
1432  ZoneList<CharacterRange>* ranges =
1433      new (zone()) ZoneList<CharacterRange>(2, zone());
1434  while (has_more() && current() != ']') {
1435    bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
1436    if (parsed_property) continue;
1437    uc16 char_class = kNoCharClass;
1438    CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
1439    if (current() == '-') {
1440      Advance();
1441      if (current() == kEndMarker) {
1442        // If we reach the end we break out of the loop and let the
1443        // following code report an error.
1444        break;
1445      } else if (current() == ']') {
1446        AddRangeOrEscape(ranges, char_class, first, zone());
1447        ranges->Add(CharacterRange::Singleton('-'), zone());
1448        break;
1449      }
1450      uc16 char_class_2 = kNoCharClass;
1451      CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
1452      if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1453        // Either end is an escaped character class. Treat the '-' verbatim.
1454        if (unicode()) {
1455          // ES2015 21.2.2.15.1 step 1.
1456          return ReportError(CStrVector(kRangeInvalid));
1457        }
1458        AddRangeOrEscape(ranges, char_class, first, zone());
1459        ranges->Add(CharacterRange::Singleton('-'), zone());
1460        AddRangeOrEscape(ranges, char_class_2, next, zone());
1461        continue;
1462      }
1463      // ES2015 21.2.2.15.1 step 6.
1464      if (first.from() > next.to()) {
1465        return ReportError(CStrVector(kRangeOutOfOrder));
1466      }
1467      ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
1468    } else {
1469      AddRangeOrEscape(ranges, char_class, first, zone());
1470    }
1471  }
1472  if (!has_more()) {
1473    return ReportError(CStrVector(kUnterminated));
1474  }
1475  Advance();
1476  if (ranges->length() == 0) {
1477    ranges->Add(CharacterRange::Everything(), zone());
1478    is_negated = !is_negated;
1479  }
1480  return new (zone()) RegExpCharacterClass(ranges, is_negated);
1481}
1482
1483
1484#undef CHECK_FAILED
1485
1486
1487bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1488                               FlatStringReader* input, JSRegExp::Flags flags,
1489                               RegExpCompileData* result) {
1490  DCHECK(result != NULL);
1491  RegExpParser parser(input, &result->error, flags, isolate, zone);
1492  RegExpTree* tree = parser.ParsePattern();
1493  if (parser.failed()) {
1494    DCHECK(tree == NULL);
1495    DCHECK(!result->error.is_null());
1496  } else {
1497    DCHECK(tree != NULL);
1498    DCHECK(result->error.is_null());
1499    if (FLAG_trace_regexp_parser) {
1500      OFStream os(stdout);
1501      tree->Print(os, zone);
1502      os << "\n";
1503    }
1504    result->tree = tree;
1505    int capture_count = parser.captures_started();
1506    result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1507    result->contains_anchor = parser.contains_anchor();
1508    result->capture_name_map = parser.CreateCaptureNameMap();
1509    result->capture_count = capture_count;
1510  }
1511  return !parser.failed();
1512}
1513
1514RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
1515    : zone_(zone),
1516      pending_empty_(false),
1517      ignore_case_(ignore_case),
1518      unicode_(unicode),
1519      characters_(NULL),
1520      pending_surrogate_(kNoPendingSurrogate),
1521      terms_(),
1522      alternatives_()
1523#ifdef DEBUG
1524      ,
1525      last_added_(ADD_NONE)
1526#endif
1527{
1528}
1529
1530
1531void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1532  DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1533  FlushPendingSurrogate();
1534  // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1535  pending_surrogate_ = lead_surrogate;
1536}
1537
1538
1539void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1540  DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1541  if (pending_surrogate_ != kNoPendingSurrogate) {
1542    uc16 lead_surrogate = pending_surrogate_;
1543    pending_surrogate_ = kNoPendingSurrogate;
1544    DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1545    uc32 combined =
1546        unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1547    if (NeedsDesugaringForIgnoreCase(combined)) {
1548      AddCharacterClassForDesugaring(combined);
1549    } else {
1550      ZoneList<uc16> surrogate_pair(2, zone());
1551      surrogate_pair.Add(lead_surrogate, zone());
1552      surrogate_pair.Add(trail_surrogate, zone());
1553      RegExpAtom* atom =
1554          new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1555      AddAtom(atom);
1556    }
1557  } else {
1558    pending_surrogate_ = trail_surrogate;
1559    FlushPendingSurrogate();
1560  }
1561}
1562
1563
1564void RegExpBuilder::FlushPendingSurrogate() {
1565  if (pending_surrogate_ != kNoPendingSurrogate) {
1566    DCHECK(unicode());
1567    uc32 c = pending_surrogate_;
1568    pending_surrogate_ = kNoPendingSurrogate;
1569    AddCharacterClassForDesugaring(c);
1570  }
1571}
1572
1573
1574void RegExpBuilder::FlushCharacters() {
1575  FlushPendingSurrogate();
1576  pending_empty_ = false;
1577  if (characters_ != NULL) {
1578    RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1579    characters_ = NULL;
1580    text_.Add(atom, zone());
1581    LAST(ADD_ATOM);
1582  }
1583}
1584
1585
1586void RegExpBuilder::FlushText() {
1587  FlushCharacters();
1588  int num_text = text_.length();
1589  if (num_text == 0) {
1590    return;
1591  } else if (num_text == 1) {
1592    terms_.Add(text_.last(), zone());
1593  } else {
1594    RegExpText* text = new (zone()) RegExpText(zone());
1595    for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1596    terms_.Add(text, zone());
1597  }
1598  text_.Clear();
1599}
1600
1601
1602void RegExpBuilder::AddCharacter(uc16 c) {
1603  FlushPendingSurrogate();
1604  pending_empty_ = false;
1605  if (NeedsDesugaringForIgnoreCase(c)) {
1606    AddCharacterClassForDesugaring(c);
1607  } else {
1608    if (characters_ == NULL) {
1609      characters_ = new (zone()) ZoneList<uc16>(4, zone());
1610    }
1611    characters_->Add(c, zone());
1612    LAST(ADD_CHAR);
1613  }
1614}
1615
1616
1617void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1618  if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1619    DCHECK(unicode());
1620    AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1621    AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1622  } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1623    AddLeadSurrogate(c);
1624  } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1625    AddTrailSurrogate(c);
1626  } else {
1627    AddCharacter(static_cast<uc16>(c));
1628  }
1629}
1630
1631void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1632  // A lead or trail surrogate parsed via escape sequence will not
1633  // pair up with any preceding lead or following trail surrogate.
1634  FlushPendingSurrogate();
1635  AddUnicodeCharacter(character);
1636  FlushPendingSurrogate();
1637}
1638
1639void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1640
1641
1642void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1643  if (NeedsDesugaringForUnicode(cc)) {
1644    // With /u, character class needs to be desugared, so it
1645    // must be a standalone term instead of being part of a RegExpText.
1646    AddTerm(cc);
1647  } else {
1648    AddAtom(cc);
1649  }
1650}
1651
1652void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1653  AddTerm(new (zone()) RegExpCharacterClass(
1654      CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
1655}
1656
1657
1658void RegExpBuilder::AddAtom(RegExpTree* term) {
1659  if (term->IsEmpty()) {
1660    AddEmpty();
1661    return;
1662  }
1663  if (term->IsTextElement()) {
1664    FlushCharacters();
1665    text_.Add(term, zone());
1666  } else {
1667    FlushText();
1668    terms_.Add(term, zone());
1669  }
1670  LAST(ADD_ATOM);
1671}
1672
1673
1674void RegExpBuilder::AddTerm(RegExpTree* term) {
1675  FlushText();
1676  terms_.Add(term, zone());
1677  LAST(ADD_ATOM);
1678}
1679
1680
1681void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1682  FlushText();
1683  terms_.Add(assert, zone());
1684  LAST(ADD_ASSERT);
1685}
1686
1687
1688void RegExpBuilder::NewAlternative() { FlushTerms(); }
1689
1690
1691void RegExpBuilder::FlushTerms() {
1692  FlushText();
1693  int num_terms = terms_.length();
1694  RegExpTree* alternative;
1695  if (num_terms == 0) {
1696    alternative = new (zone()) RegExpEmpty();
1697  } else if (num_terms == 1) {
1698    alternative = terms_.last();
1699  } else {
1700    alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1701  }
1702  alternatives_.Add(alternative, zone());
1703  terms_.Clear();
1704  LAST(ADD_NONE);
1705}
1706
1707
1708bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1709  if (!unicode()) return false;
1710  // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1711  // necessarily mean that we need to desugar. It's probably nicer to have a
1712  // separate pass to figure out unicode desugarings.
1713  if (ignore_case()) return true;
1714  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1715  CharacterRange::Canonicalize(ranges);
1716  for (int i = ranges->length() - 1; i >= 0; i--) {
1717    uc32 from = ranges->at(i).from();
1718    uc32 to = ranges->at(i).to();
1719    // Check for non-BMP characters.
1720    if (to >= kNonBmpStart) return true;
1721    // Check for lone surrogates.
1722    if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1723  }
1724  return false;
1725}
1726
1727
1728bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1729#ifdef V8_I18N_SUPPORT
1730  if (unicode() && ignore_case()) {
1731    icu::UnicodeSet set(c, c);
1732    set.closeOver(USET_CASE_INSENSITIVE);
1733    set.removeAllStrings();
1734    return set.size() > 1;
1735  }
1736  // In the case where ICU is not included, we act as if the unicode flag is
1737  // not set, and do not desugar.
1738#endif  // V8_I18N_SUPPORT
1739  return false;
1740}
1741
1742
1743RegExpTree* RegExpBuilder::ToRegExp() {
1744  FlushTerms();
1745  int num_alternatives = alternatives_.length();
1746  if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1747  if (num_alternatives == 1) return alternatives_.last();
1748  return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1749}
1750
1751bool RegExpBuilder::AddQuantifierToAtom(
1752    int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1753  FlushPendingSurrogate();
1754  if (pending_empty_) {
1755    pending_empty_ = false;
1756    return true;
1757  }
1758  RegExpTree* atom;
1759  if (characters_ != NULL) {
1760    DCHECK(last_added_ == ADD_CHAR);
1761    // Last atom was character.
1762    Vector<const uc16> char_vector = characters_->ToConstVector();
1763    int num_chars = char_vector.length();
1764    if (num_chars > 1) {
1765      Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1766      text_.Add(new (zone()) RegExpAtom(prefix), zone());
1767      char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1768    }
1769    characters_ = NULL;
1770    atom = new (zone()) RegExpAtom(char_vector);
1771    FlushText();
1772  } else if (text_.length() > 0) {
1773    DCHECK(last_added_ == ADD_ATOM);
1774    atom = text_.RemoveLast();
1775    FlushText();
1776  } else if (terms_.length() > 0) {
1777    DCHECK(last_added_ == ADD_ATOM);
1778    atom = terms_.RemoveLast();
1779    // With /u, lookarounds are not quantifiable.
1780    if (unicode() && atom->IsLookaround()) return false;
1781    if (atom->max_match() == 0) {
1782      // Guaranteed to only match an empty string.
1783      LAST(ADD_TERM);
1784      if (min == 0) {
1785        return true;
1786      }
1787      terms_.Add(atom, zone());
1788      return true;
1789    }
1790  } else {
1791    // Only call immediately after adding an atom or character!
1792    UNREACHABLE();
1793    return false;
1794  }
1795  terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1796             zone());
1797  LAST(ADD_TERM);
1798  return true;
1799}
1800
1801}  // namespace internal
1802}  // namespace v8
1803