1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp-parser.h"
6
7#include "src/char-predicates-inl.h"
8#include "src/factory.h"
9#include "src/isolate.h"
10#include "src/objects-inl.h"
11#include "src/ostreams.h"
12#include "src/regexp/jsregexp.h"
13#include "src/utils.h"
14
15#ifdef V8_I18N_SUPPORT
16#include "unicode/uset.h"
17#endif  // V8_I18N_SUPPORT
18
19namespace v8 {
20namespace internal {
21
22RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
23                           JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
24    : isolate_(isolate),
25      zone_(zone),
26      error_(error),
27      captures_(NULL),
28      named_captures_(NULL),
29      named_back_references_(NULL),
30      in_(in),
31      current_(kEndMarker),
32      ignore_case_(flags & JSRegExp::kIgnoreCase),
33      multiline_(flags & JSRegExp::kMultiline),
34      unicode_(flags & JSRegExp::kUnicode),
35      next_pos_(0),
36      captures_started_(0),
37      capture_count_(0),
38      has_more_(true),
39      simple_(false),
40      contains_anchor_(false),
41      is_scanned_for_captures_(false),
42      failed_(false) {
43  Advance();
44}
45
46template <bool update_position>
47inline uc32 RegExpParser::ReadNext() {
48  int position = next_pos_;
49  uc32 c0 = in()->Get(position);
50  position++;
51  // Read the whole surrogate pair in case of unicode flag, if possible.
52  if (unicode() && position < in()->length() &&
53      unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
54    uc16 c1 = in()->Get(position);
55    if (unibrow::Utf16::IsTrailSurrogate(c1)) {
56      c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
57      position++;
58    }
59  }
60  if (update_position) next_pos_ = position;
61  return c0;
62}
63
64
65uc32 RegExpParser::Next() {
66  if (has_next()) {
67    return ReadNext<false>();
68  } else {
69    return kEndMarker;
70  }
71}
72
73
74void RegExpParser::Advance() {
75  if (has_next()) {
76    StackLimitCheck check(isolate());
77    if (check.HasOverflowed()) {
78      ReportError(CStrVector(
79          MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
80    } else if (zone()->excess_allocation()) {
81      ReportError(CStrVector("Regular expression too large"));
82    } else {
83      current_ = ReadNext<true>();
84    }
85  } else {
86    current_ = kEndMarker;
87    // Advance so that position() points to 1-after-the-last-character. This is
88    // important so that Reset() to this position works correctly.
89    next_pos_ = in()->length() + 1;
90    has_more_ = false;
91  }
92}
93
94
95void RegExpParser::Reset(int pos) {
96  next_pos_ = pos;
97  has_more_ = (pos < in()->length());
98  Advance();
99}
100
101
102void RegExpParser::Advance(int dist) {
103  next_pos_ += dist - 1;
104  Advance();
105}
106
107
108bool RegExpParser::simple() { return simple_; }
109
110bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
111  switch (c) {
112    case '^':
113    case '$':
114    case '\\':
115    case '.':
116    case '*':
117    case '+':
118    case '?':
119    case '(':
120    case ')':
121    case '[':
122    case ']':
123    case '{':
124    case '}':
125    case '|':
126    case '/':
127      return true;
128    default:
129      break;
130  }
131  return false;
132}
133
134
135RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
136  if (failed_) return NULL;  // Do not overwrite any existing error.
137  failed_ = true;
138  *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
139  // Zip to the end to make sure the no more input is read.
140  current_ = kEndMarker;
141  next_pos_ = in()->length();
142  return NULL;
143}
144
145
146#define CHECK_FAILED /**/); \
147  if (failed_) return NULL; \
148  ((void)0
149
150
151// Pattern ::
152//   Disjunction
153RegExpTree* RegExpParser::ParsePattern() {
154  RegExpTree* result = ParseDisjunction(CHECK_FAILED);
155  PatchNamedBackReferences(CHECK_FAILED);
156  DCHECK(!has_more());
157  // If the result of parsing is a literal string atom, and it has the
158  // same length as the input, then the atom is identical to the input.
159  if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
160    simple_ = true;
161  }
162  return result;
163}
164
165
166// Disjunction ::
167//   Alternative
168//   Alternative | Disjunction
169// Alternative ::
170//   [empty]
171//   Term Alternative
172// Term ::
173//   Assertion
174//   Atom
175//   Atom Quantifier
176RegExpTree* RegExpParser::ParseDisjunction() {
177  // Used to store current state while parsing subexpressions.
178  RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
179                                  nullptr, ignore_case(), unicode(), zone());
180  RegExpParserState* state = &initial_state;
181  // Cache the builder in a local variable for quick access.
182  RegExpBuilder* builder = initial_state.builder();
183  while (true) {
184    switch (current()) {
185      case kEndMarker:
186        if (state->IsSubexpression()) {
187          // Inside a parenthesized group when hitting end of input.
188          return ReportError(CStrVector("Unterminated group"));
189        }
190        DCHECK_EQ(INITIAL, state->group_type());
191        // Parsing completed successfully.
192        return builder->ToRegExp();
193      case ')': {
194        if (!state->IsSubexpression()) {
195          return ReportError(CStrVector("Unmatched ')'"));
196        }
197        DCHECK_NE(INITIAL, state->group_type());
198
199        Advance();
200        // End disjunction parsing and convert builder content to new single
201        // regexp atom.
202        RegExpTree* body = builder->ToRegExp();
203
204        int end_capture_index = captures_started();
205
206        int capture_index = state->capture_index();
207        SubexpressionType group_type = state->group_type();
208
209        // Build result of subexpression.
210        if (group_type == CAPTURE) {
211          if (state->IsNamedCapture()) {
212            CreateNamedCaptureAtIndex(state->capture_name(),
213                                      capture_index CHECK_FAILED);
214          }
215          RegExpCapture* capture = GetCapture(capture_index);
216          capture->set_body(body);
217          body = capture;
218        } else if (group_type != GROUPING) {
219          DCHECK(group_type == POSITIVE_LOOKAROUND ||
220                 group_type == NEGATIVE_LOOKAROUND);
221          bool is_positive = (group_type == POSITIVE_LOOKAROUND);
222          body = new (zone()) RegExpLookaround(
223              body, is_positive, end_capture_index - capture_index,
224              capture_index, state->lookaround_type());
225        }
226
227        // Restore previous state.
228        state = state->previous_state();
229        builder = state->builder();
230
231        builder->AddAtom(body);
232        // For compatability with JSC and ES3, we allow quantifiers after
233        // lookaheads, and break in all cases.
234        break;
235      }
236      case '|': {
237        Advance();
238        builder->NewAlternative();
239        continue;
240      }
241      case '*':
242      case '+':
243      case '?':
244        return ReportError(CStrVector("Nothing to repeat"));
245      case '^': {
246        Advance();
247        if (multiline()) {
248          builder->AddAssertion(
249              new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
250        } else {
251          builder->AddAssertion(
252              new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
253          set_contains_anchor();
254        }
255        continue;
256      }
257      case '$': {
258        Advance();
259        RegExpAssertion::AssertionType assertion_type =
260            multiline() ? RegExpAssertion::END_OF_LINE
261                        : RegExpAssertion::END_OF_INPUT;
262        builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
263        continue;
264      }
265      case '.': {
266        Advance();
267        // everything except \x0a, \x0d, \u2028 and \u2029
268        ZoneList<CharacterRange>* ranges =
269            new (zone()) ZoneList<CharacterRange>(2, zone());
270        CharacterRange::AddClassEscape('.', ranges, zone());
271        RegExpCharacterClass* cc =
272            new (zone()) RegExpCharacterClass(ranges, false);
273        builder->AddCharacterClass(cc);
274        break;
275      }
276      case '(': {
277        SubexpressionType subexpr_type = CAPTURE;
278        RegExpLookaround::Type lookaround_type = state->lookaround_type();
279        bool is_named_capture = false;
280        Advance();
281        if (current() == '?') {
282          switch (Next()) {
283            case ':':
284              subexpr_type = GROUPING;
285              Advance(2);
286              break;
287            case '=':
288              lookaround_type = RegExpLookaround::LOOKAHEAD;
289              subexpr_type = POSITIVE_LOOKAROUND;
290              Advance(2);
291              break;
292            case '!':
293              lookaround_type = RegExpLookaround::LOOKAHEAD;
294              subexpr_type = NEGATIVE_LOOKAROUND;
295              Advance(2);
296              break;
297            case '<':
298              Advance();
299              if (FLAG_harmony_regexp_lookbehind) {
300                if (Next() == '=') {
301                  subexpr_type = POSITIVE_LOOKAROUND;
302                  lookaround_type = RegExpLookaround::LOOKBEHIND;
303                  Advance(2);
304                  break;
305                } else if (Next() == '!') {
306                  subexpr_type = NEGATIVE_LOOKAROUND;
307                  lookaround_type = RegExpLookaround::LOOKBEHIND;
308                  Advance(2);
309                  break;
310                }
311              }
312              if (FLAG_harmony_regexp_named_captures && unicode()) {
313                is_named_capture = true;
314                Advance();
315                break;
316              }
317            // Fall through.
318            default:
319              return ReportError(CStrVector("Invalid group"));
320          }
321        }
322
323        const ZoneVector<uc16>* capture_name = nullptr;
324        if (subexpr_type == CAPTURE) {
325          if (captures_started_ >= kMaxCaptures) {
326            return ReportError(CStrVector("Too many captures"));
327          }
328          captures_started_++;
329
330          if (is_named_capture) {
331            capture_name = ParseCaptureGroupName(CHECK_FAILED);
332          }
333        }
334        // Store current state and begin new disjunction parsing.
335        state = new (zone()) RegExpParserState(
336            state, subexpr_type, lookaround_type, captures_started_,
337            capture_name, ignore_case(), unicode(), zone());
338        builder = state->builder();
339        continue;
340      }
341      case '[': {
342        RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
343        builder->AddCharacterClass(cc->AsCharacterClass());
344        break;
345      }
346      // Atom ::
347      //   \ AtomEscape
348      case '\\':
349        switch (Next()) {
350          case kEndMarker:
351            return ReportError(CStrVector("\\ at end of pattern"));
352          case 'b':
353            Advance(2);
354            builder->AddAssertion(
355                new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
356            continue;
357          case 'B':
358            Advance(2);
359            builder->AddAssertion(
360                new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
361            continue;
362          // AtomEscape ::
363          //   CharacterClassEscape
364          //
365          // CharacterClassEscape :: one of
366          //   d D s S w W
367          case 'd':
368          case 'D':
369          case 's':
370          case 'S':
371          case 'w':
372          case 'W': {
373            uc32 c = Next();
374            Advance(2);
375            ZoneList<CharacterRange>* ranges =
376                new (zone()) ZoneList<CharacterRange>(2, zone());
377            CharacterRange::AddClassEscape(c, ranges, zone());
378            RegExpCharacterClass* cc =
379                new (zone()) RegExpCharacterClass(ranges, false);
380            builder->AddCharacterClass(cc);
381            break;
382          }
383          case 'p':
384          case 'P': {
385            uc32 p = Next();
386            Advance(2);
387            if (unicode()) {
388              if (FLAG_harmony_regexp_property) {
389                ZoneList<CharacterRange>* ranges =
390                    new (zone()) ZoneList<CharacterRange>(2, zone());
391                if (!ParsePropertyClass(ranges, p == 'P')) {
392                  return ReportError(CStrVector("Invalid property name"));
393                }
394                RegExpCharacterClass* cc =
395                    new (zone()) RegExpCharacterClass(ranges, false);
396                builder->AddCharacterClass(cc);
397              } else {
398                // With /u, no identity escapes except for syntax characters
399                // are allowed. Otherwise, all identity escapes are allowed.
400                return ReportError(CStrVector("Invalid escape"));
401              }
402            } else {
403              builder->AddCharacter(p);
404            }
405            break;
406          }
407          case '1':
408          case '2':
409          case '3':
410          case '4':
411          case '5':
412          case '6':
413          case '7':
414          case '8':
415          case '9': {
416            int index = 0;
417            bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
418            if (is_backref) {
419              if (state->IsInsideCaptureGroup(index)) {
420                // The back reference is inside the capture group it refers to.
421                // Nothing can possibly have been captured yet, so we use empty
422                // instead. This ensures that, when checking a back reference,
423                // the capture registers of the referenced capture are either
424                // both set or both cleared.
425                builder->AddEmpty();
426              } else {
427                RegExpCapture* capture = GetCapture(index);
428                RegExpTree* atom = new (zone()) RegExpBackReference(capture);
429                builder->AddAtom(atom);
430              }
431              break;
432            }
433            // With /u, no identity escapes except for syntax characters
434            // are allowed. Otherwise, all identity escapes are allowed.
435            if (unicode()) {
436              return ReportError(CStrVector("Invalid escape"));
437            }
438            uc32 first_digit = Next();
439            if (first_digit == '8' || first_digit == '9') {
440              builder->AddCharacter(first_digit);
441              Advance(2);
442              break;
443            }
444          }
445          // Fall through.
446          case '0': {
447            Advance();
448            if (unicode() && Next() >= '0' && Next() <= '9') {
449              // With /u, decimal escape with leading 0 are not parsed as octal.
450              return ReportError(CStrVector("Invalid decimal escape"));
451            }
452            uc32 octal = ParseOctalLiteral();
453            builder->AddCharacter(octal);
454            break;
455          }
456          // ControlEscape :: one of
457          //   f n r t v
458          case 'f':
459            Advance(2);
460            builder->AddCharacter('\f');
461            break;
462          case 'n':
463            Advance(2);
464            builder->AddCharacter('\n');
465            break;
466          case 'r':
467            Advance(2);
468            builder->AddCharacter('\r');
469            break;
470          case 't':
471            Advance(2);
472            builder->AddCharacter('\t');
473            break;
474          case 'v':
475            Advance(2);
476            builder->AddCharacter('\v');
477            break;
478          case 'c': {
479            Advance();
480            uc32 controlLetter = Next();
481            // Special case if it is an ASCII letter.
482            // Convert lower case letters to uppercase.
483            uc32 letter = controlLetter & ~('a' ^ 'A');
484            if (letter < 'A' || 'Z' < letter) {
485              // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
486              // This is outside the specification. We match JSC in
487              // reading the backslash as a literal character instead
488              // of as starting an escape.
489              if (unicode()) {
490                // With /u, invalid escapes are not treated as identity escapes.
491                return ReportError(CStrVector("Invalid unicode escape"));
492              }
493              builder->AddCharacter('\\');
494            } else {
495              Advance(2);
496              builder->AddCharacter(controlLetter & 0x1f);
497            }
498            break;
499          }
500          case 'x': {
501            Advance(2);
502            uc32 value;
503            if (ParseHexEscape(2, &value)) {
504              builder->AddCharacter(value);
505            } else if (!unicode()) {
506              builder->AddCharacter('x');
507            } else {
508              // With /u, invalid escapes are not treated as identity escapes.
509              return ReportError(CStrVector("Invalid escape"));
510            }
511            break;
512          }
513          case 'u': {
514            Advance(2);
515            uc32 value;
516            if (ParseUnicodeEscape(&value)) {
517              builder->AddEscapedUnicodeCharacter(value);
518            } else if (!unicode()) {
519              builder->AddCharacter('u');
520            } else {
521              // With /u, invalid escapes are not treated as identity escapes.
522              return ReportError(CStrVector("Invalid unicode escape"));
523            }
524            break;
525          }
526          case 'k':
527            if (FLAG_harmony_regexp_named_captures && unicode()) {
528              Advance(2);
529              ParseNamedBackReference(builder, state CHECK_FAILED);
530              break;
531            }
532          // Fall through.
533          default:
534            Advance();
535            // With /u, no identity escapes except for syntax characters
536            // are allowed. Otherwise, all identity escapes are allowed.
537            if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
538              builder->AddCharacter(current());
539              Advance();
540            } else {
541              return ReportError(CStrVector("Invalid escape"));
542            }
543            break;
544        }
545        break;
546      case '{': {
547        int dummy;
548        bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
549        if (parsed) return ReportError(CStrVector("Nothing to repeat"));
550        // Fall through.
551      }
552      case '}':
553      case ']':
554        if (unicode()) {
555          return ReportError(CStrVector("Lone quantifier brackets"));
556        }
557      // Fall through.
558      default:
559        builder->AddUnicodeCharacter(current());
560        Advance();
561        break;
562    }  // end switch(current())
563
564    int min;
565    int max;
566    switch (current()) {
567      // QuantifierPrefix ::
568      //   *
569      //   +
570      //   ?
571      //   {
572      case '*':
573        min = 0;
574        max = RegExpTree::kInfinity;
575        Advance();
576        break;
577      case '+':
578        min = 1;
579        max = RegExpTree::kInfinity;
580        Advance();
581        break;
582      case '?':
583        min = 0;
584        max = 1;
585        Advance();
586        break;
587      case '{':
588        if (ParseIntervalQuantifier(&min, &max)) {
589          if (max < min) {
590            return ReportError(
591                CStrVector("numbers out of order in {} quantifier"));
592          }
593          break;
594        } else if (unicode()) {
595          // With /u, incomplete quantifiers are not allowed.
596          return ReportError(CStrVector("Incomplete quantifier"));
597        }
598        continue;
599      default:
600        continue;
601    }
602    RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
603    if (current() == '?') {
604      quantifier_type = RegExpQuantifier::NON_GREEDY;
605      Advance();
606    } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
607      // FLAG_regexp_possessive_quantifier is a debug-only flag.
608      quantifier_type = RegExpQuantifier::POSSESSIVE;
609      Advance();
610    }
611    if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
612      return ReportError(CStrVector("Invalid quantifier"));
613    }
614  }
615}
616
617
618#ifdef DEBUG
619// Currently only used in an DCHECK.
620static bool IsSpecialClassEscape(uc32 c) {
621  switch (c) {
622    case 'd':
623    case 'D':
624    case 's':
625    case 'S':
626    case 'w':
627    case 'W':
628      return true;
629    default:
630      return false;
631  }
632}
633#endif
634
635
636// In order to know whether an escape is a backreference or not we have to scan
637// the entire regexp and find the number of capturing parentheses.  However we
638// don't want to scan the regexp twice unless it is necessary.  This mini-parser
639// is called when needed.  It can see the difference between capturing and
640// noncapturing parentheses and can skip character classes and backslash-escaped
641// characters.
642void RegExpParser::ScanForCaptures() {
643  // Start with captures started previous to current position
644  int capture_count = captures_started();
645  // Add count of captures after this position.
646  int n;
647  while ((n = current()) != kEndMarker) {
648    Advance();
649    switch (n) {
650      case '\\':
651        Advance();
652        break;
653      case '[': {
654        int c;
655        while ((c = current()) != kEndMarker) {
656          Advance();
657          if (c == '\\') {
658            Advance();
659          } else {
660            if (c == ']') break;
661          }
662        }
663        break;
664      }
665      case '(':
666        if (current() != '?') capture_count++;
667        break;
668    }
669  }
670  capture_count_ = capture_count;
671  is_scanned_for_captures_ = true;
672}
673
674
675bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
676  DCHECK_EQ('\\', current());
677  DCHECK('1' <= Next() && Next() <= '9');
678  // Try to parse a decimal literal that is no greater than the total number
679  // of left capturing parentheses in the input.
680  int start = position();
681  int value = Next() - '0';
682  Advance(2);
683  while (true) {
684    uc32 c = current();
685    if (IsDecimalDigit(c)) {
686      value = 10 * value + (c - '0');
687      if (value > kMaxCaptures) {
688        Reset(start);
689        return false;
690      }
691      Advance();
692    } else {
693      break;
694    }
695  }
696  if (value > captures_started()) {
697    if (!is_scanned_for_captures_) {
698      int saved_position = position();
699      ScanForCaptures();
700      Reset(saved_position);
701    }
702    if (value > capture_count_) {
703      Reset(start);
704      return false;
705    }
706  }
707  *index_out = value;
708  return true;
709}
710
711static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
712  if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
713    v->push_back(code_unit);
714  } else {
715    v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
716    v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
717  }
718}
719
720const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
721  DCHECK(FLAG_harmony_regexp_named_captures);
722  DCHECK(unicode());
723
724  ZoneVector<uc16>* name =
725      new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
726
727  bool at_start = true;
728  while (true) {
729    uc32 c = current();
730    Advance();
731
732    // Convert unicode escapes.
733    if (c == '\\' && current() == 'u') {
734      Advance();
735      if (!ParseUnicodeEscape(&c)) {
736        ReportError(CStrVector("Invalid Unicode escape sequence"));
737        return nullptr;
738      }
739    }
740
741    if (at_start) {
742      if (!IdentifierStart::Is(c)) {
743        ReportError(CStrVector("Invalid capture group name"));
744        return nullptr;
745      }
746      push_code_unit(name, c);
747      at_start = false;
748    } else {
749      if (c == '>') {
750        break;
751      } else if (IdentifierPart::Is(c)) {
752        push_code_unit(name, c);
753      } else {
754        ReportError(CStrVector("Invalid capture group name"));
755        return nullptr;
756      }
757    }
758  }
759
760  return name;
761}
762
763bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
764                                             int index) {
765  DCHECK(FLAG_harmony_regexp_named_captures);
766  DCHECK(unicode());
767  DCHECK(0 < index && index <= captures_started_);
768  DCHECK_NOT_NULL(name);
769
770  if (named_captures_ == nullptr) {
771    named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
772  } else {
773    // Check for duplicates and bail if we find any.
774    for (const auto& named_capture : *named_captures_) {
775      if (*named_capture->name() == *name) {
776        ReportError(CStrVector("Duplicate capture group name"));
777        return false;
778      }
779    }
780  }
781
782  RegExpCapture* capture = GetCapture(index);
783  DCHECK(capture->name() == nullptr);
784
785  capture->set_name(name);
786  named_captures_->Add(capture, zone());
787
788  return true;
789}
790
791bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
792                                           RegExpParserState* state) {
793  // The parser is assumed to be on the '<' in \k<name>.
794  if (current() != '<') {
795    ReportError(CStrVector("Invalid named reference"));
796    return false;
797  }
798
799  Advance();
800  const ZoneVector<uc16>* name = ParseCaptureGroupName();
801  if (name == nullptr) {
802    return false;
803  }
804
805  if (state->IsInsideCaptureGroup(name)) {
806    builder->AddEmpty();
807  } else {
808    RegExpBackReference* atom = new (zone()) RegExpBackReference();
809    atom->set_name(name);
810
811    builder->AddAtom(atom);
812
813    if (named_back_references_ == nullptr) {
814      named_back_references_ =
815          new (zone()) ZoneList<RegExpBackReference*>(1, zone());
816    }
817    named_back_references_->Add(atom, zone());
818  }
819
820  return true;
821}
822
823void RegExpParser::PatchNamedBackReferences() {
824  if (named_back_references_ == nullptr) return;
825
826  if (named_captures_ == nullptr) {
827    ReportError(CStrVector("Invalid named capture referenced"));
828    return;
829  }
830
831  // Look up and patch the actual capture for each named back reference.
832  // TODO(jgruber): O(n^2), optimize if necessary.
833
834  for (int i = 0; i < named_back_references_->length(); i++) {
835    RegExpBackReference* ref = named_back_references_->at(i);
836
837    int index = -1;
838    for (const auto& capture : *named_captures_) {
839      if (*capture->name() == *ref->name()) {
840        index = capture->index();
841        break;
842      }
843    }
844
845    if (index == -1) {
846      ReportError(CStrVector("Invalid named capture referenced"));
847      return;
848    }
849
850    ref->set_capture(GetCapture(index));
851  }
852}
853
854RegExpCapture* RegExpParser::GetCapture(int index) {
855  // The index for the capture groups are one-based. Its index in the list is
856  // zero-based.
857  int know_captures =
858      is_scanned_for_captures_ ? capture_count_ : captures_started_;
859  DCHECK(index <= know_captures);
860  if (captures_ == NULL) {
861    captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
862  }
863  while (captures_->length() < know_captures) {
864    captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
865  }
866  return captures_->at(index - 1);
867}
868
869Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
870  if (named_captures_ == nullptr || named_captures_->is_empty())
871    return Handle<FixedArray>();
872
873  Factory* factory = isolate()->factory();
874
875  int len = named_captures_->length() * 2;
876  Handle<FixedArray> array = factory->NewFixedArray(len);
877
878  for (int i = 0; i < named_captures_->length(); i++) {
879    RegExpCapture* capture = named_captures_->at(i);
880    MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
881    array->set(i * 2, *name.ToHandleChecked());
882    array->set(i * 2 + 1, Smi::FromInt(capture->index()));
883  }
884
885  return array;
886}
887
888bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
889  for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
890    if (s->group_type() != CAPTURE) continue;
891    // Return true if we found the matching capture index.
892    if (index == s->capture_index()) return true;
893    // Abort if index is larger than what has been parsed up till this state.
894    if (index > s->capture_index()) return false;
895  }
896  return false;
897}
898
899bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
900    const ZoneVector<uc16>* name) {
901  DCHECK_NOT_NULL(name);
902  for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
903    if (s->capture_name() == nullptr) continue;
904    if (*s->capture_name() == *name) return true;
905  }
906  return false;
907}
908
909// QuantifierPrefix ::
910//   { DecimalDigits }
911//   { DecimalDigits , }
912//   { DecimalDigits , DecimalDigits }
913//
914// Returns true if parsing succeeds, and set the min_out and max_out
915// values. Values are truncated to RegExpTree::kInfinity if they overflow.
916bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
917  DCHECK_EQ(current(), '{');
918  int start = position();
919  Advance();
920  int min = 0;
921  if (!IsDecimalDigit(current())) {
922    Reset(start);
923    return false;
924  }
925  while (IsDecimalDigit(current())) {
926    int next = current() - '0';
927    if (min > (RegExpTree::kInfinity - next) / 10) {
928      // Overflow. Skip past remaining decimal digits and return -1.
929      do {
930        Advance();
931      } while (IsDecimalDigit(current()));
932      min = RegExpTree::kInfinity;
933      break;
934    }
935    min = 10 * min + next;
936    Advance();
937  }
938  int max = 0;
939  if (current() == '}') {
940    max = min;
941    Advance();
942  } else if (current() == ',') {
943    Advance();
944    if (current() == '}') {
945      max = RegExpTree::kInfinity;
946      Advance();
947    } else {
948      while (IsDecimalDigit(current())) {
949        int next = current() - '0';
950        if (max > (RegExpTree::kInfinity - next) / 10) {
951          do {
952            Advance();
953          } while (IsDecimalDigit(current()));
954          max = RegExpTree::kInfinity;
955          break;
956        }
957        max = 10 * max + next;
958        Advance();
959      }
960      if (current() != '}') {
961        Reset(start);
962        return false;
963      }
964      Advance();
965    }
966  } else {
967    Reset(start);
968    return false;
969  }
970  *min_out = min;
971  *max_out = max;
972  return true;
973}
974
975
976uc32 RegExpParser::ParseOctalLiteral() {
977  DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
978  // For compatibility with some other browsers (not all), we parse
979  // up to three octal digits with a value below 256.
980  uc32 value = current() - '0';
981  Advance();
982  if ('0' <= current() && current() <= '7') {
983    value = value * 8 + current() - '0';
984    Advance();
985    if (value < 32 && '0' <= current() && current() <= '7') {
986      value = value * 8 + current() - '0';
987      Advance();
988    }
989  }
990  return value;
991}
992
993
994bool RegExpParser::ParseHexEscape(int length, uc32* value) {
995  int start = position();
996  uc32 val = 0;
997  for (int i = 0; i < length; ++i) {
998    uc32 c = current();
999    int d = HexValue(c);
1000    if (d < 0) {
1001      Reset(start);
1002      return false;
1003    }
1004    val = val * 16 + d;
1005    Advance();
1006  }
1007  *value = val;
1008  return true;
1009}
1010
1011// This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1012bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1013  // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1014  // allowed). In the latter case, the number of hex digits between { } is
1015  // arbitrary. \ and u have already been read.
1016  if (current() == '{' && unicode()) {
1017    int start = position();
1018    Advance();
1019    if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
1020      if (current() == '}') {
1021        Advance();
1022        return true;
1023      }
1024    }
1025    Reset(start);
1026    return false;
1027  }
1028  // \u but no {, or \u{...} escapes not allowed.
1029  bool result = ParseHexEscape(4, value);
1030  if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1031      current() == '\\') {
1032    // Attempt to read trail surrogate.
1033    int start = position();
1034    if (Next() == 'u') {
1035      Advance(2);
1036      uc32 trail;
1037      if (ParseHexEscape(4, &trail) &&
1038          unibrow::Utf16::IsTrailSurrogate(trail)) {
1039        *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1040                                                      static_cast<uc16>(trail));
1041        return true;
1042      }
1043    }
1044    Reset(start);
1045  }
1046  return result;
1047}
1048
1049#ifdef V8_I18N_SUPPORT
1050
1051namespace {
1052
1053bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1054  const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1055  if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
1056  for (int i = 0;; i++) {
1057    const char* long_name = u_getPropertyName(
1058        property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1059    if (long_name == NULL) break;
1060    if (strcmp(property_name, long_name) == 0) return true;
1061  }
1062  return false;
1063}
1064
1065bool IsExactPropertyValueAlias(const char* property_value_name,
1066                               UProperty property, int32_t property_value) {
1067  const char* short_name =
1068      u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1069  if (short_name != NULL && strcmp(property_value_name, short_name) == 0) {
1070    return true;
1071  }
1072  for (int i = 0;; i++) {
1073    const char* long_name = u_getPropertyValueName(
1074        property, property_value,
1075        static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1076    if (long_name == NULL) break;
1077    if (strcmp(property_value_name, long_name) == 0) return true;
1078  }
1079  return false;
1080}
1081
1082bool LookupPropertyValueName(UProperty property,
1083                             const char* property_value_name, bool negate,
1084                             ZoneList<CharacterRange>* result, Zone* zone) {
1085  int32_t property_value =
1086      u_getPropertyValueEnum(property, property_value_name);
1087  if (property_value == UCHAR_INVALID_CODE) return false;
1088
1089  // We require the property name to match exactly to one of the property value
1090  // aliases. However, u_getPropertyValueEnum uses loose matching.
1091  if (!IsExactPropertyValueAlias(property_value_name, property,
1092                                 property_value)) {
1093    return false;
1094  }
1095
1096  USet* set = uset_openEmpty();
1097  UErrorCode ec = U_ZERO_ERROR;
1098  uset_applyIntPropertyValue(set, property, property_value, &ec);
1099  bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set);
1100
1101  if (success) {
1102    uset_removeAllStrings(set);
1103    if (negate) uset_complement(set);
1104    int item_count = uset_getItemCount(set);
1105    int item_result = 0;
1106    for (int i = 0; i < item_count; i++) {
1107      uc32 start = 0;
1108      uc32 end = 0;
1109      item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
1110      result->Add(CharacterRange::Range(start, end), zone);
1111    }
1112    DCHECK_EQ(U_ZERO_ERROR, ec);
1113    DCHECK_EQ(0, item_result);
1114  }
1115  uset_close(set);
1116  return success;
1117}
1118
1119template <size_t N>
1120inline bool NameEquals(const char* name, const char (&literal)[N]) {
1121  return strncmp(name, literal, N + 1) == 0;
1122}
1123
1124bool LookupSpecialPropertyValueName(const char* name,
1125                                    ZoneList<CharacterRange>* result,
1126                                    bool negate, Zone* zone) {
1127  if (NameEquals(name, "Any")) {
1128    if (!negate) result->Add(CharacterRange::Everything(), zone);
1129  } else if (NameEquals(name, "ASCII")) {
1130    result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1131                       : CharacterRange::Range(0x0, 0x7f),
1132                zone);
1133  } else if (NameEquals(name, "Assigned")) {
1134    return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1135                                   !negate, result, zone);
1136  } else {
1137    return false;
1138  }
1139  return true;
1140}
1141
1142}  // anonymous namespace
1143
1144bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1145                                      bool negate) {
1146  // Parse the property class as follows:
1147  // - In \p{name}, 'name' is interpreted
1148  //   - either as a general category property value name.
1149  //   - or as a binary property name.
1150  // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1151  //   and 'value' is interpreted as one of the available property value names.
1152  // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1153  // - Loose matching is not applied.
1154  List<char> first_part;
1155  List<char> second_part;
1156  if (current() == '{') {
1157    // Parse \p{[PropertyName=]PropertyNameValue}
1158    for (Advance(); current() != '}' && current() != '='; Advance()) {
1159      if (!has_next()) return false;
1160      first_part.Add(static_cast<char>(current()));
1161    }
1162    if (current() == '=') {
1163      for (Advance(); current() != '}'; Advance()) {
1164        if (!has_next()) return false;
1165        second_part.Add(static_cast<char>(current()));
1166      }
1167      second_part.Add(0);  // null-terminate string.
1168    }
1169  } else {
1170    return false;
1171  }
1172  Advance();
1173  first_part.Add(0);  // null-terminate string.
1174
1175  if (second_part.is_empty()) {
1176    // First attempt to interpret as general category property value name.
1177    const char* name = first_part.ToConstVector().start();
1178    if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1179                                result, zone())) {
1180      return true;
1181    }
1182    // Interpret "Any", "ASCII", and "Assigned".
1183    if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
1184      return true;
1185    }
1186    // Then attempt to interpret as binary property name with value name 'Y'.
1187    UProperty property = u_getPropertyEnum(name);
1188    if (property < UCHAR_BINARY_START) return false;
1189    if (property >= UCHAR_BINARY_LIMIT) return false;
1190    if (!IsExactPropertyAlias(name, property)) return false;
1191    return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
1192                                   zone());
1193  } else {
1194    // Both property name and value name are specified. Attempt to interpret
1195    // the property name as enumerated property.
1196    const char* property_name = first_part.ToConstVector().start();
1197    const char* value_name = second_part.ToConstVector().start();
1198    UProperty property = u_getPropertyEnum(property_name);
1199    if (property < UCHAR_INT_START) return false;
1200    if (property >= UCHAR_INT_LIMIT) return false;
1201    if (!IsExactPropertyAlias(property_name, property)) return false;
1202    return LookupPropertyValueName(property, value_name, negate, result,
1203                                   zone());
1204  }
1205}
1206
1207#else  // V8_I18N_SUPPORT
1208
1209bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1210                                      bool negate) {
1211  return false;
1212}
1213
1214#endif  // V8_I18N_SUPPORT
1215
1216bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1217  uc32 x = 0;
1218  int d = HexValue(current());
1219  if (d < 0) {
1220    return false;
1221  }
1222  while (d >= 0) {
1223    x = x * 16 + d;
1224    if (x > max_value) {
1225      return false;
1226    }
1227    Advance();
1228    d = HexValue(current());
1229  }
1230  *value = x;
1231  return true;
1232}
1233
1234
1235uc32 RegExpParser::ParseClassCharacterEscape() {
1236  DCHECK(current() == '\\');
1237  DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1238  Advance();
1239  switch (current()) {
1240    case 'b':
1241      Advance();
1242      return '\b';
1243    // ControlEscape :: one of
1244    //   f n r t v
1245    case 'f':
1246      Advance();
1247      return '\f';
1248    case 'n':
1249      Advance();
1250      return '\n';
1251    case 'r':
1252      Advance();
1253      return '\r';
1254    case 't':
1255      Advance();
1256      return '\t';
1257    case 'v':
1258      Advance();
1259      return '\v';
1260    case 'c': {
1261      uc32 controlLetter = Next();
1262      uc32 letter = controlLetter & ~('A' ^ 'a');
1263      // For compatibility with JSC, inside a character class. We also accept
1264      // digits and underscore as control characters, unless with /u.
1265      if (letter >= 'A' && letter <= 'Z') {
1266        Advance(2);
1267        // Control letters mapped to ASCII control characters in the range
1268        // 0x00-0x1f.
1269        return controlLetter & 0x1f;
1270      }
1271      if (unicode()) {
1272        // With /u, invalid escapes are not treated as identity escapes.
1273        ReportError(CStrVector("Invalid class escape"));
1274        return 0;
1275      }
1276      if ((controlLetter >= '0' && controlLetter <= '9') ||
1277          controlLetter == '_') {
1278        Advance(2);
1279        return controlLetter & 0x1f;
1280      }
1281      // We match JSC in reading the backslash as a literal
1282      // character instead of as starting an escape.
1283      return '\\';
1284    }
1285    case '0':
1286      // With /u, \0 is interpreted as NUL if not followed by another digit.
1287      if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1288        Advance();
1289        return 0;
1290      }
1291    // Fall through.
1292    case '1':
1293    case '2':
1294    case '3':
1295    case '4':
1296    case '5':
1297    case '6':
1298    case '7':
1299      // For compatibility, we interpret a decimal escape that isn't
1300      // a back reference (and therefore either \0 or not valid according
1301      // to the specification) as a 1..3 digit octal character code.
1302      if (unicode()) {
1303        // With /u, decimal escape is not interpreted as octal character code.
1304        ReportError(CStrVector("Invalid class escape"));
1305        return 0;
1306      }
1307      return ParseOctalLiteral();
1308    case 'x': {
1309      Advance();
1310      uc32 value;
1311      if (ParseHexEscape(2, &value)) return value;
1312      if (unicode()) {
1313        // With /u, invalid escapes are not treated as identity escapes.
1314        ReportError(CStrVector("Invalid escape"));
1315        return 0;
1316      }
1317      // If \x is not followed by a two-digit hexadecimal, treat it
1318      // as an identity escape.
1319      return 'x';
1320    }
1321    case 'u': {
1322      Advance();
1323      uc32 value;
1324      if (ParseUnicodeEscape(&value)) return value;
1325      if (unicode()) {
1326        // With /u, invalid escapes are not treated as identity escapes.
1327        ReportError(CStrVector("Invalid unicode escape"));
1328        return 0;
1329      }
1330      // If \u is not followed by a two-digit hexadecimal, treat it
1331      // as an identity escape.
1332      return 'u';
1333    }
1334    default: {
1335      uc32 result = current();
1336      // With /u, no identity escapes except for syntax characters and '-' are
1337      // allowed. Otherwise, all identity escapes are allowed.
1338      if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1339        Advance();
1340        return result;
1341      }
1342      ReportError(CStrVector("Invalid escape"));
1343      return 0;
1344    }
1345  }
1346  return 0;
1347}
1348
1349
1350CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
1351  DCHECK_EQ(0, *char_class);
1352  uc32 first = current();
1353  if (first == '\\') {
1354    switch (Next()) {
1355      case 'w':
1356      case 'W':
1357      case 'd':
1358      case 'D':
1359      case 's':
1360      case 'S': {
1361        *char_class = Next();
1362        Advance(2);
1363        return CharacterRange::Singleton(0);  // Return dummy value.
1364      }
1365      case kEndMarker:
1366        return ReportError(CStrVector("\\ at end of pattern"));
1367      default:
1368        first = ParseClassCharacterEscape(CHECK_FAILED);
1369    }
1370  } else {
1371    Advance();
1372  }
1373
1374  return CharacterRange::Singleton(first);
1375}
1376
1377static const uc16 kNoCharClass = 0;
1378
1379// Adds range or pre-defined character class to character ranges.
1380// If char_class is not kInvalidClass, it's interpreted as a class
1381// escape (i.e., 's' means whitespace, from '\s').
1382static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
1383                                    uc16 char_class, CharacterRange range,
1384                                    Zone* zone) {
1385  if (char_class != kNoCharClass) {
1386    CharacterRange::AddClassEscape(char_class, ranges, zone);
1387  } else {
1388    ranges->Add(range, zone);
1389  }
1390}
1391
1392bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
1393  if (!FLAG_harmony_regexp_property) return false;
1394  if (!unicode()) return false;
1395  if (current() != '\\') return false;
1396  uc32 next = Next();
1397  bool parse_success = false;
1398  if (next == 'p') {
1399    Advance(2);
1400    parse_success = ParsePropertyClass(ranges, false);
1401  } else if (next == 'P') {
1402    Advance(2);
1403    parse_success = ParsePropertyClass(ranges, true);
1404  } else {
1405    return false;
1406  }
1407  if (!parse_success)
1408    ReportError(CStrVector("Invalid property name in character class"));
1409  return parse_success;
1410}
1411
1412RegExpTree* RegExpParser::ParseCharacterClass() {
1413  static const char* kUnterminated = "Unterminated character class";
1414  static const char* kRangeInvalid = "Invalid character class";
1415  static const char* kRangeOutOfOrder = "Range out of order in character class";
1416
1417  DCHECK_EQ(current(), '[');
1418  Advance();
1419  bool is_negated = false;
1420  if (current() == '^') {
1421    is_negated = true;
1422    Advance();
1423  }
1424  ZoneList<CharacterRange>* ranges =
1425      new (zone()) ZoneList<CharacterRange>(2, zone());
1426  while (has_more() && current() != ']') {
1427    bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
1428    if (parsed_property) continue;
1429    uc16 char_class = kNoCharClass;
1430    CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
1431    if (current() == '-') {
1432      Advance();
1433      if (current() == kEndMarker) {
1434        // If we reach the end we break out of the loop and let the
1435        // following code report an error.
1436        break;
1437      } else if (current() == ']') {
1438        AddRangeOrEscape(ranges, char_class, first, zone());
1439        ranges->Add(CharacterRange::Singleton('-'), zone());
1440        break;
1441      }
1442      uc16 char_class_2 = kNoCharClass;
1443      CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
1444      if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1445        // Either end is an escaped character class. Treat the '-' verbatim.
1446        if (unicode()) {
1447          // ES2015 21.2.2.15.1 step 1.
1448          return ReportError(CStrVector(kRangeInvalid));
1449        }
1450        AddRangeOrEscape(ranges, char_class, first, zone());
1451        ranges->Add(CharacterRange::Singleton('-'), zone());
1452        AddRangeOrEscape(ranges, char_class_2, next, zone());
1453        continue;
1454      }
1455      // ES2015 21.2.2.15.1 step 6.
1456      if (first.from() > next.to()) {
1457        return ReportError(CStrVector(kRangeOutOfOrder));
1458      }
1459      ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
1460    } else {
1461      AddRangeOrEscape(ranges, char_class, first, zone());
1462    }
1463  }
1464  if (!has_more()) {
1465    return ReportError(CStrVector(kUnterminated));
1466  }
1467  Advance();
1468  if (ranges->length() == 0) {
1469    ranges->Add(CharacterRange::Everything(), zone());
1470    is_negated = !is_negated;
1471  }
1472  return new (zone()) RegExpCharacterClass(ranges, is_negated);
1473}
1474
1475
1476#undef CHECK_FAILED
1477
1478
1479bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1480                               FlatStringReader* input, JSRegExp::Flags flags,
1481                               RegExpCompileData* result) {
1482  DCHECK(result != NULL);
1483  RegExpParser parser(input, &result->error, flags, isolate, zone);
1484  RegExpTree* tree = parser.ParsePattern();
1485  if (parser.failed()) {
1486    DCHECK(tree == NULL);
1487    DCHECK(!result->error.is_null());
1488  } else {
1489    DCHECK(tree != NULL);
1490    DCHECK(result->error.is_null());
1491    if (FLAG_trace_regexp_parser) {
1492      OFStream os(stdout);
1493      tree->Print(os, zone);
1494      os << "\n";
1495    }
1496    result->tree = tree;
1497    int capture_count = parser.captures_started();
1498    result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1499    result->contains_anchor = parser.contains_anchor();
1500    result->capture_name_map = parser.CreateCaptureNameMap();
1501    result->capture_count = capture_count;
1502  }
1503  return !parser.failed();
1504}
1505
1506RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
1507    : zone_(zone),
1508      pending_empty_(false),
1509      ignore_case_(ignore_case),
1510      unicode_(unicode),
1511      characters_(NULL),
1512      pending_surrogate_(kNoPendingSurrogate),
1513      terms_(),
1514      alternatives_()
1515#ifdef DEBUG
1516      ,
1517      last_added_(ADD_NONE)
1518#endif
1519{
1520}
1521
1522
1523void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1524  DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1525  FlushPendingSurrogate();
1526  // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1527  pending_surrogate_ = lead_surrogate;
1528}
1529
1530
1531void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1532  DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1533  if (pending_surrogate_ != kNoPendingSurrogate) {
1534    uc16 lead_surrogate = pending_surrogate_;
1535    pending_surrogate_ = kNoPendingSurrogate;
1536    DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1537    uc32 combined =
1538        unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1539    if (NeedsDesugaringForIgnoreCase(combined)) {
1540      AddCharacterClassForDesugaring(combined);
1541    } else {
1542      ZoneList<uc16> surrogate_pair(2, zone());
1543      surrogate_pair.Add(lead_surrogate, zone());
1544      surrogate_pair.Add(trail_surrogate, zone());
1545      RegExpAtom* atom =
1546          new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1547      AddAtom(atom);
1548    }
1549  } else {
1550    pending_surrogate_ = trail_surrogate;
1551    FlushPendingSurrogate();
1552  }
1553}
1554
1555
1556void RegExpBuilder::FlushPendingSurrogate() {
1557  if (pending_surrogate_ != kNoPendingSurrogate) {
1558    DCHECK(unicode());
1559    uc32 c = pending_surrogate_;
1560    pending_surrogate_ = kNoPendingSurrogate;
1561    AddCharacterClassForDesugaring(c);
1562  }
1563}
1564
1565
1566void RegExpBuilder::FlushCharacters() {
1567  FlushPendingSurrogate();
1568  pending_empty_ = false;
1569  if (characters_ != NULL) {
1570    RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1571    characters_ = NULL;
1572    text_.Add(atom, zone());
1573    LAST(ADD_ATOM);
1574  }
1575}
1576
1577
1578void RegExpBuilder::FlushText() {
1579  FlushCharacters();
1580  int num_text = text_.length();
1581  if (num_text == 0) {
1582    return;
1583  } else if (num_text == 1) {
1584    terms_.Add(text_.last(), zone());
1585  } else {
1586    RegExpText* text = new (zone()) RegExpText(zone());
1587    for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1588    terms_.Add(text, zone());
1589  }
1590  text_.Clear();
1591}
1592
1593
1594void RegExpBuilder::AddCharacter(uc16 c) {
1595  FlushPendingSurrogate();
1596  pending_empty_ = false;
1597  if (NeedsDesugaringForIgnoreCase(c)) {
1598    AddCharacterClassForDesugaring(c);
1599  } else {
1600    if (characters_ == NULL) {
1601      characters_ = new (zone()) ZoneList<uc16>(4, zone());
1602    }
1603    characters_->Add(c, zone());
1604    LAST(ADD_CHAR);
1605  }
1606}
1607
1608
1609void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1610  if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
1611    DCHECK(unicode());
1612    AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1613    AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1614  } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1615    AddLeadSurrogate(c);
1616  } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1617    AddTrailSurrogate(c);
1618  } else {
1619    AddCharacter(static_cast<uc16>(c));
1620  }
1621}
1622
1623void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1624  // A lead or trail surrogate parsed via escape sequence will not
1625  // pair up with any preceding lead or following trail surrogate.
1626  FlushPendingSurrogate();
1627  AddUnicodeCharacter(character);
1628  FlushPendingSurrogate();
1629}
1630
1631void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1632
1633
1634void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1635  if (NeedsDesugaringForUnicode(cc)) {
1636    // With /u, character class needs to be desugared, so it
1637    // must be a standalone term instead of being part of a RegExpText.
1638    AddTerm(cc);
1639  } else {
1640    AddAtom(cc);
1641  }
1642}
1643
1644void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1645  AddTerm(new (zone()) RegExpCharacterClass(
1646      CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
1647}
1648
1649
1650void RegExpBuilder::AddAtom(RegExpTree* term) {
1651  if (term->IsEmpty()) {
1652    AddEmpty();
1653    return;
1654  }
1655  if (term->IsTextElement()) {
1656    FlushCharacters();
1657    text_.Add(term, zone());
1658  } else {
1659    FlushText();
1660    terms_.Add(term, zone());
1661  }
1662  LAST(ADD_ATOM);
1663}
1664
1665
1666void RegExpBuilder::AddTerm(RegExpTree* term) {
1667  FlushText();
1668  terms_.Add(term, zone());
1669  LAST(ADD_ATOM);
1670}
1671
1672
1673void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1674  FlushText();
1675  terms_.Add(assert, zone());
1676  LAST(ADD_ASSERT);
1677}
1678
1679
1680void RegExpBuilder::NewAlternative() { FlushTerms(); }
1681
1682
1683void RegExpBuilder::FlushTerms() {
1684  FlushText();
1685  int num_terms = terms_.length();
1686  RegExpTree* alternative;
1687  if (num_terms == 0) {
1688    alternative = new (zone()) RegExpEmpty();
1689  } else if (num_terms == 1) {
1690    alternative = terms_.last();
1691  } else {
1692    alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1693  }
1694  alternatives_.Add(alternative, zone());
1695  terms_.Clear();
1696  LAST(ADD_NONE);
1697}
1698
1699
1700bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1701  if (!unicode()) return false;
1702  // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1703  // necessarily mean that we need to desugar. It's probably nicer to have a
1704  // separate pass to figure out unicode desugarings.
1705  if (ignore_case()) return true;
1706  ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1707  CharacterRange::Canonicalize(ranges);
1708  for (int i = ranges->length() - 1; i >= 0; i--) {
1709    uc32 from = ranges->at(i).from();
1710    uc32 to = ranges->at(i).to();
1711    // Check for non-BMP characters.
1712    if (to >= kNonBmpStart) return true;
1713    // Check for lone surrogates.
1714    if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1715  }
1716  return false;
1717}
1718
1719
1720bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1721#ifdef V8_I18N_SUPPORT
1722  if (unicode() && ignore_case()) {
1723    USet* set = uset_open(c, c);
1724    uset_closeOver(set, USET_CASE_INSENSITIVE);
1725    uset_removeAllStrings(set);
1726    bool result = uset_size(set) > 1;
1727    uset_close(set);
1728    return result;
1729  }
1730  // In the case where ICU is not included, we act as if the unicode flag is
1731  // not set, and do not desugar.
1732#endif  // V8_I18N_SUPPORT
1733  return false;
1734}
1735
1736
1737RegExpTree* RegExpBuilder::ToRegExp() {
1738  FlushTerms();
1739  int num_alternatives = alternatives_.length();
1740  if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1741  if (num_alternatives == 1) return alternatives_.last();
1742  return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1743}
1744
1745bool RegExpBuilder::AddQuantifierToAtom(
1746    int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1747  FlushPendingSurrogate();
1748  if (pending_empty_) {
1749    pending_empty_ = false;
1750    return true;
1751  }
1752  RegExpTree* atom;
1753  if (characters_ != NULL) {
1754    DCHECK(last_added_ == ADD_CHAR);
1755    // Last atom was character.
1756    Vector<const uc16> char_vector = characters_->ToConstVector();
1757    int num_chars = char_vector.length();
1758    if (num_chars > 1) {
1759      Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1760      text_.Add(new (zone()) RegExpAtom(prefix), zone());
1761      char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1762    }
1763    characters_ = NULL;
1764    atom = new (zone()) RegExpAtom(char_vector);
1765    FlushText();
1766  } else if (text_.length() > 0) {
1767    DCHECK(last_added_ == ADD_ATOM);
1768    atom = text_.RemoveLast();
1769    FlushText();
1770  } else if (terms_.length() > 0) {
1771    DCHECK(last_added_ == ADD_ATOM);
1772    atom = terms_.RemoveLast();
1773    // With /u, lookarounds are not quantifiable.
1774    if (unicode() && atom->IsLookaround()) return false;
1775    if (atom->max_match() == 0) {
1776      // Guaranteed to only match an empty string.
1777      LAST(ADD_TERM);
1778      if (min == 0) {
1779        return true;
1780      }
1781      terms_.Add(atom, zone());
1782      return true;
1783    }
1784  } else {
1785    // Only call immediately after adding an atom or character!
1786    UNREACHABLE();
1787    return false;
1788  }
1789  terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1790             zone());
1791  LAST(ADD_TERM);
1792  return true;
1793}
1794
1795}  // namespace internal
1796}  // namespace v8
1797