1014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch// Copyright 2016 the V8 project authors. All rights reserved.
2014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch// Use of this source code is governed by a BSD-style license that can be
3014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch// found in the LICENSE file.
4014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
5014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch#ifndef V8_REGEXP_REGEXP_PARSER_H_
6014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch#define V8_REGEXP_REGEXP_PARSER_H_
7014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
8014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch#include "src/objects.h"
9014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch#include "src/regexp/regexp-ast.h"
10f3b273f5e6ffd2f6ba1c18a27a17db41dfb113c3Ben Murdoch#include "src/zone/zone.h"
11014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
12014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdochnamespace v8 {
13014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdochnamespace internal {
14014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
15014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdochstruct RegExpCompileData;
16014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
17014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
18014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch// A BufferedZoneList is an automatically growing list, just like (and backed
19014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch// by) a ZoneList, that is optimized for the case of adding and removing
20014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch// a single element. The last element added is stored outside the backing list,
21014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch// and if no more than one element is ever added, the ZoneList isn't even
22014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch// allocated.
23014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch// Elements must not be NULL pointers.
24014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdochtemplate <typename T, int initial_size>
25014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdochclass BufferedZoneList {
26014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch public:
27014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  BufferedZoneList() : list_(NULL), last_(NULL) {}
28014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
29014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // Adds element at end of list. This element is buffered and can
30014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // be read using last() or removed using RemoveLast until a new Add or until
31014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // RemoveLast or GetList has been called.
32014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void Add(T* value, Zone* zone) {
33014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    if (last_ != NULL) {
34014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch      if (list_ == NULL) {
35014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch        list_ = new (zone) ZoneList<T*>(initial_size, zone);
36014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch      }
37014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch      list_->Add(last_, zone);
38014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    }
39014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    last_ = value;
40014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  }
41014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
42014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  T* last() {
43014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    DCHECK(last_ != NULL);
44014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    return last_;
45014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  }
46014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
47014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  T* RemoveLast() {
48014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    DCHECK(last_ != NULL);
49014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    T* result = last_;
50014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    if ((list_ != NULL) && (list_->length() > 0))
51014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch      last_ = list_->RemoveLast();
52014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    else
53014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch      last_ = NULL;
54014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    return result;
55014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  }
56014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
57014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  T* Get(int i) {
58014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    DCHECK((0 <= i) && (i < length()));
59014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    if (list_ == NULL) {
60014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch      DCHECK_EQ(0, i);
61014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch      return last_;
62014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    } else {
63014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch      if (i == list_->length()) {
64014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch        DCHECK(last_ != NULL);
65014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch        return last_;
66014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch      } else {
67014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch        return list_->at(i);
68014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch      }
69014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    }
70014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  }
71014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
72014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void Clear() {
73014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    list_ = NULL;
74014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    last_ = NULL;
75014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  }
76014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
77014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  int length() {
78014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    int length = (list_ == NULL) ? 0 : list_->length();
79014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    return length + ((last_ == NULL) ? 0 : 1);
80014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  }
81014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
82014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  ZoneList<T*>* GetList(Zone* zone) {
83014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    if (list_ == NULL) {
84014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch      list_ = new (zone) ZoneList<T*>(initial_size, zone);
85014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    }
86014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    if (last_ != NULL) {
87014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch      list_->Add(last_, zone);
88014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch      last_ = NULL;
89014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    }
90014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    return list_;
91014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  }
92014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
93014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch private:
94014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  ZoneList<T*>* list_;
95014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  T* last_;
96014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch};
97014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
98014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
99014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdochclass RegExpBuilder : public ZoneObject {
101014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch public:
102109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  RegExpBuilder(Zone* zone, bool ignore_case, bool unicode);
103014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void AddCharacter(uc16 character);
104014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void AddUnicodeCharacter(uc32 character);
105109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  void AddEscapedUnicodeCharacter(uc32 character);
106014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // "Adds" an empty expression. Does nothing except consume a
107014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // following quantifier
108014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void AddEmpty();
109109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  void AddCharacterClass(RegExpCharacterClass* cc);
110109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  void AddCharacterClassForDesugaring(uc32 c);
111014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void AddAtom(RegExpTree* tree);
112109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  void AddTerm(RegExpTree* tree);
113014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void AddAssertion(RegExpTree* tree);
114014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void NewAlternative();  // '|'
115109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  bool AddQuantifierToAtom(int min, int max,
116014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch                           RegExpQuantifier::QuantifierType type);
117014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  RegExpTree* ToRegExp();
118014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
119014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch private:
120109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  static const uc16 kNoPendingSurrogate = 0;
121109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  void AddLeadSurrogate(uc16 lead_surrogate);
122109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  void AddTrailSurrogate(uc16 trail_surrogate);
123109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  void FlushPendingSurrogate();
124014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void FlushCharacters();
125014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void FlushText();
126014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void FlushTerms();
127109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
128109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  bool NeedsDesugaringForIgnoreCase(uc32 c);
129014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  Zone* zone() const { return zone_; }
130109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  bool ignore_case() const { return ignore_case_; }
131109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  bool unicode() const { return unicode_; }
132014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
133014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  Zone* zone_;
134014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool pending_empty_;
135109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  bool ignore_case_;
136109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  bool unicode_;
137014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  ZoneList<uc16>* characters_;
138109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  uc16 pending_surrogate_;
139014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  BufferedZoneList<RegExpTree, 2> terms_;
140014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  BufferedZoneList<RegExpTree, 2> text_;
141014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  BufferedZoneList<RegExpTree, 2> alternatives_;
142014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch#ifdef DEBUG
143014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
144014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch#define LAST(x) last_added_ = x;
145014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch#else
146014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch#define LAST(x)
147014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch#endif
148014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch};
149014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
150014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
151014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdochclass RegExpParser BASE_EMBEDDED {
152014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch public:
153109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  RegExpParser(FlatStringReader* in, Handle<String>* error,
154109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch               JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
155014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
156014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
157109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch                          JSRegExp::Flags flags, RegExpCompileData* result);
158014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
159014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  RegExpTree* ParsePattern();
160014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  RegExpTree* ParseDisjunction();
161014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  RegExpTree* ParseGroup();
162014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  RegExpTree* ParseCharacterClass();
163014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
164014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // Parses a {...,...} quantifier and stores the range in the given
165014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // out parameters.
166014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool ParseIntervalQuantifier(int* min_out, int* max_out);
167014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
168014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // Parses and returns a single escaped character.  The character
169014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // must not be 'b' or 'B' since they are usually handle specially.
170014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  uc32 ParseClassCharacterEscape();
171014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
172014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // Checks whether the following is a length-digit hexadecimal number,
173014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // and sets the value if it is.
174014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool ParseHexEscape(int length, uc32* value);
175014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool ParseUnicodeEscape(uc32* value);
176014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
17713e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  bool ParsePropertyClass(ZoneList<CharacterRange>* result, bool negate);
178014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
179014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  uc32 ParseOctalLiteral();
180014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
181014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // Tries to parse the input as a back reference.  If successful it
182014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // stores the result in the output parameter and returns true.  If
183014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // it fails it will push back the characters read so the same characters
184014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // can be reparsed.
185014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool ParseBackReferenceIndex(int* index_out);
186014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
1873b9bc31999c9787eb726ecdbfd5796bfdec32a18Ben Murdoch  bool ParseClassProperty(ZoneList<CharacterRange>* result);
188014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  CharacterRange ParseClassAtom(uc16* char_class);
189014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  RegExpTree* ReportError(Vector<const char> message);
190014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void Advance();
191014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void Advance(int dist);
192014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void Reset(int pos);
193014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
194014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // Reports whether the pattern might be used as a literal search string.
195014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // Only use if the result of the parse is a single atom node.
196014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool simple();
197014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool contains_anchor() { return contains_anchor_; }
198014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void set_contains_anchor() { contains_anchor_ = true; }
199014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  int captures_started() { return captures_started_; }
200014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  int position() { return next_pos_ - 1; }
201014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool failed() { return failed_; }
202109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  bool ignore_case() const { return ignore_case_; }
203109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  bool multiline() const { return multiline_; }
204109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  bool unicode() const { return unicode_; }
205014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
206109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  static bool IsSyntaxCharacterOrSlash(uc32 c);
207014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
208014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  static const int kMaxCaptures = 1 << 16;
209014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  static const uc32 kEndMarker = (1 << 21);
210014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
211014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch private:
212014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  enum SubexpressionType {
213014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    INITIAL,
214014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    CAPTURE,  // All positive values represent captures.
215014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    POSITIVE_LOOKAROUND,
216014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    NEGATIVE_LOOKAROUND,
217014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    GROUPING
218014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  };
219014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
220014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  class RegExpParserState : public ZoneObject {
221014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch   public:
222014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    RegExpParserState(RegExpParserState* previous_state,
223014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch                      SubexpressionType group_type,
224014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch                      RegExpLookaround::Type lookaround_type,
22513e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch                      int disjunction_capture_index,
22613e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch                      const ZoneVector<uc16>* capture_name, bool ignore_case,
227109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch                      bool unicode, Zone* zone)
228014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch        : previous_state_(previous_state),
229109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch          builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)),
230014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch          group_type_(group_type),
231014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch          lookaround_type_(lookaround_type),
23213e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch          disjunction_capture_index_(disjunction_capture_index),
23313e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch          capture_name_(capture_name) {}
234014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    // Parser state of containing expression, if any.
235014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    RegExpParserState* previous_state() { return previous_state_; }
236014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    bool IsSubexpression() { return previous_state_ != NULL; }
237014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    // RegExpBuilder building this regexp's AST.
238014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    RegExpBuilder* builder() { return builder_; }
239014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    // Type of regexp being parsed (parenthesized group or entire regexp).
240014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    SubexpressionType group_type() { return group_type_; }
241014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    // Lookahead or Lookbehind.
242014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
243014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    // Index in captures array of first capture in this sub-expression, if any.
244014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    // Also the capture index of this sub-expression itself, if group_type
245014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    // is CAPTURE.
246014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    int capture_index() { return disjunction_capture_index_; }
24713e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch    // The name of the current sub-expression, if group_type is CAPTURE. Only
24813e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch    // used for named captures.
24913e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch    const ZoneVector<uc16>* capture_name() { return capture_name_; }
25013e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch
25113e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch    bool IsNamedCapture() const { return capture_name_ != nullptr; }
252014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
253014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    // Check whether the parser is inside a capture group with the given index.
254014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    bool IsInsideCaptureGroup(int index);
25513e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch    // Check whether the parser is inside a capture group with the given name.
25613e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch    bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
257014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
258014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch   private:
259014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    // Linked list implementation of stack of states.
260014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    RegExpParserState* previous_state_;
261014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    // Builder for the stored disjunction.
262014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    RegExpBuilder* builder_;
263014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    // Stored disjunction type (capture, look-ahead or grouping), if any.
264014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    SubexpressionType group_type_;
265014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    // Stored read direction.
266014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    RegExpLookaround::Type lookaround_type_;
267014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    // Stored disjunction's capture index (if any).
268014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch    int disjunction_capture_index_;
26913e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch    // Stored capture name (if any).
27013e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch    const ZoneVector<uc16>* capture_name_;
271014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  };
272014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
273014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // Return the 1-indexed RegExpCapture object, allocate if necessary.
274014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  RegExpCapture* GetCapture(int index);
275014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
27613e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  // Creates a new named capture at the specified index. Must be called exactly
27713e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  // once for each named capture. Fails if a capture with the same name is
27813e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  // encountered.
27913e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
28013e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch
28113e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  // Parses the name of a capture group (?<name>pattern). The name must adhere
28213e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  // to IdentifierName in the ECMAScript standard.
28313e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  const ZoneVector<uc16>* ParseCaptureGroupName();
28413e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch
28513e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  bool ParseNamedBackReference(RegExpBuilder* builder,
28613e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch                               RegExpParserState* state);
28713e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch
28813e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  // After the initial parsing pass, patch corresponding RegExpCapture objects
28913e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  // into all RegExpBackReferences. This is done after initial parsing in order
29013e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  // to avoid complicating cases in which references comes before the capture.
29113e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  void PatchNamedBackReferences();
29213e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch
29313e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  Handle<FixedArray> CreateCaptureNameMap();
29413e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch
295014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  Isolate* isolate() { return isolate_; }
296014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  Zone* zone() const { return zone_; }
297014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
298014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  uc32 current() { return current_; }
299014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool has_more() { return has_more_; }
300014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool has_next() { return next_pos_ < in()->length(); }
301014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  uc32 Next();
302109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  template <bool update_position>
303109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  uc32 ReadNext();
304014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  FlatStringReader* in() { return in_; }
305014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  void ScanForCaptures();
306014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
307014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  Isolate* isolate_;
308014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  Zone* zone_;
309014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  Handle<String>* error_;
310014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  ZoneList<RegExpCapture*>* captures_;
31113e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  ZoneList<RegExpCapture*>* named_captures_;
31213e2dadd00298019ed862f2b2fc5068bba730bcfBen Murdoch  ZoneList<RegExpBackReference*>* named_back_references_;
313014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  FlatStringReader* in_;
314014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  uc32 current_;
315109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  bool ignore_case_;
316109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  bool multiline_;
317109988c7ccb6f3fd1a58574fa3dfb88beaef6632Ben Murdoch  bool unicode_;
318014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  int next_pos_;
319014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  int captures_started_;
320014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  // The capture count is only valid after we have scanned for captures.
321014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  int capture_count_;
322014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool has_more_;
323014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool simple_;
324014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool contains_anchor_;
325014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool is_scanned_for_captures_;
326014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch  bool failed_;
327014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch};
328014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
329014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch}  // namespace internal
330014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch}  // namespace v8
331014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch
332014dc512cdd3e367bee49a713fdc5ed92584a3e5Ben Murdoch#endif  // V8_REGEXP_REGEXP_PARSER_H_
333