1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_REGEXP_PARSER_H_
6#define V8_REGEXP_REGEXP_PARSER_H_
7
8#include "src/objects.h"
9#include "src/regexp/regexp-ast.h"
10#include "src/zone/zone.h"
11
12namespace v8 {
13namespace internal {
14
15struct RegExpCompileData;
16
17
18// A BufferedZoneList is an automatically growing list, just like (and backed
19// by) a ZoneList, that is optimized for the case of adding and removing
20// a single element. The last element added is stored outside the backing list,
21// and if no more than one element is ever added, the ZoneList isn't even
22// allocated.
23// Elements must not be NULL pointers.
24template <typename T, int initial_size>
25class BufferedZoneList {
26 public:
27  BufferedZoneList() : list_(NULL), last_(NULL) {}
28
29  // Adds element at end of list. This element is buffered and can
30  // be read using last() or removed using RemoveLast until a new Add or until
31  // RemoveLast or GetList has been called.
32  void Add(T* value, Zone* zone) {
33    if (last_ != NULL) {
34      if (list_ == NULL) {
35        list_ = new (zone) ZoneList<T*>(initial_size, zone);
36      }
37      list_->Add(last_, zone);
38    }
39    last_ = value;
40  }
41
42  T* last() {
43    DCHECK(last_ != NULL);
44    return last_;
45  }
46
47  T* RemoveLast() {
48    DCHECK(last_ != NULL);
49    T* result = last_;
50    if ((list_ != NULL) && (list_->length() > 0))
51      last_ = list_->RemoveLast();
52    else
53      last_ = NULL;
54    return result;
55  }
56
57  T* Get(int i) {
58    DCHECK((0 <= i) && (i < length()));
59    if (list_ == NULL) {
60      DCHECK_EQ(0, i);
61      return last_;
62    } else {
63      if (i == list_->length()) {
64        DCHECK(last_ != NULL);
65        return last_;
66      } else {
67        return list_->at(i);
68      }
69    }
70  }
71
72  void Clear() {
73    list_ = NULL;
74    last_ = NULL;
75  }
76
77  int length() {
78    int length = (list_ == NULL) ? 0 : list_->length();
79    return length + ((last_ == NULL) ? 0 : 1);
80  }
81
82  ZoneList<T*>* GetList(Zone* zone) {
83    if (list_ == NULL) {
84      list_ = new (zone) ZoneList<T*>(initial_size, zone);
85    }
86    if (last_ != NULL) {
87      list_->Add(last_, zone);
88      last_ = NULL;
89    }
90    return list_;
91  }
92
93 private:
94  ZoneList<T*>* list_;
95  T* last_;
96};
97
98
99// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100class RegExpBuilder : public ZoneObject {
101 public:
102  RegExpBuilder(Zone* zone, bool ignore_case, bool unicode);
103  void AddCharacter(uc16 character);
104  void AddUnicodeCharacter(uc32 character);
105  void AddEscapedUnicodeCharacter(uc32 character);
106  // "Adds" an empty expression. Does nothing except consume a
107  // following quantifier
108  void AddEmpty();
109  void AddCharacterClass(RegExpCharacterClass* cc);
110  void AddCharacterClassForDesugaring(uc32 c);
111  void AddAtom(RegExpTree* tree);
112  void AddTerm(RegExpTree* tree);
113  void AddAssertion(RegExpTree* tree);
114  void NewAlternative();  // '|'
115  bool AddQuantifierToAtom(int min, int max,
116                           RegExpQuantifier::QuantifierType type);
117  RegExpTree* ToRegExp();
118
119 private:
120  static const uc16 kNoPendingSurrogate = 0;
121  void AddLeadSurrogate(uc16 lead_surrogate);
122  void AddTrailSurrogate(uc16 trail_surrogate);
123  void FlushPendingSurrogate();
124  void FlushCharacters();
125  void FlushText();
126  void FlushTerms();
127  bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
128  bool NeedsDesugaringForIgnoreCase(uc32 c);
129  Zone* zone() const { return zone_; }
130  bool ignore_case() const { return ignore_case_; }
131  bool unicode() const { return unicode_; }
132
133  Zone* zone_;
134  bool pending_empty_;
135  bool ignore_case_;
136  bool unicode_;
137  ZoneList<uc16>* characters_;
138  uc16 pending_surrogate_;
139  BufferedZoneList<RegExpTree, 2> terms_;
140  BufferedZoneList<RegExpTree, 2> text_;
141  BufferedZoneList<RegExpTree, 2> alternatives_;
142#ifdef DEBUG
143  enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
144#define LAST(x) last_added_ = x;
145#else
146#define LAST(x)
147#endif
148};
149
150
151class RegExpParser BASE_EMBEDDED {
152 public:
153  RegExpParser(FlatStringReader* in, Handle<String>* error,
154               JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
155
156  static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
157                          JSRegExp::Flags flags, RegExpCompileData* result);
158
159  RegExpTree* ParsePattern();
160  RegExpTree* ParseDisjunction();
161  RegExpTree* ParseGroup();
162  RegExpTree* ParseCharacterClass();
163
164  // Parses a {...,...} quantifier and stores the range in the given
165  // out parameters.
166  bool ParseIntervalQuantifier(int* min_out, int* max_out);
167
168  // Parses and returns a single escaped character.  The character
169  // must not be 'b' or 'B' since they are usually handle specially.
170  uc32 ParseClassCharacterEscape();
171
172  // Checks whether the following is a length-digit hexadecimal number,
173  // and sets the value if it is.
174  bool ParseHexEscape(int length, uc32* value);
175  bool ParseUnicodeEscape(uc32* value);
176  bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
177  bool ParsePropertyClass(ZoneList<CharacterRange>* result, bool negate);
178
179  uc32 ParseOctalLiteral();
180
181  // Tries to parse the input as a back reference.  If successful it
182  // stores the result in the output parameter and returns true.  If
183  // it fails it will push back the characters read so the same characters
184  // can be reparsed.
185  bool ParseBackReferenceIndex(int* index_out);
186
187  bool ParseClassProperty(ZoneList<CharacterRange>* result);
188  CharacterRange ParseClassAtom(uc16* char_class);
189  RegExpTree* ReportError(Vector<const char> message);
190  void Advance();
191  void Advance(int dist);
192  void Reset(int pos);
193
194  // Reports whether the pattern might be used as a literal search string.
195  // Only use if the result of the parse is a single atom node.
196  bool simple();
197  bool contains_anchor() { return contains_anchor_; }
198  void set_contains_anchor() { contains_anchor_ = true; }
199  int captures_started() { return captures_started_; }
200  int position() { return next_pos_ - 1; }
201  bool failed() { return failed_; }
202  bool ignore_case() const { return ignore_case_; }
203  bool multiline() const { return multiline_; }
204  bool unicode() const { return unicode_; }
205
206  static bool IsSyntaxCharacterOrSlash(uc32 c);
207
208  static const int kMaxCaptures = 1 << 16;
209  static const uc32 kEndMarker = (1 << 21);
210
211 private:
212  enum SubexpressionType {
213    INITIAL,
214    CAPTURE,  // All positive values represent captures.
215    POSITIVE_LOOKAROUND,
216    NEGATIVE_LOOKAROUND,
217    GROUPING
218  };
219
220  class RegExpParserState : public ZoneObject {
221   public:
222    RegExpParserState(RegExpParserState* previous_state,
223                      SubexpressionType group_type,
224                      RegExpLookaround::Type lookaround_type,
225                      int disjunction_capture_index,
226                      const ZoneVector<uc16>* capture_name, bool ignore_case,
227                      bool unicode, Zone* zone)
228        : previous_state_(previous_state),
229          builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)),
230          group_type_(group_type),
231          lookaround_type_(lookaround_type),
232          disjunction_capture_index_(disjunction_capture_index),
233          capture_name_(capture_name) {}
234    // Parser state of containing expression, if any.
235    RegExpParserState* previous_state() { return previous_state_; }
236    bool IsSubexpression() { return previous_state_ != NULL; }
237    // RegExpBuilder building this regexp's AST.
238    RegExpBuilder* builder() { return builder_; }
239    // Type of regexp being parsed (parenthesized group or entire regexp).
240    SubexpressionType group_type() { return group_type_; }
241    // Lookahead or Lookbehind.
242    RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
243    // Index in captures array of first capture in this sub-expression, if any.
244    // Also the capture index of this sub-expression itself, if group_type
245    // is CAPTURE.
246    int capture_index() { return disjunction_capture_index_; }
247    // The name of the current sub-expression, if group_type is CAPTURE. Only
248    // used for named captures.
249    const ZoneVector<uc16>* capture_name() { return capture_name_; }
250
251    bool IsNamedCapture() const { return capture_name_ != nullptr; }
252
253    // Check whether the parser is inside a capture group with the given index.
254    bool IsInsideCaptureGroup(int index);
255    // Check whether the parser is inside a capture group with the given name.
256    bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
257
258   private:
259    // Linked list implementation of stack of states.
260    RegExpParserState* previous_state_;
261    // Builder for the stored disjunction.
262    RegExpBuilder* builder_;
263    // Stored disjunction type (capture, look-ahead or grouping), if any.
264    SubexpressionType group_type_;
265    // Stored read direction.
266    RegExpLookaround::Type lookaround_type_;
267    // Stored disjunction's capture index (if any).
268    int disjunction_capture_index_;
269    // Stored capture name (if any).
270    const ZoneVector<uc16>* capture_name_;
271  };
272
273  // Return the 1-indexed RegExpCapture object, allocate if necessary.
274  RegExpCapture* GetCapture(int index);
275
276  // Creates a new named capture at the specified index. Must be called exactly
277  // once for each named capture. Fails if a capture with the same name is
278  // encountered.
279  bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
280
281  // Parses the name of a capture group (?<name>pattern). The name must adhere
282  // to IdentifierName in the ECMAScript standard.
283  const ZoneVector<uc16>* ParseCaptureGroupName();
284
285  bool ParseNamedBackReference(RegExpBuilder* builder,
286                               RegExpParserState* state);
287
288  // After the initial parsing pass, patch corresponding RegExpCapture objects
289  // into all RegExpBackReferences. This is done after initial parsing in order
290  // to avoid complicating cases in which references comes before the capture.
291  void PatchNamedBackReferences();
292
293  Handle<FixedArray> CreateCaptureNameMap();
294
295  Isolate* isolate() { return isolate_; }
296  Zone* zone() const { return zone_; }
297
298  uc32 current() { return current_; }
299  bool has_more() { return has_more_; }
300  bool has_next() { return next_pos_ < in()->length(); }
301  uc32 Next();
302  template <bool update_position>
303  uc32 ReadNext();
304  FlatStringReader* in() { return in_; }
305  void ScanForCaptures();
306
307  Isolate* isolate_;
308  Zone* zone_;
309  Handle<String>* error_;
310  ZoneList<RegExpCapture*>* captures_;
311  ZoneList<RegExpCapture*>* named_captures_;
312  ZoneList<RegExpBackReference*>* named_back_references_;
313  FlatStringReader* in_;
314  uc32 current_;
315  bool ignore_case_;
316  bool multiline_;
317  bool unicode_;
318  int next_pos_;
319  int captures_started_;
320  // The capture count is only valid after we have scanned for captures.
321  int capture_count_;
322  bool has_more_;
323  bool simple_;
324  bool contains_anchor_;
325  bool is_scanned_for_captures_;
326  bool failed_;
327};
328
329}  // namespace internal
330}  // namespace v8
331
332#endif  // V8_REGEXP_REGEXP_PARSER_H_
333