regexp-parser.h revision f2e3994fa5148cc3d9946666f0b0596290192b0e
1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_REGEXP_PARSER_H_
6#define V8_REGEXP_REGEXP_PARSER_H_
7
8#include "src/objects.h"
9#include "src/regexp/regexp-ast.h"
10#include "src/zone.h"
11
12namespace v8 {
13namespace internal {
14
15struct RegExpCompileData;
16
17
18// A BufferedZoneList is an automatically growing list, just like (and backed
19// by) a ZoneList, that is optimized for the case of adding and removing
20// a single element. The last element added is stored outside the backing list,
21// and if no more than one element is ever added, the ZoneList isn't even
22// allocated.
23// Elements must not be NULL pointers.
24template <typename T, int initial_size>
25class BufferedZoneList {
26 public:
27  BufferedZoneList() : list_(NULL), last_(NULL) {}
28
29  // Adds element at end of list. This element is buffered and can
30  // be read using last() or removed using RemoveLast until a new Add or until
31  // RemoveLast or GetList has been called.
32  void Add(T* value, Zone* zone) {
33    if (last_ != NULL) {
34      if (list_ == NULL) {
35        list_ = new (zone) ZoneList<T*>(initial_size, zone);
36      }
37      list_->Add(last_, zone);
38    }
39    last_ = value;
40  }
41
42  T* last() {
43    DCHECK(last_ != NULL);
44    return last_;
45  }
46
47  T* RemoveLast() {
48    DCHECK(last_ != NULL);
49    T* result = last_;
50    if ((list_ != NULL) && (list_->length() > 0))
51      last_ = list_->RemoveLast();
52    else
53      last_ = NULL;
54    return result;
55  }
56
57  T* Get(int i) {
58    DCHECK((0 <= i) && (i < length()));
59    if (list_ == NULL) {
60      DCHECK_EQ(0, i);
61      return last_;
62    } else {
63      if (i == list_->length()) {
64        DCHECK(last_ != NULL);
65        return last_;
66      } else {
67        return list_->at(i);
68      }
69    }
70  }
71
72  void Clear() {
73    list_ = NULL;
74    last_ = NULL;
75  }
76
77  int length() {
78    int length = (list_ == NULL) ? 0 : list_->length();
79    return length + ((last_ == NULL) ? 0 : 1);
80  }
81
82  ZoneList<T*>* GetList(Zone* zone) {
83    if (list_ == NULL) {
84      list_ = new (zone) ZoneList<T*>(initial_size, zone);
85    }
86    if (last_ != NULL) {
87      list_->Add(last_, zone);
88      last_ = NULL;
89    }
90    return list_;
91  }
92
93 private:
94  ZoneList<T*>* list_;
95  T* last_;
96};
97
98
99// Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100class RegExpBuilder : public ZoneObject {
101 public:
102  explicit RegExpBuilder(Zone* zone);
103  void AddCharacter(uc16 character);
104  void AddUnicodeCharacter(uc32 character);
105  // "Adds" an empty expression. Does nothing except consume a
106  // following quantifier
107  void AddEmpty();
108  void AddAtom(RegExpTree* tree);
109  void AddAssertion(RegExpTree* tree);
110  void NewAlternative();  // '|'
111  void AddQuantifierToAtom(int min, int max,
112                           RegExpQuantifier::QuantifierType type);
113  RegExpTree* ToRegExp();
114
115 private:
116  void FlushCharacters();
117  void FlushText();
118  void FlushTerms();
119  Zone* zone() const { return zone_; }
120
121  Zone* zone_;
122  bool pending_empty_;
123  ZoneList<uc16>* characters_;
124  BufferedZoneList<RegExpTree, 2> terms_;
125  BufferedZoneList<RegExpTree, 2> text_;
126  BufferedZoneList<RegExpTree, 2> alternatives_;
127#ifdef DEBUG
128  enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
129#define LAST(x) last_added_ = x;
130#else
131#define LAST(x)
132#endif
133};
134
135
136class RegExpParser BASE_EMBEDDED {
137 public:
138  RegExpParser(FlatStringReader* in, Handle<String>* error, bool multiline_mode,
139               bool unicode, Isolate* isolate, Zone* zone);
140
141  static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
142                          bool multiline, bool unicode,
143                          RegExpCompileData* result);
144
145  RegExpTree* ParsePattern();
146  RegExpTree* ParseDisjunction();
147  RegExpTree* ParseGroup();
148  RegExpTree* ParseCharacterClass();
149
150  // Parses a {...,...} quantifier and stores the range in the given
151  // out parameters.
152  bool ParseIntervalQuantifier(int* min_out, int* max_out);
153
154  // Parses and returns a single escaped character.  The character
155  // must not be 'b' or 'B' since they are usually handle specially.
156  uc32 ParseClassCharacterEscape();
157
158  // Checks whether the following is a length-digit hexadecimal number,
159  // and sets the value if it is.
160  bool ParseHexEscape(int length, uc32* value);
161  bool ParseUnicodeEscape(uc32* value);
162  bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
163
164  uc32 ParseOctalLiteral();
165
166  // Tries to parse the input as a back reference.  If successful it
167  // stores the result in the output parameter and returns true.  If
168  // it fails it will push back the characters read so the same characters
169  // can be reparsed.
170  bool ParseBackReferenceIndex(int* index_out);
171
172  CharacterRange ParseClassAtom(uc16* char_class);
173  RegExpTree* ReportError(Vector<const char> message);
174  void Advance();
175  void Advance(int dist);
176  void Reset(int pos);
177
178  // Reports whether the pattern might be used as a literal search string.
179  // Only use if the result of the parse is a single atom node.
180  bool simple();
181  bool contains_anchor() { return contains_anchor_; }
182  void set_contains_anchor() { contains_anchor_ = true; }
183  int captures_started() { return captures_started_; }
184  int position() { return next_pos_ - 1; }
185  bool failed() { return failed_; }
186
187  static bool IsSyntaxCharacter(uc32 c);
188
189  static const int kMaxCaptures = 1 << 16;
190  static const uc32 kEndMarker = (1 << 21);
191
192 private:
193  enum SubexpressionType {
194    INITIAL,
195    CAPTURE,  // All positive values represent captures.
196    POSITIVE_LOOKAROUND,
197    NEGATIVE_LOOKAROUND,
198    GROUPING
199  };
200
201  class RegExpParserState : public ZoneObject {
202   public:
203    RegExpParserState(RegExpParserState* previous_state,
204                      SubexpressionType group_type,
205                      RegExpLookaround::Type lookaround_type,
206                      int disjunction_capture_index, Zone* zone)
207        : previous_state_(previous_state),
208          builder_(new (zone) RegExpBuilder(zone)),
209          group_type_(group_type),
210          lookaround_type_(lookaround_type),
211          disjunction_capture_index_(disjunction_capture_index) {}
212    // Parser state of containing expression, if any.
213    RegExpParserState* previous_state() { return previous_state_; }
214    bool IsSubexpression() { return previous_state_ != NULL; }
215    // RegExpBuilder building this regexp's AST.
216    RegExpBuilder* builder() { return builder_; }
217    // Type of regexp being parsed (parenthesized group or entire regexp).
218    SubexpressionType group_type() { return group_type_; }
219    // Lookahead or Lookbehind.
220    RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
221    // Index in captures array of first capture in this sub-expression, if any.
222    // Also the capture index of this sub-expression itself, if group_type
223    // is CAPTURE.
224    int capture_index() { return disjunction_capture_index_; }
225
226    // Check whether the parser is inside a capture group with the given index.
227    bool IsInsideCaptureGroup(int index);
228
229   private:
230    // Linked list implementation of stack of states.
231    RegExpParserState* previous_state_;
232    // Builder for the stored disjunction.
233    RegExpBuilder* builder_;
234    // Stored disjunction type (capture, look-ahead or grouping), if any.
235    SubexpressionType group_type_;
236    // Stored read direction.
237    RegExpLookaround::Type lookaround_type_;
238    // Stored disjunction's capture index (if any).
239    int disjunction_capture_index_;
240  };
241
242  // Return the 1-indexed RegExpCapture object, allocate if necessary.
243  RegExpCapture* GetCapture(int index);
244
245  Isolate* isolate() { return isolate_; }
246  Zone* zone() const { return zone_; }
247
248  uc32 current() { return current_; }
249  bool has_more() { return has_more_; }
250  bool has_next() { return next_pos_ < in()->length(); }
251  uc32 Next();
252  FlatStringReader* in() { return in_; }
253  void ScanForCaptures();
254
255  Isolate* isolate_;
256  Zone* zone_;
257  Handle<String>* error_;
258  ZoneList<RegExpCapture*>* captures_;
259  FlatStringReader* in_;
260  uc32 current_;
261  int next_pos_;
262  int captures_started_;
263  // The capture count is only valid after we have scanned for captures.
264  int capture_count_;
265  bool has_more_;
266  bool multiline_;
267  bool unicode_;
268  bool simple_;
269  bool contains_anchor_;
270  bool is_scanned_for_captures_;
271  bool failed_;
272};
273
274}  // namespace internal
275}  // namespace v8
276
277#endif  // V8_REGEXP_REGEXP_PARSER_H_
278