1// Copyright 2016 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_REGEXP_AST_H_
6#define V8_REGEXP_REGEXP_AST_H_
7
8#include "src/objects.h"
9#include "src/utils.h"
10#include "src/zone/zone-containers.h"
11#include "src/zone/zone.h"
12
13namespace v8 {
14namespace internal {
15
16#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
17  VISIT(Disjunction)                      \
18  VISIT(Alternative)                      \
19  VISIT(Assertion)                        \
20  VISIT(CharacterClass)                   \
21  VISIT(Atom)                             \
22  VISIT(Quantifier)                       \
23  VISIT(Capture)                          \
24  VISIT(Group)                            \
25  VISIT(Lookaround)                       \
26  VISIT(BackReference)                    \
27  VISIT(Empty)                            \
28  VISIT(Text)
29
30#define FORWARD_DECLARE(Name) class RegExp##Name;
31FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
32#undef FORWARD_DECLARE
33
34class RegExpCompiler;
35class RegExpNode;
36class RegExpTree;
37
38
39class RegExpVisitor BASE_EMBEDDED {
40 public:
41  virtual ~RegExpVisitor() {}
42#define MAKE_CASE(Name) \
43  virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
44  FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
45#undef MAKE_CASE
46};
47
48
49// A simple closed interval.
50class Interval {
51 public:
52  Interval() : from_(kNone), to_(kNone) {}
53  Interval(int from, int to) : from_(from), to_(to) {}
54  Interval Union(Interval that) {
55    if (that.from_ == kNone)
56      return *this;
57    else if (from_ == kNone)
58      return that;
59    else
60      return Interval(Min(from_, that.from_), Max(to_, that.to_));
61  }
62  bool Contains(int value) { return (from_ <= value) && (value <= to_); }
63  bool is_empty() { return from_ == kNone; }
64  int from() const { return from_; }
65  int to() const { return to_; }
66  static Interval Empty() { return Interval(); }
67  static const int kNone = -1;
68
69 private:
70  int from_;
71  int to_;
72};
73
74
75// Represents code units in the range from from_ to to_, both ends are
76// inclusive.
77class CharacterRange {
78 public:
79  CharacterRange() : from_(0), to_(0) {}
80  // For compatibility with the CHECK_OK macro
81  CharacterRange(void* null) { DCHECK_NULL(null); }  // NOLINT
82  static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
83                             Zone* zone);
84  static Vector<const int> GetWordBounds();
85  static inline CharacterRange Singleton(uc32 value) {
86    return CharacterRange(value, value);
87  }
88  static inline CharacterRange Range(uc32 from, uc32 to) {
89    DCHECK(0 <= from && to <= String::kMaxCodePoint);
90    DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
91    return CharacterRange(from, to);
92  }
93  static inline CharacterRange Everything() {
94    return CharacterRange(0, String::kMaxCodePoint);
95  }
96  static inline ZoneList<CharacterRange>* List(Zone* zone,
97                                               CharacterRange range) {
98    ZoneList<CharacterRange>* list =
99        new (zone) ZoneList<CharacterRange>(1, zone);
100    list->Add(range, zone);
101    return list;
102  }
103  bool Contains(uc32 i) { return from_ <= i && i <= to_; }
104  uc32 from() const { return from_; }
105  void set_from(uc32 value) { from_ = value; }
106  uc32 to() const { return to_; }
107  void set_to(uc32 value) { to_ = value; }
108  bool is_valid() { return from_ <= to_; }
109  bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
110  bool IsSingleton() { return (from_ == to_); }
111  static void AddCaseEquivalents(Isolate* isolate, Zone* zone,
112                                 ZoneList<CharacterRange>* ranges,
113                                 bool is_one_byte);
114  // Whether a range list is in canonical form: Ranges ordered by from value,
115  // and ranges non-overlapping and non-adjacent.
116  static bool IsCanonical(ZoneList<CharacterRange>* ranges);
117  // Convert range list to canonical form. The characters covered by the ranges
118  // will still be the same, but no character is in more than one range, and
119  // adjacent ranges are merged. The resulting list may be shorter than the
120  // original, but cannot be longer.
121  static void Canonicalize(ZoneList<CharacterRange>* ranges);
122  // Negate the contents of a character range in canonical form.
123  static void Negate(ZoneList<CharacterRange>* src,
124                     ZoneList<CharacterRange>* dst, Zone* zone);
125  static const int kStartMarker = (1 << 24);
126  static const int kPayloadMask = (1 << 24) - 1;
127
128 private:
129  CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
130
131  uc32 from_;
132  uc32 to_;
133};
134
135
136class CharacterSet final BASE_EMBEDDED {
137 public:
138  explicit CharacterSet(uc16 standard_set_type)
139      : ranges_(NULL), standard_set_type_(standard_set_type) {}
140  explicit CharacterSet(ZoneList<CharacterRange>* ranges)
141      : ranges_(ranges), standard_set_type_(0) {}
142  ZoneList<CharacterRange>* ranges(Zone* zone);
143  uc16 standard_set_type() { return standard_set_type_; }
144  void set_standard_set_type(uc16 special_set_type) {
145    standard_set_type_ = special_set_type;
146  }
147  bool is_standard() { return standard_set_type_ != 0; }
148  void Canonicalize();
149
150 private:
151  ZoneList<CharacterRange>* ranges_;
152  // If non-zero, the value represents a standard set (e.g., all whitespace
153  // characters) without having to expand the ranges.
154  uc16 standard_set_type_;
155};
156
157
158class TextElement final BASE_EMBEDDED {
159 public:
160  enum TextType { ATOM, CHAR_CLASS };
161
162  static TextElement Atom(RegExpAtom* atom);
163  static TextElement CharClass(RegExpCharacterClass* char_class);
164
165  int cp_offset() const { return cp_offset_; }
166  void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
167  int length() const;
168
169  TextType text_type() const { return text_type_; }
170
171  RegExpTree* tree() const { return tree_; }
172
173  RegExpAtom* atom() const {
174    DCHECK(text_type() == ATOM);
175    return reinterpret_cast<RegExpAtom*>(tree());
176  }
177
178  RegExpCharacterClass* char_class() const {
179    DCHECK(text_type() == CHAR_CLASS);
180    return reinterpret_cast<RegExpCharacterClass*>(tree());
181  }
182
183 private:
184  TextElement(TextType text_type, RegExpTree* tree)
185      : cp_offset_(-1), text_type_(text_type), tree_(tree) {}
186
187  int cp_offset_;
188  TextType text_type_;
189  RegExpTree* tree_;
190};
191
192
193class RegExpTree : public ZoneObject {
194 public:
195  static const int kInfinity = kMaxInt;
196  virtual ~RegExpTree() {}
197  virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
198  virtual RegExpNode* ToNode(RegExpCompiler* compiler,
199                             RegExpNode* on_success) = 0;
200  virtual bool IsTextElement() { return false; }
201  virtual bool IsAnchoredAtStart() { return false; }
202  virtual bool IsAnchoredAtEnd() { return false; }
203  virtual int min_match() = 0;
204  virtual int max_match() = 0;
205  // Returns the interval of registers used for captures within this
206  // expression.
207  virtual Interval CaptureRegisters() { return Interval::Empty(); }
208  virtual void AppendToText(RegExpText* text, Zone* zone);
209  std::ostream& Print(std::ostream& os, Zone* zone);  // NOLINT
210#define MAKE_ASTYPE(Name)           \
211  virtual RegExp##Name* As##Name(); \
212  virtual bool Is##Name();
213  FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
214#undef MAKE_ASTYPE
215};
216
217
218class RegExpDisjunction final : public RegExpTree {
219 public:
220  explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
221  void* Accept(RegExpVisitor* visitor, void* data) override;
222  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
223  RegExpDisjunction* AsDisjunction() override;
224  Interval CaptureRegisters() override;
225  bool IsDisjunction() override;
226  bool IsAnchoredAtStart() override;
227  bool IsAnchoredAtEnd() override;
228  int min_match() override { return min_match_; }
229  int max_match() override { return max_match_; }
230  ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
231
232 private:
233  bool SortConsecutiveAtoms(RegExpCompiler* compiler);
234  void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
235  void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
236  ZoneList<RegExpTree*>* alternatives_;
237  int min_match_;
238  int max_match_;
239};
240
241
242class RegExpAlternative final : public RegExpTree {
243 public:
244  explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
245  void* Accept(RegExpVisitor* visitor, void* data) override;
246  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
247  RegExpAlternative* AsAlternative() override;
248  Interval CaptureRegisters() override;
249  bool IsAlternative() override;
250  bool IsAnchoredAtStart() override;
251  bool IsAnchoredAtEnd() override;
252  int min_match() override { return min_match_; }
253  int max_match() override { return max_match_; }
254  ZoneList<RegExpTree*>* nodes() { return nodes_; }
255
256 private:
257  ZoneList<RegExpTree*>* nodes_;
258  int min_match_;
259  int max_match_;
260};
261
262
263class RegExpAssertion final : public RegExpTree {
264 public:
265  enum AssertionType {
266    START_OF_LINE,
267    START_OF_INPUT,
268    END_OF_LINE,
269    END_OF_INPUT,
270    BOUNDARY,
271    NON_BOUNDARY
272  };
273  explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {}
274  void* Accept(RegExpVisitor* visitor, void* data) override;
275  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
276  RegExpAssertion* AsAssertion() override;
277  bool IsAssertion() override;
278  bool IsAnchoredAtStart() override;
279  bool IsAnchoredAtEnd() override;
280  int min_match() override { return 0; }
281  int max_match() override { return 0; }
282  AssertionType assertion_type() { return assertion_type_; }
283
284 private:
285  AssertionType assertion_type_;
286};
287
288
289class RegExpCharacterClass final : public RegExpTree {
290 public:
291  RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated)
292      : set_(ranges), is_negated_(is_negated) {}
293  explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {}
294  void* Accept(RegExpVisitor* visitor, void* data) override;
295  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
296  RegExpCharacterClass* AsCharacterClass() override;
297  bool IsCharacterClass() override;
298  bool IsTextElement() override { return true; }
299  int min_match() override { return 1; }
300  // The character class may match two code units for unicode regexps.
301  // TODO(yangguo): we should split this class for usage in TextElement, and
302  //                make max_match() dependent on the character class content.
303  int max_match() override { return 2; }
304  void AppendToText(RegExpText* text, Zone* zone) override;
305  CharacterSet character_set() { return set_; }
306  // TODO(lrn): Remove need for complex version if is_standard that
307  // recognizes a mangled standard set and just do { return set_.is_special(); }
308  bool is_standard(Zone* zone);
309  // Returns a value representing the standard character set if is_standard()
310  // returns true.
311  // Currently used values are:
312  // s : unicode whitespace
313  // S : unicode non-whitespace
314  // w : ASCII word character (digit, letter, underscore)
315  // W : non-ASCII word character
316  // d : ASCII digit
317  // D : non-ASCII digit
318  // . : non-newline
319  // * : All characters, for advancing unanchored regexp
320  uc16 standard_type() { return set_.standard_set_type(); }
321  ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
322  bool is_negated() { return is_negated_; }
323
324 private:
325  CharacterSet set_;
326  bool is_negated_;
327};
328
329
330class RegExpAtom final : public RegExpTree {
331 public:
332  explicit RegExpAtom(Vector<const uc16> data) : data_(data) {}
333  void* Accept(RegExpVisitor* visitor, void* data) override;
334  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
335  RegExpAtom* AsAtom() override;
336  bool IsAtom() override;
337  bool IsTextElement() override { return true; }
338  int min_match() override { return data_.length(); }
339  int max_match() override { return data_.length(); }
340  void AppendToText(RegExpText* text, Zone* zone) override;
341  Vector<const uc16> data() { return data_; }
342  int length() { return data_.length(); }
343
344 private:
345  Vector<const uc16> data_;
346};
347
348
349class RegExpText final : public RegExpTree {
350 public:
351  explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
352  void* Accept(RegExpVisitor* visitor, void* data) override;
353  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
354  RegExpText* AsText() override;
355  bool IsText() override;
356  bool IsTextElement() override { return true; }
357  int min_match() override { return length_; }
358  int max_match() override { return length_; }
359  void AppendToText(RegExpText* text, Zone* zone) override;
360  void AddElement(TextElement elm, Zone* zone) {
361    elements_.Add(elm, zone);
362    length_ += elm.length();
363  }
364  ZoneList<TextElement>* elements() { return &elements_; }
365
366 private:
367  ZoneList<TextElement> elements_;
368  int length_;
369};
370
371
372class RegExpQuantifier final : public RegExpTree {
373 public:
374  enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
375  RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
376      : body_(body),
377        min_(min),
378        max_(max),
379        min_match_(min * body->min_match()),
380        quantifier_type_(type) {
381    if (max > 0 && body->max_match() > kInfinity / max) {
382      max_match_ = kInfinity;
383    } else {
384      max_match_ = max * body->max_match();
385    }
386  }
387  void* Accept(RegExpVisitor* visitor, void* data) override;
388  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
389  static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
390                            RegExpCompiler* compiler, RegExpNode* on_success,
391                            bool not_at_start = false);
392  RegExpQuantifier* AsQuantifier() override;
393  Interval CaptureRegisters() override;
394  bool IsQuantifier() override;
395  int min_match() override { return min_match_; }
396  int max_match() override { return max_match_; }
397  int min() { return min_; }
398  int max() { return max_; }
399  bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
400  bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
401  bool is_greedy() { return quantifier_type_ == GREEDY; }
402  RegExpTree* body() { return body_; }
403
404 private:
405  RegExpTree* body_;
406  int min_;
407  int max_;
408  int min_match_;
409  int max_match_;
410  QuantifierType quantifier_type_;
411};
412
413
414class RegExpCapture final : public RegExpTree {
415 public:
416  explicit RegExpCapture(int index)
417      : body_(NULL), index_(index), name_(nullptr) {}
418  void* Accept(RegExpVisitor* visitor, void* data) override;
419  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
420  static RegExpNode* ToNode(RegExpTree* body, int index,
421                            RegExpCompiler* compiler, RegExpNode* on_success);
422  RegExpCapture* AsCapture() override;
423  bool IsAnchoredAtStart() override;
424  bool IsAnchoredAtEnd() override;
425  Interval CaptureRegisters() override;
426  bool IsCapture() override;
427  int min_match() override { return body_->min_match(); }
428  int max_match() override { return body_->max_match(); }
429  RegExpTree* body() { return body_; }
430  void set_body(RegExpTree* body) { body_ = body; }
431  int index() { return index_; }
432  const ZoneVector<uc16>* name() const { return name_; }
433  void set_name(const ZoneVector<uc16>* name) { name_ = name; }
434  static int StartRegister(int index) { return index * 2; }
435  static int EndRegister(int index) { return index * 2 + 1; }
436
437 private:
438  RegExpTree* body_;
439  int index_;
440  const ZoneVector<uc16>* name_;
441};
442
443class RegExpGroup final : public RegExpTree {
444 public:
445  explicit RegExpGroup(RegExpTree* body) : body_(body) {}
446  void* Accept(RegExpVisitor* visitor, void* data) override;
447  RegExpNode* ToNode(RegExpCompiler* compiler,
448                     RegExpNode* on_success) override {
449    return body_->ToNode(compiler, on_success);
450  }
451  RegExpGroup* AsGroup() override;
452  bool IsAnchoredAtStart() override { return body_->IsAnchoredAtStart(); }
453  bool IsAnchoredAtEnd() override { return body_->IsAnchoredAtEnd(); }
454  bool IsGroup() override;
455  int min_match() override { return body_->min_match(); }
456  int max_match() override { return body_->max_match(); }
457  Interval CaptureRegisters() override { return body_->CaptureRegisters(); }
458  RegExpTree* body() { return body_; }
459
460 private:
461  RegExpTree* body_;
462};
463
464class RegExpLookaround final : public RegExpTree {
465 public:
466  enum Type { LOOKAHEAD, LOOKBEHIND };
467
468  RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
469                   int capture_from, Type type)
470      : body_(body),
471        is_positive_(is_positive),
472        capture_count_(capture_count),
473        capture_from_(capture_from),
474        type_(type) {}
475
476  void* Accept(RegExpVisitor* visitor, void* data) override;
477  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
478  RegExpLookaround* AsLookaround() override;
479  Interval CaptureRegisters() override;
480  bool IsLookaround() override;
481  bool IsAnchoredAtStart() override;
482  int min_match() override { return 0; }
483  int max_match() override { return 0; }
484  RegExpTree* body() { return body_; }
485  bool is_positive() { return is_positive_; }
486  int capture_count() { return capture_count_; }
487  int capture_from() { return capture_from_; }
488  Type type() { return type_; }
489
490  class Builder {
491   public:
492    Builder(bool is_positive, RegExpNode* on_success,
493            int stack_pointer_register, int position_register,
494            int capture_register_count = 0, int capture_register_start = 0);
495    RegExpNode* on_match_success() { return on_match_success_; }
496    RegExpNode* ForMatch(RegExpNode* match);
497
498   private:
499    bool is_positive_;
500    RegExpNode* on_match_success_;
501    RegExpNode* on_success_;
502    int stack_pointer_register_;
503    int position_register_;
504  };
505
506 private:
507  RegExpTree* body_;
508  bool is_positive_;
509  int capture_count_;
510  int capture_from_;
511  Type type_;
512};
513
514
515class RegExpBackReference final : public RegExpTree {
516 public:
517  RegExpBackReference() : capture_(nullptr), name_(nullptr) {}
518  explicit RegExpBackReference(RegExpCapture* capture)
519      : capture_(capture), name_(nullptr) {}
520  void* Accept(RegExpVisitor* visitor, void* data) override;
521  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
522  RegExpBackReference* AsBackReference() override;
523  bool IsBackReference() override;
524  int min_match() override { return 0; }
525  // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
526  // recursion, we give up. Ignorance is bliss.
527  int max_match() override { return kInfinity; }
528  int index() { return capture_->index(); }
529  RegExpCapture* capture() { return capture_; }
530  void set_capture(RegExpCapture* capture) { capture_ = capture; }
531  const ZoneVector<uc16>* name() const { return name_; }
532  void set_name(const ZoneVector<uc16>* name) { name_ = name; }
533
534 private:
535  RegExpCapture* capture_;
536  const ZoneVector<uc16>* name_;
537};
538
539
540class RegExpEmpty final : public RegExpTree {
541 public:
542  RegExpEmpty() {}
543  void* Accept(RegExpVisitor* visitor, void* data) override;
544  RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
545  RegExpEmpty* AsEmpty() override;
546  bool IsEmpty() override;
547  int min_match() override { return 0; }
548  int max_match() override { return 0; }
549};
550
551}  // namespace internal
552}  // namespace v8
553
554#endif  // V8_REGEXP_REGEXP_AST_H_
555