1// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_JSREGEXP_H_
6#define V8_JSREGEXP_H_
7
8#include "src/allocation.h"
9#include "src/assembler.h"
10#include "src/zone-inl.h"
11
12namespace v8 {
13namespace internal {
14
15class NodeVisitor;
16class RegExpCompiler;
17class RegExpMacroAssembler;
18class RegExpNode;
19class RegExpTree;
20class BoyerMooreLookahead;
21
22class RegExpImpl {
23 public:
24  // Whether V8 is compiled with native regexp support or not.
25  static bool UsesNativeRegExp() {
26#ifdef V8_INTERPRETED_REGEXP
27    return false;
28#else
29    return true;
30#endif
31  }
32
33  // Creates a regular expression literal in the old space.
34  // This function calls the garbage collector if necessary.
35  MUST_USE_RESULT static MaybeHandle<Object> CreateRegExpLiteral(
36      Handle<JSFunction> constructor,
37      Handle<String> pattern,
38      Handle<String> flags);
39
40  // Returns a string representation of a regular expression.
41  // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
42  // This function calls the garbage collector if necessary.
43  static Handle<String> ToString(Handle<Object> value);
44
45  // Parses the RegExp pattern and prepares the JSRegExp object with
46  // generic data and choice of implementation - as well as what
47  // the implementation wants to store in the data field.
48  // Returns false if compilation fails.
49  MUST_USE_RESULT static MaybeHandle<Object> Compile(
50      Handle<JSRegExp> re,
51      Handle<String> pattern,
52      Handle<String> flags);
53
54  // See ECMA-262 section 15.10.6.2.
55  // This function calls the garbage collector if necessary.
56  MUST_USE_RESULT static MaybeHandle<Object> Exec(
57      Handle<JSRegExp> regexp,
58      Handle<String> subject,
59      int index,
60      Handle<JSArray> lastMatchInfo);
61
62  // Prepares a JSRegExp object with Irregexp-specific data.
63  static void IrregexpInitialize(Handle<JSRegExp> re,
64                                 Handle<String> pattern,
65                                 JSRegExp::Flags flags,
66                                 int capture_register_count);
67
68
69  static void AtomCompile(Handle<JSRegExp> re,
70                          Handle<String> pattern,
71                          JSRegExp::Flags flags,
72                          Handle<String> match_pattern);
73
74
75  static int AtomExecRaw(Handle<JSRegExp> regexp,
76                         Handle<String> subject,
77                         int index,
78                         int32_t* output,
79                         int output_size);
80
81
82  static Handle<Object> AtomExec(Handle<JSRegExp> regexp,
83                                 Handle<String> subject,
84                                 int index,
85                                 Handle<JSArray> lastMatchInfo);
86
87  enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
88
89  // Prepare a RegExp for being executed one or more times (using
90  // IrregexpExecOnce) on the subject.
91  // This ensures that the regexp is compiled for the subject, and that
92  // the subject is flat.
93  // Returns the number of integer spaces required by IrregexpExecOnce
94  // as its "registers" argument.  If the regexp cannot be compiled,
95  // an exception is set as pending, and this function returns negative.
96  static int IrregexpPrepare(Handle<JSRegExp> regexp,
97                             Handle<String> subject);
98
99  // Execute a regular expression on the subject, starting from index.
100  // If matching succeeds, return the number of matches.  This can be larger
101  // than one in the case of global regular expressions.
102  // The captures and subcaptures are stored into the registers vector.
103  // If matching fails, returns RE_FAILURE.
104  // If execution fails, sets a pending exception and returns RE_EXCEPTION.
105  static int IrregexpExecRaw(Handle<JSRegExp> regexp,
106                             Handle<String> subject,
107                             int index,
108                             int32_t* output,
109                             int output_size);
110
111  // Execute an Irregexp bytecode pattern.
112  // On a successful match, the result is a JSArray containing
113  // captured positions.  On a failure, the result is the null value.
114  // Returns an empty handle in case of an exception.
115  MUST_USE_RESULT static MaybeHandle<Object> IrregexpExec(
116      Handle<JSRegExp> regexp,
117      Handle<String> subject,
118      int index,
119      Handle<JSArray> lastMatchInfo);
120
121  // Set last match info.  If match is NULL, then setting captures is omitted.
122  static Handle<JSArray> SetLastMatchInfo(Handle<JSArray> last_match_info,
123                                          Handle<String> subject,
124                                          int capture_count,
125                                          int32_t* match);
126
127
128  class GlobalCache {
129   public:
130    GlobalCache(Handle<JSRegExp> regexp,
131                Handle<String> subject,
132                bool is_global,
133                Isolate* isolate);
134
135    INLINE(~GlobalCache());
136
137    // Fetch the next entry in the cache for global regexp match results.
138    // This does not set the last match info.  Upon failure, NULL is returned.
139    // The cause can be checked with Result().  The previous
140    // result is still in available in memory when a failure happens.
141    INLINE(int32_t* FetchNext());
142
143    INLINE(int32_t* LastSuccessfulMatch());
144
145    INLINE(bool HasException()) { return num_matches_ < 0; }
146
147   private:
148    int num_matches_;
149    int max_matches_;
150    int current_match_index_;
151    int registers_per_match_;
152    // Pointer to the last set of captures.
153    int32_t* register_array_;
154    int register_array_size_;
155    Handle<JSRegExp> regexp_;
156    Handle<String> subject_;
157  };
158
159
160  // Array index in the lastMatchInfo array.
161  static const int kLastCaptureCount = 0;
162  static const int kLastSubject = 1;
163  static const int kLastInput = 2;
164  static const int kFirstCapture = 3;
165  static const int kLastMatchOverhead = 3;
166
167  // Direct offset into the lastMatchInfo array.
168  static const int kLastCaptureCountOffset =
169      FixedArray::kHeaderSize + kLastCaptureCount * kPointerSize;
170  static const int kLastSubjectOffset =
171      FixedArray::kHeaderSize + kLastSubject * kPointerSize;
172  static const int kLastInputOffset =
173      FixedArray::kHeaderSize + kLastInput * kPointerSize;
174  static const int kFirstCaptureOffset =
175      FixedArray::kHeaderSize + kFirstCapture * kPointerSize;
176
177  // Used to access the lastMatchInfo array.
178  static int GetCapture(FixedArray* array, int index) {
179    return Smi::cast(array->get(index + kFirstCapture))->value();
180  }
181
182  static void SetLastCaptureCount(FixedArray* array, int to) {
183    array->set(kLastCaptureCount, Smi::FromInt(to));
184  }
185
186  static void SetLastSubject(FixedArray* array, String* to) {
187    array->set(kLastSubject, to);
188  }
189
190  static void SetLastInput(FixedArray* array, String* to) {
191    array->set(kLastInput, to);
192  }
193
194  static void SetCapture(FixedArray* array, int index, int to) {
195    array->set(index + kFirstCapture, Smi::FromInt(to));
196  }
197
198  static int GetLastCaptureCount(FixedArray* array) {
199    return Smi::cast(array->get(kLastCaptureCount))->value();
200  }
201
202  // For acting on the JSRegExp data FixedArray.
203  static int IrregexpMaxRegisterCount(FixedArray* re);
204  static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
205  static int IrregexpNumberOfCaptures(FixedArray* re);
206  static int IrregexpNumberOfRegisters(FixedArray* re);
207  static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte);
208  static Code* IrregexpNativeCode(FixedArray* re, bool is_one_byte);
209
210  // Limit the space regexps take up on the heap.  In order to limit this we
211  // would like to keep track of the amount of regexp code on the heap.  This
212  // is not tracked, however.  As a conservative approximation we track the
213  // total regexp code compiled including code that has subsequently been freed
214  // and the total executable memory at any point.
215  static const int kRegExpExecutableMemoryLimit = 16 * MB;
216  static const int kRegWxpCompiledLimit = 1 * MB;
217
218 private:
219  static bool CompileIrregexp(Handle<JSRegExp> re,
220                              Handle<String> sample_subject, bool is_one_byte);
221  static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re,
222                                            Handle<String> sample_subject,
223                                            bool is_one_byte);
224};
225
226
227// Represents the location of one element relative to the intersection of
228// two sets. Corresponds to the four areas of a Venn diagram.
229enum ElementInSetsRelation {
230  kInsideNone = 0,
231  kInsideFirst = 1,
232  kInsideSecond = 2,
233  kInsideBoth = 3
234};
235
236
237// Represents code units in the range from from_ to to_, both ends are
238// inclusive.
239class CharacterRange {
240 public:
241  CharacterRange() : from_(0), to_(0) { }
242  // For compatibility with the CHECK_OK macro
243  CharacterRange(void* null) { DCHECK_EQ(NULL, null); }  //NOLINT
244  CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { }
245  static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
246                             Zone* zone);
247  static Vector<const int> GetWordBounds();
248  static inline CharacterRange Singleton(uc16 value) {
249    return CharacterRange(value, value);
250  }
251  static inline CharacterRange Range(uc16 from, uc16 to) {
252    DCHECK(from <= to);
253    return CharacterRange(from, to);
254  }
255  static inline CharacterRange Everything() {
256    return CharacterRange(0, 0xFFFF);
257  }
258  bool Contains(uc16 i) { return from_ <= i && i <= to_; }
259  uc16 from() const { return from_; }
260  void set_from(uc16 value) { from_ = value; }
261  uc16 to() const { return to_; }
262  void set_to(uc16 value) { to_ = value; }
263  bool is_valid() { return from_ <= to_; }
264  bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
265  bool IsSingleton() { return (from_ == to_); }
266  void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_one_byte,
267                          Zone* zone);
268  static void Split(ZoneList<CharacterRange>* base,
269                    Vector<const int> overlay,
270                    ZoneList<CharacterRange>** included,
271                    ZoneList<CharacterRange>** excluded,
272                    Zone* zone);
273  // Whether a range list is in canonical form: Ranges ordered by from value,
274  // and ranges non-overlapping and non-adjacent.
275  static bool IsCanonical(ZoneList<CharacterRange>* ranges);
276  // Convert range list to canonical form. The characters covered by the ranges
277  // will still be the same, but no character is in more than one range, and
278  // adjacent ranges are merged. The resulting list may be shorter than the
279  // original, but cannot be longer.
280  static void Canonicalize(ZoneList<CharacterRange>* ranges);
281  // Negate the contents of a character range in canonical form.
282  static void Negate(ZoneList<CharacterRange>* src,
283                     ZoneList<CharacterRange>* dst,
284                     Zone* zone);
285  static const int kStartMarker = (1 << 24);
286  static const int kPayloadMask = (1 << 24) - 1;
287
288 private:
289  uc16 from_;
290  uc16 to_;
291};
292
293
294// A set of unsigned integers that behaves especially well on small
295// integers (< 32).  May do zone-allocation.
296class OutSet: public ZoneObject {
297 public:
298  OutSet() : first_(0), remaining_(NULL), successors_(NULL) { }
299  OutSet* Extend(unsigned value, Zone* zone);
300  bool Get(unsigned value) const;
301  static const unsigned kFirstLimit = 32;
302
303 private:
304  // Destructively set a value in this set.  In most cases you want
305  // to use Extend instead to ensure that only one instance exists
306  // that contains the same values.
307  void Set(unsigned value, Zone* zone);
308
309  // The successors are a list of sets that contain the same values
310  // as this set and the one more value that is not present in this
311  // set.
312  ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
313
314  OutSet(uint32_t first, ZoneList<unsigned>* remaining)
315      : first_(first), remaining_(remaining), successors_(NULL) { }
316  uint32_t first_;
317  ZoneList<unsigned>* remaining_;
318  ZoneList<OutSet*>* successors_;
319  friend class Trace;
320};
321
322
323// A mapping from integers, specified as ranges, to a set of integers.
324// Used for mapping character ranges to choices.
325class DispatchTable : public ZoneObject {
326 public:
327  explicit DispatchTable(Zone* zone) : tree_(zone) { }
328
329  class Entry {
330   public:
331    Entry() : from_(0), to_(0), out_set_(NULL) { }
332    Entry(uc16 from, uc16 to, OutSet* out_set)
333        : from_(from), to_(to), out_set_(out_set) { }
334    uc16 from() { return from_; }
335    uc16 to() { return to_; }
336    void set_to(uc16 value) { to_ = value; }
337    void AddValue(int value, Zone* zone) {
338      out_set_ = out_set_->Extend(value, zone);
339    }
340    OutSet* out_set() { return out_set_; }
341   private:
342    uc16 from_;
343    uc16 to_;
344    OutSet* out_set_;
345  };
346
347  class Config {
348   public:
349    typedef uc16 Key;
350    typedef Entry Value;
351    static const uc16 kNoKey;
352    static const Entry NoValue() { return Value(); }
353    static inline int Compare(uc16 a, uc16 b) {
354      if (a == b)
355        return 0;
356      else if (a < b)
357        return -1;
358      else
359        return 1;
360    }
361  };
362
363  void AddRange(CharacterRange range, int value, Zone* zone);
364  OutSet* Get(uc16 value);
365  void Dump();
366
367  template <typename Callback>
368  void ForEach(Callback* callback) {
369    return tree()->ForEach(callback);
370  }
371
372 private:
373  // There can't be a static empty set since it allocates its
374  // successors in a zone and caches them.
375  OutSet* empty() { return &empty_; }
376  OutSet empty_;
377  ZoneSplayTree<Config>* tree() { return &tree_; }
378  ZoneSplayTree<Config> tree_;
379};
380
381
382#define FOR_EACH_NODE_TYPE(VISIT)                                    \
383  VISIT(End)                                                         \
384  VISIT(Action)                                                      \
385  VISIT(Choice)                                                      \
386  VISIT(BackReference)                                               \
387  VISIT(Assertion)                                                   \
388  VISIT(Text)
389
390
391#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT)                            \
392  VISIT(Disjunction)                                                 \
393  VISIT(Alternative)                                                 \
394  VISIT(Assertion)                                                   \
395  VISIT(CharacterClass)                                              \
396  VISIT(Atom)                                                        \
397  VISIT(Quantifier)                                                  \
398  VISIT(Capture)                                                     \
399  VISIT(Lookahead)                                                   \
400  VISIT(BackReference)                                               \
401  VISIT(Empty)                                                       \
402  VISIT(Text)
403
404
405#define FORWARD_DECLARE(Name) class RegExp##Name;
406FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
407#undef FORWARD_DECLARE
408
409
410class TextElement FINAL BASE_EMBEDDED {
411 public:
412  enum TextType {
413    ATOM,
414    CHAR_CLASS
415  };
416
417  static TextElement Atom(RegExpAtom* atom);
418  static TextElement CharClass(RegExpCharacterClass* char_class);
419
420  int cp_offset() const { return cp_offset_; }
421  void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
422  int length() const;
423
424  TextType text_type() const { return text_type_; }
425
426  RegExpTree* tree() const { return tree_; }
427
428  RegExpAtom* atom() const {
429    DCHECK(text_type() == ATOM);
430    return reinterpret_cast<RegExpAtom*>(tree());
431  }
432
433  RegExpCharacterClass* char_class() const {
434    DCHECK(text_type() == CHAR_CLASS);
435    return reinterpret_cast<RegExpCharacterClass*>(tree());
436  }
437
438 private:
439  TextElement(TextType text_type, RegExpTree* tree)
440      : cp_offset_(-1), text_type_(text_type), tree_(tree) {}
441
442  int cp_offset_;
443  TextType text_type_;
444  RegExpTree* tree_;
445};
446
447
448class Trace;
449struct PreloadState;
450class GreedyLoopState;
451class AlternativeGenerationList;
452
453struct NodeInfo {
454  NodeInfo()
455      : being_analyzed(false),
456        been_analyzed(false),
457        follows_word_interest(false),
458        follows_newline_interest(false),
459        follows_start_interest(false),
460        at_end(false),
461        visited(false),
462        replacement_calculated(false) { }
463
464  // Returns true if the interests and assumptions of this node
465  // matches the given one.
466  bool Matches(NodeInfo* that) {
467    return (at_end == that->at_end) &&
468           (follows_word_interest == that->follows_word_interest) &&
469           (follows_newline_interest == that->follows_newline_interest) &&
470           (follows_start_interest == that->follows_start_interest);
471  }
472
473  // Updates the interests of this node given the interests of the
474  // node preceding it.
475  void AddFromPreceding(NodeInfo* that) {
476    at_end |= that->at_end;
477    follows_word_interest |= that->follows_word_interest;
478    follows_newline_interest |= that->follows_newline_interest;
479    follows_start_interest |= that->follows_start_interest;
480  }
481
482  bool HasLookbehind() {
483    return follows_word_interest ||
484           follows_newline_interest ||
485           follows_start_interest;
486  }
487
488  // Sets the interests of this node to include the interests of the
489  // following node.
490  void AddFromFollowing(NodeInfo* that) {
491    follows_word_interest |= that->follows_word_interest;
492    follows_newline_interest |= that->follows_newline_interest;
493    follows_start_interest |= that->follows_start_interest;
494  }
495
496  void ResetCompilationState() {
497    being_analyzed = false;
498    been_analyzed = false;
499  }
500
501  bool being_analyzed: 1;
502  bool been_analyzed: 1;
503
504  // These bits are set of this node has to know what the preceding
505  // character was.
506  bool follows_word_interest: 1;
507  bool follows_newline_interest: 1;
508  bool follows_start_interest: 1;
509
510  bool at_end: 1;
511  bool visited: 1;
512  bool replacement_calculated: 1;
513};
514
515
516// Details of a quick mask-compare check that can look ahead in the
517// input stream.
518class QuickCheckDetails {
519 public:
520  QuickCheckDetails()
521      : characters_(0),
522        mask_(0),
523        value_(0),
524        cannot_match_(false) { }
525  explicit QuickCheckDetails(int characters)
526      : characters_(characters),
527        mask_(0),
528        value_(0),
529        cannot_match_(false) { }
530  bool Rationalize(bool one_byte);
531  // Merge in the information from another branch of an alternation.
532  void Merge(QuickCheckDetails* other, int from_index);
533  // Advance the current position by some amount.
534  void Advance(int by, bool one_byte);
535  void Clear();
536  bool cannot_match() { return cannot_match_; }
537  void set_cannot_match() { cannot_match_ = true; }
538  struct Position {
539    Position() : mask(0), value(0), determines_perfectly(false) { }
540    uc16 mask;
541    uc16 value;
542    bool determines_perfectly;
543  };
544  int characters() { return characters_; }
545  void set_characters(int characters) { characters_ = characters; }
546  Position* positions(int index) {
547    DCHECK(index >= 0);
548    DCHECK(index < characters_);
549    return positions_ + index;
550  }
551  uint32_t mask() { return mask_; }
552  uint32_t value() { return value_; }
553
554 private:
555  // How many characters do we have quick check information from.  This is
556  // the same for all branches of a choice node.
557  int characters_;
558  Position positions_[4];
559  // These values are the condensate of the above array after Rationalize().
560  uint32_t mask_;
561  uint32_t value_;
562  // If set to true, there is no way this quick check can match at all.
563  // E.g., if it requires to be at the start of the input, and isn't.
564  bool cannot_match_;
565};
566
567
568extern int kUninitializedRegExpNodePlaceHolder;
569
570
571class RegExpNode: public ZoneObject {
572 public:
573  explicit RegExpNode(Zone* zone)
574  : replacement_(NULL), trace_count_(0), zone_(zone) {
575    bm_info_[0] = bm_info_[1] = NULL;
576  }
577  virtual ~RegExpNode();
578  virtual void Accept(NodeVisitor* visitor) = 0;
579  // Generates a goto to this node or actually generates the code at this point.
580  virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
581  // How many characters must this node consume at a minimum in order to
582  // succeed.  If we have found at least 'still_to_find' characters that
583  // must be consumed there is no need to ask any following nodes whether
584  // they are sure to eat any more characters.  The not_at_start argument is
585  // used to indicate that we know we are not at the start of the input.  In
586  // this case anchored branches will always fail and can be ignored when
587  // determining how many characters are consumed on success.
588  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
589  // Emits some quick code that checks whether the preloaded characters match.
590  // Falls through on certain failure, jumps to the label on possible success.
591  // If the node cannot make a quick check it does nothing and returns false.
592  bool EmitQuickCheck(RegExpCompiler* compiler,
593                      Trace* bounds_check_trace,
594                      Trace* trace,
595                      bool preload_has_checked_bounds,
596                      Label* on_possible_success,
597                      QuickCheckDetails* details_return,
598                      bool fall_through_on_failure);
599  // For a given number of characters this returns a mask and a value.  The
600  // next n characters are anded with the mask and compared with the value.
601  // A comparison failure indicates the node cannot match the next n characters.
602  // A comparison success indicates the node may match.
603  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
604                                    RegExpCompiler* compiler,
605                                    int characters_filled_in,
606                                    bool not_at_start) = 0;
607  static const int kNodeIsTooComplexForGreedyLoops = -1;
608  virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
609  // Only returns the successor for a text node of length 1 that matches any
610  // character and that has no guards on it.
611  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
612      RegExpCompiler* compiler) {
613    return NULL;
614  }
615
616  // Collects information on the possible code units (mod 128) that can match if
617  // we look forward.  This is used for a Boyer-Moore-like string searching
618  // implementation.  TODO(erikcorry):  This should share more code with
619  // EatsAtLeast, GetQuickCheckDetails.  The budget argument is used to limit
620  // the number of nodes we are willing to look at in order to create this data.
621  static const int kRecursionBudget = 200;
622  virtual void FillInBMInfo(int offset,
623                            int budget,
624                            BoyerMooreLookahead* bm,
625                            bool not_at_start) {
626    UNREACHABLE();
627  }
628
629  // If we know that the input is one-byte then there are some nodes that can
630  // never match.  This method returns a node that can be substituted for
631  // itself, or NULL if the node can never match.
632  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case) {
633    return this;
634  }
635  // Helper for FilterOneByte.
636  RegExpNode* replacement() {
637    DCHECK(info()->replacement_calculated);
638    return replacement_;
639  }
640  RegExpNode* set_replacement(RegExpNode* replacement) {
641    info()->replacement_calculated = true;
642    replacement_ =  replacement;
643    return replacement;  // For convenience.
644  }
645
646  // We want to avoid recalculating the lookahead info, so we store it on the
647  // node.  Only info that is for this node is stored.  We can tell that the
648  // info is for this node when offset == 0, so the information is calculated
649  // relative to this node.
650  void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
651    if (offset == 0) set_bm_info(not_at_start, bm);
652  }
653
654  Label* label() { return &label_; }
655  // If non-generic code is generated for a node (i.e. the node is not at the
656  // start of the trace) then it cannot be reused.  This variable sets a limit
657  // on how often we allow that to happen before we insist on starting a new
658  // trace and generating generic code for a node that can be reused by flushing
659  // the deferred actions in the current trace and generating a goto.
660  static const int kMaxCopiesCodeGenerated = 10;
661
662  NodeInfo* info() { return &info_; }
663
664  BoyerMooreLookahead* bm_info(bool not_at_start) {
665    return bm_info_[not_at_start ? 1 : 0];
666  }
667
668  Zone* zone() const { return zone_; }
669
670 protected:
671  enum LimitResult { DONE, CONTINUE };
672  RegExpNode* replacement_;
673
674  LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
675
676  void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
677    bm_info_[not_at_start ? 1 : 0] = bm;
678  }
679
680 private:
681  static const int kFirstCharBudget = 10;
682  Label label_;
683  NodeInfo info_;
684  // This variable keeps track of how many times code has been generated for
685  // this node (in different traces).  We don't keep track of where the
686  // generated code is located unless the code is generated at the start of
687  // a trace, in which case it is generic and can be reused by flushing the
688  // deferred operations in the current trace and generating a goto.
689  int trace_count_;
690  BoyerMooreLookahead* bm_info_[2];
691
692  Zone* zone_;
693};
694
695
696// A simple closed interval.
697class Interval {
698 public:
699  Interval() : from_(kNone), to_(kNone) { }
700  Interval(int from, int to) : from_(from), to_(to) { }
701  Interval Union(Interval that) {
702    if (that.from_ == kNone)
703      return *this;
704    else if (from_ == kNone)
705      return that;
706    else
707      return Interval(Min(from_, that.from_), Max(to_, that.to_));
708  }
709  bool Contains(int value) {
710    return (from_ <= value) && (value <= to_);
711  }
712  bool is_empty() { return from_ == kNone; }
713  int from() const { return from_; }
714  int to() const { return to_; }
715  static Interval Empty() { return Interval(); }
716  static const int kNone = -1;
717 private:
718  int from_;
719  int to_;
720};
721
722
723class SeqRegExpNode: public RegExpNode {
724 public:
725  explicit SeqRegExpNode(RegExpNode* on_success)
726      : RegExpNode(on_success->zone()), on_success_(on_success) { }
727  RegExpNode* on_success() { return on_success_; }
728  void set_on_success(RegExpNode* node) { on_success_ = node; }
729  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
730  virtual void FillInBMInfo(int offset,
731                            int budget,
732                            BoyerMooreLookahead* bm,
733                            bool not_at_start) {
734    on_success_->FillInBMInfo(offset, budget - 1, bm, not_at_start);
735    if (offset == 0) set_bm_info(not_at_start, bm);
736  }
737
738 protected:
739  RegExpNode* FilterSuccessor(int depth, bool ignore_case);
740
741 private:
742  RegExpNode* on_success_;
743};
744
745
746class ActionNode: public SeqRegExpNode {
747 public:
748  enum ActionType {
749    SET_REGISTER,
750    INCREMENT_REGISTER,
751    STORE_POSITION,
752    BEGIN_SUBMATCH,
753    POSITIVE_SUBMATCH_SUCCESS,
754    EMPTY_MATCH_CHECK,
755    CLEAR_CAPTURES
756  };
757  static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
758  static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
759  static ActionNode* StorePosition(int reg,
760                                   bool is_capture,
761                                   RegExpNode* on_success);
762  static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
763  static ActionNode* BeginSubmatch(int stack_pointer_reg,
764                                   int position_reg,
765                                   RegExpNode* on_success);
766  static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
767                                             int restore_reg,
768                                             int clear_capture_count,
769                                             int clear_capture_from,
770                                             RegExpNode* on_success);
771  static ActionNode* EmptyMatchCheck(int start_register,
772                                     int repetition_register,
773                                     int repetition_limit,
774                                     RegExpNode* on_success);
775  virtual void Accept(NodeVisitor* visitor);
776  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
777  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
778  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
779                                    RegExpCompiler* compiler,
780                                    int filled_in,
781                                    bool not_at_start) {
782    return on_success()->GetQuickCheckDetails(
783        details, compiler, filled_in, not_at_start);
784  }
785  virtual void FillInBMInfo(int offset,
786                            int budget,
787                            BoyerMooreLookahead* bm,
788                            bool not_at_start);
789  ActionType action_type() { return action_type_; }
790  // TODO(erikcorry): We should allow some action nodes in greedy loops.
791  virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
792
793 private:
794  union {
795    struct {
796      int reg;
797      int value;
798    } u_store_register;
799    struct {
800      int reg;
801    } u_increment_register;
802    struct {
803      int reg;
804      bool is_capture;
805    } u_position_register;
806    struct {
807      int stack_pointer_register;
808      int current_position_register;
809      int clear_register_count;
810      int clear_register_from;
811    } u_submatch;
812    struct {
813      int start_register;
814      int repetition_register;
815      int repetition_limit;
816    } u_empty_match_check;
817    struct {
818      int range_from;
819      int range_to;
820    } u_clear_captures;
821  } data_;
822  ActionNode(ActionType action_type, RegExpNode* on_success)
823      : SeqRegExpNode(on_success),
824        action_type_(action_type) { }
825  ActionType action_type_;
826  friend class DotPrinter;
827};
828
829
830class TextNode: public SeqRegExpNode {
831 public:
832  TextNode(ZoneList<TextElement>* elms,
833           RegExpNode* on_success)
834      : SeqRegExpNode(on_success),
835        elms_(elms) { }
836  TextNode(RegExpCharacterClass* that,
837           RegExpNode* on_success)
838      : SeqRegExpNode(on_success),
839        elms_(new(zone()) ZoneList<TextElement>(1, zone())) {
840    elms_->Add(TextElement::CharClass(that), zone());
841  }
842  virtual void Accept(NodeVisitor* visitor);
843  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
844  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
845  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
846                                    RegExpCompiler* compiler,
847                                    int characters_filled_in,
848                                    bool not_at_start);
849  ZoneList<TextElement>* elements() { return elms_; }
850  void MakeCaseIndependent(bool is_one_byte);
851  virtual int GreedyLoopTextLength();
852  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
853      RegExpCompiler* compiler);
854  virtual void FillInBMInfo(int offset,
855                            int budget,
856                            BoyerMooreLookahead* bm,
857                            bool not_at_start);
858  void CalculateOffsets();
859  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
860
861 private:
862  enum TextEmitPassType {
863    NON_LATIN1_MATCH,            // Check for characters that can't match.
864    SIMPLE_CHARACTER_MATCH,      // Case-dependent single character check.
865    NON_LETTER_CHARACTER_MATCH,  // Check characters that have no case equivs.
866    CASE_CHARACTER_MATCH,        // Case-independent single character check.
867    CHARACTER_CLASS_MATCH        // Character class.
868  };
869  static bool SkipPass(int pass, bool ignore_case);
870  static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
871  static const int kLastPass = CHARACTER_CLASS_MATCH;
872  void TextEmitPass(RegExpCompiler* compiler,
873                    TextEmitPassType pass,
874                    bool preloaded,
875                    Trace* trace,
876                    bool first_element_checked,
877                    int* checked_up_to);
878  int Length();
879  ZoneList<TextElement>* elms_;
880};
881
882
883class AssertionNode: public SeqRegExpNode {
884 public:
885  enum AssertionType {
886    AT_END,
887    AT_START,
888    AT_BOUNDARY,
889    AT_NON_BOUNDARY,
890    AFTER_NEWLINE
891  };
892  static AssertionNode* AtEnd(RegExpNode* on_success) {
893    return new(on_success->zone()) AssertionNode(AT_END, on_success);
894  }
895  static AssertionNode* AtStart(RegExpNode* on_success) {
896    return new(on_success->zone()) AssertionNode(AT_START, on_success);
897  }
898  static AssertionNode* AtBoundary(RegExpNode* on_success) {
899    return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
900  }
901  static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
902    return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
903  }
904  static AssertionNode* AfterNewline(RegExpNode* on_success) {
905    return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
906  }
907  virtual void Accept(NodeVisitor* visitor);
908  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
909  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
910  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
911                                    RegExpCompiler* compiler,
912                                    int filled_in,
913                                    bool not_at_start);
914  virtual void FillInBMInfo(int offset,
915                            int budget,
916                            BoyerMooreLookahead* bm,
917                            bool not_at_start);
918  AssertionType assertion_type() { return assertion_type_; }
919
920 private:
921  void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
922  enum IfPrevious { kIsNonWord, kIsWord };
923  void BacktrackIfPrevious(RegExpCompiler* compiler,
924                           Trace* trace,
925                           IfPrevious backtrack_if_previous);
926  AssertionNode(AssertionType t, RegExpNode* on_success)
927      : SeqRegExpNode(on_success), assertion_type_(t) { }
928  AssertionType assertion_type_;
929};
930
931
932class BackReferenceNode: public SeqRegExpNode {
933 public:
934  BackReferenceNode(int start_reg,
935                    int end_reg,
936                    RegExpNode* on_success)
937      : SeqRegExpNode(on_success),
938        start_reg_(start_reg),
939        end_reg_(end_reg) { }
940  virtual void Accept(NodeVisitor* visitor);
941  int start_register() { return start_reg_; }
942  int end_register() { return end_reg_; }
943  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
944  virtual int EatsAtLeast(int still_to_find,
945                          int recursion_depth,
946                          bool not_at_start);
947  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
948                                    RegExpCompiler* compiler,
949                                    int characters_filled_in,
950                                    bool not_at_start) {
951    return;
952  }
953  virtual void FillInBMInfo(int offset,
954                            int budget,
955                            BoyerMooreLookahead* bm,
956                            bool not_at_start);
957
958 private:
959  int start_reg_;
960  int end_reg_;
961};
962
963
964class EndNode: public RegExpNode {
965 public:
966  enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
967  explicit EndNode(Action action, Zone* zone)
968      : RegExpNode(zone), action_(action) { }
969  virtual void Accept(NodeVisitor* visitor);
970  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
971  virtual int EatsAtLeast(int still_to_find,
972                          int recursion_depth,
973                          bool not_at_start) { return 0; }
974  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
975                                    RegExpCompiler* compiler,
976                                    int characters_filled_in,
977                                    bool not_at_start) {
978    // Returning 0 from EatsAtLeast should ensure we never get here.
979    UNREACHABLE();
980  }
981  virtual void FillInBMInfo(int offset,
982                            int budget,
983                            BoyerMooreLookahead* bm,
984                            bool not_at_start) {
985    // Returning 0 from EatsAtLeast should ensure we never get here.
986    UNREACHABLE();
987  }
988
989 private:
990  Action action_;
991};
992
993
994class NegativeSubmatchSuccess: public EndNode {
995 public:
996  NegativeSubmatchSuccess(int stack_pointer_reg,
997                          int position_reg,
998                          int clear_capture_count,
999                          int clear_capture_start,
1000                          Zone* zone)
1001      : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
1002        stack_pointer_register_(stack_pointer_reg),
1003        current_position_register_(position_reg),
1004        clear_capture_count_(clear_capture_count),
1005        clear_capture_start_(clear_capture_start) { }
1006  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1007
1008 private:
1009  int stack_pointer_register_;
1010  int current_position_register_;
1011  int clear_capture_count_;
1012  int clear_capture_start_;
1013};
1014
1015
1016class Guard: public ZoneObject {
1017 public:
1018  enum Relation { LT, GEQ };
1019  Guard(int reg, Relation op, int value)
1020      : reg_(reg),
1021        op_(op),
1022        value_(value) { }
1023  int reg() { return reg_; }
1024  Relation op() { return op_; }
1025  int value() { return value_; }
1026
1027 private:
1028  int reg_;
1029  Relation op_;
1030  int value_;
1031};
1032
1033
1034class GuardedAlternative {
1035 public:
1036  explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) { }
1037  void AddGuard(Guard* guard, Zone* zone);
1038  RegExpNode* node() { return node_; }
1039  void set_node(RegExpNode* node) { node_ = node; }
1040  ZoneList<Guard*>* guards() { return guards_; }
1041
1042 private:
1043  RegExpNode* node_;
1044  ZoneList<Guard*>* guards_;
1045};
1046
1047
1048class AlternativeGeneration;
1049
1050
1051class ChoiceNode: public RegExpNode {
1052 public:
1053  explicit ChoiceNode(int expected_size, Zone* zone)
1054      : RegExpNode(zone),
1055        alternatives_(new(zone)
1056                      ZoneList<GuardedAlternative>(expected_size, zone)),
1057        table_(NULL),
1058        not_at_start_(false),
1059        being_calculated_(false) { }
1060  virtual void Accept(NodeVisitor* visitor);
1061  void AddAlternative(GuardedAlternative node) {
1062    alternatives()->Add(node, zone());
1063  }
1064  ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
1065  DispatchTable* GetTable(bool ignore_case);
1066  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1067  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
1068  int EatsAtLeastHelper(int still_to_find,
1069                        int budget,
1070                        RegExpNode* ignore_this_node,
1071                        bool not_at_start);
1072  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1073                                    RegExpCompiler* compiler,
1074                                    int characters_filled_in,
1075                                    bool not_at_start);
1076  virtual void FillInBMInfo(int offset,
1077                            int budget,
1078                            BoyerMooreLookahead* bm,
1079                            bool not_at_start);
1080
1081  bool being_calculated() { return being_calculated_; }
1082  bool not_at_start() { return not_at_start_; }
1083  void set_not_at_start() { not_at_start_ = true; }
1084  void set_being_calculated(bool b) { being_calculated_ = b; }
1085  virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
1086    return true;
1087  }
1088  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
1089
1090 protected:
1091  int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
1092  ZoneList<GuardedAlternative>* alternatives_;
1093
1094 private:
1095  friend class DispatchTableConstructor;
1096  friend class Analysis;
1097  void GenerateGuard(RegExpMacroAssembler* macro_assembler,
1098                     Guard* guard,
1099                     Trace* trace);
1100  int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
1101  void EmitOutOfLineContinuation(RegExpCompiler* compiler,
1102                                 Trace* trace,
1103                                 GuardedAlternative alternative,
1104                                 AlternativeGeneration* alt_gen,
1105                                 int preload_characters,
1106                                 bool next_expects_preload);
1107  void SetUpPreLoad(RegExpCompiler* compiler,
1108                    Trace* current_trace,
1109                    PreloadState* preloads);
1110  void AssertGuardsMentionRegisters(Trace* trace);
1111  int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
1112  Trace* EmitGreedyLoop(RegExpCompiler* compiler,
1113                        Trace* trace,
1114                        AlternativeGenerationList* alt_gens,
1115                        PreloadState* preloads,
1116                        GreedyLoopState* greedy_loop_state,
1117                        int text_length);
1118  void EmitChoices(RegExpCompiler* compiler,
1119                   AlternativeGenerationList* alt_gens,
1120                   int first_choice,
1121                   Trace* trace,
1122                   PreloadState* preloads);
1123  DispatchTable* table_;
1124  // If true, this node is never checked at the start of the input.
1125  // Allows a new trace to start with at_start() set to false.
1126  bool not_at_start_;
1127  bool being_calculated_;
1128};
1129
1130
1131class NegativeLookaheadChoiceNode: public ChoiceNode {
1132 public:
1133  explicit NegativeLookaheadChoiceNode(GuardedAlternative this_must_fail,
1134                                       GuardedAlternative then_do_this,
1135                                       Zone* zone)
1136      : ChoiceNode(2, zone) {
1137    AddAlternative(this_must_fail);
1138    AddAlternative(then_do_this);
1139  }
1140  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
1141  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1142                                    RegExpCompiler* compiler,
1143                                    int characters_filled_in,
1144                                    bool not_at_start);
1145  virtual void FillInBMInfo(int offset,
1146                            int budget,
1147                            BoyerMooreLookahead* bm,
1148                            bool not_at_start) {
1149    alternatives_->at(1).node()->FillInBMInfo(
1150        offset, budget - 1, bm, not_at_start);
1151    if (offset == 0) set_bm_info(not_at_start, bm);
1152  }
1153  // For a negative lookahead we don't emit the quick check for the
1154  // alternative that is expected to fail.  This is because quick check code
1155  // starts by loading enough characters for the alternative that takes fewest
1156  // characters, but on a negative lookahead the negative branch did not take
1157  // part in that calculation (EatsAtLeast) so the assumptions don't hold.
1158  virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
1159    return !is_first;
1160  }
1161  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
1162};
1163
1164
1165class LoopChoiceNode: public ChoiceNode {
1166 public:
1167  explicit LoopChoiceNode(bool body_can_be_zero_length, Zone* zone)
1168      : ChoiceNode(2, zone),
1169        loop_node_(NULL),
1170        continue_node_(NULL),
1171        body_can_be_zero_length_(body_can_be_zero_length)
1172        { }
1173  void AddLoopAlternative(GuardedAlternative alt);
1174  void AddContinueAlternative(GuardedAlternative alt);
1175  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1176  virtual int EatsAtLeast(int still_to_find,  int budget, bool not_at_start);
1177  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1178                                    RegExpCompiler* compiler,
1179                                    int characters_filled_in,
1180                                    bool not_at_start);
1181  virtual void FillInBMInfo(int offset,
1182                            int budget,
1183                            BoyerMooreLookahead* bm,
1184                            bool not_at_start);
1185  RegExpNode* loop_node() { return loop_node_; }
1186  RegExpNode* continue_node() { return continue_node_; }
1187  bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1188  virtual void Accept(NodeVisitor* visitor);
1189  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
1190
1191 private:
1192  // AddAlternative is made private for loop nodes because alternatives
1193  // should not be added freely, we need to keep track of which node
1194  // goes back to the node itself.
1195  void AddAlternative(GuardedAlternative node) {
1196    ChoiceNode::AddAlternative(node);
1197  }
1198
1199  RegExpNode* loop_node_;
1200  RegExpNode* continue_node_;
1201  bool body_can_be_zero_length_;
1202};
1203
1204
1205// Improve the speed that we scan for an initial point where a non-anchored
1206// regexp can match by using a Boyer-Moore-like table. This is done by
1207// identifying non-greedy non-capturing loops in the nodes that eat any
1208// character one at a time.  For example in the middle of the regexp
1209// /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
1210// inserted at the start of any non-anchored regexp.
1211//
1212// When we have found such a loop we look ahead in the nodes to find the set of
1213// characters that can come at given distances. For example for the regexp
1214// /.?foo/ we know that there are at least 3 characters ahead of us, and the
1215// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1216// the lookahead info where the set of characters is reasonably constrained. In
1217// our example this is from index 1 to 2 (0 is not constrained). We can now
1218// look 3 characters ahead and if we don't find one of [f, o] (the union of
1219// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1220//
1221// For Unicode input strings we do the same, but modulo 128.
1222//
1223// We also look at the first string fed to the regexp and use that to get a hint
1224// of the character frequencies in the inputs. This affects the assessment of
1225// whether the set of characters is 'reasonably constrained'.
1226//
1227// We also have another lookahead mechanism (called quick check in the code),
1228// which uses a wide load of multiple characters followed by a mask and compare
1229// to determine whether a match is possible at this point.
1230enum ContainedInLattice {
1231  kNotYet = 0,
1232  kLatticeIn = 1,
1233  kLatticeOut = 2,
1234  kLatticeUnknown = 3  // Can also mean both in and out.
1235};
1236
1237
1238inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1239  return static_cast<ContainedInLattice>(a | b);
1240}
1241
1242
1243ContainedInLattice AddRange(ContainedInLattice a,
1244                            const int* ranges,
1245                            int ranges_size,
1246                            Interval new_range);
1247
1248
1249class BoyerMoorePositionInfo : public ZoneObject {
1250 public:
1251  explicit BoyerMoorePositionInfo(Zone* zone)
1252      : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1253        map_count_(0),
1254        w_(kNotYet),
1255        s_(kNotYet),
1256        d_(kNotYet),
1257        surrogate_(kNotYet) {
1258     for (int i = 0; i < kMapSize; i++) {
1259       map_->Add(false, zone);
1260     }
1261  }
1262
1263  bool& at(int i) { return map_->at(i); }
1264
1265  static const int kMapSize = 128;
1266  static const int kMask = kMapSize - 1;
1267
1268  int map_count() const { return map_count_; }
1269
1270  void Set(int character);
1271  void SetInterval(const Interval& interval);
1272  void SetAll();
1273  bool is_non_word() { return w_ == kLatticeOut; }
1274  bool is_word() { return w_ == kLatticeIn; }
1275
1276 private:
1277  ZoneList<bool>* map_;
1278  int map_count_;  // Number of set bits in the map.
1279  ContainedInLattice w_;  // The \w character class.
1280  ContainedInLattice s_;  // The \s character class.
1281  ContainedInLattice d_;  // The \d character class.
1282  ContainedInLattice surrogate_;  // Surrogate UTF-16 code units.
1283};
1284
1285
1286class BoyerMooreLookahead : public ZoneObject {
1287 public:
1288  BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1289
1290  int length() { return length_; }
1291  int max_char() { return max_char_; }
1292  RegExpCompiler* compiler() { return compiler_; }
1293
1294  int Count(int map_number) {
1295    return bitmaps_->at(map_number)->map_count();
1296  }
1297
1298  BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1299
1300  void Set(int map_number, int character) {
1301    if (character > max_char_) return;
1302    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1303    info->Set(character);
1304  }
1305
1306  void SetInterval(int map_number, const Interval& interval) {
1307    if (interval.from() > max_char_) return;
1308    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1309    if (interval.to() > max_char_) {
1310      info->SetInterval(Interval(interval.from(), max_char_));
1311    } else {
1312      info->SetInterval(interval);
1313    }
1314  }
1315
1316  void SetAll(int map_number) {
1317    bitmaps_->at(map_number)->SetAll();
1318  }
1319
1320  void SetRest(int from_map) {
1321    for (int i = from_map; i < length_; i++) SetAll(i);
1322  }
1323  void EmitSkipInstructions(RegExpMacroAssembler* masm);
1324
1325 private:
1326  // This is the value obtained by EatsAtLeast.  If we do not have at least this
1327  // many characters left in the sample string then the match is bound to fail.
1328  // Therefore it is OK to read a character this far ahead of the current match
1329  // point.
1330  int length_;
1331  RegExpCompiler* compiler_;
1332  // 0xff for Latin1, 0xffff for UTF-16.
1333  int max_char_;
1334  ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1335
1336  int GetSkipTable(int min_lookahead,
1337                   int max_lookahead,
1338                   Handle<ByteArray> boolean_skip_table);
1339  bool FindWorthwhileInterval(int* from, int* to);
1340  int FindBestInterval(
1341    int max_number_of_chars, int old_biggest_points, int* from, int* to);
1342};
1343
1344
1345// There are many ways to generate code for a node.  This class encapsulates
1346// the current way we should be generating.  In other words it encapsulates
1347// the current state of the code generator.  The effect of this is that we
1348// generate code for paths that the matcher can take through the regular
1349// expression.  A given node in the regexp can be code-generated several times
1350// as it can be part of several traces.  For example for the regexp:
1351// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1352// of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
1353// to match foo is generated only once (the traces have a common prefix).  The
1354// code to store the capture is deferred and generated (twice) after the places
1355// where baz has been matched.
1356class Trace {
1357 public:
1358  // A value for a property that is either known to be true, know to be false,
1359  // or not known.
1360  enum TriBool {
1361    UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1362  };
1363
1364  class DeferredAction {
1365   public:
1366    DeferredAction(ActionNode::ActionType action_type, int reg)
1367        : action_type_(action_type), reg_(reg), next_(NULL) { }
1368    DeferredAction* next() { return next_; }
1369    bool Mentions(int reg);
1370    int reg() { return reg_; }
1371    ActionNode::ActionType action_type() { return action_type_; }
1372   private:
1373    ActionNode::ActionType action_type_;
1374    int reg_;
1375    DeferredAction* next_;
1376    friend class Trace;
1377  };
1378
1379  class DeferredCapture : public DeferredAction {
1380   public:
1381    DeferredCapture(int reg, bool is_capture, Trace* trace)
1382        : DeferredAction(ActionNode::STORE_POSITION, reg),
1383          cp_offset_(trace->cp_offset()),
1384          is_capture_(is_capture) { }
1385    int cp_offset() { return cp_offset_; }
1386    bool is_capture() { return is_capture_; }
1387   private:
1388    int cp_offset_;
1389    bool is_capture_;
1390    void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1391  };
1392
1393  class DeferredSetRegister : public DeferredAction {
1394   public:
1395    DeferredSetRegister(int reg, int value)
1396        : DeferredAction(ActionNode::SET_REGISTER, reg),
1397          value_(value) { }
1398    int value() { return value_; }
1399   private:
1400    int value_;
1401  };
1402
1403  class DeferredClearCaptures : public DeferredAction {
1404   public:
1405    explicit DeferredClearCaptures(Interval range)
1406        : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1407          range_(range) { }
1408    Interval range() { return range_; }
1409   private:
1410    Interval range_;
1411  };
1412
1413  class DeferredIncrementRegister : public DeferredAction {
1414   public:
1415    explicit DeferredIncrementRegister(int reg)
1416        : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1417  };
1418
1419  Trace()
1420      : cp_offset_(0),
1421        actions_(NULL),
1422        backtrack_(NULL),
1423        stop_node_(NULL),
1424        loop_label_(NULL),
1425        characters_preloaded_(0),
1426        bound_checked_up_to_(0),
1427        flush_budget_(100),
1428        at_start_(UNKNOWN) { }
1429
1430  // End the trace.  This involves flushing the deferred actions in the trace
1431  // and pushing a backtrack location onto the backtrack stack.  Once this is
1432  // done we can start a new trace or go to one that has already been
1433  // generated.
1434  void Flush(RegExpCompiler* compiler, RegExpNode* successor);
1435  int cp_offset() { return cp_offset_; }
1436  DeferredAction* actions() { return actions_; }
1437  // A trivial trace is one that has no deferred actions or other state that
1438  // affects the assumptions used when generating code.  There is no recorded
1439  // backtrack location in a trivial trace, so with a trivial trace we will
1440  // generate code that, on a failure to match, gets the backtrack location
1441  // from the backtrack stack rather than using a direct jump instruction.  We
1442  // always start code generation with a trivial trace and non-trivial traces
1443  // are created as we emit code for nodes or add to the list of deferred
1444  // actions in the trace.  The location of the code generated for a node using
1445  // a trivial trace is recorded in a label in the node so that gotos can be
1446  // generated to that code.
1447  bool is_trivial() {
1448    return backtrack_ == NULL &&
1449           actions_ == NULL &&
1450           cp_offset_ == 0 &&
1451           characters_preloaded_ == 0 &&
1452           bound_checked_up_to_ == 0 &&
1453           quick_check_performed_.characters() == 0 &&
1454           at_start_ == UNKNOWN;
1455  }
1456  TriBool at_start() { return at_start_; }
1457  void set_at_start(bool at_start) {
1458    at_start_ = at_start ? TRUE_VALUE : FALSE_VALUE;
1459  }
1460  Label* backtrack() { return backtrack_; }
1461  Label* loop_label() { return loop_label_; }
1462  RegExpNode* stop_node() { return stop_node_; }
1463  int characters_preloaded() { return characters_preloaded_; }
1464  int bound_checked_up_to() { return bound_checked_up_to_; }
1465  int flush_budget() { return flush_budget_; }
1466  QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1467  bool mentions_reg(int reg);
1468  // Returns true if a deferred position store exists to the specified
1469  // register and stores the offset in the out-parameter.  Otherwise
1470  // returns false.
1471  bool GetStoredPosition(int reg, int* cp_offset);
1472  // These set methods and AdvanceCurrentPositionInTrace should be used only on
1473  // new traces - the intention is that traces are immutable after creation.
1474  void add_action(DeferredAction* new_action) {
1475    DCHECK(new_action->next_ == NULL);
1476    new_action->next_ = actions_;
1477    actions_ = new_action;
1478  }
1479  void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
1480  void set_stop_node(RegExpNode* node) { stop_node_ = node; }
1481  void set_loop_label(Label* label) { loop_label_ = label; }
1482  void set_characters_preloaded(int count) { characters_preloaded_ = count; }
1483  void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
1484  void set_flush_budget(int to) { flush_budget_ = to; }
1485  void set_quick_check_performed(QuickCheckDetails* d) {
1486    quick_check_performed_ = *d;
1487  }
1488  void InvalidateCurrentCharacter();
1489  void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1490
1491 private:
1492  int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
1493  void PerformDeferredActions(RegExpMacroAssembler* macro,
1494                              int max_register,
1495                              const OutSet& affected_registers,
1496                              OutSet* registers_to_pop,
1497                              OutSet* registers_to_clear,
1498                              Zone* zone);
1499  void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1500                                int max_register,
1501                                const OutSet& registers_to_pop,
1502                                const OutSet& registers_to_clear);
1503  int cp_offset_;
1504  DeferredAction* actions_;
1505  Label* backtrack_;
1506  RegExpNode* stop_node_;
1507  Label* loop_label_;
1508  int characters_preloaded_;
1509  int bound_checked_up_to_;
1510  QuickCheckDetails quick_check_performed_;
1511  int flush_budget_;
1512  TriBool at_start_;
1513};
1514
1515
1516class GreedyLoopState {
1517 public:
1518  explicit GreedyLoopState(bool not_at_start);
1519
1520  Label* label() { return &label_; }
1521  Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
1522
1523 private:
1524  Label label_;
1525  Trace counter_backtrack_trace_;
1526};
1527
1528
1529struct PreloadState {
1530  static const int kEatsAtLeastNotYetInitialized = -1;
1531  bool preload_is_current_;
1532  bool preload_has_checked_bounds_;
1533  int preload_characters_;
1534  int eats_at_least_;
1535  void init() {
1536    eats_at_least_ = kEatsAtLeastNotYetInitialized;
1537  }
1538};
1539
1540
1541class NodeVisitor {
1542 public:
1543  virtual ~NodeVisitor() { }
1544#define DECLARE_VISIT(Type)                                          \
1545  virtual void Visit##Type(Type##Node* that) = 0;
1546FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1547#undef DECLARE_VISIT
1548  virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1549};
1550
1551
1552// Node visitor used to add the start set of the alternatives to the
1553// dispatch table of a choice node.
1554class DispatchTableConstructor: public NodeVisitor {
1555 public:
1556  DispatchTableConstructor(DispatchTable* table, bool ignore_case,
1557                           Zone* zone)
1558      : table_(table),
1559        choice_index_(-1),
1560        ignore_case_(ignore_case),
1561        zone_(zone) { }
1562
1563  void BuildTable(ChoiceNode* node);
1564
1565  void AddRange(CharacterRange range) {
1566    table()->AddRange(range, choice_index_, zone_);
1567  }
1568
1569  void AddInverse(ZoneList<CharacterRange>* ranges);
1570
1571#define DECLARE_VISIT(Type)                                          \
1572  virtual void Visit##Type(Type##Node* that);
1573FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1574#undef DECLARE_VISIT
1575
1576  DispatchTable* table() { return table_; }
1577  void set_choice_index(int value) { choice_index_ = value; }
1578
1579 protected:
1580  DispatchTable* table_;
1581  int choice_index_;
1582  bool ignore_case_;
1583  Zone* zone_;
1584};
1585
1586
1587// Assertion propagation moves information about assertions such as
1588// \b to the affected nodes.  For instance, in /.\b./ information must
1589// be propagated to the first '.' that whatever follows needs to know
1590// if it matched a word or a non-word, and to the second '.' that it
1591// has to check if it succeeds a word or non-word.  In this case the
1592// result will be something like:
1593//
1594//   +-------+        +------------+
1595//   |   .   |        |      .     |
1596//   +-------+  --->  +------------+
1597//   | word? |        | check word |
1598//   +-------+        +------------+
1599class Analysis: public NodeVisitor {
1600 public:
1601  Analysis(bool ignore_case, bool is_one_byte)
1602      : ignore_case_(ignore_case),
1603        is_one_byte_(is_one_byte),
1604        error_message_(NULL) {}
1605  void EnsureAnalyzed(RegExpNode* node);
1606
1607#define DECLARE_VISIT(Type)                                          \
1608  virtual void Visit##Type(Type##Node* that);
1609FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1610#undef DECLARE_VISIT
1611  virtual void VisitLoopChoice(LoopChoiceNode* that);
1612
1613  bool has_failed() { return error_message_ != NULL; }
1614  const char* error_message() {
1615    DCHECK(error_message_ != NULL);
1616    return error_message_;
1617  }
1618  void fail(const char* error_message) {
1619    error_message_ = error_message;
1620  }
1621
1622 private:
1623  bool ignore_case_;
1624  bool is_one_byte_;
1625  const char* error_message_;
1626
1627  DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1628};
1629
1630
1631struct RegExpCompileData {
1632  RegExpCompileData()
1633    : tree(NULL),
1634      node(NULL),
1635      simple(true),
1636      contains_anchor(false),
1637      capture_count(0) { }
1638  RegExpTree* tree;
1639  RegExpNode* node;
1640  bool simple;
1641  bool contains_anchor;
1642  Handle<String> error;
1643  int capture_count;
1644};
1645
1646
1647class RegExpEngine: public AllStatic {
1648 public:
1649  struct CompilationResult {
1650    CompilationResult(Isolate* isolate, const char* error_message)
1651        : error_message(error_message),
1652          code(isolate->heap()->the_hole_value()),
1653          num_registers(0) {}
1654    CompilationResult(Object* code, int registers)
1655      : error_message(NULL),
1656        code(code),
1657        num_registers(registers) {}
1658    const char* error_message;
1659    Object* code;
1660    int num_registers;
1661  };
1662
1663  static CompilationResult Compile(RegExpCompileData* input, bool ignore_case,
1664                                   bool global, bool multiline, bool sticky,
1665                                   Handle<String> pattern,
1666                                   Handle<String> sample_subject,
1667                                   bool is_one_byte, Zone* zone);
1668
1669  static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1670};
1671
1672
1673} }  // namespace v8::internal
1674
1675#endif  // V8_JSREGEXP_H_
1676