1// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef V8_REGEXP_JSREGEXP_H_
6#define V8_REGEXP_JSREGEXP_H_
7
8#include "src/allocation.h"
9#include "src/assembler.h"
10#include "src/regexp/regexp-ast.h"
11#include "src/regexp/regexp-macro-assembler.h"
12
13namespace v8 {
14namespace internal {
15
16class NodeVisitor;
17class RegExpCompiler;
18class RegExpMacroAssembler;
19class RegExpNode;
20class RegExpTree;
21class BoyerMooreLookahead;
22
23class RegExpImpl {
24 public:
25  // Whether V8 is compiled with native regexp support or not.
26  static bool UsesNativeRegExp() {
27#ifdef V8_INTERPRETED_REGEXP
28    return false;
29#else
30    return true;
31#endif
32  }
33
34  // Returns a string representation of a regular expression.
35  // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
36  // This function calls the garbage collector if necessary.
37  static Handle<String> ToString(Handle<Object> value);
38
39  // Parses the RegExp pattern and prepares the JSRegExp object with
40  // generic data and choice of implementation - as well as what
41  // the implementation wants to store in the data field.
42  // Returns false if compilation fails.
43  MUST_USE_RESULT static MaybeHandle<Object> Compile(Handle<JSRegExp> re,
44                                                     Handle<String> pattern,
45                                                     JSRegExp::Flags flags);
46
47  // See ECMA-262 section 15.10.6.2.
48  // This function calls the garbage collector if necessary.
49  V8_EXPORT_PRIVATE MUST_USE_RESULT static MaybeHandle<Object> Exec(
50      Handle<JSRegExp> regexp, Handle<String> subject, int index,
51      Handle<RegExpMatchInfo> last_match_info);
52
53  // Prepares a JSRegExp object with Irregexp-specific data.
54  static void IrregexpInitialize(Handle<JSRegExp> re,
55                                 Handle<String> pattern,
56                                 JSRegExp::Flags flags,
57                                 int capture_register_count);
58
59
60  static void AtomCompile(Handle<JSRegExp> re,
61                          Handle<String> pattern,
62                          JSRegExp::Flags flags,
63                          Handle<String> match_pattern);
64
65
66  static int AtomExecRaw(Handle<JSRegExp> regexp,
67                         Handle<String> subject,
68                         int index,
69                         int32_t* output,
70                         int output_size);
71
72  static Handle<Object> AtomExec(Handle<JSRegExp> regexp,
73                                 Handle<String> subject, int index,
74                                 Handle<RegExpMatchInfo> last_match_info);
75
76  enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
77
78  // Prepare a RegExp for being executed one or more times (using
79  // IrregexpExecOnce) on the subject.
80  // This ensures that the regexp is compiled for the subject, and that
81  // the subject is flat.
82  // Returns the number of integer spaces required by IrregexpExecOnce
83  // as its "registers" argument.  If the regexp cannot be compiled,
84  // an exception is set as pending, and this function returns negative.
85  static int IrregexpPrepare(Handle<JSRegExp> regexp,
86                             Handle<String> subject);
87
88  // Execute a regular expression on the subject, starting from index.
89  // If matching succeeds, return the number of matches.  This can be larger
90  // than one in the case of global regular expressions.
91  // The captures and subcaptures are stored into the registers vector.
92  // If matching fails, returns RE_FAILURE.
93  // If execution fails, sets a pending exception and returns RE_EXCEPTION.
94  static int IrregexpExecRaw(Handle<JSRegExp> regexp,
95                             Handle<String> subject,
96                             int index,
97                             int32_t* output,
98                             int output_size);
99
100  // Execute an Irregexp bytecode pattern.
101  // On a successful match, the result is a JSArray containing
102  // captured positions.  On a failure, the result is the null value.
103  // Returns an empty handle in case of an exception.
104  MUST_USE_RESULT static MaybeHandle<Object> IrregexpExec(
105      Handle<JSRegExp> regexp, Handle<String> subject, int index,
106      Handle<RegExpMatchInfo> last_match_info);
107
108  // Set last match info.  If match is NULL, then setting captures is omitted.
109  static Handle<RegExpMatchInfo> SetLastMatchInfo(
110      Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
111      int capture_count, int32_t* match);
112
113  class GlobalCache {
114   public:
115    GlobalCache(Handle<JSRegExp> regexp,
116                Handle<String> subject,
117                Isolate* isolate);
118
119    INLINE(~GlobalCache());
120
121    // Fetch the next entry in the cache for global regexp match results.
122    // This does not set the last match info.  Upon failure, NULL is returned.
123    // The cause can be checked with Result().  The previous
124    // result is still in available in memory when a failure happens.
125    INLINE(int32_t* FetchNext());
126
127    INLINE(int32_t* LastSuccessfulMatch());
128
129    INLINE(bool HasException()) { return num_matches_ < 0; }
130
131   private:
132    int AdvanceZeroLength(int last_index);
133
134    int num_matches_;
135    int max_matches_;
136    int current_match_index_;
137    int registers_per_match_;
138    // Pointer to the last set of captures.
139    int32_t* register_array_;
140    int register_array_size_;
141    Handle<JSRegExp> regexp_;
142    Handle<String> subject_;
143  };
144
145  // For acting on the JSRegExp data FixedArray.
146  static int IrregexpMaxRegisterCount(FixedArray* re);
147  static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
148  static void SetIrregexpCaptureNameMap(FixedArray* re,
149                                        Handle<FixedArray> value);
150  static int IrregexpNumberOfCaptures(FixedArray* re);
151  static int IrregexpNumberOfRegisters(FixedArray* re);
152  static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte);
153  static Code* IrregexpNativeCode(FixedArray* re, bool is_one_byte);
154
155  // Limit the space regexps take up on the heap.  In order to limit this we
156  // would like to keep track of the amount of regexp code on the heap.  This
157  // is not tracked, however.  As a conservative approximation we track the
158  // total regexp code compiled including code that has subsequently been freed
159  // and the total executable memory at any point.
160  static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
161  static const int kRegExpCompiledLimit = 1 * MB;
162  static const int kRegExpTooLargeToOptimize = 20 * KB;
163
164 private:
165  static bool CompileIrregexp(Handle<JSRegExp> re,
166                              Handle<String> sample_subject, bool is_one_byte);
167  static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re,
168                                            Handle<String> sample_subject,
169                                            bool is_one_byte);
170};
171
172
173// Represents the location of one element relative to the intersection of
174// two sets. Corresponds to the four areas of a Venn diagram.
175enum ElementInSetsRelation {
176  kInsideNone = 0,
177  kInsideFirst = 1,
178  kInsideSecond = 2,
179  kInsideBoth = 3
180};
181
182
183// A set of unsigned integers that behaves especially well on small
184// integers (< 32).  May do zone-allocation.
185class OutSet: public ZoneObject {
186 public:
187  OutSet() : first_(0), remaining_(NULL), successors_(NULL) { }
188  OutSet* Extend(unsigned value, Zone* zone);
189  bool Get(unsigned value) const;
190  static const unsigned kFirstLimit = 32;
191
192 private:
193  // Destructively set a value in this set.  In most cases you want
194  // to use Extend instead to ensure that only one instance exists
195  // that contains the same values.
196  void Set(unsigned value, Zone* zone);
197
198  // The successors are a list of sets that contain the same values
199  // as this set and the one more value that is not present in this
200  // set.
201  ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
202
203  OutSet(uint32_t first, ZoneList<unsigned>* remaining)
204      : first_(first), remaining_(remaining), successors_(NULL) { }
205  uint32_t first_;
206  ZoneList<unsigned>* remaining_;
207  ZoneList<OutSet*>* successors_;
208  friend class Trace;
209};
210
211
212// A mapping from integers, specified as ranges, to a set of integers.
213// Used for mapping character ranges to choices.
214class DispatchTable : public ZoneObject {
215 public:
216  explicit DispatchTable(Zone* zone) : tree_(zone) { }
217
218  class Entry {
219   public:
220    Entry() : from_(0), to_(0), out_set_(NULL) { }
221    Entry(uc32 from, uc32 to, OutSet* out_set)
222        : from_(from), to_(to), out_set_(out_set) {
223      DCHECK(from <= to);
224    }
225    uc32 from() { return from_; }
226    uc32 to() { return to_; }
227    void set_to(uc32 value) { to_ = value; }
228    void AddValue(int value, Zone* zone) {
229      out_set_ = out_set_->Extend(value, zone);
230    }
231    OutSet* out_set() { return out_set_; }
232   private:
233    uc32 from_;
234    uc32 to_;
235    OutSet* out_set_;
236  };
237
238  class Config {
239   public:
240    typedef uc32 Key;
241    typedef Entry Value;
242    static const uc32 kNoKey;
243    static const Entry NoValue() { return Value(); }
244    static inline int Compare(uc32 a, uc32 b) {
245      if (a == b)
246        return 0;
247      else if (a < b)
248        return -1;
249      else
250        return 1;
251    }
252  };
253
254  void AddRange(CharacterRange range, int value, Zone* zone);
255  OutSet* Get(uc32 value);
256  void Dump();
257
258  template <typename Callback>
259  void ForEach(Callback* callback) {
260    return tree()->ForEach(callback);
261  }
262
263 private:
264  // There can't be a static empty set since it allocates its
265  // successors in a zone and caches them.
266  OutSet* empty() { return &empty_; }
267  OutSet empty_;
268  ZoneSplayTree<Config>* tree() { return &tree_; }
269  ZoneSplayTree<Config> tree_;
270};
271
272
273// Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
274class UnicodeRangeSplitter {
275 public:
276  UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
277  void Call(uc32 from, DispatchTable::Entry entry);
278
279  ZoneList<CharacterRange>* bmp() { return bmp_; }
280  ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
281  ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
282  ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
283
284 private:
285  static const int kBase = 0;
286  // Separate ranges into
287  static const int kBmpCodePoints = 1;
288  static const int kLeadSurrogates = 2;
289  static const int kTrailSurrogates = 3;
290  static const int kNonBmpCodePoints = 4;
291
292  Zone* zone_;
293  DispatchTable table_;
294  ZoneList<CharacterRange>* bmp_;
295  ZoneList<CharacterRange>* lead_surrogates_;
296  ZoneList<CharacterRange>* trail_surrogates_;
297  ZoneList<CharacterRange>* non_bmp_;
298};
299
300
301#define FOR_EACH_NODE_TYPE(VISIT)                                    \
302  VISIT(End)                                                         \
303  VISIT(Action)                                                      \
304  VISIT(Choice)                                                      \
305  VISIT(BackReference)                                               \
306  VISIT(Assertion)                                                   \
307  VISIT(Text)
308
309
310class Trace;
311struct PreloadState;
312class GreedyLoopState;
313class AlternativeGenerationList;
314
315struct NodeInfo {
316  NodeInfo()
317      : being_analyzed(false),
318        been_analyzed(false),
319        follows_word_interest(false),
320        follows_newline_interest(false),
321        follows_start_interest(false),
322        at_end(false),
323        visited(false),
324        replacement_calculated(false) { }
325
326  // Returns true if the interests and assumptions of this node
327  // matches the given one.
328  bool Matches(NodeInfo* that) {
329    return (at_end == that->at_end) &&
330           (follows_word_interest == that->follows_word_interest) &&
331           (follows_newline_interest == that->follows_newline_interest) &&
332           (follows_start_interest == that->follows_start_interest);
333  }
334
335  // Updates the interests of this node given the interests of the
336  // node preceding it.
337  void AddFromPreceding(NodeInfo* that) {
338    at_end |= that->at_end;
339    follows_word_interest |= that->follows_word_interest;
340    follows_newline_interest |= that->follows_newline_interest;
341    follows_start_interest |= that->follows_start_interest;
342  }
343
344  bool HasLookbehind() {
345    return follows_word_interest ||
346           follows_newline_interest ||
347           follows_start_interest;
348  }
349
350  // Sets the interests of this node to include the interests of the
351  // following node.
352  void AddFromFollowing(NodeInfo* that) {
353    follows_word_interest |= that->follows_word_interest;
354    follows_newline_interest |= that->follows_newline_interest;
355    follows_start_interest |= that->follows_start_interest;
356  }
357
358  void ResetCompilationState() {
359    being_analyzed = false;
360    been_analyzed = false;
361  }
362
363  bool being_analyzed: 1;
364  bool been_analyzed: 1;
365
366  // These bits are set of this node has to know what the preceding
367  // character was.
368  bool follows_word_interest: 1;
369  bool follows_newline_interest: 1;
370  bool follows_start_interest: 1;
371
372  bool at_end: 1;
373  bool visited: 1;
374  bool replacement_calculated: 1;
375};
376
377
378// Details of a quick mask-compare check that can look ahead in the
379// input stream.
380class QuickCheckDetails {
381 public:
382  QuickCheckDetails()
383      : characters_(0),
384        mask_(0),
385        value_(0),
386        cannot_match_(false) { }
387  explicit QuickCheckDetails(int characters)
388      : characters_(characters),
389        mask_(0),
390        value_(0),
391        cannot_match_(false) { }
392  bool Rationalize(bool one_byte);
393  // Merge in the information from another branch of an alternation.
394  void Merge(QuickCheckDetails* other, int from_index);
395  // Advance the current position by some amount.
396  void Advance(int by, bool one_byte);
397  void Clear();
398  bool cannot_match() { return cannot_match_; }
399  void set_cannot_match() { cannot_match_ = true; }
400  struct Position {
401    Position() : mask(0), value(0), determines_perfectly(false) { }
402    uc16 mask;
403    uc16 value;
404    bool determines_perfectly;
405  };
406  int characters() { return characters_; }
407  void set_characters(int characters) { characters_ = characters; }
408  Position* positions(int index) {
409    DCHECK(index >= 0);
410    DCHECK(index < characters_);
411    return positions_ + index;
412  }
413  uint32_t mask() { return mask_; }
414  uint32_t value() { return value_; }
415
416 private:
417  // How many characters do we have quick check information from.  This is
418  // the same for all branches of a choice node.
419  int characters_;
420  Position positions_[4];
421  // These values are the condensate of the above array after Rationalize().
422  uint32_t mask_;
423  uint32_t value_;
424  // If set to true, there is no way this quick check can match at all.
425  // E.g., if it requires to be at the start of the input, and isn't.
426  bool cannot_match_;
427};
428
429
430extern int kUninitializedRegExpNodePlaceHolder;
431
432
433class RegExpNode: public ZoneObject {
434 public:
435  explicit RegExpNode(Zone* zone)
436      : replacement_(NULL), on_work_list_(false), trace_count_(0), zone_(zone) {
437    bm_info_[0] = bm_info_[1] = NULL;
438  }
439  virtual ~RegExpNode();
440  virtual void Accept(NodeVisitor* visitor) = 0;
441  // Generates a goto to this node or actually generates the code at this point.
442  virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
443  // How many characters must this node consume at a minimum in order to
444  // succeed.  If we have found at least 'still_to_find' characters that
445  // must be consumed there is no need to ask any following nodes whether
446  // they are sure to eat any more characters.  The not_at_start argument is
447  // used to indicate that we know we are not at the start of the input.  In
448  // this case anchored branches will always fail and can be ignored when
449  // determining how many characters are consumed on success.
450  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
451  // Emits some quick code that checks whether the preloaded characters match.
452  // Falls through on certain failure, jumps to the label on possible success.
453  // If the node cannot make a quick check it does nothing and returns false.
454  bool EmitQuickCheck(RegExpCompiler* compiler,
455                      Trace* bounds_check_trace,
456                      Trace* trace,
457                      bool preload_has_checked_bounds,
458                      Label* on_possible_success,
459                      QuickCheckDetails* details_return,
460                      bool fall_through_on_failure);
461  // For a given number of characters this returns a mask and a value.  The
462  // next n characters are anded with the mask and compared with the value.
463  // A comparison failure indicates the node cannot match the next n characters.
464  // A comparison success indicates the node may match.
465  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
466                                    RegExpCompiler* compiler,
467                                    int characters_filled_in,
468                                    bool not_at_start) = 0;
469  static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
470  virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
471  // Only returns the successor for a text node of length 1 that matches any
472  // character and that has no guards on it.
473  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
474      RegExpCompiler* compiler) {
475    return NULL;
476  }
477
478  // Collects information on the possible code units (mod 128) that can match if
479  // we look forward.  This is used for a Boyer-Moore-like string searching
480  // implementation.  TODO(erikcorry):  This should share more code with
481  // EatsAtLeast, GetQuickCheckDetails.  The budget argument is used to limit
482  // the number of nodes we are willing to look at in order to create this data.
483  static const int kRecursionBudget = 200;
484  bool KeepRecursing(RegExpCompiler* compiler);
485  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
486                            BoyerMooreLookahead* bm, bool not_at_start) {
487    UNREACHABLE();
488  }
489
490  // If we know that the input is one-byte then there are some nodes that can
491  // never match.  This method returns a node that can be substituted for
492  // itself, or NULL if the node can never match.
493  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case) {
494    return this;
495  }
496  // Helper for FilterOneByte.
497  RegExpNode* replacement() {
498    DCHECK(info()->replacement_calculated);
499    return replacement_;
500  }
501  RegExpNode* set_replacement(RegExpNode* replacement) {
502    info()->replacement_calculated = true;
503    replacement_ =  replacement;
504    return replacement;  // For convenience.
505  }
506
507  // We want to avoid recalculating the lookahead info, so we store it on the
508  // node.  Only info that is for this node is stored.  We can tell that the
509  // info is for this node when offset == 0, so the information is calculated
510  // relative to this node.
511  void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
512    if (offset == 0) set_bm_info(not_at_start, bm);
513  }
514
515  Label* label() { return &label_; }
516  // If non-generic code is generated for a node (i.e. the node is not at the
517  // start of the trace) then it cannot be reused.  This variable sets a limit
518  // on how often we allow that to happen before we insist on starting a new
519  // trace and generating generic code for a node that can be reused by flushing
520  // the deferred actions in the current trace and generating a goto.
521  static const int kMaxCopiesCodeGenerated = 10;
522
523  bool on_work_list() { return on_work_list_; }
524  void set_on_work_list(bool value) { on_work_list_ = value; }
525
526  NodeInfo* info() { return &info_; }
527
528  BoyerMooreLookahead* bm_info(bool not_at_start) {
529    return bm_info_[not_at_start ? 1 : 0];
530  }
531
532  Zone* zone() const { return zone_; }
533
534 protected:
535  enum LimitResult { DONE, CONTINUE };
536  RegExpNode* replacement_;
537
538  LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
539
540  void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
541    bm_info_[not_at_start ? 1 : 0] = bm;
542  }
543
544 private:
545  static const int kFirstCharBudget = 10;
546  Label label_;
547  bool on_work_list_;
548  NodeInfo info_;
549  // This variable keeps track of how many times code has been generated for
550  // this node (in different traces).  We don't keep track of where the
551  // generated code is located unless the code is generated at the start of
552  // a trace, in which case it is generic and can be reused by flushing the
553  // deferred operations in the current trace and generating a goto.
554  int trace_count_;
555  BoyerMooreLookahead* bm_info_[2];
556
557  Zone* zone_;
558};
559
560
561class SeqRegExpNode: public RegExpNode {
562 public:
563  explicit SeqRegExpNode(RegExpNode* on_success)
564      : RegExpNode(on_success->zone()), on_success_(on_success) { }
565  RegExpNode* on_success() { return on_success_; }
566  void set_on_success(RegExpNode* node) { on_success_ = node; }
567  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
568  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
569                            BoyerMooreLookahead* bm, bool not_at_start) {
570    on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
571    if (offset == 0) set_bm_info(not_at_start, bm);
572  }
573
574 protected:
575  RegExpNode* FilterSuccessor(int depth, bool ignore_case);
576
577 private:
578  RegExpNode* on_success_;
579};
580
581
582class ActionNode: public SeqRegExpNode {
583 public:
584  enum ActionType {
585    SET_REGISTER,
586    INCREMENT_REGISTER,
587    STORE_POSITION,
588    BEGIN_SUBMATCH,
589    POSITIVE_SUBMATCH_SUCCESS,
590    EMPTY_MATCH_CHECK,
591    CLEAR_CAPTURES
592  };
593  static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
594  static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
595  static ActionNode* StorePosition(int reg,
596                                   bool is_capture,
597                                   RegExpNode* on_success);
598  static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
599  static ActionNode* BeginSubmatch(int stack_pointer_reg,
600                                   int position_reg,
601                                   RegExpNode* on_success);
602  static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
603                                             int restore_reg,
604                                             int clear_capture_count,
605                                             int clear_capture_from,
606                                             RegExpNode* on_success);
607  static ActionNode* EmptyMatchCheck(int start_register,
608                                     int repetition_register,
609                                     int repetition_limit,
610                                     RegExpNode* on_success);
611  virtual void Accept(NodeVisitor* visitor);
612  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
613  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
614  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
615                                    RegExpCompiler* compiler,
616                                    int filled_in,
617                                    bool not_at_start) {
618    return on_success()->GetQuickCheckDetails(
619        details, compiler, filled_in, not_at_start);
620  }
621  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
622                            BoyerMooreLookahead* bm, bool not_at_start);
623  ActionType action_type() { return action_type_; }
624  // TODO(erikcorry): We should allow some action nodes in greedy loops.
625  virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
626
627 private:
628  union {
629    struct {
630      int reg;
631      int value;
632    } u_store_register;
633    struct {
634      int reg;
635    } u_increment_register;
636    struct {
637      int reg;
638      bool is_capture;
639    } u_position_register;
640    struct {
641      int stack_pointer_register;
642      int current_position_register;
643      int clear_register_count;
644      int clear_register_from;
645    } u_submatch;
646    struct {
647      int start_register;
648      int repetition_register;
649      int repetition_limit;
650    } u_empty_match_check;
651    struct {
652      int range_from;
653      int range_to;
654    } u_clear_captures;
655  } data_;
656  ActionNode(ActionType action_type, RegExpNode* on_success)
657      : SeqRegExpNode(on_success),
658        action_type_(action_type) { }
659  ActionType action_type_;
660  friend class DotPrinter;
661};
662
663
664class TextNode: public SeqRegExpNode {
665 public:
666  TextNode(ZoneList<TextElement>* elms, bool read_backward,
667           RegExpNode* on_success)
668      : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
669  TextNode(RegExpCharacterClass* that, bool read_backward,
670           RegExpNode* on_success)
671      : SeqRegExpNode(on_success),
672        elms_(new (zone()) ZoneList<TextElement>(1, zone())),
673        read_backward_(read_backward) {
674    elms_->Add(TextElement::CharClass(that), zone());
675  }
676  // Create TextNode for a single character class for the given ranges.
677  static TextNode* CreateForCharacterRanges(Zone* zone,
678                                            ZoneList<CharacterRange>* ranges,
679                                            bool read_backward,
680                                            RegExpNode* on_success);
681  // Create TextNode for a surrogate pair with a range given for the
682  // lead and the trail surrogate each.
683  static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
684                                          CharacterRange trail,
685                                          bool read_backward,
686                                          RegExpNode* on_success);
687  virtual void Accept(NodeVisitor* visitor);
688  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
689  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
690  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
691                                    RegExpCompiler* compiler,
692                                    int characters_filled_in,
693                                    bool not_at_start);
694  ZoneList<TextElement>* elements() { return elms_; }
695  bool read_backward() { return read_backward_; }
696  void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
697  virtual int GreedyLoopTextLength();
698  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
699      RegExpCompiler* compiler);
700  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
701                            BoyerMooreLookahead* bm, bool not_at_start);
702  void CalculateOffsets();
703  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
704
705 private:
706  enum TextEmitPassType {
707    NON_LATIN1_MATCH,            // Check for characters that can't match.
708    SIMPLE_CHARACTER_MATCH,      // Case-dependent single character check.
709    NON_LETTER_CHARACTER_MATCH,  // Check characters that have no case equivs.
710    CASE_CHARACTER_MATCH,        // Case-independent single character check.
711    CHARACTER_CLASS_MATCH        // Character class.
712  };
713  static bool SkipPass(int pass, bool ignore_case);
714  static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
715  static const int kLastPass = CHARACTER_CLASS_MATCH;
716  void TextEmitPass(RegExpCompiler* compiler,
717                    TextEmitPassType pass,
718                    bool preloaded,
719                    Trace* trace,
720                    bool first_element_checked,
721                    int* checked_up_to);
722  int Length();
723  ZoneList<TextElement>* elms_;
724  bool read_backward_;
725};
726
727
728class AssertionNode: public SeqRegExpNode {
729 public:
730  enum AssertionType {
731    AT_END,
732    AT_START,
733    AT_BOUNDARY,
734    AT_NON_BOUNDARY,
735    AFTER_NEWLINE
736  };
737  static AssertionNode* AtEnd(RegExpNode* on_success) {
738    return new(on_success->zone()) AssertionNode(AT_END, on_success);
739  }
740  static AssertionNode* AtStart(RegExpNode* on_success) {
741    return new(on_success->zone()) AssertionNode(AT_START, on_success);
742  }
743  static AssertionNode* AtBoundary(RegExpNode* on_success) {
744    return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
745  }
746  static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
747    return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
748  }
749  static AssertionNode* AfterNewline(RegExpNode* on_success) {
750    return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
751  }
752  virtual void Accept(NodeVisitor* visitor);
753  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
754  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
755  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
756                                    RegExpCompiler* compiler,
757                                    int filled_in,
758                                    bool not_at_start);
759  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
760                            BoyerMooreLookahead* bm, bool not_at_start);
761  AssertionType assertion_type() { return assertion_type_; }
762
763 private:
764  void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
765  enum IfPrevious { kIsNonWord, kIsWord };
766  void BacktrackIfPrevious(RegExpCompiler* compiler,
767                           Trace* trace,
768                           IfPrevious backtrack_if_previous);
769  AssertionNode(AssertionType t, RegExpNode* on_success)
770      : SeqRegExpNode(on_success), assertion_type_(t) { }
771  AssertionType assertion_type_;
772};
773
774
775class BackReferenceNode: public SeqRegExpNode {
776 public:
777  BackReferenceNode(int start_reg, int end_reg, bool read_backward,
778                    RegExpNode* on_success)
779      : SeqRegExpNode(on_success),
780        start_reg_(start_reg),
781        end_reg_(end_reg),
782        read_backward_(read_backward) {}
783  virtual void Accept(NodeVisitor* visitor);
784  int start_register() { return start_reg_; }
785  int end_register() { return end_reg_; }
786  bool read_backward() { return read_backward_; }
787  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
788  virtual int EatsAtLeast(int still_to_find,
789                          int recursion_depth,
790                          bool not_at_start);
791  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
792                                    RegExpCompiler* compiler,
793                                    int characters_filled_in,
794                                    bool not_at_start) {
795    return;
796  }
797  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
798                            BoyerMooreLookahead* bm, bool not_at_start);
799
800 private:
801  int start_reg_;
802  int end_reg_;
803  bool read_backward_;
804};
805
806
807class EndNode: public RegExpNode {
808 public:
809  enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
810  EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
811  virtual void Accept(NodeVisitor* visitor);
812  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
813  virtual int EatsAtLeast(int still_to_find,
814                          int recursion_depth,
815                          bool not_at_start) { return 0; }
816  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
817                                    RegExpCompiler* compiler,
818                                    int characters_filled_in,
819                                    bool not_at_start) {
820    // Returning 0 from EatsAtLeast should ensure we never get here.
821    UNREACHABLE();
822  }
823  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
824                            BoyerMooreLookahead* bm, bool not_at_start) {
825    // Returning 0 from EatsAtLeast should ensure we never get here.
826    UNREACHABLE();
827  }
828
829 private:
830  Action action_;
831};
832
833
834class NegativeSubmatchSuccess: public EndNode {
835 public:
836  NegativeSubmatchSuccess(int stack_pointer_reg,
837                          int position_reg,
838                          int clear_capture_count,
839                          int clear_capture_start,
840                          Zone* zone)
841      : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
842        stack_pointer_register_(stack_pointer_reg),
843        current_position_register_(position_reg),
844        clear_capture_count_(clear_capture_count),
845        clear_capture_start_(clear_capture_start) { }
846  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
847
848 private:
849  int stack_pointer_register_;
850  int current_position_register_;
851  int clear_capture_count_;
852  int clear_capture_start_;
853};
854
855
856class Guard: public ZoneObject {
857 public:
858  enum Relation { LT, GEQ };
859  Guard(int reg, Relation op, int value)
860      : reg_(reg),
861        op_(op),
862        value_(value) { }
863  int reg() { return reg_; }
864  Relation op() { return op_; }
865  int value() { return value_; }
866
867 private:
868  int reg_;
869  Relation op_;
870  int value_;
871};
872
873
874class GuardedAlternative {
875 public:
876  explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) { }
877  void AddGuard(Guard* guard, Zone* zone);
878  RegExpNode* node() { return node_; }
879  void set_node(RegExpNode* node) { node_ = node; }
880  ZoneList<Guard*>* guards() { return guards_; }
881
882 private:
883  RegExpNode* node_;
884  ZoneList<Guard*>* guards_;
885};
886
887
888class AlternativeGeneration;
889
890
891class ChoiceNode: public RegExpNode {
892 public:
893  explicit ChoiceNode(int expected_size, Zone* zone)
894      : RegExpNode(zone),
895        alternatives_(new(zone)
896                      ZoneList<GuardedAlternative>(expected_size, zone)),
897        table_(NULL),
898        not_at_start_(false),
899        being_calculated_(false) { }
900  virtual void Accept(NodeVisitor* visitor);
901  void AddAlternative(GuardedAlternative node) {
902    alternatives()->Add(node, zone());
903  }
904  ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
905  DispatchTable* GetTable(bool ignore_case);
906  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
907  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
908  int EatsAtLeastHelper(int still_to_find,
909                        int budget,
910                        RegExpNode* ignore_this_node,
911                        bool not_at_start);
912  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
913                                    RegExpCompiler* compiler,
914                                    int characters_filled_in,
915                                    bool not_at_start);
916  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
917                            BoyerMooreLookahead* bm, bool not_at_start);
918
919  bool being_calculated() { return being_calculated_; }
920  bool not_at_start() { return not_at_start_; }
921  void set_not_at_start() { not_at_start_ = true; }
922  void set_being_calculated(bool b) { being_calculated_ = b; }
923  virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
924    return true;
925  }
926  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
927  virtual bool read_backward() { return false; }
928
929 protected:
930  int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
931  ZoneList<GuardedAlternative>* alternatives_;
932
933 private:
934  friend class DispatchTableConstructor;
935  friend class Analysis;
936  void GenerateGuard(RegExpMacroAssembler* macro_assembler,
937                     Guard* guard,
938                     Trace* trace);
939  int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
940  void EmitOutOfLineContinuation(RegExpCompiler* compiler,
941                                 Trace* trace,
942                                 GuardedAlternative alternative,
943                                 AlternativeGeneration* alt_gen,
944                                 int preload_characters,
945                                 bool next_expects_preload);
946  void SetUpPreLoad(RegExpCompiler* compiler,
947                    Trace* current_trace,
948                    PreloadState* preloads);
949  void AssertGuardsMentionRegisters(Trace* trace);
950  int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
951  Trace* EmitGreedyLoop(RegExpCompiler* compiler,
952                        Trace* trace,
953                        AlternativeGenerationList* alt_gens,
954                        PreloadState* preloads,
955                        GreedyLoopState* greedy_loop_state,
956                        int text_length);
957  void EmitChoices(RegExpCompiler* compiler,
958                   AlternativeGenerationList* alt_gens,
959                   int first_choice,
960                   Trace* trace,
961                   PreloadState* preloads);
962  DispatchTable* table_;
963  // If true, this node is never checked at the start of the input.
964  // Allows a new trace to start with at_start() set to false.
965  bool not_at_start_;
966  bool being_calculated_;
967};
968
969
970class NegativeLookaroundChoiceNode : public ChoiceNode {
971 public:
972  explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
973                                        GuardedAlternative then_do_this,
974                                        Zone* zone)
975      : ChoiceNode(2, zone) {
976    AddAlternative(this_must_fail);
977    AddAlternative(then_do_this);
978  }
979  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
980  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
981                                    RegExpCompiler* compiler,
982                                    int characters_filled_in,
983                                    bool not_at_start);
984  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
985                            BoyerMooreLookahead* bm, bool not_at_start) {
986    alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
987                                              not_at_start);
988    if (offset == 0) set_bm_info(not_at_start, bm);
989  }
990  // For a negative lookahead we don't emit the quick check for the
991  // alternative that is expected to fail.  This is because quick check code
992  // starts by loading enough characters for the alternative that takes fewest
993  // characters, but on a negative lookahead the negative branch did not take
994  // part in that calculation (EatsAtLeast) so the assumptions don't hold.
995  virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
996    return !is_first;
997  }
998  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
999};
1000
1001
1002class LoopChoiceNode: public ChoiceNode {
1003 public:
1004  LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
1005      : ChoiceNode(2, zone),
1006        loop_node_(NULL),
1007        continue_node_(NULL),
1008        body_can_be_zero_length_(body_can_be_zero_length),
1009        read_backward_(read_backward) {}
1010  void AddLoopAlternative(GuardedAlternative alt);
1011  void AddContinueAlternative(GuardedAlternative alt);
1012  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1013  virtual int EatsAtLeast(int still_to_find,  int budget, bool not_at_start);
1014  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1015                                    RegExpCompiler* compiler,
1016                                    int characters_filled_in,
1017                                    bool not_at_start);
1018  virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
1019                            BoyerMooreLookahead* bm, bool not_at_start);
1020  RegExpNode* loop_node() { return loop_node_; }
1021  RegExpNode* continue_node() { return continue_node_; }
1022  bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1023  virtual bool read_backward() { return read_backward_; }
1024  virtual void Accept(NodeVisitor* visitor);
1025  virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
1026
1027 private:
1028  // AddAlternative is made private for loop nodes because alternatives
1029  // should not be added freely, we need to keep track of which node
1030  // goes back to the node itself.
1031  void AddAlternative(GuardedAlternative node) {
1032    ChoiceNode::AddAlternative(node);
1033  }
1034
1035  RegExpNode* loop_node_;
1036  RegExpNode* continue_node_;
1037  bool body_can_be_zero_length_;
1038  bool read_backward_;
1039};
1040
1041
1042// Improve the speed that we scan for an initial point where a non-anchored
1043// regexp can match by using a Boyer-Moore-like table. This is done by
1044// identifying non-greedy non-capturing loops in the nodes that eat any
1045// character one at a time.  For example in the middle of the regexp
1046// /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
1047// inserted at the start of any non-anchored regexp.
1048//
1049// When we have found such a loop we look ahead in the nodes to find the set of
1050// characters that can come at given distances. For example for the regexp
1051// /.?foo/ we know that there are at least 3 characters ahead of us, and the
1052// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1053// the lookahead info where the set of characters is reasonably constrained. In
1054// our example this is from index 1 to 2 (0 is not constrained). We can now
1055// look 3 characters ahead and if we don't find one of [f, o] (the union of
1056// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1057//
1058// For Unicode input strings we do the same, but modulo 128.
1059//
1060// We also look at the first string fed to the regexp and use that to get a hint
1061// of the character frequencies in the inputs. This affects the assessment of
1062// whether the set of characters is 'reasonably constrained'.
1063//
1064// We also have another lookahead mechanism (called quick check in the code),
1065// which uses a wide load of multiple characters followed by a mask and compare
1066// to determine whether a match is possible at this point.
1067enum ContainedInLattice {
1068  kNotYet = 0,
1069  kLatticeIn = 1,
1070  kLatticeOut = 2,
1071  kLatticeUnknown = 3  // Can also mean both in and out.
1072};
1073
1074
1075inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1076  return static_cast<ContainedInLattice>(a | b);
1077}
1078
1079
1080ContainedInLattice AddRange(ContainedInLattice a,
1081                            const int* ranges,
1082                            int ranges_size,
1083                            Interval new_range);
1084
1085
1086class BoyerMoorePositionInfo : public ZoneObject {
1087 public:
1088  explicit BoyerMoorePositionInfo(Zone* zone)
1089      : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1090        map_count_(0),
1091        w_(kNotYet),
1092        s_(kNotYet),
1093        d_(kNotYet),
1094        surrogate_(kNotYet) {
1095     for (int i = 0; i < kMapSize; i++) {
1096       map_->Add(false, zone);
1097     }
1098  }
1099
1100  bool& at(int i) { return map_->at(i); }
1101
1102  static const int kMapSize = 128;
1103  static const int kMask = kMapSize - 1;
1104
1105  int map_count() const { return map_count_; }
1106
1107  void Set(int character);
1108  void SetInterval(const Interval& interval);
1109  void SetAll();
1110  bool is_non_word() { return w_ == kLatticeOut; }
1111  bool is_word() { return w_ == kLatticeIn; }
1112
1113 private:
1114  ZoneList<bool>* map_;
1115  int map_count_;  // Number of set bits in the map.
1116  ContainedInLattice w_;  // The \w character class.
1117  ContainedInLattice s_;  // The \s character class.
1118  ContainedInLattice d_;  // The \d character class.
1119  ContainedInLattice surrogate_;  // Surrogate UTF-16 code units.
1120};
1121
1122
1123class BoyerMooreLookahead : public ZoneObject {
1124 public:
1125  BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1126
1127  int length() { return length_; }
1128  int max_char() { return max_char_; }
1129  RegExpCompiler* compiler() { return compiler_; }
1130
1131  int Count(int map_number) {
1132    return bitmaps_->at(map_number)->map_count();
1133  }
1134
1135  BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1136
1137  void Set(int map_number, int character) {
1138    if (character > max_char_) return;
1139    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1140    info->Set(character);
1141  }
1142
1143  void SetInterval(int map_number, const Interval& interval) {
1144    if (interval.from() > max_char_) return;
1145    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1146    if (interval.to() > max_char_) {
1147      info->SetInterval(Interval(interval.from(), max_char_));
1148    } else {
1149      info->SetInterval(interval);
1150    }
1151  }
1152
1153  void SetAll(int map_number) {
1154    bitmaps_->at(map_number)->SetAll();
1155  }
1156
1157  void SetRest(int from_map) {
1158    for (int i = from_map; i < length_; i++) SetAll(i);
1159  }
1160  void EmitSkipInstructions(RegExpMacroAssembler* masm);
1161
1162 private:
1163  // This is the value obtained by EatsAtLeast.  If we do not have at least this
1164  // many characters left in the sample string then the match is bound to fail.
1165  // Therefore it is OK to read a character this far ahead of the current match
1166  // point.
1167  int length_;
1168  RegExpCompiler* compiler_;
1169  // 0xff for Latin1, 0xffff for UTF-16.
1170  int max_char_;
1171  ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1172
1173  int GetSkipTable(int min_lookahead,
1174                   int max_lookahead,
1175                   Handle<ByteArray> boolean_skip_table);
1176  bool FindWorthwhileInterval(int* from, int* to);
1177  int FindBestInterval(
1178    int max_number_of_chars, int old_biggest_points, int* from, int* to);
1179};
1180
1181
1182// There are many ways to generate code for a node.  This class encapsulates
1183// the current way we should be generating.  In other words it encapsulates
1184// the current state of the code generator.  The effect of this is that we
1185// generate code for paths that the matcher can take through the regular
1186// expression.  A given node in the regexp can be code-generated several times
1187// as it can be part of several traces.  For example for the regexp:
1188// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1189// of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
1190// to match foo is generated only once (the traces have a common prefix).  The
1191// code to store the capture is deferred and generated (twice) after the places
1192// where baz has been matched.
1193class Trace {
1194 public:
1195  // A value for a property that is either known to be true, know to be false,
1196  // or not known.
1197  enum TriBool {
1198    UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1199  };
1200
1201  class DeferredAction {
1202   public:
1203    DeferredAction(ActionNode::ActionType action_type, int reg)
1204        : action_type_(action_type), reg_(reg), next_(NULL) { }
1205    DeferredAction* next() { return next_; }
1206    bool Mentions(int reg);
1207    int reg() { return reg_; }
1208    ActionNode::ActionType action_type() { return action_type_; }
1209   private:
1210    ActionNode::ActionType action_type_;
1211    int reg_;
1212    DeferredAction* next_;
1213    friend class Trace;
1214  };
1215
1216  class DeferredCapture : public DeferredAction {
1217   public:
1218    DeferredCapture(int reg, bool is_capture, Trace* trace)
1219        : DeferredAction(ActionNode::STORE_POSITION, reg),
1220          cp_offset_(trace->cp_offset()),
1221          is_capture_(is_capture) { }
1222    int cp_offset() { return cp_offset_; }
1223    bool is_capture() { return is_capture_; }
1224   private:
1225    int cp_offset_;
1226    bool is_capture_;
1227    void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1228  };
1229
1230  class DeferredSetRegister : public DeferredAction {
1231   public:
1232    DeferredSetRegister(int reg, int value)
1233        : DeferredAction(ActionNode::SET_REGISTER, reg),
1234          value_(value) { }
1235    int value() { return value_; }
1236   private:
1237    int value_;
1238  };
1239
1240  class DeferredClearCaptures : public DeferredAction {
1241   public:
1242    explicit DeferredClearCaptures(Interval range)
1243        : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1244          range_(range) { }
1245    Interval range() { return range_; }
1246   private:
1247    Interval range_;
1248  };
1249
1250  class DeferredIncrementRegister : public DeferredAction {
1251   public:
1252    explicit DeferredIncrementRegister(int reg)
1253        : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1254  };
1255
1256  Trace()
1257      : cp_offset_(0),
1258        actions_(NULL),
1259        backtrack_(NULL),
1260        stop_node_(NULL),
1261        loop_label_(NULL),
1262        characters_preloaded_(0),
1263        bound_checked_up_to_(0),
1264        flush_budget_(100),
1265        at_start_(UNKNOWN) { }
1266
1267  // End the trace.  This involves flushing the deferred actions in the trace
1268  // and pushing a backtrack location onto the backtrack stack.  Once this is
1269  // done we can start a new trace or go to one that has already been
1270  // generated.
1271  void Flush(RegExpCompiler* compiler, RegExpNode* successor);
1272  int cp_offset() { return cp_offset_; }
1273  DeferredAction* actions() { return actions_; }
1274  // A trivial trace is one that has no deferred actions or other state that
1275  // affects the assumptions used when generating code.  There is no recorded
1276  // backtrack location in a trivial trace, so with a trivial trace we will
1277  // generate code that, on a failure to match, gets the backtrack location
1278  // from the backtrack stack rather than using a direct jump instruction.  We
1279  // always start code generation with a trivial trace and non-trivial traces
1280  // are created as we emit code for nodes or add to the list of deferred
1281  // actions in the trace.  The location of the code generated for a node using
1282  // a trivial trace is recorded in a label in the node so that gotos can be
1283  // generated to that code.
1284  bool is_trivial() {
1285    return backtrack_ == NULL &&
1286           actions_ == NULL &&
1287           cp_offset_ == 0 &&
1288           characters_preloaded_ == 0 &&
1289           bound_checked_up_to_ == 0 &&
1290           quick_check_performed_.characters() == 0 &&
1291           at_start_ == UNKNOWN;
1292  }
1293  TriBool at_start() { return at_start_; }
1294  void set_at_start(TriBool at_start) { at_start_ = at_start; }
1295  Label* backtrack() { return backtrack_; }
1296  Label* loop_label() { return loop_label_; }
1297  RegExpNode* stop_node() { return stop_node_; }
1298  int characters_preloaded() { return characters_preloaded_; }
1299  int bound_checked_up_to() { return bound_checked_up_to_; }
1300  int flush_budget() { return flush_budget_; }
1301  QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1302  bool mentions_reg(int reg);
1303  // Returns true if a deferred position store exists to the specified
1304  // register and stores the offset in the out-parameter.  Otherwise
1305  // returns false.
1306  bool GetStoredPosition(int reg, int* cp_offset);
1307  // These set methods and AdvanceCurrentPositionInTrace should be used only on
1308  // new traces - the intention is that traces are immutable after creation.
1309  void add_action(DeferredAction* new_action) {
1310    DCHECK(new_action->next_ == NULL);
1311    new_action->next_ = actions_;
1312    actions_ = new_action;
1313  }
1314  void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
1315  void set_stop_node(RegExpNode* node) { stop_node_ = node; }
1316  void set_loop_label(Label* label) { loop_label_ = label; }
1317  void set_characters_preloaded(int count) { characters_preloaded_ = count; }
1318  void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
1319  void set_flush_budget(int to) { flush_budget_ = to; }
1320  void set_quick_check_performed(QuickCheckDetails* d) {
1321    quick_check_performed_ = *d;
1322  }
1323  void InvalidateCurrentCharacter();
1324  void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1325
1326 private:
1327  int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
1328  void PerformDeferredActions(RegExpMacroAssembler* macro,
1329                              int max_register,
1330                              const OutSet& affected_registers,
1331                              OutSet* registers_to_pop,
1332                              OutSet* registers_to_clear,
1333                              Zone* zone);
1334  void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1335                                int max_register,
1336                                const OutSet& registers_to_pop,
1337                                const OutSet& registers_to_clear);
1338  int cp_offset_;
1339  DeferredAction* actions_;
1340  Label* backtrack_;
1341  RegExpNode* stop_node_;
1342  Label* loop_label_;
1343  int characters_preloaded_;
1344  int bound_checked_up_to_;
1345  QuickCheckDetails quick_check_performed_;
1346  int flush_budget_;
1347  TriBool at_start_;
1348};
1349
1350
1351class GreedyLoopState {
1352 public:
1353  explicit GreedyLoopState(bool not_at_start);
1354
1355  Label* label() { return &label_; }
1356  Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
1357
1358 private:
1359  Label label_;
1360  Trace counter_backtrack_trace_;
1361};
1362
1363
1364struct PreloadState {
1365  static const int kEatsAtLeastNotYetInitialized = -1;
1366  bool preload_is_current_;
1367  bool preload_has_checked_bounds_;
1368  int preload_characters_;
1369  int eats_at_least_;
1370  void init() {
1371    eats_at_least_ = kEatsAtLeastNotYetInitialized;
1372  }
1373};
1374
1375
1376class NodeVisitor {
1377 public:
1378  virtual ~NodeVisitor() { }
1379#define DECLARE_VISIT(Type)                                          \
1380  virtual void Visit##Type(Type##Node* that) = 0;
1381FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1382#undef DECLARE_VISIT
1383  virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1384};
1385
1386
1387// Node visitor used to add the start set of the alternatives to the
1388// dispatch table of a choice node.
1389class DispatchTableConstructor: public NodeVisitor {
1390 public:
1391  DispatchTableConstructor(DispatchTable* table, bool ignore_case,
1392                           Zone* zone)
1393      : table_(table),
1394        choice_index_(-1),
1395        ignore_case_(ignore_case),
1396        zone_(zone) { }
1397
1398  void BuildTable(ChoiceNode* node);
1399
1400  void AddRange(CharacterRange range) {
1401    table()->AddRange(range, choice_index_, zone_);
1402  }
1403
1404  void AddInverse(ZoneList<CharacterRange>* ranges);
1405
1406#define DECLARE_VISIT(Type)                                          \
1407  virtual void Visit##Type(Type##Node* that);
1408FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1409#undef DECLARE_VISIT
1410
1411  DispatchTable* table() { return table_; }
1412  void set_choice_index(int value) { choice_index_ = value; }
1413
1414 protected:
1415  DispatchTable* table_;
1416  int choice_index_;
1417  bool ignore_case_;
1418  Zone* zone_;
1419};
1420
1421
1422// Assertion propagation moves information about assertions such as
1423// \b to the affected nodes.  For instance, in /.\b./ information must
1424// be propagated to the first '.' that whatever follows needs to know
1425// if it matched a word or a non-word, and to the second '.' that it
1426// has to check if it succeeds a word or non-word.  In this case the
1427// result will be something like:
1428//
1429//   +-------+        +------------+
1430//   |   .   |        |      .     |
1431//   +-------+  --->  +------------+
1432//   | word? |        | check word |
1433//   +-------+        +------------+
1434class Analysis: public NodeVisitor {
1435 public:
1436  Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte)
1437      : isolate_(isolate),
1438        flags_(flags),
1439        is_one_byte_(is_one_byte),
1440        error_message_(NULL) {}
1441  void EnsureAnalyzed(RegExpNode* node);
1442
1443#define DECLARE_VISIT(Type)                                          \
1444  virtual void Visit##Type(Type##Node* that);
1445FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1446#undef DECLARE_VISIT
1447  virtual void VisitLoopChoice(LoopChoiceNode* that);
1448
1449  bool has_failed() { return error_message_ != NULL; }
1450  const char* error_message() {
1451    DCHECK(error_message_ != NULL);
1452    return error_message_;
1453  }
1454  void fail(const char* error_message) {
1455    error_message_ = error_message;
1456  }
1457
1458  Isolate* isolate() const { return isolate_; }
1459
1460  bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
1461  bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
1462
1463 private:
1464  Isolate* isolate_;
1465  JSRegExp::Flags flags_;
1466  bool is_one_byte_;
1467  const char* error_message_;
1468
1469  DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1470};
1471
1472
1473struct RegExpCompileData {
1474  RegExpCompileData()
1475    : tree(NULL),
1476      node(NULL),
1477      simple(true),
1478      contains_anchor(false),
1479      capture_count(0) { }
1480  RegExpTree* tree;
1481  RegExpNode* node;
1482  bool simple;
1483  bool contains_anchor;
1484  Handle<FixedArray> capture_name_map;
1485  Handle<String> error;
1486  int capture_count;
1487};
1488
1489
1490class RegExpEngine: public AllStatic {
1491 public:
1492  struct CompilationResult {
1493    CompilationResult(Isolate* isolate, const char* error_message)
1494        : error_message(error_message),
1495          code(isolate->heap()->the_hole_value()),
1496          num_registers(0) {}
1497    CompilationResult(Object* code, int registers)
1498        : error_message(NULL), code(code), num_registers(registers) {}
1499    const char* error_message;
1500    Object* code;
1501    int num_registers;
1502  };
1503
1504  static CompilationResult Compile(Isolate* isolate, Zone* zone,
1505                                   RegExpCompileData* input,
1506                                   JSRegExp::Flags flags,
1507                                   Handle<String> pattern,
1508                                   Handle<String> sample_subject,
1509                                   bool is_one_byte);
1510
1511  static bool TooMuchRegExpCode(Handle<String> pattern);
1512
1513  static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1514};
1515
1516
1517class RegExpResultsCache : public AllStatic {
1518 public:
1519  enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
1520
1521  // Attempt to retrieve a cached result.  On failure, 0 is returned as a Smi.
1522  // On success, the returned result is guaranteed to be a COW-array.
1523  static Object* Lookup(Heap* heap, String* key_string, Object* key_pattern,
1524                        FixedArray** last_match_out, ResultsCacheType type);
1525  // Attempt to add value_array to the cache specified by type.  On success,
1526  // value_array is turned into a COW-array.
1527  static void Enter(Isolate* isolate, Handle<String> key_string,
1528                    Handle<Object> key_pattern, Handle<FixedArray> value_array,
1529                    Handle<FixedArray> last_match_cache, ResultsCacheType type);
1530  static void Clear(FixedArray* cache);
1531  static const int kRegExpResultsCacheSize = 0x100;
1532
1533 private:
1534  static const int kArrayEntriesPerCacheEntry = 4;
1535  static const int kStringOffset = 0;
1536  static const int kPatternOffset = 1;
1537  static const int kArrayOffset = 2;
1538  static const int kLastMatchOffset = 3;
1539};
1540
1541}  // namespace internal
1542}  // namespace v8
1543
1544#endif  // V8_REGEXP_JSREGEXP_H_
1545