1// Copyright 2012 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_JSREGEXP_H_
29#define V8_JSREGEXP_H_
30
31#include "allocation.h"
32#include "assembler.h"
33#include "zone-inl.h"
34
35namespace v8 {
36namespace internal {
37
38class NodeVisitor;
39class RegExpCompiler;
40class RegExpMacroAssembler;
41class RegExpNode;
42class RegExpTree;
43class BoyerMooreLookahead;
44
45class RegExpImpl {
46 public:
47  // Whether V8 is compiled with native regexp support or not.
48  static bool UsesNativeRegExp() {
49#ifdef V8_INTERPRETED_REGEXP
50    return false;
51#else
52    return true;
53#endif
54  }
55
56  // Creates a regular expression literal in the old space.
57  // This function calls the garbage collector if necessary.
58  static Handle<Object> CreateRegExpLiteral(Handle<JSFunction> constructor,
59                                            Handle<String> pattern,
60                                            Handle<String> flags,
61                                            bool* has_pending_exception);
62
63  // Returns a string representation of a regular expression.
64  // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
65  // This function calls the garbage collector if necessary.
66  static Handle<String> ToString(Handle<Object> value);
67
68  // Parses the RegExp pattern and prepares the JSRegExp object with
69  // generic data and choice of implementation - as well as what
70  // the implementation wants to store in the data field.
71  // Returns false if compilation fails.
72  static Handle<Object> Compile(Handle<JSRegExp> re,
73                                Handle<String> pattern,
74                                Handle<String> flags);
75
76  // See ECMA-262 section 15.10.6.2.
77  // This function calls the garbage collector if necessary.
78  static Handle<Object> Exec(Handle<JSRegExp> regexp,
79                             Handle<String> subject,
80                             int index,
81                             Handle<JSArray> lastMatchInfo);
82
83  // Prepares a JSRegExp object with Irregexp-specific data.
84  static void IrregexpInitialize(Handle<JSRegExp> re,
85                                 Handle<String> pattern,
86                                 JSRegExp::Flags flags,
87                                 int capture_register_count);
88
89
90  static void AtomCompile(Handle<JSRegExp> re,
91                          Handle<String> pattern,
92                          JSRegExp::Flags flags,
93                          Handle<String> match_pattern);
94
95
96  static int AtomExecRaw(Handle<JSRegExp> regexp,
97                         Handle<String> subject,
98                         int index,
99                         int32_t* output,
100                         int output_size);
101
102
103  static Handle<Object> AtomExec(Handle<JSRegExp> regexp,
104                                 Handle<String> subject,
105                                 int index,
106                                 Handle<JSArray> lastMatchInfo);
107
108  enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
109
110  // Prepare a RegExp for being executed one or more times (using
111  // IrregexpExecOnce) on the subject.
112  // This ensures that the regexp is compiled for the subject, and that
113  // the subject is flat.
114  // Returns the number of integer spaces required by IrregexpExecOnce
115  // as its "registers" argument.  If the regexp cannot be compiled,
116  // an exception is set as pending, and this function returns negative.
117  static int IrregexpPrepare(Handle<JSRegExp> regexp,
118                             Handle<String> subject);
119
120  // Execute a regular expression on the subject, starting from index.
121  // If matching succeeds, return the number of matches.  This can be larger
122  // than one in the case of global regular expressions.
123  // The captures and subcaptures are stored into the registers vector.
124  // If matching fails, returns RE_FAILURE.
125  // If execution fails, sets a pending exception and returns RE_EXCEPTION.
126  static int IrregexpExecRaw(Handle<JSRegExp> regexp,
127                             Handle<String> subject,
128                             int index,
129                             int32_t* output,
130                             int output_size);
131
132  // Execute an Irregexp bytecode pattern.
133  // On a successful match, the result is a JSArray containing
134  // captured positions.  On a failure, the result is the null value.
135  // Returns an empty handle in case of an exception.
136  static Handle<Object> IrregexpExec(Handle<JSRegExp> regexp,
137                                     Handle<String> subject,
138                                     int index,
139                                     Handle<JSArray> lastMatchInfo);
140
141  // Set last match info.  If match is NULL, then setting captures is omitted.
142  static Handle<JSArray> SetLastMatchInfo(Handle<JSArray> last_match_info,
143                                          Handle<String> subject,
144                                          int capture_count,
145                                          int32_t* match);
146
147
148  class GlobalCache {
149   public:
150    GlobalCache(Handle<JSRegExp> regexp,
151                Handle<String> subject,
152                bool is_global,
153                Isolate* isolate);
154
155    INLINE(~GlobalCache());
156
157    // Fetch the next entry in the cache for global regexp match results.
158    // This does not set the last match info.  Upon failure, NULL is returned.
159    // The cause can be checked with Result().  The previous
160    // result is still in available in memory when a failure happens.
161    INLINE(int32_t* FetchNext());
162
163    INLINE(int32_t* LastSuccessfulMatch());
164
165    INLINE(bool HasException()) { return num_matches_ < 0; }
166
167   private:
168    int num_matches_;
169    int max_matches_;
170    int current_match_index_;
171    int registers_per_match_;
172    // Pointer to the last set of captures.
173    int32_t* register_array_;
174    int register_array_size_;
175    Handle<JSRegExp> regexp_;
176    Handle<String> subject_;
177  };
178
179
180  // Array index in the lastMatchInfo array.
181  static const int kLastCaptureCount = 0;
182  static const int kLastSubject = 1;
183  static const int kLastInput = 2;
184  static const int kFirstCapture = 3;
185  static const int kLastMatchOverhead = 3;
186
187  // Direct offset into the lastMatchInfo array.
188  static const int kLastCaptureCountOffset =
189      FixedArray::kHeaderSize + kLastCaptureCount * kPointerSize;
190  static const int kLastSubjectOffset =
191      FixedArray::kHeaderSize + kLastSubject * kPointerSize;
192  static const int kLastInputOffset =
193      FixedArray::kHeaderSize + kLastInput * kPointerSize;
194  static const int kFirstCaptureOffset =
195      FixedArray::kHeaderSize + kFirstCapture * kPointerSize;
196
197  // Used to access the lastMatchInfo array.
198  static int GetCapture(FixedArray* array, int index) {
199    return Smi::cast(array->get(index + kFirstCapture))->value();
200  }
201
202  static void SetLastCaptureCount(FixedArray* array, int to) {
203    array->set(kLastCaptureCount, Smi::FromInt(to));
204  }
205
206  static void SetLastSubject(FixedArray* array, String* to) {
207    array->set(kLastSubject, to);
208  }
209
210  static void SetLastInput(FixedArray* array, String* to) {
211    array->set(kLastInput, to);
212  }
213
214  static void SetCapture(FixedArray* array, int index, int to) {
215    array->set(index + kFirstCapture, Smi::FromInt(to));
216  }
217
218  static int GetLastCaptureCount(FixedArray* array) {
219    return Smi::cast(array->get(kLastCaptureCount))->value();
220  }
221
222  // For acting on the JSRegExp data FixedArray.
223  static int IrregexpMaxRegisterCount(FixedArray* re);
224  static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
225  static int IrregexpNumberOfCaptures(FixedArray* re);
226  static int IrregexpNumberOfRegisters(FixedArray* re);
227  static ByteArray* IrregexpByteCode(FixedArray* re, bool is_ascii);
228  static Code* IrregexpNativeCode(FixedArray* re, bool is_ascii);
229
230  // Limit the space regexps take up on the heap.  In order to limit this we
231  // would like to keep track of the amount of regexp code on the heap.  This
232  // is not tracked, however.  As a conservative approximation we track the
233  // total regexp code compiled including code that has subsequently been freed
234  // and the total executable memory at any point.
235  static const int kRegExpExecutableMemoryLimit = 16 * MB;
236  static const int kRegWxpCompiledLimit = 1 * MB;
237
238 private:
239  static bool CompileIrregexp(
240      Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii);
241  static inline bool EnsureCompiledIrregexp(
242      Handle<JSRegExp> re, Handle<String> sample_subject, bool is_ascii);
243};
244
245
246// Represents the location of one element relative to the intersection of
247// two sets. Corresponds to the four areas of a Venn diagram.
248enum ElementInSetsRelation {
249  kInsideNone = 0,
250  kInsideFirst = 1,
251  kInsideSecond = 2,
252  kInsideBoth = 3
253};
254
255
256// Represents code units in the range from from_ to to_, both ends are
257// inclusive.
258class CharacterRange {
259 public:
260  CharacterRange() : from_(0), to_(0) { }
261  // For compatibility with the CHECK_OK macro
262  CharacterRange(void* null) { ASSERT_EQ(NULL, null); }  //NOLINT
263  CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) { }
264  static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
265                             Zone* zone);
266  static Vector<const int> GetWordBounds();
267  static inline CharacterRange Singleton(uc16 value) {
268    return CharacterRange(value, value);
269  }
270  static inline CharacterRange Range(uc16 from, uc16 to) {
271    ASSERT(from <= to);
272    return CharacterRange(from, to);
273  }
274  static inline CharacterRange Everything() {
275    return CharacterRange(0, 0xFFFF);
276  }
277  bool Contains(uc16 i) { return from_ <= i && i <= to_; }
278  uc16 from() const { return from_; }
279  void set_from(uc16 value) { from_ = value; }
280  uc16 to() const { return to_; }
281  void set_to(uc16 value) { to_ = value; }
282  bool is_valid() { return from_ <= to_; }
283  bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
284  bool IsSingleton() { return (from_ == to_); }
285  void AddCaseEquivalents(ZoneList<CharacterRange>* ranges, bool is_ascii,
286                          Zone* zone);
287  static void Split(ZoneList<CharacterRange>* base,
288                    Vector<const int> overlay,
289                    ZoneList<CharacterRange>** included,
290                    ZoneList<CharacterRange>** excluded,
291                    Zone* zone);
292  // Whether a range list is in canonical form: Ranges ordered by from value,
293  // and ranges non-overlapping and non-adjacent.
294  static bool IsCanonical(ZoneList<CharacterRange>* ranges);
295  // Convert range list to canonical form. The characters covered by the ranges
296  // will still be the same, but no character is in more than one range, and
297  // adjacent ranges are merged. The resulting list may be shorter than the
298  // original, but cannot be longer.
299  static void Canonicalize(ZoneList<CharacterRange>* ranges);
300  // Negate the contents of a character range in canonical form.
301  static void Negate(ZoneList<CharacterRange>* src,
302                     ZoneList<CharacterRange>* dst,
303                     Zone* zone);
304  static const int kStartMarker = (1 << 24);
305  static const int kPayloadMask = (1 << 24) - 1;
306
307 private:
308  uc16 from_;
309  uc16 to_;
310};
311
312
313// A set of unsigned integers that behaves especially well on small
314// integers (< 32).  May do zone-allocation.
315class OutSet: public ZoneObject {
316 public:
317  OutSet() : first_(0), remaining_(NULL), successors_(NULL) { }
318  OutSet* Extend(unsigned value, Zone* zone);
319  bool Get(unsigned value);
320  static const unsigned kFirstLimit = 32;
321
322 private:
323  // Destructively set a value in this set.  In most cases you want
324  // to use Extend instead to ensure that only one instance exists
325  // that contains the same values.
326  void Set(unsigned value, Zone* zone);
327
328  // The successors are a list of sets that contain the same values
329  // as this set and the one more value that is not present in this
330  // set.
331  ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
332
333  OutSet(uint32_t first, ZoneList<unsigned>* remaining)
334      : first_(first), remaining_(remaining), successors_(NULL) { }
335  uint32_t first_;
336  ZoneList<unsigned>* remaining_;
337  ZoneList<OutSet*>* successors_;
338  friend class Trace;
339};
340
341
342// A mapping from integers, specified as ranges, to a set of integers.
343// Used for mapping character ranges to choices.
344class DispatchTable : public ZoneObject {
345 public:
346  explicit DispatchTable(Zone* zone) : tree_(zone) { }
347
348  class Entry {
349   public:
350    Entry() : from_(0), to_(0), out_set_(NULL) { }
351    Entry(uc16 from, uc16 to, OutSet* out_set)
352        : from_(from), to_(to), out_set_(out_set) { }
353    uc16 from() { return from_; }
354    uc16 to() { return to_; }
355    void set_to(uc16 value) { to_ = value; }
356    void AddValue(int value, Zone* zone) {
357      out_set_ = out_set_->Extend(value, zone);
358    }
359    OutSet* out_set() { return out_set_; }
360   private:
361    uc16 from_;
362    uc16 to_;
363    OutSet* out_set_;
364  };
365
366  class Config {
367   public:
368    typedef uc16 Key;
369    typedef Entry Value;
370    static const uc16 kNoKey;
371    static const Entry NoValue() { return Value(); }
372    static inline int Compare(uc16 a, uc16 b) {
373      if (a == b)
374        return 0;
375      else if (a < b)
376        return -1;
377      else
378        return 1;
379    }
380  };
381
382  void AddRange(CharacterRange range, int value, Zone* zone);
383  OutSet* Get(uc16 value);
384  void Dump();
385
386  template <typename Callback>
387  void ForEach(Callback* callback) {
388    return tree()->ForEach(callback);
389  }
390
391 private:
392  // There can't be a static empty set since it allocates its
393  // successors in a zone and caches them.
394  OutSet* empty() { return &empty_; }
395  OutSet empty_;
396  ZoneSplayTree<Config>* tree() { return &tree_; }
397  ZoneSplayTree<Config> tree_;
398};
399
400
401#define FOR_EACH_NODE_TYPE(VISIT)                                    \
402  VISIT(End)                                                         \
403  VISIT(Action)                                                      \
404  VISIT(Choice)                                                      \
405  VISIT(BackReference)                                               \
406  VISIT(Assertion)                                                   \
407  VISIT(Text)
408
409
410#define FOR_EACH_REG_EXP_TREE_TYPE(VISIT)                            \
411  VISIT(Disjunction)                                                 \
412  VISIT(Alternative)                                                 \
413  VISIT(Assertion)                                                   \
414  VISIT(CharacterClass)                                              \
415  VISIT(Atom)                                                        \
416  VISIT(Quantifier)                                                  \
417  VISIT(Capture)                                                     \
418  VISIT(Lookahead)                                                   \
419  VISIT(BackReference)                                               \
420  VISIT(Empty)                                                       \
421  VISIT(Text)
422
423
424#define FORWARD_DECLARE(Name) class RegExp##Name;
425FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
426#undef FORWARD_DECLARE
427
428
429class TextElement {
430 public:
431  enum TextType {UNINITIALIZED, ATOM, CHAR_CLASS};
432  TextElement() : text_type(UNINITIALIZED) { }
433  explicit TextElement(TextType t) : text_type(t), cp_offset(-1) { }
434  static TextElement Atom(RegExpAtom* atom);
435  static TextElement CharClass(RegExpCharacterClass* char_class);
436  int length();
437  TextType text_type;
438  union {
439    RegExpAtom* u_atom;
440    RegExpCharacterClass* u_char_class;
441  } data;
442  int cp_offset;
443};
444
445
446class Trace;
447
448
449struct NodeInfo {
450  NodeInfo()
451      : being_analyzed(false),
452        been_analyzed(false),
453        follows_word_interest(false),
454        follows_newline_interest(false),
455        follows_start_interest(false),
456        at_end(false),
457        visited(false),
458        replacement_calculated(false) { }
459
460  // Returns true if the interests and assumptions of this node
461  // matches the given one.
462  bool Matches(NodeInfo* that) {
463    return (at_end == that->at_end) &&
464           (follows_word_interest == that->follows_word_interest) &&
465           (follows_newline_interest == that->follows_newline_interest) &&
466           (follows_start_interest == that->follows_start_interest);
467  }
468
469  // Updates the interests of this node given the interests of the
470  // node preceding it.
471  void AddFromPreceding(NodeInfo* that) {
472    at_end |= that->at_end;
473    follows_word_interest |= that->follows_word_interest;
474    follows_newline_interest |= that->follows_newline_interest;
475    follows_start_interest |= that->follows_start_interest;
476  }
477
478  bool HasLookbehind() {
479    return follows_word_interest ||
480           follows_newline_interest ||
481           follows_start_interest;
482  }
483
484  // Sets the interests of this node to include the interests of the
485  // following node.
486  void AddFromFollowing(NodeInfo* that) {
487    follows_word_interest |= that->follows_word_interest;
488    follows_newline_interest |= that->follows_newline_interest;
489    follows_start_interest |= that->follows_start_interest;
490  }
491
492  void ResetCompilationState() {
493    being_analyzed = false;
494    been_analyzed = false;
495  }
496
497  bool being_analyzed: 1;
498  bool been_analyzed: 1;
499
500  // These bits are set of this node has to know what the preceding
501  // character was.
502  bool follows_word_interest: 1;
503  bool follows_newline_interest: 1;
504  bool follows_start_interest: 1;
505
506  bool at_end: 1;
507  bool visited: 1;
508  bool replacement_calculated: 1;
509};
510
511
512// Details of a quick mask-compare check that can look ahead in the
513// input stream.
514class QuickCheckDetails {
515 public:
516  QuickCheckDetails()
517      : characters_(0),
518        mask_(0),
519        value_(0),
520        cannot_match_(false) { }
521  explicit QuickCheckDetails(int characters)
522      : characters_(characters),
523        mask_(0),
524        value_(0),
525        cannot_match_(false) { }
526  bool Rationalize(bool ascii);
527  // Merge in the information from another branch of an alternation.
528  void Merge(QuickCheckDetails* other, int from_index);
529  // Advance the current position by some amount.
530  void Advance(int by, bool ascii);
531  void Clear();
532  bool cannot_match() { return cannot_match_; }
533  void set_cannot_match() { cannot_match_ = true; }
534  struct Position {
535    Position() : mask(0), value(0), determines_perfectly(false) { }
536    uc16 mask;
537    uc16 value;
538    bool determines_perfectly;
539  };
540  int characters() { return characters_; }
541  void set_characters(int characters) { characters_ = characters; }
542  Position* positions(int index) {
543    ASSERT(index >= 0);
544    ASSERT(index < characters_);
545    return positions_ + index;
546  }
547  uint32_t mask() { return mask_; }
548  uint32_t value() { return value_; }
549
550 private:
551  // How many characters do we have quick check information from.  This is
552  // the same for all branches of a choice node.
553  int characters_;
554  Position positions_[4];
555  // These values are the condensate of the above array after Rationalize().
556  uint32_t mask_;
557  uint32_t value_;
558  // If set to true, there is no way this quick check can match at all.
559  // E.g., if it requires to be at the start of the input, and isn't.
560  bool cannot_match_;
561};
562
563
564extern int kUninitializedRegExpNodePlaceHolder;
565
566
567class RegExpNode: public ZoneObject {
568 public:
569  explicit RegExpNode(Zone* zone)
570  : replacement_(NULL), trace_count_(0), zone_(zone) {
571    bm_info_[0] = bm_info_[1] = NULL;
572  }
573  virtual ~RegExpNode();
574  virtual void Accept(NodeVisitor* visitor) = 0;
575  // Generates a goto to this node or actually generates the code at this point.
576  virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
577  // How many characters must this node consume at a minimum in order to
578  // succeed.  If we have found at least 'still_to_find' characters that
579  // must be consumed there is no need to ask any following nodes whether
580  // they are sure to eat any more characters.  The not_at_start argument is
581  // used to indicate that we know we are not at the start of the input.  In
582  // this case anchored branches will always fail and can be ignored when
583  // determining how many characters are consumed on success.
584  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
585  // Emits some quick code that checks whether the preloaded characters match.
586  // Falls through on certain failure, jumps to the label on possible success.
587  // If the node cannot make a quick check it does nothing and returns false.
588  bool EmitQuickCheck(RegExpCompiler* compiler,
589                      Trace* trace,
590                      bool preload_has_checked_bounds,
591                      Label* on_possible_success,
592                      QuickCheckDetails* details_return,
593                      bool fall_through_on_failure);
594  // For a given number of characters this returns a mask and a value.  The
595  // next n characters are anded with the mask and compared with the value.
596  // A comparison failure indicates the node cannot match the next n characters.
597  // A comparison success indicates the node may match.
598  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
599                                    RegExpCompiler* compiler,
600                                    int characters_filled_in,
601                                    bool not_at_start) = 0;
602  static const int kNodeIsTooComplexForGreedyLoops = -1;
603  virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
604  // Only returns the successor for a text node of length 1 that matches any
605  // character and that has no guards on it.
606  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
607      RegExpCompiler* compiler) {
608    return NULL;
609  }
610
611  // Collects information on the possible code units (mod 128) that can match if
612  // we look forward.  This is used for a Boyer-Moore-like string searching
613  // implementation.  TODO(erikcorry):  This should share more code with
614  // EatsAtLeast, GetQuickCheckDetails.  The budget argument is used to limit
615  // the number of nodes we are willing to look at in order to create this data.
616  static const int kRecursionBudget = 200;
617  virtual void FillInBMInfo(int offset,
618                            int budget,
619                            BoyerMooreLookahead* bm,
620                            bool not_at_start) {
621    UNREACHABLE();
622  }
623
624  // If we know that the input is ASCII then there are some nodes that can
625  // never match.  This method returns a node that can be substituted for
626  // itself, or NULL if the node can never match.
627  virtual RegExpNode* FilterASCII(int depth, bool ignore_case) { return this; }
628  // Helper for FilterASCII.
629  RegExpNode* replacement() {
630    ASSERT(info()->replacement_calculated);
631    return replacement_;
632  }
633  RegExpNode* set_replacement(RegExpNode* replacement) {
634    info()->replacement_calculated = true;
635    replacement_ =  replacement;
636    return replacement;  // For convenience.
637  }
638
639  // We want to avoid recalculating the lookahead info, so we store it on the
640  // node.  Only info that is for this node is stored.  We can tell that the
641  // info is for this node when offset == 0, so the information is calculated
642  // relative to this node.
643  void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
644    if (offset == 0) set_bm_info(not_at_start, bm);
645  }
646
647  Label* label() { return &label_; }
648  // If non-generic code is generated for a node (i.e. the node is not at the
649  // start of the trace) then it cannot be reused.  This variable sets a limit
650  // on how often we allow that to happen before we insist on starting a new
651  // trace and generating generic code for a node that can be reused by flushing
652  // the deferred actions in the current trace and generating a goto.
653  static const int kMaxCopiesCodeGenerated = 10;
654
655  NodeInfo* info() { return &info_; }
656
657  BoyerMooreLookahead* bm_info(bool not_at_start) {
658    return bm_info_[not_at_start ? 1 : 0];
659  }
660
661  Zone* zone() const { return zone_; }
662
663 protected:
664  enum LimitResult { DONE, CONTINUE };
665  RegExpNode* replacement_;
666
667  LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
668
669  void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
670    bm_info_[not_at_start ? 1 : 0] = bm;
671  }
672
673 private:
674  static const int kFirstCharBudget = 10;
675  Label label_;
676  NodeInfo info_;
677  // This variable keeps track of how many times code has been generated for
678  // this node (in different traces).  We don't keep track of where the
679  // generated code is located unless the code is generated at the start of
680  // a trace, in which case it is generic and can be reused by flushing the
681  // deferred operations in the current trace and generating a goto.
682  int trace_count_;
683  BoyerMooreLookahead* bm_info_[2];
684
685  Zone* zone_;
686};
687
688
689// A simple closed interval.
690class Interval {
691 public:
692  Interval() : from_(kNone), to_(kNone) { }
693  Interval(int from, int to) : from_(from), to_(to) { }
694  Interval Union(Interval that) {
695    if (that.from_ == kNone)
696      return *this;
697    else if (from_ == kNone)
698      return that;
699    else
700      return Interval(Min(from_, that.from_), Max(to_, that.to_));
701  }
702  bool Contains(int value) {
703    return (from_ <= value) && (value <= to_);
704  }
705  bool is_empty() { return from_ == kNone; }
706  int from() const { return from_; }
707  int to() const { return to_; }
708  static Interval Empty() { return Interval(); }
709  static const int kNone = -1;
710 private:
711  int from_;
712  int to_;
713};
714
715
716class SeqRegExpNode: public RegExpNode {
717 public:
718  explicit SeqRegExpNode(RegExpNode* on_success)
719      : RegExpNode(on_success->zone()), on_success_(on_success) { }
720  RegExpNode* on_success() { return on_success_; }
721  void set_on_success(RegExpNode* node) { on_success_ = node; }
722  virtual RegExpNode* FilterASCII(int depth, bool ignore_case);
723  virtual void FillInBMInfo(int offset,
724                            int budget,
725                            BoyerMooreLookahead* bm,
726                            bool not_at_start) {
727    on_success_->FillInBMInfo(offset, budget - 1, bm, not_at_start);
728    if (offset == 0) set_bm_info(not_at_start, bm);
729  }
730
731 protected:
732  RegExpNode* FilterSuccessor(int depth, bool ignore_case);
733
734 private:
735  RegExpNode* on_success_;
736};
737
738
739class ActionNode: public SeqRegExpNode {
740 public:
741  enum ActionType {
742    SET_REGISTER,
743    INCREMENT_REGISTER,
744    STORE_POSITION,
745    BEGIN_SUBMATCH,
746    POSITIVE_SUBMATCH_SUCCESS,
747    EMPTY_MATCH_CHECK,
748    CLEAR_CAPTURES
749  };
750  static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
751  static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
752  static ActionNode* StorePosition(int reg,
753                                   bool is_capture,
754                                   RegExpNode* on_success);
755  static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
756  static ActionNode* BeginSubmatch(int stack_pointer_reg,
757                                   int position_reg,
758                                   RegExpNode* on_success);
759  static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
760                                             int restore_reg,
761                                             int clear_capture_count,
762                                             int clear_capture_from,
763                                             RegExpNode* on_success);
764  static ActionNode* EmptyMatchCheck(int start_register,
765                                     int repetition_register,
766                                     int repetition_limit,
767                                     RegExpNode* on_success);
768  virtual void Accept(NodeVisitor* visitor);
769  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
770  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
771  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
772                                    RegExpCompiler* compiler,
773                                    int filled_in,
774                                    bool not_at_start) {
775    return on_success()->GetQuickCheckDetails(
776        details, compiler, filled_in, not_at_start);
777  }
778  virtual void FillInBMInfo(int offset,
779                            int budget,
780                            BoyerMooreLookahead* bm,
781                            bool not_at_start);
782  ActionType action_type() { return action_type_; }
783  // TODO(erikcorry): We should allow some action nodes in greedy loops.
784  virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
785
786 private:
787  union {
788    struct {
789      int reg;
790      int value;
791    } u_store_register;
792    struct {
793      int reg;
794    } u_increment_register;
795    struct {
796      int reg;
797      bool is_capture;
798    } u_position_register;
799    struct {
800      int stack_pointer_register;
801      int current_position_register;
802      int clear_register_count;
803      int clear_register_from;
804    } u_submatch;
805    struct {
806      int start_register;
807      int repetition_register;
808      int repetition_limit;
809    } u_empty_match_check;
810    struct {
811      int range_from;
812      int range_to;
813    } u_clear_captures;
814  } data_;
815  ActionNode(ActionType action_type, RegExpNode* on_success)
816      : SeqRegExpNode(on_success),
817        action_type_(action_type) { }
818  ActionType action_type_;
819  friend class DotPrinter;
820};
821
822
823class TextNode: public SeqRegExpNode {
824 public:
825  TextNode(ZoneList<TextElement>* elms,
826           RegExpNode* on_success)
827      : SeqRegExpNode(on_success),
828        elms_(elms) { }
829  TextNode(RegExpCharacterClass* that,
830           RegExpNode* on_success)
831      : SeqRegExpNode(on_success),
832        elms_(new(zone()) ZoneList<TextElement>(1, zone())) {
833    elms_->Add(TextElement::CharClass(that), zone());
834  }
835  virtual void Accept(NodeVisitor* visitor);
836  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
837  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
838  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
839                                    RegExpCompiler* compiler,
840                                    int characters_filled_in,
841                                    bool not_at_start);
842  ZoneList<TextElement>* elements() { return elms_; }
843  void MakeCaseIndependent(bool is_ascii);
844  virtual int GreedyLoopTextLength();
845  virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
846      RegExpCompiler* compiler);
847  virtual void FillInBMInfo(int offset,
848                            int budget,
849                            BoyerMooreLookahead* bm,
850                            bool not_at_start);
851  void CalculateOffsets();
852  virtual RegExpNode* FilterASCII(int depth, bool ignore_case);
853
854 private:
855  enum TextEmitPassType {
856    NON_ASCII_MATCH,             // Check for characters that can't match.
857    SIMPLE_CHARACTER_MATCH,      // Case-dependent single character check.
858    NON_LETTER_CHARACTER_MATCH,  // Check characters that have no case equivs.
859    CASE_CHARACTER_MATCH,        // Case-independent single character check.
860    CHARACTER_CLASS_MATCH        // Character class.
861  };
862  static bool SkipPass(int pass, bool ignore_case);
863  static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
864  static const int kLastPass = CHARACTER_CLASS_MATCH;
865  void TextEmitPass(RegExpCompiler* compiler,
866                    TextEmitPassType pass,
867                    bool preloaded,
868                    Trace* trace,
869                    bool first_element_checked,
870                    int* checked_up_to);
871  int Length();
872  ZoneList<TextElement>* elms_;
873};
874
875
876class AssertionNode: public SeqRegExpNode {
877 public:
878  enum AssertionType {
879    AT_END,
880    AT_START,
881    AT_BOUNDARY,
882    AT_NON_BOUNDARY,
883    AFTER_NEWLINE
884  };
885  static AssertionNode* AtEnd(RegExpNode* on_success) {
886    return new(on_success->zone()) AssertionNode(AT_END, on_success);
887  }
888  static AssertionNode* AtStart(RegExpNode* on_success) {
889    return new(on_success->zone()) AssertionNode(AT_START, on_success);
890  }
891  static AssertionNode* AtBoundary(RegExpNode* on_success) {
892    return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
893  }
894  static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
895    return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
896  }
897  static AssertionNode* AfterNewline(RegExpNode* on_success) {
898    return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
899  }
900  virtual void Accept(NodeVisitor* visitor);
901  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
902  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
903  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
904                                    RegExpCompiler* compiler,
905                                    int filled_in,
906                                    bool not_at_start);
907  virtual void FillInBMInfo(int offset,
908                            int budget,
909                            BoyerMooreLookahead* bm,
910                            bool not_at_start);
911  AssertionType assertion_type() { return assertion_type_; }
912
913 private:
914  void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
915  enum IfPrevious { kIsNonWord, kIsWord };
916  void BacktrackIfPrevious(RegExpCompiler* compiler,
917                           Trace* trace,
918                           IfPrevious backtrack_if_previous);
919  AssertionNode(AssertionType t, RegExpNode* on_success)
920      : SeqRegExpNode(on_success), assertion_type_(t) { }
921  AssertionType assertion_type_;
922};
923
924
925class BackReferenceNode: public SeqRegExpNode {
926 public:
927  BackReferenceNode(int start_reg,
928                    int end_reg,
929                    RegExpNode* on_success)
930      : SeqRegExpNode(on_success),
931        start_reg_(start_reg),
932        end_reg_(end_reg) { }
933  virtual void Accept(NodeVisitor* visitor);
934  int start_register() { return start_reg_; }
935  int end_register() { return end_reg_; }
936  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
937  virtual int EatsAtLeast(int still_to_find,
938                          int recursion_depth,
939                          bool not_at_start);
940  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
941                                    RegExpCompiler* compiler,
942                                    int characters_filled_in,
943                                    bool not_at_start) {
944    return;
945  }
946  virtual void FillInBMInfo(int offset,
947                            int budget,
948                            BoyerMooreLookahead* bm,
949                            bool not_at_start);
950
951 private:
952  int start_reg_;
953  int end_reg_;
954};
955
956
957class EndNode: public RegExpNode {
958 public:
959  enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
960  explicit EndNode(Action action, Zone* zone)
961      : RegExpNode(zone), action_(action) { }
962  virtual void Accept(NodeVisitor* visitor);
963  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
964  virtual int EatsAtLeast(int still_to_find,
965                          int recursion_depth,
966                          bool not_at_start) { return 0; }
967  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
968                                    RegExpCompiler* compiler,
969                                    int characters_filled_in,
970                                    bool not_at_start) {
971    // Returning 0 from EatsAtLeast should ensure we never get here.
972    UNREACHABLE();
973  }
974  virtual void FillInBMInfo(int offset,
975                            int budget,
976                            BoyerMooreLookahead* bm,
977                            bool not_at_start) {
978    // Returning 0 from EatsAtLeast should ensure we never get here.
979    UNREACHABLE();
980  }
981
982 private:
983  Action action_;
984};
985
986
987class NegativeSubmatchSuccess: public EndNode {
988 public:
989  NegativeSubmatchSuccess(int stack_pointer_reg,
990                          int position_reg,
991                          int clear_capture_count,
992                          int clear_capture_start,
993                          Zone* zone)
994      : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
995        stack_pointer_register_(stack_pointer_reg),
996        current_position_register_(position_reg),
997        clear_capture_count_(clear_capture_count),
998        clear_capture_start_(clear_capture_start) { }
999  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1000
1001 private:
1002  int stack_pointer_register_;
1003  int current_position_register_;
1004  int clear_capture_count_;
1005  int clear_capture_start_;
1006};
1007
1008
1009class Guard: public ZoneObject {
1010 public:
1011  enum Relation { LT, GEQ };
1012  Guard(int reg, Relation op, int value)
1013      : reg_(reg),
1014        op_(op),
1015        value_(value) { }
1016  int reg() { return reg_; }
1017  Relation op() { return op_; }
1018  int value() { return value_; }
1019
1020 private:
1021  int reg_;
1022  Relation op_;
1023  int value_;
1024};
1025
1026
1027class GuardedAlternative {
1028 public:
1029  explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) { }
1030  void AddGuard(Guard* guard, Zone* zone);
1031  RegExpNode* node() { return node_; }
1032  void set_node(RegExpNode* node) { node_ = node; }
1033  ZoneList<Guard*>* guards() { return guards_; }
1034
1035 private:
1036  RegExpNode* node_;
1037  ZoneList<Guard*>* guards_;
1038};
1039
1040
1041class AlternativeGeneration;
1042
1043
1044class ChoiceNode: public RegExpNode {
1045 public:
1046  explicit ChoiceNode(int expected_size, Zone* zone)
1047      : RegExpNode(zone),
1048        alternatives_(new(zone)
1049                      ZoneList<GuardedAlternative>(expected_size, zone)),
1050        table_(NULL),
1051        not_at_start_(false),
1052        being_calculated_(false) { }
1053  virtual void Accept(NodeVisitor* visitor);
1054  void AddAlternative(GuardedAlternative node) {
1055    alternatives()->Add(node, zone());
1056  }
1057  ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
1058  DispatchTable* GetTable(bool ignore_case);
1059  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1060  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
1061  int EatsAtLeastHelper(int still_to_find,
1062                        int budget,
1063                        RegExpNode* ignore_this_node,
1064                        bool not_at_start);
1065  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1066                                    RegExpCompiler* compiler,
1067                                    int characters_filled_in,
1068                                    bool not_at_start);
1069  virtual void FillInBMInfo(int offset,
1070                            int budget,
1071                            BoyerMooreLookahead* bm,
1072                            bool not_at_start);
1073
1074  bool being_calculated() { return being_calculated_; }
1075  bool not_at_start() { return not_at_start_; }
1076  void set_not_at_start() { not_at_start_ = true; }
1077  void set_being_calculated(bool b) { being_calculated_ = b; }
1078  virtual bool try_to_emit_quick_check_for_alternative(int i) { return true; }
1079  virtual RegExpNode* FilterASCII(int depth, bool ignore_case);
1080
1081 protected:
1082  int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
1083  ZoneList<GuardedAlternative>* alternatives_;
1084
1085 private:
1086  friend class DispatchTableConstructor;
1087  friend class Analysis;
1088  void GenerateGuard(RegExpMacroAssembler* macro_assembler,
1089                     Guard* guard,
1090                     Trace* trace);
1091  int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
1092  void EmitOutOfLineContinuation(RegExpCompiler* compiler,
1093                                 Trace* trace,
1094                                 GuardedAlternative alternative,
1095                                 AlternativeGeneration* alt_gen,
1096                                 int preload_characters,
1097                                 bool next_expects_preload);
1098  DispatchTable* table_;
1099  // If true, this node is never checked at the start of the input.
1100  // Allows a new trace to start with at_start() set to false.
1101  bool not_at_start_;
1102  bool being_calculated_;
1103};
1104
1105
1106class NegativeLookaheadChoiceNode: public ChoiceNode {
1107 public:
1108  explicit NegativeLookaheadChoiceNode(GuardedAlternative this_must_fail,
1109                                       GuardedAlternative then_do_this,
1110                                       Zone* zone)
1111      : ChoiceNode(2, zone) {
1112    AddAlternative(this_must_fail);
1113    AddAlternative(then_do_this);
1114  }
1115  virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
1116  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1117                                    RegExpCompiler* compiler,
1118                                    int characters_filled_in,
1119                                    bool not_at_start);
1120  virtual void FillInBMInfo(int offset,
1121                            int budget,
1122                            BoyerMooreLookahead* bm,
1123                            bool not_at_start) {
1124    alternatives_->at(1).node()->FillInBMInfo(
1125        offset, budget - 1, bm, not_at_start);
1126    if (offset == 0) set_bm_info(not_at_start, bm);
1127  }
1128  // For a negative lookahead we don't emit the quick check for the
1129  // alternative that is expected to fail.  This is because quick check code
1130  // starts by loading enough characters for the alternative that takes fewest
1131  // characters, but on a negative lookahead the negative branch did not take
1132  // part in that calculation (EatsAtLeast) so the assumptions don't hold.
1133  virtual bool try_to_emit_quick_check_for_alternative(int i) { return i != 0; }
1134  virtual RegExpNode* FilterASCII(int depth, bool ignore_case);
1135};
1136
1137
1138class LoopChoiceNode: public ChoiceNode {
1139 public:
1140  explicit LoopChoiceNode(bool body_can_be_zero_length, Zone* zone)
1141      : ChoiceNode(2, zone),
1142        loop_node_(NULL),
1143        continue_node_(NULL),
1144        body_can_be_zero_length_(body_can_be_zero_length) { }
1145  void AddLoopAlternative(GuardedAlternative alt);
1146  void AddContinueAlternative(GuardedAlternative alt);
1147  virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1148  virtual int EatsAtLeast(int still_to_find,  int budget, bool not_at_start);
1149  virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1150                                    RegExpCompiler* compiler,
1151                                    int characters_filled_in,
1152                                    bool not_at_start);
1153  virtual void FillInBMInfo(int offset,
1154                            int budget,
1155                            BoyerMooreLookahead* bm,
1156                            bool not_at_start);
1157  RegExpNode* loop_node() { return loop_node_; }
1158  RegExpNode* continue_node() { return continue_node_; }
1159  bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1160  virtual void Accept(NodeVisitor* visitor);
1161  virtual RegExpNode* FilterASCII(int depth, bool ignore_case);
1162
1163 private:
1164  // AddAlternative is made private for loop nodes because alternatives
1165  // should not be added freely, we need to keep track of which node
1166  // goes back to the node itself.
1167  void AddAlternative(GuardedAlternative node) {
1168    ChoiceNode::AddAlternative(node);
1169  }
1170
1171  RegExpNode* loop_node_;
1172  RegExpNode* continue_node_;
1173  bool body_can_be_zero_length_;
1174};
1175
1176
1177// Improve the speed that we scan for an initial point where a non-anchored
1178// regexp can match by using a Boyer-Moore-like table. This is done by
1179// identifying non-greedy non-capturing loops in the nodes that eat any
1180// character one at a time.  For example in the middle of the regexp
1181// /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
1182// inserted at the start of any non-anchored regexp.
1183//
1184// When we have found such a loop we look ahead in the nodes to find the set of
1185// characters that can come at given distances. For example for the regexp
1186// /.?foo/ we know that there are at least 3 characters ahead of us, and the
1187// sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1188// the lookahead info where the set of characters is reasonably constrained. In
1189// our example this is from index 1 to 2 (0 is not constrained). We can now
1190// look 3 characters ahead and if we don't find one of [f, o] (the union of
1191// [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1192//
1193// For Unicode input strings we do the same, but modulo 128.
1194//
1195// We also look at the first string fed to the regexp and use that to get a hint
1196// of the character frequencies in the inputs. This affects the assessment of
1197// whether the set of characters is 'reasonably constrained'.
1198//
1199// We also have another lookahead mechanism (called quick check in the code),
1200// which uses a wide load of multiple characters followed by a mask and compare
1201// to determine whether a match is possible at this point.
1202enum ContainedInLattice {
1203  kNotYet = 0,
1204  kLatticeIn = 1,
1205  kLatticeOut = 2,
1206  kLatticeUnknown = 3  // Can also mean both in and out.
1207};
1208
1209
1210inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1211  return static_cast<ContainedInLattice>(a | b);
1212}
1213
1214
1215ContainedInLattice AddRange(ContainedInLattice a,
1216                            const int* ranges,
1217                            int ranges_size,
1218                            Interval new_range);
1219
1220
1221class BoyerMoorePositionInfo : public ZoneObject {
1222 public:
1223  explicit BoyerMoorePositionInfo(Zone* zone)
1224      : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1225        map_count_(0),
1226        w_(kNotYet),
1227        s_(kNotYet),
1228        d_(kNotYet),
1229        surrogate_(kNotYet) {
1230     for (int i = 0; i < kMapSize; i++) {
1231       map_->Add(false, zone);
1232     }
1233  }
1234
1235  bool& at(int i) { return map_->at(i); }
1236
1237  static const int kMapSize = 128;
1238  static const int kMask = kMapSize - 1;
1239
1240  int map_count() const { return map_count_; }
1241
1242  void Set(int character);
1243  void SetInterval(const Interval& interval);
1244  void SetAll();
1245  bool is_non_word() { return w_ == kLatticeOut; }
1246  bool is_word() { return w_ == kLatticeIn; }
1247
1248 private:
1249  ZoneList<bool>* map_;
1250  int map_count_;  // Number of set bits in the map.
1251  ContainedInLattice w_;  // The \w character class.
1252  ContainedInLattice s_;  // The \s character class.
1253  ContainedInLattice d_;  // The \d character class.
1254  ContainedInLattice surrogate_;  // Surrogate UTF-16 code units.
1255};
1256
1257
1258class BoyerMooreLookahead : public ZoneObject {
1259 public:
1260  BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1261
1262  int length() { return length_; }
1263  int max_char() { return max_char_; }
1264  RegExpCompiler* compiler() { return compiler_; }
1265
1266  int Count(int map_number) {
1267    return bitmaps_->at(map_number)->map_count();
1268  }
1269
1270  BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1271
1272  void Set(int map_number, int character) {
1273    if (character > max_char_) return;
1274    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1275    info->Set(character);
1276  }
1277
1278  void SetInterval(int map_number, const Interval& interval) {
1279    if (interval.from() > max_char_) return;
1280    BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1281    if (interval.to() > max_char_) {
1282      info->SetInterval(Interval(interval.from(), max_char_));
1283    } else {
1284      info->SetInterval(interval);
1285    }
1286  }
1287
1288  void SetAll(int map_number) {
1289    bitmaps_->at(map_number)->SetAll();
1290  }
1291
1292  void SetRest(int from_map) {
1293    for (int i = from_map; i < length_; i++) SetAll(i);
1294  }
1295  bool EmitSkipInstructions(RegExpMacroAssembler* masm);
1296
1297 private:
1298  // This is the value obtained by EatsAtLeast.  If we do not have at least this
1299  // many characters left in the sample string then the match is bound to fail.
1300  // Therefore it is OK to read a character this far ahead of the current match
1301  // point.
1302  int length_;
1303  RegExpCompiler* compiler_;
1304  // 0x7f for ASCII, 0xffff for UTF-16.
1305  int max_char_;
1306  ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1307
1308  int GetSkipTable(int min_lookahead,
1309                   int max_lookahead,
1310                   Handle<ByteArray> boolean_skip_table);
1311  bool FindWorthwhileInterval(int* from, int* to);
1312  int FindBestInterval(
1313    int max_number_of_chars, int old_biggest_points, int* from, int* to);
1314};
1315
1316
1317// There are many ways to generate code for a node.  This class encapsulates
1318// the current way we should be generating.  In other words it encapsulates
1319// the current state of the code generator.  The effect of this is that we
1320// generate code for paths that the matcher can take through the regular
1321// expression.  A given node in the regexp can be code-generated several times
1322// as it can be part of several traces.  For example for the regexp:
1323// /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1324// of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
1325// to match foo is generated only once (the traces have a common prefix).  The
1326// code to store the capture is deferred and generated (twice) after the places
1327// where baz has been matched.
1328class Trace {
1329 public:
1330  // A value for a property that is either known to be true, know to be false,
1331  // or not known.
1332  enum TriBool {
1333    UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1334  };
1335
1336  class DeferredAction {
1337   public:
1338    DeferredAction(ActionNode::ActionType action_type, int reg)
1339        : action_type_(action_type), reg_(reg), next_(NULL) { }
1340    DeferredAction* next() { return next_; }
1341    bool Mentions(int reg);
1342    int reg() { return reg_; }
1343    ActionNode::ActionType action_type() { return action_type_; }
1344   private:
1345    ActionNode::ActionType action_type_;
1346    int reg_;
1347    DeferredAction* next_;
1348    friend class Trace;
1349  };
1350
1351  class DeferredCapture : public DeferredAction {
1352   public:
1353    DeferredCapture(int reg, bool is_capture, Trace* trace)
1354        : DeferredAction(ActionNode::STORE_POSITION, reg),
1355          cp_offset_(trace->cp_offset()),
1356          is_capture_(is_capture) { }
1357    int cp_offset() { return cp_offset_; }
1358    bool is_capture() { return is_capture_; }
1359   private:
1360    int cp_offset_;
1361    bool is_capture_;
1362    void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1363  };
1364
1365  class DeferredSetRegister : public DeferredAction {
1366   public:
1367    DeferredSetRegister(int reg, int value)
1368        : DeferredAction(ActionNode::SET_REGISTER, reg),
1369          value_(value) { }
1370    int value() { return value_; }
1371   private:
1372    int value_;
1373  };
1374
1375  class DeferredClearCaptures : public DeferredAction {
1376   public:
1377    explicit DeferredClearCaptures(Interval range)
1378        : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1379          range_(range) { }
1380    Interval range() { return range_; }
1381   private:
1382    Interval range_;
1383  };
1384
1385  class DeferredIncrementRegister : public DeferredAction {
1386   public:
1387    explicit DeferredIncrementRegister(int reg)
1388        : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1389  };
1390
1391  Trace()
1392      : cp_offset_(0),
1393        actions_(NULL),
1394        backtrack_(NULL),
1395        stop_node_(NULL),
1396        loop_label_(NULL),
1397        characters_preloaded_(0),
1398        bound_checked_up_to_(0),
1399        flush_budget_(100),
1400        at_start_(UNKNOWN) { }
1401
1402  // End the trace.  This involves flushing the deferred actions in the trace
1403  // and pushing a backtrack location onto the backtrack stack.  Once this is
1404  // done we can start a new trace or go to one that has already been
1405  // generated.
1406  void Flush(RegExpCompiler* compiler, RegExpNode* successor);
1407  int cp_offset() { return cp_offset_; }
1408  DeferredAction* actions() { return actions_; }
1409  // A trivial trace is one that has no deferred actions or other state that
1410  // affects the assumptions used when generating code.  There is no recorded
1411  // backtrack location in a trivial trace, so with a trivial trace we will
1412  // generate code that, on a failure to match, gets the backtrack location
1413  // from the backtrack stack rather than using a direct jump instruction.  We
1414  // always start code generation with a trivial trace and non-trivial traces
1415  // are created as we emit code for nodes or add to the list of deferred
1416  // actions in the trace.  The location of the code generated for a node using
1417  // a trivial trace is recorded in a label in the node so that gotos can be
1418  // generated to that code.
1419  bool is_trivial() {
1420    return backtrack_ == NULL &&
1421           actions_ == NULL &&
1422           cp_offset_ == 0 &&
1423           characters_preloaded_ == 0 &&
1424           bound_checked_up_to_ == 0 &&
1425           quick_check_performed_.characters() == 0 &&
1426           at_start_ == UNKNOWN;
1427  }
1428  TriBool at_start() { return at_start_; }
1429  void set_at_start(bool at_start) {
1430    at_start_ = at_start ? TRUE_VALUE : FALSE_VALUE;
1431  }
1432  Label* backtrack() { return backtrack_; }
1433  Label* loop_label() { return loop_label_; }
1434  RegExpNode* stop_node() { return stop_node_; }
1435  int characters_preloaded() { return characters_preloaded_; }
1436  int bound_checked_up_to() { return bound_checked_up_to_; }
1437  int flush_budget() { return flush_budget_; }
1438  QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1439  bool mentions_reg(int reg);
1440  // Returns true if a deferred position store exists to the specified
1441  // register and stores the offset in the out-parameter.  Otherwise
1442  // returns false.
1443  bool GetStoredPosition(int reg, int* cp_offset);
1444  // These set methods and AdvanceCurrentPositionInTrace should be used only on
1445  // new traces - the intention is that traces are immutable after creation.
1446  void add_action(DeferredAction* new_action) {
1447    ASSERT(new_action->next_ == NULL);
1448    new_action->next_ = actions_;
1449    actions_ = new_action;
1450  }
1451  void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
1452  void set_stop_node(RegExpNode* node) { stop_node_ = node; }
1453  void set_loop_label(Label* label) { loop_label_ = label; }
1454  void set_characters_preloaded(int count) { characters_preloaded_ = count; }
1455  void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
1456  void set_flush_budget(int to) { flush_budget_ = to; }
1457  void set_quick_check_performed(QuickCheckDetails* d) {
1458    quick_check_performed_ = *d;
1459  }
1460  void InvalidateCurrentCharacter();
1461  void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1462
1463 private:
1464  int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
1465  void PerformDeferredActions(RegExpMacroAssembler* macro,
1466                              int max_register,
1467                              OutSet& affected_registers,
1468                              OutSet* registers_to_pop,
1469                              OutSet* registers_to_clear,
1470                              Zone* zone);
1471  void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1472                                int max_register,
1473                                OutSet& registers_to_pop,
1474                                OutSet& registers_to_clear);
1475  int cp_offset_;
1476  DeferredAction* actions_;
1477  Label* backtrack_;
1478  RegExpNode* stop_node_;
1479  Label* loop_label_;
1480  int characters_preloaded_;
1481  int bound_checked_up_to_;
1482  QuickCheckDetails quick_check_performed_;
1483  int flush_budget_;
1484  TriBool at_start_;
1485};
1486
1487
1488class NodeVisitor {
1489 public:
1490  virtual ~NodeVisitor() { }
1491#define DECLARE_VISIT(Type)                                          \
1492  virtual void Visit##Type(Type##Node* that) = 0;
1493FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1494#undef DECLARE_VISIT
1495  virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1496};
1497
1498
1499// Node visitor used to add the start set of the alternatives to the
1500// dispatch table of a choice node.
1501class DispatchTableConstructor: public NodeVisitor {
1502 public:
1503  DispatchTableConstructor(DispatchTable* table, bool ignore_case,
1504                           Zone* zone)
1505      : table_(table),
1506        choice_index_(-1),
1507        ignore_case_(ignore_case),
1508        zone_(zone) { }
1509
1510  void BuildTable(ChoiceNode* node);
1511
1512  void AddRange(CharacterRange range) {
1513    table()->AddRange(range, choice_index_, zone_);
1514  }
1515
1516  void AddInverse(ZoneList<CharacterRange>* ranges);
1517
1518#define DECLARE_VISIT(Type)                                          \
1519  virtual void Visit##Type(Type##Node* that);
1520FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1521#undef DECLARE_VISIT
1522
1523  DispatchTable* table() { return table_; }
1524  void set_choice_index(int value) { choice_index_ = value; }
1525
1526 protected:
1527  DispatchTable* table_;
1528  int choice_index_;
1529  bool ignore_case_;
1530  Zone* zone_;
1531};
1532
1533
1534// Assertion propagation moves information about assertions such as
1535// \b to the affected nodes.  For instance, in /.\b./ information must
1536// be propagated to the first '.' that whatever follows needs to know
1537// if it matched a word or a non-word, and to the second '.' that it
1538// has to check if it succeeds a word or non-word.  In this case the
1539// result will be something like:
1540//
1541//   +-------+        +------------+
1542//   |   .   |        |      .     |
1543//   +-------+  --->  +------------+
1544//   | word? |        | check word |
1545//   +-------+        +------------+
1546class Analysis: public NodeVisitor {
1547 public:
1548  Analysis(bool ignore_case, bool is_ascii)
1549      : ignore_case_(ignore_case),
1550        is_ascii_(is_ascii),
1551        error_message_(NULL) { }
1552  void EnsureAnalyzed(RegExpNode* node);
1553
1554#define DECLARE_VISIT(Type)                                          \
1555  virtual void Visit##Type(Type##Node* that);
1556FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1557#undef DECLARE_VISIT
1558  virtual void VisitLoopChoice(LoopChoiceNode* that);
1559
1560  bool has_failed() { return error_message_ != NULL; }
1561  const char* error_message() {
1562    ASSERT(error_message_ != NULL);
1563    return error_message_;
1564  }
1565  void fail(const char* error_message) {
1566    error_message_ = error_message;
1567  }
1568
1569 private:
1570  bool ignore_case_;
1571  bool is_ascii_;
1572  const char* error_message_;
1573
1574  DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1575};
1576
1577
1578struct RegExpCompileData {
1579  RegExpCompileData()
1580    : tree(NULL),
1581      node(NULL),
1582      simple(true),
1583      contains_anchor(false),
1584      capture_count(0) { }
1585  RegExpTree* tree;
1586  RegExpNode* node;
1587  bool simple;
1588  bool contains_anchor;
1589  Handle<String> error;
1590  int capture_count;
1591};
1592
1593
1594class RegExpEngine: public AllStatic {
1595 public:
1596  struct CompilationResult {
1597    explicit CompilationResult(const char* error_message)
1598        : error_message(error_message),
1599          code(HEAP->the_hole_value()),
1600          num_registers(0) {}
1601    CompilationResult(Object* code, int registers)
1602      : error_message(NULL),
1603        code(code),
1604        num_registers(registers) {}
1605    const char* error_message;
1606    Object* code;
1607    int num_registers;
1608  };
1609
1610  static CompilationResult Compile(RegExpCompileData* input,
1611                                   bool ignore_case,
1612                                   bool global,
1613                                   bool multiline,
1614                                   Handle<String> pattern,
1615                                   Handle<String> sample_subject,
1616                                   bool is_ascii, Zone* zone);
1617
1618  static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1619};
1620
1621
1622} }  // namespace v8::internal
1623
1624#endif  // V8_JSREGEXP_H_
1625