12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2007 The RE2 Authors.  All Rights Reserved.
22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style
32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file.
42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Compiled representation of regular expressions.
62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// See regexp.h for the Regexp class, which represents a regular
72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// expression symbolically.
82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#ifndef RE2_PROG_H__
102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#define RE2_PROG_H__
112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/util.h"
132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/re2.h"
142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 {
162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Simple fixed-size bitmap.
182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsontemplate<int Bits>
192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass Bitmap {
202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public:
212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Bitmap() { Reset(); }
222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int Size() { return Bits; }
232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void Reset() {
252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    for (int i = 0; i < Words; i++)
262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      w_[i] = 0;
272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool Get(int k) const {
292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return w_[k >> WordLog] & (1<<(k & 31));
302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void Set(int k) {
322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    w_[k >> WordLog] |= 1<<(k & 31);
332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void Clear(int k) {
352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    w_[k >> WordLog] &= ~(1<<(k & 31));
362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint32 Word(int i) const {
382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return w_[i];
392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private:
422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static const int WordLog = 5;
432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static const int Words = (Bits+31)/32;
442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint32 w_[Words];
452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Opcodes for Inst
502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonenum InstOp {
512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kInstAlt = 0,      // choose between out_ and out1_
522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kInstAltMatch,     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kInstByteRange,    // next (possible case-folded) byte must be in [lo_, hi_]
542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kInstCapture,      // capturing parenthesis number cap_
552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kInstEmptyWidth,   // empty-width special (^ $ ...); bit(s) set in empty_
562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kInstMatch,        // found a match!
572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kInstNop,          // no-op; occasionally unavoidable
582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kInstFail,         // never match; occasionally unavoidable
592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Bit flags for empty-width specials
622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonenum EmptyOp {
632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kEmptyBeginLine        = 1<<0,      // ^ - beginning of line
642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kEmptyEndLine          = 1<<1,      // $ - end of line
652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kEmptyBeginText        = 1<<2,      // \A - beginning of text
662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kEmptyEndText          = 1<<3,      // \z - end of text
672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kEmptyWordBoundary     = 1<<4,      // \b - word boundary
682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kEmptyNonWordBoundary  = 1<<5,      // \B - not \b
692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  kEmptyAllFlags         = (1<<6)-1,
702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass Regexp;
732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass DFA;
752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstruct OneState;
762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Compiled form of regexp program.
782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass Prog {
792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public:
802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Prog();
812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  ~Prog();
822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Single instruction in regexp program.
842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  class Inst {
852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson   public:
862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Inst() : out_opcode_(0), out1_(0) { }
872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Constructors per opcode
892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    void InitAlt(uint32 out, uint32 out1);
902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    void InitByteRange(int lo, int hi, int foldcase, uint32 out);
912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    void InitCapture(int cap, uint32 out);
922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    void InitEmptyWidth(EmptyOp empty, uint32 out);
932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    void InitMatch(int id);
942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    void InitNop(uint32 out);
952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    void InitFail();
962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Getters
982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int id(Prog* p) { return this - p->inst_; }
992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int out()     { return out_opcode_>>3; }
1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int out1()    { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int cap()       { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int lo()        { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int hi()        { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int foldcase()  { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int match_id()  { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    bool greedy(Prog *p) {
1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      DCHECK_EQ(opcode(), kInstAltMatch);
1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return p->inst(out())->opcode() == kInstByteRange;
1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Does this inst (an kInstByteRange) match c?
1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    inline bool Matches(int c) {
1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      DCHECK_EQ(opcode(), kInstByteRange);
1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (foldcase_ && 'A' <= c && c <= 'Z')
1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        c += 'a' - 'A';
1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return lo_ <= c && c <= hi_;
1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Returns string representation for debugging.
1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    string Dump();
1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Maximum instruction id.
1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // (Must fit in out_opcode_, and PatchList steals another bit.)
1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    static const int kMaxInst = (1<<28) - 1;
1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson   private:
1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    void set_opcode(InstOp opcode) {
1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      out_opcode_ = (out()<<3) | opcode;
1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    void set_out(int out) {
1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      out_opcode_ = (out<<3) | opcode();
1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    void set_out_opcode(int out, InstOp opcode) {
1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      out_opcode_ = (out<<3) | opcode;
1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    uint32 out_opcode_;  // 29 bits of out, 3 (low) bits opcode
1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    union {              // additional instruction arguments:
1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      uint32 out1_;      // opcode == kInstAlt
1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                         //   alternate next instruction
1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      int32 cap_;        // opcode == kInstCapture
1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                         //   Index of capture register (holds text
1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                         //   position recorded by capturing parentheses).
1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                         //   For \n (the submatch for the nth parentheses),
1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                         //   the left parenthesis captures into register 2*n
1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                         //   and the right one captures into register 2*n+1.
1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      int32 match_id_;   // opcode == kInstMatch
1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                         //   Match ID to identify this match (for re2::Set).
1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      struct {           // opcode == kInstByteRange
1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        uint8 lo_;       //   byte range is lo_-hi_ inclusive
1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        uint8 hi_;       //
1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        uint8 foldcase_; //   convert A-Z to a-z before checking range.
1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      };
1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      EmptyOp empty_;    // opcode == kInstEmptyWidth
1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                         //   empty_ is bitwise OR of kEmpty* flags above.
1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    };
1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    friend class Compiler;
1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    friend struct PatchList;
1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    friend class Prog;
1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    DISALLOW_EVIL_CONSTRUCTORS(Inst);
1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  };
1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Whether to anchor the search.
1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  enum Anchor {
1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    kUnanchored,  // match anywhere
1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    kAnchored,    // match only starting at beginning of text
1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  };
1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Kind of match to look for (for anchor != kFullMatch)
1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //
1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // kLongestMatch mode finds the overall longest
1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // match but still makes its submatch choices the way
1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Perl would, not in the way prescribed by POSIX.
1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // The POSIX rules are much more expensive to implement,
1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // and no one has needed them.
1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //
1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // kFullMatch is not strictly necessary -- we could use
1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // kLongestMatch and then check the length of the match -- but
1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // the matching code can run faster if it knows to consider only
1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // full matches.
1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  enum MatchKind {
1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    kFirstMatch,     // like Perl, PCRE
1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    kLongestMatch,   // like egrep or POSIX
1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    kFullMatch,      // match only entire text; implies anchor==kAnchored
1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    kManyMatch       // for SearchDFA, records set of matches
1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  };
1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Inst *inst(int id) { return &inst_[id]; }
1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int start() { return start_; }
2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int start_unanchored() { return start_unanchored_; }
2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void set_start(int start) { start_ = start; }
2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void set_start_unanchored(int start) { start_unanchored_ = start; }
2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int64 size() { return size_; }
2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool reversed() { return reversed_; }
2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void set_reversed(bool reversed) { reversed_ = reversed; }
2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int64 byte_inst_count() { return byte_inst_count_; }
2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  const Bitmap<256>& byterange() { return byterange_; }
2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int64 dfa_mem() { return dfa_mem_; }
2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int flags() { return flags_; }
2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void set_flags(int flags) { flags_ = flags; }
2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool anchor_start() { return anchor_start_; }
2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void set_anchor_start(bool b) { anchor_start_ = b; }
2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool anchor_end() { return anchor_end_; }
2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void set_anchor_end(bool b) { anchor_end_ = b; }
2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int bytemap_range() { return bytemap_range_; }
2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  const uint8* bytemap() { return bytemap_; }
2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns string representation of program for debugging.
2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  string Dump();
2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  string DumpUnanchored();
2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Record that at some point in the prog, the bytes in the range
2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // lo-hi (inclusive) are treated as different from bytes outside the range.
2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Tracking this lets the DFA collapse commonly-treated byte ranges
2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // when recording state pointers, greatly reducing its memory footprint.
2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void MarkByteRange(int lo, int hi);
2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns the set of kEmpty flags that are in effect at
2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // position p within context.
2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static uint32 EmptyFlags(const StringPiece& context, const char* p);
2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns whether byte c is a word character: ASCII only.
2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Used by the implementation of \b and \B.
2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // This is not right for Unicode, but:
2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //   - it's hard to get right in a byte-at-a-time matching world
2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //     (the DFA has only one-byte lookahead).
2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //   - even if the lookahead were possible, the Progs would be huge.
2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // This crude approximation is the same one PCRE uses.
2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static bool IsWordChar(uint8 c) {
2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return ('A' <= c && c <= 'Z') ||
2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson           ('a' <= c && c <= 'z') ||
2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson           ('0' <= c && c <= '9') ||
2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson           c == '_';
2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Execution engines.  They all search for the regexp (run the prog)
2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // in text, which is in the larger context (used for ^ $ \b etc).
2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Anchor and kind control the kind of search.
2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns true if match found, false if not.
2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // If match found, fills match[0..nmatch-1] with submatch info.
2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // match[0] is overall match, match[1] is first set of parens, etc.
2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // If a particular submatch is not matched during the regexp match,
2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // it is set to NULL.
2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //
2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Matching text == StringPiece(NULL, 0) is treated as any other empty
2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // string, but note that on return, it will not be possible to distinguish
2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // submatches that matched that empty string from submatches that didn't
2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // match anything.  Either way, match[i] == NULL.
2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Search using NFA: can find submatches but kind of slow.
2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool SearchNFA(const StringPiece& text, const StringPiece& context,
2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                 Anchor anchor, MatchKind kind,
2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                 StringPiece* match, int nmatch);
2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Search using DFA: much faster than NFA but only finds
2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // end of match and can use a lot more memory.
2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns whether a match was found.
2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // If the DFA runs out of memory, sets *failed to true and returns false.
2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // If matches != NULL and kind == kManyMatch and there is a match,
2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // SearchDFA fills matches with the match IDs of the final matching state.
2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool SearchDFA(const StringPiece& text, const StringPiece& context,
2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                 Anchor anchor, MatchKind kind,
2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                 StringPiece* match0, bool* failed,
2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                 vector<int>* matches);
2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Build the entire DFA for the given match kind.  FOR TESTING ONLY.
2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Usually the DFA is built out incrementally, as needed, which
2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // avoids lots of unnecessary work.  This function is useful only
2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // for testing purposes.  Returns number of states.
2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int BuildEntireDFA(MatchKind kind);
2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Compute byte map.
2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void ComputeByteMap();
2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Run peep-hole optimizer on program.
2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void Optimize();
2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // One-pass NFA: only correct if IsOnePass() is true,
2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // but much faster than NFA (competitive with PCRE)
2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // for those expressions.
2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool IsOnePass();
2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool SearchOnePass(const StringPiece& text, const StringPiece& context,
2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                     Anchor anchor, MatchKind kind,
2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                     StringPiece* match, int nmatch);
2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Bit-state backtracking.  Fast on small cases but uses memory
2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // proportional to the product of the program size and the text size.
2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool SearchBitState(const StringPiece& text, const StringPiece& context,
3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                      Anchor anchor, MatchKind kind,
3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                      StringPiece* match, int nmatch);
3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static const int kMaxOnePassCapture = 5;  // $0 through $4
3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Backtracking search: the gold standard against which the other
3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // implementations are checked.  FOR TESTING ONLY.
3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // It allocates a ton of memory to avoid running forever.
3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // It is also recursive, so can't use in production (will overflow stacks).
3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // The name "Unsafe" here is supposed to be a flag that
3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // you should not be using this function.
3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool UnsafeSearchBacktrack(const StringPiece& text,
3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                             const StringPiece& context,
3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                             Anchor anchor, MatchKind kind,
3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                             StringPiece* match, int nmatch);
3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Computes range for any strings matching regexp. The min and max can in
3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // some cases be arbitrarily precise, so the caller gets to specify the
3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // maximum desired length of string returned.
3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //
3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // string s that is an anchored match for this regexp satisfies
3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //   min <= s && s <= max.
3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //
3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Note that PossibleMatchRange() will only consider the first copy of an
3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // infinitely repeated element (i.e., any regexp element followed by a '*' or
3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // '+' operator). Regexps with "{N}" constructions are not affected, as those
3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // do not compile down to infinite repetitions.
3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //
3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns true on success, false on error.
3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool PossibleMatchRange(string* min, string* max, int maxlen);
3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Compiles a collection of regexps to Prog.  Each regexp will have
3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // its own Match instruction recording the index in the vector.
3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                          Regexp* re);
3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private:
3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  friend class Compiler;
3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DFA* GetDFA(MatchKind kind);
3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool anchor_start_;       // regexp has explicit start anchor
3432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool anchor_end_;         // regexp has explicit end anchor
3442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool reversed_;           // whether program runs backward over input
3452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool did_onepass_;        // has IsOnePass been called?
3462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int start_;               // entry point for program
3482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int start_unanchored_;    // unanchored entry point for program
3492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int size_;                // number of instructions
3502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int byte_inst_count_;     // number of kInstByteRange instructions
3512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int bytemap_range_;       // bytemap_[x] < bytemap_range_
3522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int flags_;               // regexp parse flags
3532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int onepass_statesize_;   // byte size of each OneState* node
3542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Inst* inst_;              // pointer to instruction array
3562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Mutex dfa_mutex_;    // Protects dfa_first_, dfa_longest_
3582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DFA* volatile dfa_first_;     // DFA cached for kFirstMatch
3592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DFA* volatile dfa_longest_;   // DFA cached for kLongestMatch and kFullMatch
3602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int64 dfa_mem_;      // Maximum memory for DFAs.
3612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void (*delete_dfa_)(DFA* dfa);
3622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Bitmap<256> byterange_;    // byterange.Get(x) true if x ends a
3642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                             // commonly-treated byte range.
3652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint8 bytemap_[256];       // map from input bytes to byte classes
3662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint8 *unbytemap_;         // bytemap_[unbytemap_[x]] == x
3672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint8* onepass_nodes_;     // data for OnePass nodes
3692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  OneState* onepass_start_;  // start node for OnePass program
3702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DISALLOW_EVIL_CONSTRUCTORS(Prog);
3722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
3732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}  // namespace re2
3752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#endif  // RE2_PROG_H__
377