15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2007 The RE2 Authors.  All Rights Reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Compiled representation of regular expressions.
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// See regexp.h for the Regexp class, which represents a regular
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// expression symbolically.
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef RE2_PROG_H__
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define RE2_PROG_H__
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/util.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/re2.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 {
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Simple fixed-size bitmap.
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)template<int Bits>
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Bitmap {
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Bitmap() { Reset(); }
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int Size() { return Bits; }
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Reset() {
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (int i = 0; i < Words; i++)
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      w_[i] = 0;
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool Get(int k) const {
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return w_[k >> WordLog] & (1<<(k & 31));
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Set(int k) {
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    w_[k >> WordLog] |= 1<<(k & 31);
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Clear(int k) {
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    w_[k >> WordLog] &= ~(1<<(k & 31));
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32 Word(int i) const {
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return w_[i];
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int WordLog = 5;
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int Words = (Bits+31)/32;
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32 w_[Words];
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Opcodes for Inst
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)enum InstOp {
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kInstAlt = 0,      // choose between out_ and out1_
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kInstAltMatch,     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kInstByteRange,    // next (possible case-folded) byte must be in [lo_, hi_]
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kInstCapture,      // capturing parenthesis number cap_
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kInstEmptyWidth,   // empty-width special (^ $ ...); bit(s) set in empty_
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kInstMatch,        // found a match!
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kInstNop,          // no-op; occasionally unavoidable
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kInstFail,         // never match; occasionally unavoidable
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Bit flags for empty-width specials
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)enum EmptyOp {
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kEmptyBeginLine        = 1<<0,      // ^ - beginning of line
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kEmptyEndLine          = 1<<1,      // $ - end of line
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kEmptyBeginText        = 1<<2,      // \A - beginning of text
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kEmptyEndText          = 1<<3,      // \z - end of text
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kEmptyWordBoundary     = 1<<4,      // \b - word boundary
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kEmptyNonWordBoundary  = 1<<5,      // \B - not \b
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kEmptyAllFlags         = (1<<6)-1,
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Regexp;
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class DFA;
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct OneState;
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Compiled form of regexp program.
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Prog {
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Prog();
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~Prog();
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Single instruction in regexp program.
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  class Inst {
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   public:
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Inst() : out_opcode_(0), out1_(0) { }
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Constructors per opcode
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    void InitAlt(uint32 out, uint32 out1);
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    void InitByteRange(int lo, int hi, int foldcase, uint32 out);
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    void InitCapture(int cap, uint32 out);
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    void InitEmptyWidth(EmptyOp empty, uint32 out);
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    void InitMatch(int id);
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    void InitNop(uint32 out);
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    void InitFail();
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Getters
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int id(Prog* p) { return this - p->inst_; }
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int out()     { return out_opcode_>>3; }
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int out1()    { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int cap()       { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int lo()        { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int hi()        { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int foldcase()  { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int match_id()  { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    bool greedy(Prog *p) {
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      DCHECK_EQ(opcode(), kInstAltMatch);
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return p->inst(out())->opcode() == kInstByteRange;
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Does this inst (an kInstByteRange) match c?
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    inline bool Matches(int c) {
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      DCHECK_EQ(opcode(), kInstByteRange);
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (foldcase_ && 'A' <= c && c <= 'Z')
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        c += 'a' - 'A';
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return lo_ <= c && c <= hi_;
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Returns string representation for debugging.
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    string Dump();
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Maximum instruction id.
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // (Must fit in out_opcode_, and PatchList steals another bit.)
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    static const int kMaxInst = (1<<28) - 1;
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   private:
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    void set_opcode(InstOp opcode) {
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      out_opcode_ = (out()<<3) | opcode;
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    void set_out(int out) {
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      out_opcode_ = (out<<3) | opcode();
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    void set_out_opcode(int out, InstOp opcode) {
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      out_opcode_ = (out<<3) | opcode;
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    uint32 out_opcode_;  // 29 bits of out, 3 (low) bits opcode
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    union {              // additional instruction arguments:
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      uint32 out1_;      // opcode == kInstAlt
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         //   alternate next instruction
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      int32 cap_;        // opcode == kInstCapture
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         //   Index of capture register (holds text
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         //   position recorded by capturing parentheses).
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         //   For \n (the submatch for the nth parentheses),
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         //   the left parenthesis captures into register 2*n
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         //   and the right one captures into register 2*n+1.
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      int32 match_id_;   // opcode == kInstMatch
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         //   Match ID to identify this match (for re2::Set).
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      struct {           // opcode == kInstByteRange
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        uint8 lo_;       //   byte range is lo_-hi_ inclusive
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        uint8 hi_;       //
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        uint8 foldcase_; //   convert A-Z to a-z before checking range.
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      };
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      EmptyOp empty_;    // opcode == kInstEmptyWidth
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         //   empty_ is bitwise OR of kEmpty* flags above.
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    };
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    friend class Compiler;
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    friend struct PatchList;
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    friend class Prog;
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DISALLOW_EVIL_CONSTRUCTORS(Inst);
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Whether to anchor the search.
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  enum Anchor {
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    kUnanchored,  // match anywhere
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    kAnchored,    // match only starting at beginning of text
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Kind of match to look for (for anchor != kFullMatch)
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // kLongestMatch mode finds the overall longest
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // match but still makes its submatch choices the way
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Perl would, not in the way prescribed by POSIX.
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The POSIX rules are much more expensive to implement,
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // and no one has needed them.
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // kFullMatch is not strictly necessary -- we could use
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // kLongestMatch and then check the length of the match -- but
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the matching code can run faster if it knows to consider only
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // full matches.
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  enum MatchKind {
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    kFirstMatch,     // like Perl, PCRE
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    kLongestMatch,   // like egrep or POSIX
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    kFullMatch,      // match only entire text; implies anchor==kAnchored
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    kManyMatch       // for SearchDFA, records set of matches
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Inst *inst(int id) { return &inst_[id]; }
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int start() { return start_; }
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int start_unanchored() { return start_unanchored_; }
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void set_start(int start) { start_ = start; }
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void set_start_unanchored(int start) { start_unanchored_ = start; }
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int64 size() { return size_; }
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool reversed() { return reversed_; }
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void set_reversed(bool reversed) { reversed_ = reversed; }
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int64 byte_inst_count() { return byte_inst_count_; }
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const Bitmap<256>& byterange() { return byterange_; }
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int64 dfa_mem() { return dfa_mem_; }
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int flags() { return flags_; }
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void set_flags(int flags) { flags_ = flags; }
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool anchor_start() { return anchor_start_; }
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void set_anchor_start(bool b) { anchor_start_ = b; }
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool anchor_end() { return anchor_end_; }
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void set_anchor_end(bool b) { anchor_end_ = b; }
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int bytemap_range() { return bytemap_range_; }
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const uint8* bytemap() { return bytemap_; }
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns string representation of program for debugging.
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  string Dump();
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  string DumpUnanchored();
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Record that at some point in the prog, the bytes in the range
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // lo-hi (inclusive) are treated as different from bytes outside the range.
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Tracking this lets the DFA collapse commonly-treated byte ranges
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // when recording state pointers, greatly reducing its memory footprint.
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void MarkByteRange(int lo, int hi);
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns the set of kEmpty flags that are in effect at
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // position p within context.
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static uint32 EmptyFlags(const StringPiece& context, const char* p);
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns whether byte c is a word character: ASCII only.
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Used by the implementation of \b and \B.
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This is not right for Unicode, but:
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   - it's hard to get right in a byte-at-a-time matching world
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //     (the DFA has only one-byte lookahead).
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   - even if the lookahead were possible, the Progs would be huge.
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This crude approximation is the same one PCRE uses.
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static bool IsWordChar(uint8 c) {
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return ('A' <= c && c <= 'Z') ||
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           ('a' <= c && c <= 'z') ||
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           ('0' <= c && c <= '9') ||
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)           c == '_';
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Execution engines.  They all search for the regexp (run the prog)
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // in text, which is in the larger context (used for ^ $ \b etc).
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Anchor and kind control the kind of search.
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns true if match found, false if not.
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If match found, fills match[0..nmatch-1] with submatch info.
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // match[0] is overall match, match[1] is first set of parens, etc.
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If a particular submatch is not matched during the regexp match,
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // it is set to NULL.
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Matching text == StringPiece(NULL, 0) is treated as any other empty
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // string, but note that on return, it will not be possible to distinguish
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // submatches that matched that empty string from submatches that didn't
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // match anything.  Either way, match[i] == NULL.
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Search using NFA: can find submatches but kind of slow.
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool SearchNFA(const StringPiece& text, const StringPiece& context,
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 Anchor anchor, MatchKind kind,
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 StringPiece* match, int nmatch);
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Search using DFA: much faster than NFA but only finds
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // end of match and can use a lot more memory.
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns whether a match was found.
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If the DFA runs out of memory, sets *failed to true and returns false.
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If matches != NULL and kind == kManyMatch and there is a match,
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // SearchDFA fills matches with the match IDs of the final matching state.
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool SearchDFA(const StringPiece& text, const StringPiece& context,
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 Anchor anchor, MatchKind kind,
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 StringPiece* match0, bool* failed,
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 vector<int>* matches);
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Build the entire DFA for the given match kind.  FOR TESTING ONLY.
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Usually the DFA is built out incrementally, as needed, which
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // avoids lots of unnecessary work.  This function is useful only
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // for testing purposes.  Returns number of states.
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int BuildEntireDFA(MatchKind kind);
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Compute byte map.
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void ComputeByteMap();
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Run peep-hole optimizer on program.
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Optimize();
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // One-pass NFA: only correct if IsOnePass() is true,
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // but much faster than NFA (competitive with PCRE)
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // for those expressions.
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool IsOnePass();
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool SearchOnePass(const StringPiece& text, const StringPiece& context,
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     Anchor anchor, MatchKind kind,
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     StringPiece* match, int nmatch);
2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Bit-state backtracking.  Fast on small cases but uses memory
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // proportional to the product of the program size and the text size.
2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool SearchBitState(const StringPiece& text, const StringPiece& context,
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      Anchor anchor, MatchKind kind,
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      StringPiece* match, int nmatch);
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int kMaxOnePassCapture = 5;  // $0 through $4
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Backtracking search: the gold standard against which the other
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // implementations are checked.  FOR TESTING ONLY.
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // It allocates a ton of memory to avoid running forever.
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // It is also recursive, so can't use in production (will overflow stacks).
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The name "Unsafe" here is supposed to be a flag that
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // you should not be using this function.
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool UnsafeSearchBacktrack(const StringPiece& text,
3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             const StringPiece& context,
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             Anchor anchor, MatchKind kind,
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             StringPiece* match, int nmatch);
3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Computes range for any strings matching regexp. The min and max can in
3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // some cases be arbitrarily precise, so the caller gets to specify the
3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // maximum desired length of string returned.
3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //
3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // string s that is an anchored match for this regexp satisfies
3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   min <= s && s <= max.
3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //
3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Note that PossibleMatchRange() will only consider the first copy of an
3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // infinitely repeated element (i.e., any regexp element followed by a '*' or
3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // '+' operator). Regexps with "{N}" constructions are not affected, as those
3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // do not compile down to infinite repetitions.
3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //
3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns true on success, false on error.
3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool PossibleMatchRange(string* min, string* max, int maxlen);
3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Compiles a collection of regexps to Prog.  Each regexp will have
3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // its own Match instruction recording the index in the vector.
3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          Regexp* re);
3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  friend class Compiler;
3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DFA* GetDFA(MatchKind kind);
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool anchor_start_;       // regexp has explicit start anchor
3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool anchor_end_;         // regexp has explicit end anchor
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool reversed_;           // whether program runs backward over input
3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool did_onepass_;        // has IsOnePass been called?
3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int start_;               // entry point for program
3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int start_unanchored_;    // unanchored entry point for program
3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int size_;                // number of instructions
3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int byte_inst_count_;     // number of kInstByteRange instructions
3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int bytemap_range_;       // bytemap_[x] < bytemap_range_
3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int flags_;               // regexp parse flags
3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int onepass_statesize_;   // byte size of each OneState* node
3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Inst* inst_;              // pointer to instruction array
3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Mutex dfa_mutex_;    // Protects dfa_first_, dfa_longest_
3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DFA* volatile dfa_first_;     // DFA cached for kFirstMatch
3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DFA* volatile dfa_longest_;   // DFA cached for kLongestMatch and kFullMatch
3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int64 dfa_mem_;      // Maximum memory for DFAs.
3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void (*delete_dfa_)(DFA* dfa);
3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Bitmap<256> byterange_;    // byterange.Get(x) true if x ends a
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                             // commonly-treated byte range.
3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint8 bytemap_[256];       // map from input bytes to byte classes
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint8 *unbytemap_;         // bytemap_[unbytemap_[x]] == x
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint8* onepass_nodes_;     // data for OnePass nodes
3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  OneState* onepass_start_;  // start node for OnePass program
3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_EVIL_CONSTRUCTORS(Prog);
3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace re2
3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // RE2_PROG_H__
377