12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2007 The RE2 Authors. All Rights Reserved. 22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style 32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file. 42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Compiled representation of regular expressions. 62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// See regexp.h for the Regexp class, which represents a regular 72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// expression symbolically. 82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#ifndef RE2_PROG_H__ 102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#define RE2_PROG_H__ 112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/util.h" 132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/re2.h" 142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 { 162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Simple fixed-size bitmap. 182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsontemplate<int Bits> 192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass Bitmap { 202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public: 212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Bitmap() { Reset(); } 222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int Size() { return Bits; } 232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void Reset() { 252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < Words; i++) 262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson w_[i] = 0; 272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool Get(int k) const { 292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return w_[k >> WordLog] & (1<<(k & 31)); 302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void Set(int k) { 322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson w_[k >> WordLog] |= 1<<(k & 31); 332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void Clear(int k) { 352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson w_[k >> WordLog] &= ~(1<<(k & 31)); 362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint32 Word(int i) const { 382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return w_[i]; 392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private: 422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static const int WordLog = 5; 432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static const int Words = (Bits+31)/32; 442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint32 w_[Words]; 452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DISALLOW_EVIL_CONSTRUCTORS(Bitmap); 462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Opcodes for Inst 502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonenum InstOp { 512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kInstAlt = 0, // choose between out_ and out1_ 522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa. 532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_] 542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kInstCapture, // capturing parenthesis number cap_ 552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_ 562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kInstMatch, // found a match! 572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kInstNop, // no-op; occasionally unavoidable 582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kInstFail, // never match; occasionally unavoidable 592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Bit flags for empty-width specials 622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonenum EmptyOp { 632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kEmptyBeginLine = 1<<0, // ^ - beginning of line 642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kEmptyEndLine = 1<<1, // $ - end of line 652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kEmptyBeginText = 1<<2, // \A - beginning of text 662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kEmptyEndText = 1<<3, // \z - end of text 672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kEmptyWordBoundary = 1<<4, // \b - word boundary 682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kEmptyNonWordBoundary = 1<<5, // \B - not \b 692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kEmptyAllFlags = (1<<6)-1, 702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass Regexp; 732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass DFA; 752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstruct OneState; 762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Compiled form of regexp program. 782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass Prog { 792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public: 802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog(); 812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ~Prog(); 822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Single instruction in regexp program. 842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson class Inst { 852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public: 862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Inst() : out_opcode_(0), out1_(0) { } 872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Constructors per opcode 892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void InitAlt(uint32 out, uint32 out1); 902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void InitByteRange(int lo, int hi, int foldcase, uint32 out); 912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void InitCapture(int cap, uint32 out); 922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void InitEmptyWidth(EmptyOp empty, uint32 out); 932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void InitMatch(int id); 942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void InitNop(uint32 out); 952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void InitFail(); 962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Getters 982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id(Prog* p) { return this - p->inst_; } 992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); } 1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int out() { return out_opcode_>>3; } 1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } 1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } 1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } 1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } 1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; } 1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } 1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } 1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool greedy(Prog *p) { 1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_EQ(opcode(), kInstAltMatch); 1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return p->inst(out())->opcode() == kInstByteRange; 1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Does this inst (an kInstByteRange) match c? 1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inline bool Matches(int c) { 1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_EQ(opcode(), kInstByteRange); 1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (foldcase_ && 'A' <= c && c <= 'Z') 1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c += 'a' - 'A'; 1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return lo_ <= c && c <= hi_; 1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns string representation for debugging. 1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson string Dump(); 1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Maximum instruction id. 1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (Must fit in out_opcode_, and PatchList steals another bit.) 1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static const int kMaxInst = (1<<28) - 1; 1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private: 1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void set_opcode(InstOp opcode) { 1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson out_opcode_ = (out()<<3) | opcode; 1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void set_out(int out) { 1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson out_opcode_ = (out<<3) | opcode(); 1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void set_out_opcode(int out, InstOp opcode) { 1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson out_opcode_ = (out<<3) | opcode; 1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode 1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson union { // additional instruction arguments: 1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint32 out1_; // opcode == kInstAlt 1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // alternate next instruction 1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int32 cap_; // opcode == kInstCapture 1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Index of capture register (holds text 1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // position recorded by capturing parentheses). 1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // For \n (the submatch for the nth parentheses), 1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the left parenthesis captures into register 2*n 1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and the right one captures into register 2*n+1. 1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int32 match_id_; // opcode == kInstMatch 1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Match ID to identify this match (for re2::Set). 1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson struct { // opcode == kInstByteRange 1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint8 lo_; // byte range is lo_-hi_ inclusive 1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint8 hi_; // 1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint8 foldcase_; // convert A-Z to a-z before checking range. 1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson EmptyOp empty_; // opcode == kInstEmptyWidth 1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // empty_ is bitwise OR of kEmpty* flags above. 1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson friend class Compiler; 1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson friend struct PatchList; 1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson friend class Prog; 1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DISALLOW_EVIL_CONSTRUCTORS(Inst); 1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Whether to anchor the search. 1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson enum Anchor { 1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kUnanchored, // match anywhere 1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kAnchored, // match only starting at beginning of text 1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Kind of match to look for (for anchor != kFullMatch) 1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // kLongestMatch mode finds the overall longest 1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // match but still makes its submatch choices the way 1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Perl would, not in the way prescribed by POSIX. 1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The POSIX rules are much more expensive to implement, 1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and no one has needed them. 1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // kFullMatch is not strictly necessary -- we could use 1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // kLongestMatch and then check the length of the match -- but 1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the matching code can run faster if it knows to consider only 1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // full matches. 1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson enum MatchKind { 1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kFirstMatch, // like Perl, PCRE 1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kLongestMatch, // like egrep or POSIX 1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kFullMatch, // match only entire text; implies anchor==kAnchored 1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kManyMatch // for SearchDFA, records set of matches 1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Inst *inst(int id) { return &inst_[id]; } 1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int start() { return start_; } 2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int start_unanchored() { return start_unanchored_; } 2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void set_start(int start) { start_ = start; } 2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void set_start_unanchored(int start) { start_unanchored_ = start; } 2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int64 size() { return size_; } 2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool reversed() { return reversed_; } 2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void set_reversed(bool reversed) { reversed_ = reversed; } 2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int64 byte_inst_count() { return byte_inst_count_; } 2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const Bitmap<256>& byterange() { return byterange_; } 2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; } 2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int64 dfa_mem() { return dfa_mem_; } 2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int flags() { return flags_; } 2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void set_flags(int flags) { flags_ = flags; } 2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchor_start() { return anchor_start_; } 2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void set_anchor_start(bool b) { anchor_start_ = b; } 2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchor_end() { return anchor_end_; } 2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void set_anchor_end(bool b) { anchor_end_ = b; } 2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int bytemap_range() { return bytemap_range_; } 2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const uint8* bytemap() { return bytemap_; } 2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns string representation of program for debugging. 2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson string Dump(); 2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson string DumpUnanchored(); 2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Record that at some point in the prog, the bytes in the range 2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // lo-hi (inclusive) are treated as different from bytes outside the range. 2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Tracking this lets the DFA collapse commonly-treated byte ranges 2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // when recording state pointers, greatly reducing its memory footprint. 2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void MarkByteRange(int lo, int hi); 2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns the set of kEmpty flags that are in effect at 2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // position p within context. 2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static uint32 EmptyFlags(const StringPiece& context, const char* p); 2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns whether byte c is a word character: ASCII only. 2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Used by the implementation of \b and \B. 2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // This is not right for Unicode, but: 2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // - it's hard to get right in a byte-at-a-time matching world 2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (the DFA has only one-byte lookahead). 2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // - even if the lookahead were possible, the Progs would be huge. 2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // This crude approximation is the same one PCRE uses. 2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static bool IsWordChar(uint8 c) { 2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ('A' <= c && c <= 'Z') || 2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ('a' <= c && c <= 'z') || 2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ('0' <= c && c <= '9') || 2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c == '_'; 2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Execution engines. They all search for the regexp (run the prog) 2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // in text, which is in the larger context (used for ^ $ \b etc). 2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Anchor and kind control the kind of search. 2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns true if match found, false if not. 2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If match found, fills match[0..nmatch-1] with submatch info. 2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // match[0] is overall match, match[1] is first set of parens, etc. 2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If a particular submatch is not matched during the regexp match, 2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // it is set to NULL. 2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Matching text == StringPiece(NULL, 0) is treated as any other empty 2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // string, but note that on return, it will not be possible to distinguish 2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // submatches that matched that empty string from submatches that didn't 2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // match anything. Either way, match[i] == NULL. 2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Search using NFA: can find submatches but kind of slow. 2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool SearchNFA(const StringPiece& text, const StringPiece& context, 2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Anchor anchor, MatchKind kind, 2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece* match, int nmatch); 2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Search using DFA: much faster than NFA but only finds 2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // end of match and can use a lot more memory. 2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns whether a match was found. 2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If the DFA runs out of memory, sets *failed to true and returns false. 2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If matches != NULL and kind == kManyMatch and there is a match, 2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // SearchDFA fills matches with the match IDs of the final matching state. 2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool SearchDFA(const StringPiece& text, const StringPiece& context, 2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Anchor anchor, MatchKind kind, 2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece* match0, bool* failed, 2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson vector<int>* matches); 2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Build the entire DFA for the given match kind. FOR TESTING ONLY. 2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Usually the DFA is built out incrementally, as needed, which 2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // avoids lots of unnecessary work. This function is useful only 2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // for testing purposes. Returns number of states. 2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int BuildEntireDFA(MatchKind kind); 2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Compute byte map. 2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void ComputeByteMap(); 2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Run peep-hole optimizer on program. 2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void Optimize(); 2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // One-pass NFA: only correct if IsOnePass() is true, 2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // but much faster than NFA (competitive with PCRE) 2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // for those expressions. 2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool IsOnePass(); 2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool SearchOnePass(const StringPiece& text, const StringPiece& context, 2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Anchor anchor, MatchKind kind, 2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece* match, int nmatch); 2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Bit-state backtracking. Fast on small cases but uses memory 2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // proportional to the product of the program size and the text size. 2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool SearchBitState(const StringPiece& text, const StringPiece& context, 3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Anchor anchor, MatchKind kind, 3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece* match, int nmatch); 3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static const int kMaxOnePassCapture = 5; // $0 through $4 3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Backtracking search: the gold standard against which the other 3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // implementations are checked. FOR TESTING ONLY. 3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // It allocates a ton of memory to avoid running forever. 3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // It is also recursive, so can't use in production (will overflow stacks). 3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The name "Unsafe" here is supposed to be a flag that 3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // you should not be using this function. 3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool UnsafeSearchBacktrack(const StringPiece& text, 3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const StringPiece& context, 3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Anchor anchor, MatchKind kind, 3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece* match, int nmatch); 3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Computes range for any strings matching regexp. The min and max can in 3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // some cases be arbitrarily precise, so the caller gets to specify the 3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // maximum desired length of string returned. 3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any 3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // string s that is an anchored match for this regexp satisfies 3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // min <= s && s <= max. 3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Note that PossibleMatchRange() will only consider the first copy of an 3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // infinitely repeated element (i.e., any regexp element followed by a '*' or 3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // '+' operator). Regexps with "{N}" constructions are not affected, as those 3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // do not compile down to infinite repetitions. 3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns true on success, false on error. 3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool PossibleMatchRange(string* min, string* max, int maxlen); 3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Compiles a collection of regexps to Prog. Each regexp will have 3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // its own Match instruction recording the index in the vector. 3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, 3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re); 3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private: 3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson friend class Compiler; 3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DFA* GetDFA(MatchKind kind); 3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchor_start_; // regexp has explicit start anchor 3432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchor_end_; // regexp has explicit end anchor 3442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool reversed_; // whether program runs backward over input 3452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool did_onepass_; // has IsOnePass been called? 3462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int start_; // entry point for program 3482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int start_unanchored_; // unanchored entry point for program 3492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int size_; // number of instructions 3502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int byte_inst_count_; // number of kInstByteRange instructions 3512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int bytemap_range_; // bytemap_[x] < bytemap_range_ 3522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int flags_; // regexp parse flags 3532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int onepass_statesize_; // byte size of each OneState* node 3542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Inst* inst_; // pointer to instruction array 3562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_ 3582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DFA* volatile dfa_first_; // DFA cached for kFirstMatch 3592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch 3602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int64 dfa_mem_; // Maximum memory for DFAs. 3612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void (*delete_dfa_)(DFA* dfa); 3622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Bitmap<256> byterange_; // byterange.Get(x) true if x ends a 3642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // commonly-treated byte range. 3652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint8 bytemap_[256]; // map from input bytes to byte classes 3662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x 3672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint8* onepass_nodes_; // data for OnePass nodes 3692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson OneState* onepass_start_; // start node for OnePass program 3702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DISALLOW_EVIL_CONSTRUCTORS(Prog); 3722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 3732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} // namespace re2 3752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#endif // RE2_PROG_H__ 377