12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2006-2007 The RE2 Authors. All Rights Reserved. 22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style 32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file. 42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Tested by search_test.cc. 62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Prog::SearchNFA, an NFA search. 82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This is an actual NFA like the theorists talk about, 92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// not the pseudo-NFA found in backtracking regexp implementations. 102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// IMPLEMENTATION 122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This algorithm is a variant of one that appeared in Rob Pike's sam editor, 142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// which is a variant of the one described in Thompson's 1968 CACM paper. 152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// See http://swtch.com/~rsc/regexp/ for various history. The main feature 162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// over the DFA implementation is that it tracks submatch boundaries. 172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// When the choice of submatch boundaries is ambiguous, this particular 192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// implementation makes the same choices that traditional backtracking 202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// implementations (in particular, Perl and PCRE) do. 212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential 222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// time in the length of the input. 232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Like Thompson's original machine and like the DFA implementation, this 252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// implementation notices a match only once it is one byte past it. 262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/prog.h" 282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/regexp.h" 292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/sparse_array.h" 302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/sparse_set.h" 312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 { 332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass NFA { 352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public: 362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson NFA(Prog* prog); 372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ~NFA(); 382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Searches for a matching string. 402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // * If anchored is true, only considers matches starting at offset. 412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Otherwise finds lefmost match at or after offset. 422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // * If longest is true, returns the longest match starting 432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // at the chosen start point. Otherwise returns the so-called 442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // left-biased match, the one traditional backtracking engines 452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (like Perl and PCRE) find. 462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Records submatch boundaries in submatch[1..nsubmatch-1]. 472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Submatch[0] is the entire match. When there is a choice in 482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // which text matches each subexpression, the submatch boundaries 492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // are chosen to match what a backtracking implementation would choose. 502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool Search(const StringPiece& text, const StringPiece& context, 512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchored, bool longest, 522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece* submatch, int nsubmatch); 532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static const int Debug = 0; 552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private: 572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson struct Thread { 582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson union { 592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id; 602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Thread* next; // when on free list 612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char** capture; 632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // State for explicit stack in AddToThreadq. 662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson struct AddState { 672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id; // Inst to process 682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int j; 692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip 702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddState() 722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson : id(0), j(-1), cap_j(NULL) {} 732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson explicit AddState(int id) 742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson : id(id), j(-1), cap_j(NULL) {} 752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddState(int id, const char* cap_j, int j) 762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson : id(id), j(j), cap_j(cap_j) {} 772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Threadq is a list of threads. The list is sorted by the order 802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // in which Perl would explore that particular state -- the earlier 812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // choices appear earlier in the list. 822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson typedef SparseArray<Thread*> Threadq; 832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inline Thread* AllocThread(); 852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inline void FreeThread(Thread*); 862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Add r (or its children, following unlabeled arrows) 882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // to the workqueue q with associated capture info. 892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void AddToThreadq(Threadq* q, int id, int flag, 902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* p, const char** capture); 912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Run runq on byte c, appending new states to nextq. 932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Updates matched_ and match_ as new, better matches are found. 942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // p is position of the next byte (the one after c) 952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // in the input string, used when processing capturing parens. 962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // flag is the bitwise or of Bol, Eol, etc., specifying whether 972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ^, $ and \b match the current input point (after c). 982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p); 992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns text version of capture information, for debugging. 1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson string FormatCapture(const char** capture); 1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inline void CopyCapture(const char** dst, const char** src); 1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Computes whether all matches must begin with the same first 1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // byte, and if so, returns that byte. If not, returns -1. 1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int ComputeFirstByte(); 1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog* prog_; // underlying program 1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int start_; // start instruction in program 1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int ncapture_; // number of submatches to track 1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool longest_; // whether searching for longest match 1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool endmatch_; // whether match must end at text.end() 1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* btext_; // beginning of text being matched (for FormatSubmatch) 1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* etext_; // end of text being matched (for endmatch_) 1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Threadq q0_, q1_; // pre-allocated for Search. 1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char** match_; // best match so far 1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool matched_; // any match so far? 1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddState* astack_; // pre-allocated for AddToThreadq 1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nastack_; 1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int first_byte_; // required first byte for match, or -1 if none 1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Thread* free_threads_; // free list 1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DISALLOW_EVIL_CONSTRUCTORS(NFA); 1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonNFA::NFA(Prog* prog) { 1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_ = prog; 1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start_ = prog->start(); 1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ncapture_ = 0; 1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson longest_ = false; 1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson endmatch_ = false; 1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson btext_ = NULL; 1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson etext_ = NULL; 1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q0_.resize(prog_->size()); 1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q1_.resize(prog_->size()); 1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nastack_ = 2*prog_->size(); 1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson astack_ = new AddState[nastack_]; 1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson match_ = NULL; 1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson matched_ = false; 1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson free_threads_ = NULL; 1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson first_byte_ = ComputeFirstByte(); 1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonNFA::~NFA() { 1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] match_; 1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] astack_; 1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Thread* next; 1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (Thread* t = free_threads_; t; t = next) { 1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson next = t->next; 1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] t->capture; 1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete t; 1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid NFA::FreeThread(Thread *t) { 1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t == NULL) 1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t->next = free_threads_; 1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson free_threads_ = t; 1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonNFA::Thread* NFA::AllocThread() { 1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Thread* t = free_threads_; 1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t == NULL) { 1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t = new Thread; 1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t->capture = new const char*[ncapture_]; 1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return t; 1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson free_threads_ = t->next; 1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return t; 1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid NFA::CopyCapture(const char** dst, const char** src) { 1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < ncapture_; i+=2) { 1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson dst[i] = src[i]; 1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson dst[i+1] = src[i+1]; 1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Follows all empty arrows from r and enqueues all the states reached. 1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match. 1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The pointer p is the current input position, and m is the 1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// current set of match boundaries. 1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid NFA::AddToThreadq(Threadq* q, int id0, int flag, 1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* p, const char** capture) { 1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id0 == 0) 1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Astack_ is pre-allocated to avoid resize operations. 1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // It has room for 2*prog_->size() entries, which is enough: 1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Each inst in prog can be processed at most once, 1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // pushing at most two entries on stk. 1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nstk = 0; 1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddState* stk = astack_; 1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[nstk++] = AddState(id0); 1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (nstk > 0) { 2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_LE(nstk, nastack_); 2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const AddState& a = stk[--nstk]; 2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (a.j >= 0) 2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson capture[a.j] = a.cap_j; 2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = a.id; 2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id == 0) 2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (q->has_index(id)) { 2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (Debug) 2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str()); 2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Create entry in q no matter what. We might fill it in below, 2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // or we might not. Even if not, it is necessary to have it, 2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // so that we don't revisit r during the recursion. 2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q->set_new(id, NULL); 2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Thread** tp = &q->find(id)->second; 2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int j; 2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Thread* t; 2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = prog_->inst(id); 2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (ip->opcode()) { 2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq"; 2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstFail: 2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAltMatch: 2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Save state; will pick up at next byte. 2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t = AllocThread(); 2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t->id = id; 2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CopyCapture(t->capture, capture); 2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *tp = t; 2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // fall through 2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAlt: 2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Explore alternatives. 2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[nstk++] = AddState(ip->out1()); 2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[nstk++] = AddState(ip->out()); 2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstNop: 2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Continue on. 2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[nstk++] = AddState(ip->out()); 2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstCapture: 2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((j=ip->cap()) < ncapture_) { 2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Push a dummy whose only job is to restore capture[j] 2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // once we finish exploring this possibility. 2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[nstk++] = AddState(0, capture[j], j); 2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Record capture. 2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson capture[j] = p; 2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[nstk++] = AddState(ip->out()); 2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstMatch: 2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstByteRange: 2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Save state; will pick up at next byte. 2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t = AllocThread(); 2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t->id = id; 2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CopyCapture(t->capture, capture); 2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *tp = t; 2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (Debug) 2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t); 2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstEmptyWidth: 2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Continue on if we have all the right flag bits. 2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->empty() & ~flag) 2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[nstk++] = AddState(ip->out()); 2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Run runq on byte c, appending new states to nextq. 2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Updates match as new, better matches are found. 2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// p is position of the byte c in the input string, 2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// used when processing capturing parens. 2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// flag is the bitwise or of Bol, Eol, etc., specifying whether 2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// ^, $ and \b match the current input point (after c). 2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Frees all the threads on runq. 2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If there is a shortcut to the end, returns that shortcut. 2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonint NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) { 2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nextq->clear(); 2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { 2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Thread* t = i->second; 2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t == NULL) 2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (longest_) { 3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Can skip any threads started after our current best match. 3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (matched_ && match_[0] < t->capture[0]) { 3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson FreeThread(t); 3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = t->id; 3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = prog_->inst(id); 3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (ip->opcode()) { 3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Should only see the values handled below. 3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step"; 3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstByteRange: 3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->Matches(c)) 3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToThreadq(nextq, ip->out(), flag, p+1, t->capture); 3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAltMatch: 3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i != runq->begin()) 3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The match is ours if we want it. 3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->greedy(prog_) || longest_) { 3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CopyCapture((const char**)match_, t->capture); 3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson FreeThread(t); 3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (++i; i != runq->end(); ++i) 3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson FreeThread(i->second); 3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson runq->clear(); 3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson matched_ = true; 3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->greedy(prog_)) 3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ip->out1(); 3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ip->out(); 3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstMatch: 3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (endmatch_ && p != etext_) 3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 3422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* old = t->capture[1]; // previous end pointer 3442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t->capture[1] = p; 3452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (longest_) { 3462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Leftmost-longest mode: save this match only if 3472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // it is either farther to the left or at the same 3482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // point but longer than an existing match. 3492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!matched_ || t->capture[0] < match_[0] || 3502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (t->capture[0] == match_[0] && t->capture[1] > match_[1])) 3512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CopyCapture((const char**)match_, t->capture); 3522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 3532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Leftmost-biased mode: this match is by definition 3542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // better than what we've already found (see next line). 3552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CopyCapture((const char**)match_, t->capture); 3562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Cut off the threads that can only find matches 3582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // worse than the one we just found: don't run the 3592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // rest of the current Threadq. 3602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t->capture[0] = old; 3612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson FreeThread(t); 3622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (++i; i != runq->end(); ++i) 3632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson FreeThread(i->second); 3642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson runq->clear(); 3652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson matched_ = true; 3662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return 0; 3672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson t->capture[0] = old; 3692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson matched_ = true; 3702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 3712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson FreeThread(t); 3732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson runq->clear(); 3752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return 0; 3762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring NFA::FormatCapture(const char** capture) { 3792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson string s; 3802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < ncapture_; i+=2) { 3822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (capture[i] == NULL) 3832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringAppendF(&s, "(?,?)"); 3842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else if (capture[i+1] == NULL) 3852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_)); 3862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 3872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringAppendF(&s, "(%d,%d)", 3882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (int)(capture[i] - btext_), 3892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (int)(capture[i+1] - btext_)); 3902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return s; 3922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns whether haystack contains needle's memory. 3952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool StringPieceContains(const StringPiece haystack, const StringPiece needle) { 3962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return haystack.begin() <= needle.begin() && 3972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson haystack.end() >= needle.end(); 3982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool NFA::Search(const StringPiece& text, const StringPiece& const_context, 4012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchored, bool longest, 4022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece* submatch, int nsubmatch) { 4032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (start_ == 0) 4042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 4052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece context = const_context; 4072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (context.begin() == NULL) 4082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson context = text; 4092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!StringPieceContains(context, text)) { 4112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(FATAL) << "Bad args: context does not contain text " 4122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson << reinterpret_cast<const void*>(context.begin()) 4132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson << "+" << context.size() << " " 4142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson << reinterpret_cast<const void*>(text.begin()) 4152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson << "+" << text.size(); 4162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 4172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (prog_->anchor_start() && context.begin() != text.begin()) 4202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 4212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (prog_->anchor_end() && context.end() != text.end()) 4222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 4232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson anchored |= prog_->anchor_start(); 4242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (prog_->anchor_end()) { 4252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson longest = true; 4262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson endmatch_ = true; 4272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson etext_ = text.end(); 4282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (nsubmatch < 0) { 4312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch; 4322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 4332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Save search parameters. 4362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ncapture_ = 2*nsubmatch; 4372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson longest_ = longest; 4382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (nsubmatch == 0) { 4402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // We need to maintain match[0], both to distinguish the 4412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // longest match (if longest is true) and also to tell 4422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // whether we've seen any matches at all. 4432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ncapture_ = 2; 4442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson match_ = new const char*[ncapture_]; 4472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson matched_ = false; 4482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memset(match_, 0, ncapture_*sizeof match_[0]); 4492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // For debugging prints. 4512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson btext_ = context.begin(); 4522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (Debug) { 4542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", 4552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson text.as_string().c_str(), context.as_string().c_str(), anchored, 4562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson longest); 4572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Set up search. 4602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Threadq* runq = &q0_; 4612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Threadq* nextq = &q1_; 4622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson runq->clear(); 4632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nextq->clear(); 4642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memset(&match_[0], 0, ncapture_*sizeof match_[0]); 4652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* bp = context.begin(); 4662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int c = -1; 4672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int wasword = 0; 4682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (text.begin() > context.begin()) { 4702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c = text.begin()[-1] & 0xFF; 4712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson wasword = Prog::IsWordChar(c); 4722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Loop over the text, stepping the machine. 4752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (const char* p = text.begin();; p++) { 4762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Check for empty-width specials. 4772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int flag = 0; 4782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ^ and \A 4802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (p == context.begin()) 4812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flag |= kEmptyBeginText | kEmptyBeginLine; 4822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else if (p <= context.end() && p[-1] == '\n') 4832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flag |= kEmptyBeginLine; 4842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // $ and \z 4862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (p == context.end()) 4872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flag |= kEmptyEndText | kEmptyEndLine; 4882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else if (p < context.end() && p[0] == '\n') 4892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flag |= kEmptyEndLine; 4902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // \b and \B 4922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int isword = 0; 4932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (p < context.end()) 4942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson isword = Prog::IsWordChar(p[0] & 0xFF); 4952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (isword != wasword) 4972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flag |= kEmptyWordBoundary; 4982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 4992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flag |= kEmptyNonWordBoundary; 5002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (Debug) { 5022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword); 5032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { 5042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Thread* t = i->second; 5052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (t == NULL) 5062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 5072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, " %d%s", t->id, 5082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson FormatCapture((const char**)t->capture).c_str()); 5092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "\n"); 5112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Process previous character (waited until now to avoid 5142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // repeating the flag computation above). 5152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // This is a no-op the first time around the loop, because 5162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // runq is empty. 5172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = Step(runq, nextq, c, flag, p-1); 5182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_EQ(runq->size(), 0); 5192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson swap(nextq, runq); 5202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nextq->clear(); 5212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id != 0) { 5222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // We're done: full match ahead. 5232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson p = text.end(); 5242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (;;) { 5252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = prog_->inst(id); 5262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (ip->opcode()) { 5272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 5282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode(); 5292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 5302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstCapture: 5322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson match_[ip->cap()] = p; 5332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = ip->out(); 5342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 5352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstNop: 5372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = ip->out(); 5382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 5392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstMatch: 5412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson match_[1] = p; 5422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson matched_ = true; 5432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 5442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstEmptyWidth: 5462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) { 5472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty(); 5482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 5492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = ip->out(); 5512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 5522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 5542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 5562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (p > text.end()) 5592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 5602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Start a new thread if there have not been any matches. 5622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (No point in starting a new thread if there have been 5632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // matches, since it would be to the right of the match 5642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // we already found.) 5652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!matched_ && (!anchored || p == text.begin())) { 5662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If there's a required first byte for an unanchored search 5672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and we're not in the middle of any possible matches, 5682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // use memchr to search for the byte quickly. 5692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!anchored && first_byte_ >= 0 && runq->size() == 0 && 5702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson p < text.end() && (p[0] & 0xFF) != first_byte_) { 5712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson p = reinterpret_cast<const char*>(memchr(p, first_byte_, 5722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson text.end() - p)); 5732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (p == NULL) { 5742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson p = text.end(); 5752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson isword = 0; 5762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 5772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson isword = Prog::IsWordChar(p[0] & 0xFF); 5782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flag = Prog::EmptyFlags(context, p); 5802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Steal match storage (cleared but unused as of yet) 5832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // temporarily to hold match boundaries for new thread. 5842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson match_[0] = p; 5852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToThreadq(runq, start_, flag, p, match_); 5862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson match_[0] = NULL; 5872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If all the threads have died, stop early. 5902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (runq->size() == 0) { 5912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (Debug) 5922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "dead\n"); 5932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 5942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (p == text.end()) 5972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c = 0; 5982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 5992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c = *p & 0xFF; 6002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson wasword = isword; 6012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Will run step(runq, nextq, c, ...) on next iteration. See above. 6032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) 6062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson FreeThread(i->second); 6072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (matched_) { 6092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < nsubmatch; i++) 6102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]); 6112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (Debug) 6122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "match (%d,%d)\n", 6132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static_cast<int>(match_[0] - btext_), 6142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static_cast<int>(match_[1] - btext_)); 6152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 6162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(1) << "No matches found"; 6182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 6192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 6202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Computes whether all successful matches have a common first byte, 6222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and if so, returns that byte. If not, returns -1. 6232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonint NFA::ComputeFirstByte() { 6242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (start_ == 0) 6252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return -1; 6262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int b = -1; // first byte, not yet computed 6282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson typedef SparseSet Workq; 6302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Workq q(prog_->size()); 6312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q.insert(start_); 6322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (Workq::iterator it = q.begin(); it != q.end(); ++it) { 6332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = *it; 6342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = prog_->inst(id); 6352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (ip->opcode()) { 6362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 6372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte"; 6382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 6392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstMatch: 6412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The empty string matches: no first byte. 6422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return -1; 6432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstByteRange: 6452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Must match only a single byte 6462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->lo() != ip->hi()) 6472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return -1; 6482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z') 6492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return -1; 6502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If we haven't seen any bytes yet, record it; 6512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // otherwise must match the one we saw before. 6522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (b == -1) 6532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson b = ip->lo(); 6542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else if (b != ip->lo()) 6552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return -1; 6562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 6572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstNop: 6592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstCapture: 6602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstEmptyWidth: 6612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Continue on. 6622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Ignore ip->empty() flags for kInstEmptyWidth 6632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // in order to be as conservative as possible 6642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (assume all possible empty-width flags are true). 6652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->out()) 6662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q.insert(ip->out()); 6672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 6682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAlt: 6702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAltMatch: 6712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Explore alternatives. 6722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->out()) 6732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q.insert(ip->out()); 6742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->out1()) 6752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q.insert(ip->out1()); 6762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 6772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstFail: 6792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 6802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return b; 6832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 6842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool 6862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonProg::SearchNFA(const StringPiece& text, const StringPiece& context, 6872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Anchor anchor, MatchKind kind, 6882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece* match, int nmatch) { 6892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (NFA::Debug) 6902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Dump(); 6912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson NFA nfa(this); 6932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece sp; 6942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (kind == kFullMatch) { 6952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson anchor = kAnchored; 6962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (nmatch == 0) { 6972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson match = &sp; 6982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nmatch = 1; 6992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch)) 7022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 7032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (kind == kFullMatch && match[0].end() != text.end()) 7042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 7052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 7062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 7072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} // namespace re2 7092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 710