12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2008 The RE2 Authors. All Rights Reserved. 22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style 32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file. 42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Tested by search_test.cc, exhaustive_test.cc, tester.cc 62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Prog::SearchBitState is a regular expression search with submatch 82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// tracking for small regular expressions and texts. Like 92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// testing/backtrack.cc, it allocates a bit vector with (length of 102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// text) * (length of prog) bits, to make sure it never explores the 112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// same (character position, instruction) state multiple times. This 122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// limits the search to run in time linear in the length of the text. 132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Unlike testing/backtrack.cc, SearchBitState is not recursive 152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// on the text. 162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// SearchBitState is a fast replacement for the NFA code on small 182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// regexps and texts when SearchOnePass cannot be used. 192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/prog.h" 212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/regexp.h" 222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 { 242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstruct Job { 262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id; 272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int arg; 282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* p; 292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass BitState { 322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public: 332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson explicit BitState(Prog* prog); 342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ~BitState(); 352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The usual Search prototype. 372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Can only call Search once per BitState. 382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool Search(const StringPiece& text, const StringPiece& context, 392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchored, bool longest, 402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece* submatch, int nsubmatch); 412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private: 432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inline bool ShouldVisit(int id, const char* p); 442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void Push(int id, const char* p, int arg); 452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool GrowStack(); 462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool TrySearch(int id, const char* p); 472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Search parameters 492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog* prog_; // program being run 502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece text_; // text being searched 512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece context_; // greater context of text being searched 522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchored_; // whether search is anchored at text.begin() 532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool longest_; // whether search wants leftmost-longest match 542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool endmatch_; // whether match must end at text.end() 552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece *submatch_; // submatches to fill in 562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nsubmatch_; // # of submatches to fill in 572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Search state 592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char** cap_; // capture registers 602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int ncap_; 612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static const int VisitedBits = 32; 632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked 642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nvisited_; // # of words in bitmap 652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Job *job_; // stack of text positions to explore 672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int njob_; 682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int maxjob_; 692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonBitState::BitState(Prog* prog) 722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson : prog_(prog), 732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson anchored_(false), 742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson longest_(false), 752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson endmatch_(false), 762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson submatch_(NULL), 772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nsubmatch_(0), 782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cap_(NULL), 792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ncap_(0), 802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson visited_(NULL), 812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nvisited_(0), 822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson job_(NULL), 832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson njob_(0), 842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson maxjob_(0) { 852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonBitState::~BitState() { 882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] visited_; 892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] job_; 902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] cap_; 912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Should the search visit the pair ip, p? 942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If so, remember that it was visited so that the next time, 952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// we don't repeat the visit. 962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool BitState::ShouldVisit(int id, const char* p) { 972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint n = id * (text_.size() + 1) + (p - text_.begin()); 982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1)))) 992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1)); 1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Grow the stack. 1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool BitState::GrowStack() { 1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // VLOG(0) << "Reallocate."; 1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson maxjob_ *= 2; 1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Job* newjob = new Job[maxjob_]; 1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memmove(newjob, job_, njob_*sizeof job_[0]); 1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] job_; 1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson job_ = newjob; 1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (njob_ >= maxjob_) { 1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Job stack overflow."; 1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Push the triple (id, p, arg) onto the stack, growing it if necessary. 1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid BitState::Push(int id, const char* p, int arg) { 1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (njob_ >= maxjob_) { 1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!GrowStack()) 1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int op = prog_->inst(id)->opcode(); 1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (op == kInstFail) 1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Only check ShouldVisit when arg == 0. 1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // When arg > 0, we are continuing a previous visit. 1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (arg == 0 && !ShouldVisit(id, p)) 1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Job* j = &job_[njob_++]; 1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson j->id = id; 1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson j->p = p; 1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson j->arg = arg; 1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Try a search from instruction id0 in state p0. 1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Return whether it succeeded. 1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool BitState::TrySearch(int id0, const char* p0) { 1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool matched = false; 1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* end = text_.end(); 1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson njob_ = 0; 1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Push(id0, p0, 0); 1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (njob_ > 0) { 1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pop job off stack. 1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson --njob_; 1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = job_[njob_].id; 1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* p = job_[njob_].p; 1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int arg = job_[njob_].arg; 1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Optimization: rather than push and pop, 1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // code that is going to Push and continue 1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the loop simply updates ip, p, and arg 1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and jumps to CheckAndLoop. We have to 1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // do the ShouldVisit check that Push 1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // would have, but we avoid the stack 1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // manipulation. 1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (0) { 1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CheckAndLoop: 1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ShouldVisit(id, p)) 1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Visit ip, p. 1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // VLOG(0) << "Job: " << ip->id() << " " 1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // << (p - text_.begin()) << " " << arg; 1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = prog_->inst(id); 1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (ip->opcode()) { 1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstFail: 1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg; 1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAlt: 1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Cannot just 1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Push(ip->out1(), p, 0); 1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Push(ip->out(), p, 0); 1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If, during the processing of ip->out(), we encounter 1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ip->out1() via another path, we want to process it then. 1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pushing it here will inhibit that. Instead, re-push 1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ip with arg==1 as a reminder to push ip->out1() later. 1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (arg) { 1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 0: 1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Push(id, p, 1); // come back when we're done 1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = ip->out(); 1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto CheckAndLoop; 1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 1: 1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Finished ip->out(); try ip->out1(). 1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson arg = 0; 1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = ip->out1(); 1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto CheckAndLoop; 1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Bad arg in kInstCapture: " << arg; 1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAltMatch: 2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // One opcode is byte range; the other leads to match. 2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->greedy(prog_)) { 2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // out1 is the match 2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Push(ip->out1(), p, 0); 2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = ip->out1(); 2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson p = end; 2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto CheckAndLoop; 2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // out is the match - non-greedy 2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Push(ip->out(), end, 0); 2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = ip->out(); 2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto CheckAndLoop; 2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstByteRange: { 2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int c = -1; 2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (p < end) 2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c = *p & 0xFF; 2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->Matches(c)) { 2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = ip->out(); 2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson p++; 2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto CheckAndLoop; 2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstCapture: 2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (arg) { 2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 0: 2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (0 <= ip->cap() && ip->cap() < ncap_) { 2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Capture p to register, but save old value. 2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Push(id, cap_[ip->cap()], 1); // come back when we're done 2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cap_[ip->cap()] = p; 2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Continue on. 2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = ip->out(); 2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto CheckAndLoop; 2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case 1: 2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Finished ip->out(); restore the old value. 2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cap_[ip->cap()] = p; 2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Bad arg in kInstCapture: " << arg; 2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstEmptyWidth: 2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->empty() & ~Prog::EmptyFlags(context_, p)) 2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = ip->out(); 2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto CheckAndLoop; 2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstNop: 2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = ip->out(); 2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson goto CheckAndLoop; 2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstMatch: { 2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (endmatch_ && p != text_.end()) 2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // VLOG(0) << "Found match."; 2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // We found a match. If the caller doesn't care 2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // where the match is, no point going further. 2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (nsubmatch_ == 0) 2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Record best match so far. 2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Only need to check end point, because this entire 2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // call is only considering one start position. 2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson matched = true; 2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cap_[1] = p; 2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (submatch_[0].data() == NULL || 2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (longest_ && p > submatch_[0].end())) { 2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < nsubmatch_; i++) 2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]); 2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If going for first match, we're done. 2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!longest_) 2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If we used the entire text, no longer match is possible. 2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (p == text_.end()) 2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Otherwise, continue on in hope of a longer match. 2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return matched; 2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Search text (within context) for prog_. 2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool BitState::Search(const StringPiece& text, const StringPiece& context, 2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchored, bool longest, 2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece* submatch, int nsubmatch) { 2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Search parameters. 2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson text_ = text; 2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson context_ = context; 2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (context_.begin() == NULL) 3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson context_ = text; 3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (prog_->anchor_start() && context_.begin() != text.begin()) 3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (prog_->anchor_end() && context_.end() != text.end()) 3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson anchored_ = anchored || prog_->anchor_start(); 3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson longest_ = longest || prog_->anchor_end(); 3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson endmatch_ = prog_->anchor_end(); 3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson submatch_ = submatch; 3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nsubmatch_ = nsubmatch; 3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < nsubmatch_; i++) 3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson submatch_[i] = NULL; 3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Allocate scratch space. 3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits; 3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson visited_ = new uint32[nvisited_]; 3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memset(visited_, 0, nvisited_*sizeof visited_[0]); 3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // VLOG(0) << "nvisited_ = " << nvisited_; 3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ncap_ = 2*nsubmatch; 3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ncap_ < 2) 3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ncap_ = 2; 3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cap_ = new const char*[ncap_]; 3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memset(cap_, 0, ncap_*sizeof cap_[0]); 3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson maxjob_ = 256; 3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson job_ = new Job[maxjob_]; 3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Anchored search must start at text.begin(). 3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (anchored_) { 3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cap_[0] = text.begin(); 3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return TrySearch(prog_->start(), text.begin()); 3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Unanchored search, starting from each possible text position. 3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Notice that we have to try the empty string at the end of 3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the text, so the loop condition is p <= text.end(), not p < text.end(). 3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // This looks like it's quadratic in the size of the text, 3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // but we are not clearing visited_ between calls to TrySearch, 3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // so no work is duplicated and it ends up still being linear. 3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (const char* p = text.begin(); p <= text.end(); p++) { 3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cap_[0] = p; 3422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. 3432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 3442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 3462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Bit-state search. 3492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Prog::SearchBitState(const StringPiece& text, 3502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const StringPiece& context, 3512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Anchor anchor, 3522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson MatchKind kind, 3532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece* match, 3542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nmatch) { 3552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If full match, we ask for an anchored longest match 3562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and then check that match[0] == text. 3572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // So make sure match[0] exists. 3582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece sp0; 3592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (kind == kFullMatch) { 3602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson anchor = kAnchored; 3612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (nmatch < 1) { 3622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson match = &sp0; 3632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nmatch = 1; 3642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Run the search. 3682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson BitState b(this); 3692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchored = anchor == kAnchored; 3702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool longest = kind != kFirstMatch; 3712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!b.Search(text, context, anchored, longest, match, nmatch)) 3722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 3732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (kind == kFullMatch && match[0].end() != text.end()) 3742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 3752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 3762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} // namespace re2 379