15c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Copyright 2008 The RE2 Authors. All Rights Reserved. 25c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Use of this source code is governed by a BSD-style 35c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// license that can be found in the LICENSE file. 45c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 55c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Tested by search_test.cc, exhaustive_test.cc, tester.cc 65c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 75c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Prog::SearchBitState is a regular expression search with submatch 85c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// tracking for small regular expressions and texts. Like 95c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// testing/backtrack.cc, it allocates a bit vector with (length of 105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// text) * (length of prog) bits, to make sure it never explores the 115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// same (character position, instruction) state multiple times. This 125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// limits the search to run in time linear in the length of the text. 135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// 145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Unlike testing/backtrack.cc, SearchBitState is not recursive 155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// on the text. 165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// 175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// SearchBitState is a fast replacement for the NFA code on small 185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// regexps and texts when SearchOnePass cannot be used. 195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include "re2/prog.h" 215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include "re2/regexp.h" 225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)namespace re2 { 245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)struct Job { 265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int id; 275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int arg; 285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const char* p; 295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}; 3053e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) 315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)class BitState { 325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) public: 335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) explicit BitState(Prog* prog); 345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ~BitState(); 3553e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) 3653e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) // The usual Search prototype. 3753e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) // Can only call Search once per BitState. 3853e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) bool Search(const StringPiece& text, const StringPiece& context, 3953e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) bool anchored, bool longest, 4053e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) StringPiece* submatch, int nsubmatch); 4153e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) 4253e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) private: 4353e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) inline bool ShouldVisit(int id, const char* p); 4453e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) void Push(int id, const char* p, int arg); 4553e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) bool GrowStack(); 4653e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) bool TrySearch(int id, const char* p); 4753e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) 4853e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) // Search parameters 4953e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) Prog* prog_; // program being run 5053e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) StringPiece text_; // text being searched 5153e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) StringPiece context_; // greater context of text being searched 5253e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) bool anchored_; // whether search is anchored at text.begin() 5353e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) bool longest_; // whether search wants leftmost-longest match 5453e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) bool endmatch_; // whether match must end at text.end() 5553e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) StringPiece *submatch_; // submatches to fill in 5653e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) int nsubmatch_; // # of submatches to fill in 5753e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) 5853e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) // Search state 5953e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) const char** cap_; // capture registers 6053e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) int ncap_; 6153e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) 6253e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) static const int VisitedBits = 32; 6353e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked 6453e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) int nvisited_; // # of words in bitmap 65926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) 6653e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) Job *job_; // stack of text positions to explore 6753e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) int njob_; 6853e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) int maxjob_; 6953e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)}; 7053e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) 7153e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)BitState::BitState(Prog* prog) 7253e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) : prog_(prog), 7353e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) anchored_(false), 745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) longest_(false), 755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) endmatch_(false), 765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) submatch_(NULL), 775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) nsubmatch_(0), 7853e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) cap_(NULL), 795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ncap_(0), 805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) visited_(NULL), 815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) nvisited_(0), 825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) job_(NULL), 835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) njob_(0), 845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) maxjob_(0) { 855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)BitState::~BitState() { 885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) delete[] visited_; 895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) delete[] job_; 905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) delete[] cap_; 915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Should the search visit the pair ip, p? 945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// If so, remember that it was visited so that the next time, 955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// we don't repeat the visit. 965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)bool BitState::ShouldVisit(int id, const char* p) { 975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) uint n = id * (text_.size() + 1) + (p - text_.begin()); 985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1)))) 995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return false; 1005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1)); 1015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return true; 1025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 1035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Grow the stack. 1055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)bool BitState::GrowStack() { 1065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // VLOG(0) << "Reallocate."; 1075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) maxjob_ *= 2; 1085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) Job* newjob = new Job[maxjob_]; 1095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) memmove(newjob, job_, njob_*sizeof job_[0]); 1105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) delete[] job_; 1115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) job_ = newjob; 1125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (njob_ >= maxjob_) { 1135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) LOG(DFATAL) << "Job stack overflow."; 1145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return false; 1155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 1165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return true; 1175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 1185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Push the triple (id, p, arg) onto the stack, growing it if necessary. 1205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void BitState::Push(int id, const char* p, int arg) { 1215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (njob_ >= maxjob_) { 1225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (!GrowStack()) 1235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return; 1245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 1255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int op = prog_->inst(id)->opcode(); 1265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (op == kInstFail) 1275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return; 1285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Only check ShouldVisit when arg == 0. 1305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // When arg > 0, we are continuing a previous visit. 1315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (arg == 0 && !ShouldVisit(id, p)) 1325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return; 1335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) Job* j = &job_[njob_++]; 1355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) j->id = id; 1365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) j->p = p; 137926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) j->arg = arg; 138926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)} 139926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) 14053e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)// Try a search from instruction id0 in state p0. 1415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Return whether it succeeded. 1425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)bool BitState::TrySearch(int id0, const char* p0) { 1435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) bool matched = false; 1445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const char* end = text_.end(); 1455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) njob_ = 0; 146926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) Push(id0, p0, 0); 1475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (njob_ > 0) { 1485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Pop job off stack. 1495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) --njob_; 150926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) int id = job_[njob_].id; 1515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const char* p = job_[njob_].p; 152926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) int arg = job_[njob_].arg; 153926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) 154926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) // Optimization: rather than push and pop, 155926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) // code that is going to Push and continue 1565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // the loop simply updates ip, p, and arg 157926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) // and jumps to CheckAndLoop. We have to 158926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) // do the ShouldVisit check that Push 1595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // would have, but we avoid the stack 1605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // manipulation. 1615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (0) { 1625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) CheckAndLoop: 1635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (!ShouldVisit(id, p)) 1645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) continue; 1655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 1665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Visit ip, p. 1685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // VLOG(0) << "Job: " << ip->id() << " " 1695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // << (p - text_.begin()) << " " << arg; 1705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) Prog::Inst* ip = prog_->inst(id); 1715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) switch (ip->opcode()) { 172926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) case kInstFail: 1735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) default: 1745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg; 1755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return false; 1765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) case kInstAlt: 1785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Cannot just 1795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Push(ip->out1(), p, 0); 1805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Push(ip->out(), p, 0); 1815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // If, during the processing of ip->out(), we encounter 1825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // ip->out1() via another path, we want to process it then. 1835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Pushing it here will inhibit that. Instead, re-push 1845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // ip with arg==1 as a reminder to push ip->out1() later. 1855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) switch (arg) { 1865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) case 0: 1875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) Push(id, p, 1); // come back when we're done 1885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) id = ip->out(); 1895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) goto CheckAndLoop; 1905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) case 1: 1925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Finished ip->out(); try ip->out1(). 1935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) arg = 0; 1945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) id = ip->out1(); 1955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) goto CheckAndLoop; 1965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 1975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) LOG(DFATAL) << "Bad arg in kInstCapture: " << arg; 1985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) continue; 1995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) case kInstAltMatch: 2015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // One opcode is byte range; the other leads to match. 2025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (ip->greedy(prog_)) { 2035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // out1 is the match 2045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) Push(ip->out1(), p, 0); 2055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) id = ip->out1(); 2065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) p = end; 2075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) goto CheckAndLoop; 2085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // out is the match - non-greedy 2105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) Push(ip->out(), end, 0); 2115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) id = ip->out(); 2125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) goto CheckAndLoop; 2135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) case kInstByteRange: { 2155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int c = -1; 2165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (p < end) 2175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) c = *p & 0xFF; 2185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (ip->Matches(c)) { 2195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) id = ip->out(); 2205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) p++; 2215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) goto CheckAndLoop; 2225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) continue; 2245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) case kInstCapture: 2275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) switch (arg) { 2285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) case 0: 2295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (0 <= ip->cap() && ip->cap() < ncap_) { 2305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Capture p to register, but save old value. 2315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) Push(id, cap_[ip->cap()], 1); // come back when we're done 2325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) cap_[ip->cap()] = p; 2335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Continue on. 2355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) id = ip->out(); 2365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) goto CheckAndLoop; 2375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) case 1: 2385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Finished ip->out(); restore the old value. 2395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) cap_[ip->cap()] = p; 2405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) continue; 2415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) LOG(DFATAL) << "Bad arg in kInstCapture: " << arg; 2435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) continue; 2445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) case kInstEmptyWidth: 2465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (ip->empty() & ~Prog::EmptyFlags(context_, p)) 2475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) continue; 2485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) id = ip->out(); 2495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) goto CheckAndLoop; 2505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) case kInstNop: 2525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) id = ip->out(); 2535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) goto CheckAndLoop; 2545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) case kInstMatch: { 2565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (endmatch_ && p != text_.end()) 2575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) continue; 2585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // VLOG(0) << "Found match."; 2605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // We found a match. If the caller doesn't care 2615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // where the match is, no point going further. 2625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (nsubmatch_ == 0) 2635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return true; 2645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Record best match so far. 2665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Only need to check end point, because this entire 2675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // call is only considering one start position. 2685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) matched = true; 2695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) cap_[1] = p; 2705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (submatch_[0].data() == NULL || 2715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) (longest_ && p > submatch_[0].end())) { 2725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) for (int i = 0; i < nsubmatch_; i++) 2735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]); 274926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) } 275926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) 276926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) // If going for first match, we're done. 277926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) if (!longest_) 278926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles) return true; 2795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // If we used the entire text, no longer match is possible. 2815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (p == text_.end()) 2825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return true; 2835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Otherwise, continue on in hope of a longer match. 2855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) continue; 2865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return matched; 2905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 2915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Search text (within context) for prog_. 2935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)bool BitState::Search(const StringPiece& text, const StringPiece& context, 2945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) bool anchored, bool longest, 2955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) StringPiece* submatch, int nsubmatch) { 2965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Search parameters. 2975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) text_ = text; 2985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) context_ = context; 2995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (context_.begin() == NULL) 3005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) context_ = text; 3015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (prog_->anchor_start() && context_.begin() != text.begin()) 3025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return false; 3035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (prog_->anchor_end() && context_.end() != text.end()) 3045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return false; 3055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) anchored_ = anchored || prog_->anchor_start(); 3065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) longest_ = longest || prog_->anchor_end(); 3075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) endmatch_ = prog_->anchor_end(); 3085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) submatch_ = submatch; 3095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) nsubmatch_ = nsubmatch; 3105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) for (int i = 0; i < nsubmatch_; i++) 3115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) submatch_[i] = NULL; 3125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Allocate scratch space. 3145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits; 3155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) visited_ = new uint32[nvisited_]; 3165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) memset(visited_, 0, nvisited_*sizeof visited_[0]); 3175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // VLOG(0) << "nvisited_ = " << nvisited_; 3185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ncap_ = 2*nsubmatch; 3205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (ncap_ < 2) 3215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ncap_ = 2; 3225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) cap_ = new const char*[ncap_]; 3235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) memset(cap_, 0, ncap_*sizeof cap_[0]); 3245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) maxjob_ = 256; 3265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) job_ = new Job[maxjob_]; 3275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Anchored search must start at text.begin(). 3295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (anchored_) { 3305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) cap_[0] = text.begin(); 3315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return TrySearch(prog_->start(), text.begin()); 3325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 3335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Unanchored search, starting from each possible text position. 3355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Notice that we have to try the empty string at the end of 3365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // the text, so the loop condition is p <= text.end(), not p < text.end(). 3375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // This looks like it's quadratic in the size of the text, 3385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // but we are not clearing visited_ between calls to TrySearch, 3395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // so no work is duplicated and it ends up still being linear. 3405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) for (const char* p = text.begin(); p <= text.end(); p++) { 3415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) cap_[0] = p; 3425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. 3435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return true; 3445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 3455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return false; 3465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 3475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Bit-state search. 3495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)bool Prog::SearchBitState(const StringPiece& text, 3505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const StringPiece& context, 3515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) Anchor anchor, 3525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) MatchKind kind, 3535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) StringPiece* match, 3545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int nmatch) { 3555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // If full match, we ask for an anchored longest match 3565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // and then check that match[0] == text. 3575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // So make sure match[0] exists. 3585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) StringPiece sp0; 3595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (kind == kFullMatch) { 3605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) anchor = kAnchored; 3615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (nmatch < 1) { 3625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) match = &sp0; 3635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) nmatch = 1; 3645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 3655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 3665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Run the search. 3685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) BitState b(this); 3695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) bool anchored = anchor == kAnchored; 3705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) bool longest = kind != kFirstMatch; 3715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (!b.Search(text, context, anchored, longest, match, nmatch)) 3725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return false; 3735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (kind == kFullMatch && match[0].end() != text.end()) 3745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return false; 3755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) return true; 3765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 3775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} // namespace re2 3795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)