1// Copyright 2008 The RE2 Authors. All Rights Reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Tested by search_test.cc, exhaustive_test.cc, tester.cc 6// 7// Prog::BadSearchBacktrack is a backtracking regular expression search, 8// except that it remembers where it has been, trading a lot of 9// memory for a lot of time. It exists only for testing purposes. 10// 11// Let me repeat that. 12// 13// THIS CODE SHOULD NEVER BE USED IN PRODUCTION: 14// - It uses a ton of memory. 15// - It uses a ton of stack. 16// - It uses CHECK and LOG(FATAL). 17// - It implements unanchored search by repeated anchored search. 18// 19// On the other hand, it is very simple and a good reference 20// implementation for the more complicated regexp packages. 21// 22// In BUILD, this file is linked into the ":testing" library, 23// not the main library, in order to make it harder to pick up 24// accidentally. 25 26#include "util/util.h" 27#include "re2/prog.h" 28#include "re2/regexp.h" 29 30namespace re2 { 31 32// Backtracker holds the state for a backtracking search. 33// 34// Excluding the search parameters, the main search state 35// is just the "capture registers", which record, for the 36// current execution, the string position at which each 37// parenthesis was passed. cap_[0] and cap_[1] are the 38// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc. 39// 40// To avoid infinite loops during backtracking on expressions 41// like (a*)*, the visited_[] bitmap marks the (state, string-position) 42// pairs that have already been explored and are thus not worth 43// re-exploring if we get there via another path. Modern backtracking 44// libraries engineer their program representation differently, to make 45// such infinite loops possible to avoid without keeping a giant visited_ 46// bitmap, but visited_ works fine for a reference implementation 47// and it has the nice benefit of making the search run in linear time. 48class Backtracker { 49 public: 50 explicit Backtracker(Prog* prog); 51 ~Backtracker(); 52 53 bool Search(const StringPiece& text, const StringPiece& context, 54 bool anchored, bool longest, 55 StringPiece* submatch, int nsubmatch); 56 57 private: 58 // Explores from instruction ip at string position p looking for a match. 59 // Returns true if found (so that caller can stop trying other possibilities). 60 bool Visit(int id, const char* p); 61 62 // Search parameters 63 Prog* prog_; // program being run 64 StringPiece text_; // text being searched 65 StringPiece context_; // greater context of text being searched 66 bool anchored_; // whether search is anchored at text.begin() 67 bool longest_; // whether search wants leftmost-longest match 68 bool endmatch_; // whether search must end at text.end() 69 StringPiece *submatch_; // submatches to fill in 70 int nsubmatch_; // # of submatches to fill in 71 72 // Search state 73 const char* cap_[64]; // capture registers 74 uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked 75 int nvisited_; // # of words in bitmap 76}; 77 78Backtracker::Backtracker(Prog* prog) 79 : prog_(prog), 80 anchored_(false), 81 longest_(false), 82 endmatch_(false), 83 submatch_(NULL), 84 nsubmatch_(0), 85 visited_(NULL), 86 nvisited_(0) { 87} 88 89Backtracker::~Backtracker() { 90 delete[] visited_; 91} 92 93// Runs a backtracking search. 94bool Backtracker::Search(const StringPiece& text, const StringPiece& context, 95 bool anchored, bool longest, 96 StringPiece* submatch, int nsubmatch) { 97 text_ = text; 98 context_ = context; 99 if (context_.begin() == NULL) 100 context_ = text; 101 if (prog_->anchor_start() && text.begin() > context_.begin()) 102 return false; 103 if (prog_->anchor_end() && text.end() < context_.end()) 104 return false; 105 anchored_ = anchored | prog_->anchor_start(); 106 longest_ = longest | prog_->anchor_end(); 107 endmatch_ = prog_->anchor_end(); 108 submatch_ = submatch; 109 nsubmatch_ = nsubmatch; 110 CHECK(2*nsubmatch_ < arraysize(cap_)); 111 memset(cap_, 0, sizeof cap_); 112 113 // We use submatch_[0] for our own bookkeeping, 114 // so it had better exist. 115 StringPiece sp0; 116 if (nsubmatch < 1) { 117 submatch_ = &sp0; 118 nsubmatch_ = 1; 119 } 120 submatch_[0] = NULL; 121 122 // Allocate new visited_ bitmap -- size is proportional 123 // to text, so have to reallocate on each call to Search. 124 delete[] visited_; 125 nvisited_ = (prog_->size()*(text.size()+1) + 31)/32; 126 visited_ = new uint32[nvisited_]; 127 memset(visited_, 0, nvisited_*sizeof visited_[0]); 128 129 // Anchored search must start at text.begin(). 130 if (anchored_) { 131 cap_[0] = text.begin(); 132 return Visit(prog_->start(), text.begin()); 133 } 134 135 // Unanchored search, starting from each possible text position. 136 // Notice that we have to try the empty string at the end of 137 // the text, so the loop condition is p <= text.end(), not p < text.end(). 138 for (const char* p = text.begin(); p <= text.end(); p++) { 139 cap_[0] = p; 140 if (Visit(prog_->start(), p)) // Match must be leftmost; done. 141 return true; 142 } 143 return false; 144} 145 146// Explores from instruction ip at string position p looking for a match. 147// Return true if found (so that caller can stop trying other possibilities). 148bool Backtracker::Visit(int id, const char* p) { 149 // Check bitmap. If we've already explored from here, 150 // either it didn't match or it did but we're hoping for a better match. 151 // Either way, don't go down that road again. 152 CHECK(p <= text_.end()); 153 int n = id*(text_.size()+1) + (p - text_.begin()); 154 CHECK_LT(n/32, nvisited_); 155 if (visited_[n/32] & (1 << (n&31))) 156 return false; 157 visited_[n/32] |= 1 << (n&31); 158 159 // Pick out byte at current position. If at end of string, 160 // have to explore in hope of finishing a match. Use impossible byte -1. 161 int c = -1; 162 if (p < text_.end()) 163 c = *p & 0xFF; 164 165 Prog::Inst* ip = prog_->inst(id); 166 switch (ip->opcode()) { 167 default: 168 LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode(); 169 return false; // not reached 170 171 case kInstAlt: 172 case kInstAltMatch: 173 // Try both possible next states: out is preferred to out1. 174 if (Visit(ip->out(), p)) { 175 if (longest_) 176 Visit(ip->out1(), p); 177 return true; 178 } 179 return Visit(ip->out1(), p); 180 181 case kInstByteRange: 182 if (ip->Matches(c)) 183 return Visit(ip->out(), p+1); 184 return false; 185 186 case kInstCapture: 187 if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) { 188 // Capture p to register, but save old value. 189 const char* q = cap_[ip->cap()]; 190 cap_[ip->cap()] = p; 191 bool ret = Visit(ip->out(), p); 192 // Restore old value as we backtrack. 193 cap_[ip->cap()] = q; 194 return ret; 195 } 196 return Visit(ip->out(), p); 197 198 case kInstEmptyWidth: 199 if (ip->empty() & ~Prog::EmptyFlags(context_, p)) 200 return false; 201 return Visit(ip->out(), p); 202 203 case kInstNop: 204 return Visit(ip->out(), p); 205 206 case kInstMatch: 207 // We found a match. If it's the best so far, record the 208 // parameters in the caller's submatch_ array. 209 if (endmatch_ && p != context_.end()) 210 return false; 211 cap_[1] = p; 212 if (submatch_[0].data() == NULL || // First match so far ... 213 (longest_ && p > submatch_[0].end())) { // ... or better match 214 for (int i = 0; i < nsubmatch_; i++) 215 submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]); 216 } 217 return true; 218 219 case kInstFail: 220 return false; 221 } 222} 223 224// Runs a backtracking search. 225bool Prog::UnsafeSearchBacktrack(const StringPiece& text, 226 const StringPiece& context, 227 Anchor anchor, 228 MatchKind kind, 229 StringPiece* match, 230 int nmatch) { 231 // If full match, we ask for an anchored longest match 232 // and then check that match[0] == text. 233 // So make sure match[0] exists. 234 StringPiece sp0; 235 if (kind == kFullMatch) { 236 anchor = kAnchored; 237 if (nmatch < 1) { 238 match = &sp0; 239 nmatch = 1; 240 } 241 } 242 243 // Run the search. 244 Backtracker b(this); 245 bool anchored = anchor == kAnchored; 246 bool longest = kind != kFirstMatch; 247 if (!b.Search(text, context, anchored, longest, match, nmatch)) 248 return false; 249 if (kind == kFullMatch && match[0].end() != text.end()) 250 return false; 251 return true; 252} 253 254} // namespace re2 255