1// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Tested by search_test.cc, exhaustive_test.cc, tester.cc
6//
7// Prog::BadSearchBacktrack is a backtracking regular expression search,
8// except that it remembers where it has been, trading a lot of
9// memory for a lot of time. It exists only for testing purposes.
10//
11// Let me repeat that.
12//
13// THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
14//   - It uses a ton of memory.
15//   - It uses a ton of stack.
16//   - It uses CHECK and LOG(FATAL).
17//   - It implements unanchored search by repeated anchored search.
18//
19// On the other hand, it is very simple and a good reference
20// implementation for the more complicated regexp packages.
21//
22// In BUILD, this file is linked into the ":testing" library,
23// not the main library, in order to make it harder to pick up
24// accidentally.
25
26#include "util/util.h"
27#include "re2/prog.h"
28#include "re2/regexp.h"
29
30namespace re2 {
31
32// Backtracker holds the state for a backtracking search.
33//
34// Excluding the search parameters, the main search state
35// is just the "capture registers", which record, for the
36// current execution, the string position at which each
37// parenthesis was passed.  cap_[0] and cap_[1] are the
38// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
39//
40// To avoid infinite loops during backtracking on expressions
41// like (a*)*, the visited_[] bitmap marks the (state, string-position)
42// pairs that have already been explored and are thus not worth
43// re-exploring if we get there via another path.  Modern backtracking
44// libraries engineer their program representation differently, to make
45// such infinite loops possible to avoid without keeping a giant visited_
46// bitmap, but visited_ works fine for a reference implementation
47// and it has the nice benefit of making the search run in linear time.
48class Backtracker {
49 public:
50  explicit Backtracker(Prog* prog);
51  ~Backtracker();
52
53  bool Search(const StringPiece& text, const StringPiece& context,
54              bool anchored, bool longest,
55              StringPiece* submatch, int nsubmatch);
56
57 private:
58  // Explores from instruction ip at string position p looking for a match.
59  // Returns true if found (so that caller can stop trying other possibilities).
60  bool Visit(int id, const char* p);
61
62  // Search parameters
63  Prog* prog_;              // program being run
64  StringPiece text_;        // text being searched
65  StringPiece context_;     // greater context of text being searched
66  bool anchored_;           // whether search is anchored at text.begin()
67  bool longest_;            // whether search wants leftmost-longest match
68  bool endmatch_;           // whether search must end at text.end()
69  StringPiece *submatch_;   // submatches to fill in
70  int nsubmatch_;           //   # of submatches to fill in
71
72  // Search state
73  const char* cap_[64];     // capture registers
74  uint32 *visited_;         // bitmap: (Inst*, char*) pairs already backtracked
75  int nvisited_;            //   # of words in bitmap
76};
77
78Backtracker::Backtracker(Prog* prog)
79  : prog_(prog),
80    anchored_(false),
81    longest_(false),
82    endmatch_(false),
83    submatch_(NULL),
84    nsubmatch_(0),
85    visited_(NULL),
86    nvisited_(0) {
87}
88
89Backtracker::~Backtracker() {
90  delete[] visited_;
91}
92
93// Runs a backtracking search.
94bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
95                         bool anchored, bool longest,
96                         StringPiece* submatch, int nsubmatch) {
97  text_ = text;
98  context_ = context;
99  if (context_.begin() == NULL)
100    context_ = text;
101  if (prog_->anchor_start() && text.begin() > context_.begin())
102    return false;
103  if (prog_->anchor_end() && text.end() < context_.end())
104    return false;
105  anchored_ = anchored | prog_->anchor_start();
106  longest_ = longest | prog_->anchor_end();
107  endmatch_ = prog_->anchor_end();
108  submatch_ = submatch;
109  nsubmatch_ = nsubmatch;
110  CHECK(2*nsubmatch_ < arraysize(cap_));
111  memset(cap_, 0, sizeof cap_);
112
113  // We use submatch_[0] for our own bookkeeping,
114  // so it had better exist.
115  StringPiece sp0;
116  if (nsubmatch < 1) {
117    submatch_ = &sp0;
118    nsubmatch_ = 1;
119  }
120  submatch_[0] = NULL;
121
122  // Allocate new visited_ bitmap -- size is proportional
123  // to text, so have to reallocate on each call to Search.
124  delete[] visited_;
125  nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
126  visited_ = new uint32[nvisited_];
127  memset(visited_, 0, nvisited_*sizeof visited_[0]);
128
129  // Anchored search must start at text.begin().
130  if (anchored_) {
131    cap_[0] = text.begin();
132    return Visit(prog_->start(), text.begin());
133  }
134
135  // Unanchored search, starting from each possible text position.
136  // Notice that we have to try the empty string at the end of
137  // the text, so the loop condition is p <= text.end(), not p < text.end().
138  for (const char* p = text.begin(); p <= text.end(); p++) {
139    cap_[0] = p;
140    if (Visit(prog_->start(), p))  // Match must be leftmost; done.
141      return true;
142  }
143  return false;
144}
145
146// Explores from instruction ip at string position p looking for a match.
147// Return true if found (so that caller can stop trying other possibilities).
148bool Backtracker::Visit(int id, const char* p) {
149  // Check bitmap.  If we've already explored from here,
150  // either it didn't match or it did but we're hoping for a better match.
151  // Either way, don't go down that road again.
152  CHECK(p <= text_.end());
153  int n = id*(text_.size()+1) + (p - text_.begin());
154  CHECK_LT(n/32, nvisited_);
155  if (visited_[n/32] & (1 << (n&31)))
156    return false;
157  visited_[n/32] |= 1 << (n&31);
158
159  // Pick out byte at current position.  If at end of string,
160  // have to explore in hope of finishing a match.  Use impossible byte -1.
161  int c = -1;
162  if (p < text_.end())
163    c = *p & 0xFF;
164
165  Prog::Inst* ip = prog_->inst(id);
166  switch (ip->opcode()) {
167    default:
168      LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
169      return false;  // not reached
170
171    case kInstAlt:
172    case kInstAltMatch:
173      // Try both possible next states: out is preferred to out1.
174      if (Visit(ip->out(), p)) {
175        if (longest_)
176          Visit(ip->out1(), p);
177        return true;
178      }
179      return Visit(ip->out1(), p);
180
181    case kInstByteRange:
182      if (ip->Matches(c))
183        return Visit(ip->out(), p+1);
184      return false;
185
186    case kInstCapture:
187      if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) {
188        // Capture p to register, but save old value.
189        const char* q = cap_[ip->cap()];
190        cap_[ip->cap()] = p;
191        bool ret = Visit(ip->out(), p);
192        // Restore old value as we backtrack.
193        cap_[ip->cap()] = q;
194        return ret;
195      }
196      return Visit(ip->out(), p);
197
198    case kInstEmptyWidth:
199      if (ip->empty() & ~Prog::EmptyFlags(context_, p))
200        return false;
201      return Visit(ip->out(), p);
202
203    case kInstNop:
204      return Visit(ip->out(), p);
205
206    case kInstMatch:
207      // We found a match.  If it's the best so far, record the
208      // parameters in the caller's submatch_ array.
209      if (endmatch_ && p != context_.end())
210        return false;
211      cap_[1] = p;
212      if (submatch_[0].data() == NULL ||           // First match so far ...
213          (longest_ && p > submatch_[0].end())) {  // ... or better match
214        for (int i = 0; i < nsubmatch_; i++)
215          submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
216      }
217      return true;
218
219    case kInstFail:
220      return false;
221  }
222}
223
224// Runs a backtracking search.
225bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
226                                 const StringPiece& context,
227                                 Anchor anchor,
228                                 MatchKind kind,
229                                 StringPiece* match,
230                                 int nmatch) {
231  // If full match, we ask for an anchored longest match
232  // and then check that match[0] == text.
233  // So make sure match[0] exists.
234  StringPiece sp0;
235  if (kind == kFullMatch) {
236    anchor = kAnchored;
237    if (nmatch < 1) {
238      match = &sp0;
239      nmatch = 1;
240    }
241  }
242
243  // Run the search.
244  Backtracker b(this);
245  bool anchored = anchor == kAnchored;
246  bool longest = kind != kFirstMatch;
247  if (!b.Search(text, context, anchored, longest, match, nmatch))
248    return false;
249  if (kind == kFullMatch && match[0].end() != text.end())
250    return false;
251  return true;
252}
253
254}  // namespace re2
255