bitstate.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Tested by search_test.cc, exhaustive_test.cc, tester.cc
6
7// Prog::SearchBitState is a regular expression search with submatch
8// tracking for small regular expressions and texts.  Like
9// testing/backtrack.cc, it allocates a bit vector with (length of
10// text) * (length of prog) bits, to make sure it never explores the
11// same (character position, instruction) state multiple times.  This
12// limits the search to run in time linear in the length of the text.
13//
14// Unlike testing/backtrack.cc, SearchBitState is not recursive
15// on the text.
16//
17// SearchBitState is a fast replacement for the NFA code on small
18// regexps and texts when SearchOnePass cannot be used.
19
20#include "re2/prog.h"
21#include "re2/regexp.h"
22
23namespace re2 {
24
25struct Job {
26  int id;
27  int arg;
28  const char* p;
29};
30
31class BitState {
32 public:
33  explicit BitState(Prog* prog);
34  ~BitState();
35
36  // The usual Search prototype.
37  // Can only call Search once per BitState.
38  bool Search(const StringPiece& text, const StringPiece& context,
39              bool anchored, bool longest,
40              StringPiece* submatch, int nsubmatch);
41
42 private:
43  inline bool ShouldVisit(int id, const char* p);
44  void Push(int id, const char* p, int arg);
45  bool GrowStack();
46  bool TrySearch(int id, const char* p);
47
48  // Search parameters
49  Prog* prog_;              // program being run
50  StringPiece text_;        // text being searched
51  StringPiece context_;     // greater context of text being searched
52  bool anchored_;           // whether search is anchored at text.begin()
53  bool longest_;            // whether search wants leftmost-longest match
54  bool endmatch_;           // whether match must end at text.end()
55  StringPiece *submatch_;   // submatches to fill in
56  int nsubmatch_;           //   # of submatches to fill in
57
58  // Search state
59  const char** cap_;        // capture registers
60  int ncap_;
61
62  static const int VisitedBits = 32;
63  uint32 *visited_;         // bitmap: (Inst*, char*) pairs already backtracked
64  int nvisited_;            //   # of words in bitmap
65
66  Job *job_;                // stack of text positions to explore
67  int njob_;
68  int maxjob_;
69};
70
71BitState::BitState(Prog* prog)
72  : prog_(prog),
73    anchored_(false),
74    longest_(false),
75    endmatch_(false),
76    submatch_(NULL),
77    nsubmatch_(0),
78    cap_(NULL),
79    ncap_(0),
80    visited_(NULL),
81    nvisited_(0),
82    job_(NULL),
83    njob_(0),
84    maxjob_(0) {
85}
86
87BitState::~BitState() {
88  delete[] visited_;
89  delete[] job_;
90  delete[] cap_;
91}
92
93// Should the search visit the pair ip, p?
94// If so, remember that it was visited so that the next time,
95// we don't repeat the visit.
96bool BitState::ShouldVisit(int id, const char* p) {
97  uint n = id * (text_.size() + 1) + (p - text_.begin());
98  if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
99    return false;
100  visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
101  return true;
102}
103
104// Grow the stack.
105bool BitState::GrowStack() {
106  // VLOG(0) << "Reallocate.";
107  maxjob_ *= 2;
108  Job* newjob = new Job[maxjob_];
109  memmove(newjob, job_, njob_*sizeof job_[0]);
110  delete[] job_;
111  job_ = newjob;
112  if (njob_ >= maxjob_) {
113    LOG(DFATAL) << "Job stack overflow.";
114    return false;
115  }
116  return true;
117}
118
119// Push the triple (id, p, arg) onto the stack, growing it if necessary.
120void BitState::Push(int id, const char* p, int arg) {
121  if (njob_ >= maxjob_) {
122    if (!GrowStack())
123      return;
124  }
125  int op = prog_->inst(id)->opcode();
126  if (op == kInstFail)
127    return;
128
129  // Only check ShouldVisit when arg == 0.
130  // When arg > 0, we are continuing a previous visit.
131  if (arg == 0 && !ShouldVisit(id, p))
132    return;
133
134  Job* j = &job_[njob_++];
135  j->id = id;
136  j->p = p;
137  j->arg = arg;
138}
139
140// Try a search from instruction id0 in state p0.
141// Return whether it succeeded.
142bool BitState::TrySearch(int id0, const char* p0) {
143  bool matched = false;
144  const char* end = text_.end();
145  njob_ = 0;
146  Push(id0, p0, 0);
147  while (njob_ > 0) {
148    // Pop job off stack.
149    --njob_;
150    int id = job_[njob_].id;
151    const char* p = job_[njob_].p;
152    int arg = job_[njob_].arg;
153
154    // Optimization: rather than push and pop,
155    // code that is going to Push and continue
156    // the loop simply updates ip, p, and arg
157    // and jumps to CheckAndLoop.  We have to
158    // do the ShouldVisit check that Push
159    // would have, but we avoid the stack
160    // manipulation.
161    if (0) {
162    CheckAndLoop:
163      if (!ShouldVisit(id, p))
164        continue;
165    }
166
167    // Visit ip, p.
168    // VLOG(0) << "Job: " << ip->id() << " "
169    //         << (p - text_.begin()) << " " << arg;
170    Prog::Inst* ip = prog_->inst(id);
171    switch (ip->opcode()) {
172      case kInstFail:
173      default:
174        LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
175        return false;
176
177      case kInstAlt:
178        // Cannot just
179        //   Push(ip->out1(), p, 0);
180        //   Push(ip->out(), p, 0);
181        // If, during the processing of ip->out(), we encounter
182        // ip->out1() via another path, we want to process it then.
183        // Pushing it here will inhibit that.  Instead, re-push
184        // ip with arg==1 as a reminder to push ip->out1() later.
185        switch (arg) {
186          case 0:
187            Push(id, p, 1);  // come back when we're done
188            id = ip->out();
189            goto CheckAndLoop;
190
191          case 1:
192            // Finished ip->out(); try ip->out1().
193            arg = 0;
194            id = ip->out1();
195            goto CheckAndLoop;
196        }
197        LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
198        continue;
199
200      case kInstAltMatch:
201        // One opcode is byte range; the other leads to match.
202        if (ip->greedy(prog_)) {
203          // out1 is the match
204          Push(ip->out1(), p, 0);
205          id = ip->out1();
206          p = end;
207          goto CheckAndLoop;
208        }
209        // out is the match - non-greedy
210        Push(ip->out(), end, 0);
211        id = ip->out();
212        goto CheckAndLoop;
213
214      case kInstByteRange: {
215        int c = -1;
216        if (p < end)
217          c = *p & 0xFF;
218        if (ip->Matches(c)) {
219          id = ip->out();
220          p++;
221          goto CheckAndLoop;
222        }
223        continue;
224      }
225
226      case kInstCapture:
227        switch (arg) {
228          case 0:
229            if (0 <= ip->cap() && ip->cap() < ncap_) {
230              // Capture p to register, but save old value.
231              Push(id, cap_[ip->cap()], 1);  // come back when we're done
232              cap_[ip->cap()] = p;
233            }
234            // Continue on.
235            id = ip->out();
236            goto CheckAndLoop;
237          case 1:
238            // Finished ip->out(); restore the old value.
239            cap_[ip->cap()] = p;
240            continue;
241        }
242        LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
243        continue;
244
245      case kInstEmptyWidth:
246        if (ip->empty() & ~Prog::EmptyFlags(context_, p))
247          continue;
248        id = ip->out();
249        goto CheckAndLoop;
250
251      case kInstNop:
252        id = ip->out();
253        goto CheckAndLoop;
254
255      case kInstMatch: {
256        if (endmatch_ && p != text_.end())
257          continue;
258
259        // VLOG(0) << "Found match.";
260        // We found a match.  If the caller doesn't care
261        // where the match is, no point going further.
262        if (nsubmatch_ == 0)
263          return true;
264
265        // Record best match so far.
266        // Only need to check end point, because this entire
267        // call is only considering one start position.
268        matched = true;
269        cap_[1] = p;
270        if (submatch_[0].data() == NULL ||
271            (longest_ && p > submatch_[0].end())) {
272          for (int i = 0; i < nsubmatch_; i++)
273            submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
274        }
275
276        // If going for first match, we're done.
277        if (!longest_)
278          return true;
279
280        // If we used the entire text, no longer match is possible.
281        if (p == text_.end())
282          return true;
283
284        // Otherwise, continue on in hope of a longer match.
285        continue;
286      }
287    }
288  }
289  return matched;
290}
291
292// Search text (within context) for prog_.
293bool BitState::Search(const StringPiece& text, const StringPiece& context,
294                      bool anchored, bool longest,
295                      StringPiece* submatch, int nsubmatch) {
296  // Search parameters.
297  text_ = text;
298  context_ = context;
299  if (context_.begin() == NULL)
300    context_ = text;
301  if (prog_->anchor_start() && context_.begin() != text.begin())
302    return false;
303  if (prog_->anchor_end() && context_.end() != text.end())
304    return false;
305  anchored_ = anchored || prog_->anchor_start();
306  longest_ = longest || prog_->anchor_end();
307  endmatch_ = prog_->anchor_end();
308  submatch_ = submatch;
309  nsubmatch_ = nsubmatch;
310  for (int i = 0; i < nsubmatch_; i++)
311    submatch_[i] = NULL;
312
313  // Allocate scratch space.
314  nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
315  visited_ = new uint32[nvisited_];
316  memset(visited_, 0, nvisited_*sizeof visited_[0]);
317  // VLOG(0) << "nvisited_ = " << nvisited_;
318
319  ncap_ = 2*nsubmatch;
320  if (ncap_ < 2)
321    ncap_ = 2;
322  cap_ = new const char*[ncap_];
323  memset(cap_, 0, ncap_*sizeof cap_[0]);
324
325  maxjob_ = 256;
326  job_ = new Job[maxjob_];
327
328  // Anchored search must start at text.begin().
329  if (anchored_) {
330    cap_[0] = text.begin();
331    return TrySearch(prog_->start(), text.begin());
332  }
333
334  // Unanchored search, starting from each possible text position.
335  // Notice that we have to try the empty string at the end of
336  // the text, so the loop condition is p <= text.end(), not p < text.end().
337  // This looks like it's quadratic in the size of the text,
338  // but we are not clearing visited_ between calls to TrySearch,
339  // so no work is duplicated and it ends up still being linear.
340  for (const char* p = text.begin(); p <= text.end(); p++) {
341    cap_[0] = p;
342    if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
343      return true;
344  }
345  return false;
346}
347
348// Bit-state search.
349bool Prog::SearchBitState(const StringPiece& text,
350                          const StringPiece& context,
351                          Anchor anchor,
352                          MatchKind kind,
353                          StringPiece* match,
354                          int nmatch) {
355  // If full match, we ask for an anchored longest match
356  // and then check that match[0] == text.
357  // So make sure match[0] exists.
358  StringPiece sp0;
359  if (kind == kFullMatch) {
360    anchor = kAnchored;
361    if (nmatch < 1) {
362      match = &sp0;
363      nmatch = 1;
364    }
365  }
366
367  // Run the search.
368  BitState b(this);
369  bool anchored = anchor == kAnchored;
370  bool longest = kind != kFirstMatch;
371  if (!b.Search(text, context, anchored, longest, match, nmatch))
372    return false;
373  if (kind == kFullMatch && match[0].end() != text.end())
374    return false;
375  return true;
376}
377
378}  // namespace re2
379