12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2008 The RE2 Authors.  All Rights Reserved.
22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style
32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file.
42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Tested by search_test.cc, exhaustive_test.cc, tester.cc
62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Prog::SearchBitState is a regular expression search with submatch
82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// tracking for small regular expressions and texts.  Like
92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// testing/backtrack.cc, it allocates a bit vector with (length of
102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// text) * (length of prog) bits, to make sure it never explores the
112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// same (character position, instruction) state multiple times.  This
122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// limits the search to run in time linear in the length of the text.
132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Unlike testing/backtrack.cc, SearchBitState is not recursive
152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// on the text.
162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// SearchBitState is a fast replacement for the NFA code on small
182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// regexps and texts when SearchOnePass cannot be used.
192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/prog.h"
212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/regexp.h"
222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 {
242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstruct Job {
262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int id;
272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int arg;
282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  const char* p;
292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass BitState {
322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public:
332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  explicit BitState(Prog* prog);
342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  ~BitState();
352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // The usual Search prototype.
372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Can only call Search once per BitState.
382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool Search(const StringPiece& text, const StringPiece& context,
392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson              bool anchored, bool longest,
402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson              StringPiece* submatch, int nsubmatch);
412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private:
432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  inline bool ShouldVisit(int id, const char* p);
442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void Push(int id, const char* p, int arg);
452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool GrowStack();
462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool TrySearch(int id, const char* p);
472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Search parameters
492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Prog* prog_;              // program being run
502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  StringPiece text_;        // text being searched
512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  StringPiece context_;     // greater context of text being searched
522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool anchored_;           // whether search is anchored at text.begin()
532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool longest_;            // whether search wants leftmost-longest match
542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool endmatch_;           // whether match must end at text.end()
552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  StringPiece *submatch_;   // submatches to fill in
562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int nsubmatch_;           //   # of submatches to fill in
572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Search state
592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  const char** cap_;        // capture registers
602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int ncap_;
612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static const int VisitedBits = 32;
632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint32 *visited_;         // bitmap: (Inst*, char*) pairs already backtracked
642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int nvisited_;            //   # of words in bitmap
652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Job *job_;                // stack of text positions to explore
672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int njob_;
682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int maxjob_;
692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonBitState::BitState(Prog* prog)
722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  : prog_(prog),
732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    anchored_(false),
742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    longest_(false),
752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    endmatch_(false),
762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    submatch_(NULL),
772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    nsubmatch_(0),
782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    cap_(NULL),
792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    ncap_(0),
802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    visited_(NULL),
812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    nvisited_(0),
822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    job_(NULL),
832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    njob_(0),
842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    maxjob_(0) {
852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonBitState::~BitState() {
882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  delete[] visited_;
892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  delete[] job_;
902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  delete[] cap_;
912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Should the search visit the pair ip, p?
942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If so, remember that it was visited so that the next time,
952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// we don't repeat the visit.
962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool BitState::ShouldVisit(int id, const char* p) {
972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint n = id * (text_.size() + 1) + (p - text_.begin());
982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return true;
1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Grow the stack.
1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool BitState::GrowStack() {
1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // VLOG(0) << "Reallocate.";
1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  maxjob_ *= 2;
1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Job* newjob = new Job[maxjob_];
1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  memmove(newjob, job_, njob_*sizeof job_[0]);
1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  delete[] job_;
1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  job_ = newjob;
1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (njob_ >= maxjob_) {
1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    LOG(DFATAL) << "Job stack overflow.";
1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return true;
1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Push the triple (id, p, arg) onto the stack, growing it if necessary.
1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid BitState::Push(int id, const char* p, int arg) {
1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (njob_ >= maxjob_) {
1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (!GrowStack())
1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return;
1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int op = prog_->inst(id)->opcode();
1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (op == kInstFail)
1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return;
1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Only check ShouldVisit when arg == 0.
1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // When arg > 0, we are continuing a previous visit.
1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (arg == 0 && !ShouldVisit(id, p))
1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return;
1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Job* j = &job_[njob_++];
1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  j->id = id;
1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  j->p = p;
1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  j->arg = arg;
1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Try a search from instruction id0 in state p0.
1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Return whether it succeeded.
1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool BitState::TrySearch(int id0, const char* p0) {
1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool matched = false;
1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  const char* end = text_.end();
1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  njob_ = 0;
1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Push(id0, p0, 0);
1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  while (njob_ > 0) {
1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Pop job off stack.
1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    --njob_;
1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int id = job_[njob_].id;
1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    const char* p = job_[njob_].p;
1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int arg = job_[njob_].arg;
1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Optimization: rather than push and pop,
1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // code that is going to Push and continue
1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // the loop simply updates ip, p, and arg
1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // and jumps to CheckAndLoop.  We have to
1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // do the ShouldVisit check that Push
1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // would have, but we avoid the stack
1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // manipulation.
1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (0) {
1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    CheckAndLoop:
1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (!ShouldVisit(id, p))
1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        continue;
1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Visit ip, p.
1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // VLOG(0) << "Job: " << ip->id() << " "
1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    //         << (p - text_.begin()) << " " << arg;
1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Prog::Inst* ip = prog_->inst(id);
1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    switch (ip->opcode()) {
1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstFail:
1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      default:
1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        return false;
1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstAlt:
1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // Cannot just
1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        //   Push(ip->out1(), p, 0);
1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        //   Push(ip->out(), p, 0);
1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // If, during the processing of ip->out(), we encounter
1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // ip->out1() via another path, we want to process it then.
1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // Pushing it here will inhibit that.  Instead, re-push
1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // ip with arg==1 as a reminder to push ip->out1() later.
1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        switch (arg) {
1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          case 0:
1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            Push(id, p, 1);  // come back when we're done
1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            id = ip->out();
1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            goto CheckAndLoop;
1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          case 1:
1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            // Finished ip->out(); try ip->out1().
1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            arg = 0;
1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            id = ip->out1();
1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            goto CheckAndLoop;
1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        }
1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        continue;
1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstAltMatch:
2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // One opcode is byte range; the other leads to match.
2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (ip->greedy(prog_)) {
2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          // out1 is the match
2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          Push(ip->out1(), p, 0);
2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          id = ip->out1();
2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          p = end;
2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          goto CheckAndLoop;
2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        }
2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // out is the match - non-greedy
2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        Push(ip->out(), end, 0);
2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        id = ip->out();
2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        goto CheckAndLoop;
2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstByteRange: {
2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        int c = -1;
2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (p < end)
2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          c = *p & 0xFF;
2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (ip->Matches(c)) {
2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          id = ip->out();
2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          p++;
2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          goto CheckAndLoop;
2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        }
2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        continue;
2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstCapture:
2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        switch (arg) {
2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          case 0:
2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            if (0 <= ip->cap() && ip->cap() < ncap_) {
2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson              // Capture p to register, but save old value.
2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson              Push(id, cap_[ip->cap()], 1);  // come back when we're done
2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson              cap_[ip->cap()] = p;
2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            }
2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            // Continue on.
2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            id = ip->out();
2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            goto CheckAndLoop;
2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          case 1:
2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            // Finished ip->out(); restore the old value.
2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            cap_[ip->cap()] = p;
2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            continue;
2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        }
2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        continue;
2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstEmptyWidth:
2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (ip->empty() & ~Prog::EmptyFlags(context_, p))
2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          continue;
2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        id = ip->out();
2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        goto CheckAndLoop;
2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstNop:
2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        id = ip->out();
2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        goto CheckAndLoop;
2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstMatch: {
2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (endmatch_ && p != text_.end())
2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          continue;
2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // VLOG(0) << "Found match.";
2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // We found a match.  If the caller doesn't care
2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // where the match is, no point going further.
2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (nsubmatch_ == 0)
2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          return true;
2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // Record best match so far.
2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // Only need to check end point, because this entire
2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // call is only considering one start position.
2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        matched = true;
2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        cap_[1] = p;
2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (submatch_[0].data() == NULL ||
2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            (longest_ && p > submatch_[0].end())) {
2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          for (int i = 0; i < nsubmatch_; i++)
2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        }
2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // If going for first match, we're done.
2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (!longest_)
2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          return true;
2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // If we used the entire text, no longer match is possible.
2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (p == text_.end())
2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          return true;
2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // Otherwise, continue on in hope of a longer match.
2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        continue;
2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return matched;
2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Search text (within context) for prog_.
2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool BitState::Search(const StringPiece& text, const StringPiece& context,
2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                      bool anchored, bool longest,
2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                      StringPiece* submatch, int nsubmatch) {
2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Search parameters.
2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  text_ = text;
2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  context_ = context;
2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (context_.begin() == NULL)
3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    context_ = text;
3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (prog_->anchor_start() && context_.begin() != text.begin())
3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (prog_->anchor_end() && context_.end() != text.end())
3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  anchored_ = anchored || prog_->anchor_start();
3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  longest_ = longest || prog_->anchor_end();
3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  endmatch_ = prog_->anchor_end();
3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  submatch_ = submatch;
3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  nsubmatch_ = nsubmatch;
3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = 0; i < nsubmatch_; i++)
3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    submatch_[i] = NULL;
3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Allocate scratch space.
3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  visited_ = new uint32[nvisited_];
3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  memset(visited_, 0, nvisited_*sizeof visited_[0]);
3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // VLOG(0) << "nvisited_ = " << nvisited_;
3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  ncap_ = 2*nsubmatch;
3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (ncap_ < 2)
3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    ncap_ = 2;
3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  cap_ = new const char*[ncap_];
3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  memset(cap_, 0, ncap_*sizeof cap_[0]);
3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  maxjob_ = 256;
3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  job_ = new Job[maxjob_];
3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Anchored search must start at text.begin().
3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (anchored_) {
3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    cap_[0] = text.begin();
3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return TrySearch(prog_->start(), text.begin());
3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Unanchored search, starting from each possible text position.
3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Notice that we have to try the empty string at the end of
3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // the text, so the loop condition is p <= text.end(), not p < text.end().
3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // This looks like it's quadratic in the size of the text,
3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // but we are not clearing visited_ between calls to TrySearch,
3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // so no work is duplicated and it ends up still being linear.
3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (const char* p = text.begin(); p <= text.end(); p++) {
3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    cap_[0] = p;
3422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
3432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return true;
3442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
3452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return false;
3462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
3472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Bit-state search.
3492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Prog::SearchBitState(const StringPiece& text,
3502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                          const StringPiece& context,
3512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                          Anchor anchor,
3522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                          MatchKind kind,
3532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                          StringPiece* match,
3542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                          int nmatch) {
3552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // If full match, we ask for an anchored longest match
3562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // and then check that match[0] == text.
3572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // So make sure match[0] exists.
3582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  StringPiece sp0;
3592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (kind == kFullMatch) {
3602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    anchor = kAnchored;
3612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (nmatch < 1) {
3622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      match = &sp0;
3632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      nmatch = 1;
3642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
3652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
3662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Run the search.
3682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  BitState b(this);
3692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool anchored = anchor == kAnchored;
3702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool longest = kind != kFirstMatch;
3712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (!b.Search(text, context, anchored, longest, match, nmatch))
3722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
3732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (kind == kFullMatch && match[0].end() != text.end())
3742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
3752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return true;
3762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
3772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}  // namespace re2
379