15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2008 The RE2 Authors.  All Rights Reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Tested by search_test.cc, exhaustive_test.cc, tester.cc
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Prog::SearchBitState is a regular expression search with submatch
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// tracking for small regular expressions and texts.  Like
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// testing/backtrack.cc, it allocates a bit vector with (length of
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// text) * (length of prog) bits, to make sure it never explores the
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// same (character position, instruction) state multiple times.  This
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// limits the search to run in time linear in the length of the text.
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Unlike testing/backtrack.cc, SearchBitState is not recursive
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// on the text.
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// SearchBitState is a fast replacement for the NFA code on small
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// regexps and texts when SearchOnePass cannot be used.
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/prog.h"
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/regexp.h"
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 {
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct Job {
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int id;
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int arg;
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* p;
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class BitState {
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  explicit BitState(Prog* prog);
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~BitState();
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The usual Search prototype.
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Can only call Search once per BitState.
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool Search(const StringPiece& text, const StringPiece& context,
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              bool anchored, bool longest,
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              StringPiece* submatch, int nsubmatch);
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inline bool ShouldVisit(int id, const char* p);
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Push(int id, const char* p, int arg);
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool GrowStack();
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool TrySearch(int id, const char* p);
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Search parameters
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Prog* prog_;              // program being run
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  StringPiece text_;        // text being searched
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  StringPiece context_;     // greater context of text being searched
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool anchored_;           // whether search is anchored at text.begin()
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool longest_;            // whether search wants leftmost-longest match
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool endmatch_;           // whether match must end at text.end()
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  StringPiece *submatch_;   // submatches to fill in
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int nsubmatch_;           //   # of submatches to fill in
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Search state
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char** cap_;        // capture registers
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int ncap_;
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int VisitedBits = 32;
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32 *visited_;         // bitmap: (Inst*, char*) pairs already backtracked
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int nvisited_;            //   # of words in bitmap
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Job *job_;                // stack of text positions to explore
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int njob_;
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int maxjob_;
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BitState::BitState(Prog* prog)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  : prog_(prog),
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    anchored_(false),
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    longest_(false),
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    endmatch_(false),
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    submatch_(NULL),
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    nsubmatch_(0),
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    cap_(NULL),
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ncap_(0),
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    visited_(NULL),
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    nvisited_(0),
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    job_(NULL),
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    njob_(0),
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    maxjob_(0) {
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)BitState::~BitState() {
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  delete[] visited_;
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  delete[] job_;
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  delete[] cap_;
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Should the search visit the pair ip, p?
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If so, remember that it was visited so that the next time,
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// we don't repeat the visit.
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool BitState::ShouldVisit(int id, const char* p) {
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint n = id * (text_.size() + 1) + (p - text_.begin());
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Grow the stack.
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool BitState::GrowStack() {
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // VLOG(0) << "Reallocate.";
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  maxjob_ *= 2;
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Job* newjob = new Job[maxjob_];
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  memmove(newjob, job_, njob_*sizeof job_[0]);
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  delete[] job_;
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  job_ = newjob;
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (njob_ >= maxjob_) {
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    LOG(DFATAL) << "Job stack overflow.";
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Push the triple (id, p, arg) onto the stack, growing it if necessary.
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void BitState::Push(int id, const char* p, int arg) {
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (njob_ >= maxjob_) {
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (!GrowStack())
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return;
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int op = prog_->inst(id)->opcode();
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (op == kInstFail)
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Only check ShouldVisit when arg == 0.
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // When arg > 0, we are continuing a previous visit.
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (arg == 0 && !ShouldVisit(id, p))
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Job* j = &job_[njob_++];
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  j->id = id;
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  j->p = p;
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  j->arg = arg;
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Try a search from instruction id0 in state p0.
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Return whether it succeeded.
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool BitState::TrySearch(int id0, const char* p0) {
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool matched = false;
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* end = text_.end();
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  njob_ = 0;
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Push(id0, p0, 0);
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (njob_ > 0) {
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Pop job off stack.
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    --njob_;
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int id = job_[njob_].id;
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const char* p = job_[njob_].p;
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int arg = job_[njob_].arg;
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Optimization: rather than push and pop,
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // code that is going to Push and continue
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // the loop simply updates ip, p, and arg
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // and jumps to CheckAndLoop.  We have to
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // do the ShouldVisit check that Push
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // would have, but we avoid the stack
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // manipulation.
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (0) {
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    CheckAndLoop:
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (!ShouldVisit(id, p))
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        continue;
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Visit ip, p.
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // VLOG(0) << "Job: " << ip->id() << " "
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    //         << (p - text_.begin()) << " " << arg;
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Prog::Inst* ip = prog_->inst(id);
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    switch (ip->opcode()) {
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstFail:
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      default:
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return false;
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstAlt:
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Cannot just
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        //   Push(ip->out1(), p, 0);
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        //   Push(ip->out(), p, 0);
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // If, during the processing of ip->out(), we encounter
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // ip->out1() via another path, we want to process it then.
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Pushing it here will inhibit that.  Instead, re-push
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // ip with arg==1 as a reminder to push ip->out1() later.
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        switch (arg) {
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          case 0:
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            Push(id, p, 1);  // come back when we're done
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            id = ip->out();
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            goto CheckAndLoop;
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          case 1:
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            // Finished ip->out(); try ip->out1().
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            arg = 0;
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            id = ip->out1();
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            goto CheckAndLoop;
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        continue;
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstAltMatch:
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // One opcode is byte range; the other leads to match.
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (ip->greedy(prog_)) {
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // out1 is the match
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          Push(ip->out1(), p, 0);
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          id = ip->out1();
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          p = end;
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          goto CheckAndLoop;
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // out is the match - non-greedy
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        Push(ip->out(), end, 0);
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        id = ip->out();
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        goto CheckAndLoop;
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstByteRange: {
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        int c = -1;
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (p < end)
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          c = *p & 0xFF;
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (ip->Matches(c)) {
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          id = ip->out();
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          p++;
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          goto CheckAndLoop;
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        continue;
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstCapture:
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        switch (arg) {
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          case 0:
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            if (0 <= ip->cap() && ip->cap() < ncap_) {
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              // Capture p to register, but save old value.
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              Push(id, cap_[ip->cap()], 1);  // come back when we're done
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              cap_[ip->cap()] = p;
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            }
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            // Continue on.
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            id = ip->out();
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            goto CheckAndLoop;
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          case 1:
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            // Finished ip->out(); restore the old value.
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            cap_[ip->cap()] = p;
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            continue;
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        continue;
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstEmptyWidth:
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (ip->empty() & ~Prog::EmptyFlags(context_, p))
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          continue;
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        id = ip->out();
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        goto CheckAndLoop;
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstNop:
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        id = ip->out();
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        goto CheckAndLoop;
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstMatch: {
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (endmatch_ && p != text_.end())
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          continue;
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // VLOG(0) << "Found match.";
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // We found a match.  If the caller doesn't care
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // where the match is, no point going further.
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (nsubmatch_ == 0)
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return true;
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Record best match so far.
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Only need to check end point, because this entire
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // call is only considering one start position.
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        matched = true;
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        cap_[1] = p;
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (submatch_[0].data() == NULL ||
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            (longest_ && p > submatch_[0].end())) {
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          for (int i = 0; i < nsubmatch_; i++)
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // If going for first match, we're done.
2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (!longest_)
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return true;
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // If we used the entire text, no longer match is possible.
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (p == text_.end())
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return true;
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Otherwise, continue on in hope of a longer match.
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        continue;
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return matched;
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Search text (within context) for prog_.
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool BitState::Search(const StringPiece& text, const StringPiece& context,
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      bool anchored, bool longest,
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      StringPiece* submatch, int nsubmatch) {
2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Search parameters.
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  text_ = text;
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  context_ = context;
2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (context_.begin() == NULL)
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    context_ = text;
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (prog_->anchor_start() && context_.begin() != text.begin())
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (prog_->anchor_end() && context_.end() != text.end())
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  anchored_ = anchored || prog_->anchor_start();
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  longest_ = longest || prog_->anchor_end();
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  endmatch_ = prog_->anchor_end();
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  submatch_ = submatch;
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  nsubmatch_ = nsubmatch;
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 0; i < nsubmatch_; i++)
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    submatch_[i] = NULL;
3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Allocate scratch space.
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  visited_ = new uint32[nvisited_];
3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  memset(visited_, 0, nvisited_*sizeof visited_[0]);
3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // VLOG(0) << "nvisited_ = " << nvisited_;
3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ncap_ = 2*nsubmatch;
3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (ncap_ < 2)
3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ncap_ = 2;
3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  cap_ = new const char*[ncap_];
3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  memset(cap_, 0, ncap_*sizeof cap_[0]);
3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  maxjob_ = 256;
3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  job_ = new Job[maxjob_];
3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Anchored search must start at text.begin().
3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (anchored_) {
3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    cap_[0] = text.begin();
3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return TrySearch(prog_->start(), text.begin());
3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Unanchored search, starting from each possible text position.
3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Notice that we have to try the empty string at the end of
3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the text, so the loop condition is p <= text.end(), not p < text.end().
3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This looks like it's quadratic in the size of the text,
3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // but we are not clearing visited_ between calls to TrySearch,
3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // so no work is duplicated and it ends up still being linear.
3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (const char* p = text.begin(); p <= text.end(); p++) {
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    cap_[0] = p;
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return true;
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return false;
3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Bit-state search.
3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Prog::SearchBitState(const StringPiece& text,
3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          const StringPiece& context,
3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          Anchor anchor,
3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          MatchKind kind,
3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          StringPiece* match,
3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          int nmatch) {
3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If full match, we ask for an anchored longest match
3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // and then check that match[0] == text.
3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // So make sure match[0] exists.
3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  StringPiece sp0;
3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (kind == kFullMatch) {
3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    anchor = kAnchored;
3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (nmatch < 1) {
3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      match = &sp0;
3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nmatch = 1;
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Run the search.
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  BitState b(this);
3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool anchored = anchor == kAnchored;
3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool longest = kind != kFirstMatch;
3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!b.Search(text, context, anchored, longest, match, nmatch))
3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (kind == kFullMatch && match[0].end() != text.end())
3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace re2
379