15c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Copyright 2008 The RE2 Authors.  All Rights Reserved.
25c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Use of this source code is governed by a BSD-style
35c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// license that can be found in the LICENSE file.
45c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
55c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Tested by search_test.cc, exhaustive_test.cc, tester.cc
65c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
75c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Prog::SearchBitState is a regular expression search with submatch
85c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// tracking for small regular expressions and texts.  Like
95c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// testing/backtrack.cc, it allocates a bit vector with (length of
105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// text) * (length of prog) bits, to make sure it never explores the
115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// same (character position, instruction) state multiple times.  This
125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// limits the search to run in time linear in the length of the text.
135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)//
145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Unlike testing/backtrack.cc, SearchBitState is not recursive
155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// on the text.
165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)//
175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// SearchBitState is a fast replacement for the NFA code on small
185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// regexps and texts when SearchOnePass cannot be used.
195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include "re2/prog.h"
215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include "re2/regexp.h"
225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)namespace re2 {
245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)struct Job {
265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  int id;
275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  int arg;
285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  const char* p;
295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)};
3053e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)
315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)class BitState {
325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) public:
335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  explicit BitState(Prog* prog);
345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  ~BitState();
3553e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)
3653e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  // The usual Search prototype.
3753e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  // Can only call Search once per BitState.
3853e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  bool Search(const StringPiece& text, const StringPiece& context,
3953e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)              bool anchored, bool longest,
4053e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)              StringPiece* submatch, int nsubmatch);
4153e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)
4253e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles) private:
4353e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  inline bool ShouldVisit(int id, const char* p);
4453e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  void Push(int id, const char* p, int arg);
4553e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  bool GrowStack();
4653e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  bool TrySearch(int id, const char* p);
4753e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)
4853e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  // Search parameters
4953e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  Prog* prog_;              // program being run
5053e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  StringPiece text_;        // text being searched
5153e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  StringPiece context_;     // greater context of text being searched
5253e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  bool anchored_;           // whether search is anchored at text.begin()
5353e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  bool longest_;            // whether search wants leftmost-longest match
5453e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  bool endmatch_;           // whether match must end at text.end()
5553e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  StringPiece *submatch_;   // submatches to fill in
5653e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  int nsubmatch_;           //   # of submatches to fill in
5753e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)
5853e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  // Search state
5953e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  const char** cap_;        // capture registers
6053e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  int ncap_;
6153e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)
6253e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  static const int VisitedBits = 32;
6353e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  uint32 *visited_;         // bitmap: (Inst*, char*) pairs already backtracked
6453e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  int nvisited_;            //   # of words in bitmap
65926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)
6653e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  Job *job_;                // stack of text positions to explore
6753e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  int njob_;
6853e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  int maxjob_;
6953e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)};
7053e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)
7153e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)BitState::BitState(Prog* prog)
7253e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)  : prog_(prog),
7353e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)    anchored_(false),
745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    longest_(false),
755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    endmatch_(false),
765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    submatch_(NULL),
775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    nsubmatch_(0),
7853e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)    cap_(NULL),
795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ncap_(0),
805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    visited_(NULL),
815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    nvisited_(0),
825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    job_(NULL),
835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    njob_(0),
845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    maxjob_(0) {
855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)BitState::~BitState() {
885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  delete[] visited_;
895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  delete[] job_;
905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  delete[] cap_;
915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Should the search visit the pair ip, p?
945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// If so, remember that it was visited so that the next time,
955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// we don't repeat the visit.
965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)bool BitState::ShouldVisit(int id, const char* p) {
975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  uint n = id * (text_.size() + 1) + (p - text_.begin());
985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return false;
1005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
1015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  return true;
1025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
1035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Grow the stack.
1055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)bool BitState::GrowStack() {
1065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // VLOG(0) << "Reallocate.";
1075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  maxjob_ *= 2;
1085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  Job* newjob = new Job[maxjob_];
1095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  memmove(newjob, job_, njob_*sizeof job_[0]);
1105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  delete[] job_;
1115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  job_ = newjob;
1125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  if (njob_ >= maxjob_) {
1135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    LOG(DFATAL) << "Job stack overflow.";
1145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return false;
1155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  }
1165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  return true;
1175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
1185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Push the triple (id, p, arg) onto the stack, growing it if necessary.
1205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void BitState::Push(int id, const char* p, int arg) {
1215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  if (njob_ >= maxjob_) {
1225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (!GrowStack())
1235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      return;
1245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  }
1255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  int op = prog_->inst(id)->opcode();
1265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  if (op == kInstFail)
1275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return;
1285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // Only check ShouldVisit when arg == 0.
1305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // When arg > 0, we are continuing a previous visit.
1315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  if (arg == 0 && !ShouldVisit(id, p))
1325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return;
1335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  Job* j = &job_[njob_++];
1355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  j->id = id;
1365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  j->p = p;
137926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)  j->arg = arg;
138926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)}
139926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)
14053e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)// Try a search from instruction id0 in state p0.
1415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Return whether it succeeded.
1425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)bool BitState::TrySearch(int id0, const char* p0) {
1435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  bool matched = false;
1445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  const char* end = text_.end();
1455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  njob_ = 0;
146926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)  Push(id0, p0, 0);
1475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  while (njob_ > 0) {
1485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // Pop job off stack.
1495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    --njob_;
150926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)    int id = job_[njob_].id;
1515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    const char* p = job_[njob_].p;
152926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)    int arg = job_[njob_].arg;
153926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)
154926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)    // Optimization: rather than push and pop,
155926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)    // code that is going to Push and continue
1565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // the loop simply updates ip, p, and arg
157926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)    // and jumps to CheckAndLoop.  We have to
158926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)    // do the ShouldVisit check that Push
1595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // would have, but we avoid the stack
1605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // manipulation.
1615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (0) {
1625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    CheckAndLoop:
1635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      if (!ShouldVisit(id, p))
1645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        continue;
1655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
1665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // Visit ip, p.
1685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // VLOG(0) << "Job: " << ip->id() << " "
1695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    //         << (p - text_.begin()) << " " << arg;
1705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    Prog::Inst* ip = prog_->inst(id);
1715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    switch (ip->opcode()) {
172926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)      case kInstFail:
1735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      default:
1745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
1755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        return false;
1765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      case kInstAlt:
1785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Cannot just
1795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        //   Push(ip->out1(), p, 0);
1805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        //   Push(ip->out(), p, 0);
1815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // If, during the processing of ip->out(), we encounter
1825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // ip->out1() via another path, we want to process it then.
1835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Pushing it here will inhibit that.  Instead, re-push
1845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // ip with arg==1 as a reminder to push ip->out1() later.
1855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        switch (arg) {
1865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          case 0:
1875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            Push(id, p, 1);  // come back when we're done
1885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            id = ip->out();
1895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            goto CheckAndLoop;
1905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          case 1:
1925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            // Finished ip->out(); try ip->out1().
1935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            arg = 0;
1945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            id = ip->out1();
1955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            goto CheckAndLoop;
1965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
1975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
1985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        continue;
1995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      case kInstAltMatch:
2015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // One opcode is byte range; the other leads to match.
2025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (ip->greedy(prog_)) {
2035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          // out1 is the match
2045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          Push(ip->out1(), p, 0);
2055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          id = ip->out1();
2065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          p = end;
2075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          goto CheckAndLoop;
2085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // out is the match - non-greedy
2105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        Push(ip->out(), end, 0);
2115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        id = ip->out();
2125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        goto CheckAndLoop;
2135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      case kInstByteRange: {
2155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int c = -1;
2165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (p < end)
2175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          c = *p & 0xFF;
2185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (ip->Matches(c)) {
2195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          id = ip->out();
2205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          p++;
2215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          goto CheckAndLoop;
2225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        continue;
2245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      }
2255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      case kInstCapture:
2275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        switch (arg) {
2285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          case 0:
2295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            if (0 <= ip->cap() && ip->cap() < ncap_) {
2305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)              // Capture p to register, but save old value.
2315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)              Push(id, cap_[ip->cap()], 1);  // come back when we're done
2325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)              cap_[ip->cap()] = p;
2335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
2345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            // Continue on.
2355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            id = ip->out();
2365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            goto CheckAndLoop;
2375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          case 1:
2385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            // Finished ip->out(); restore the old value.
2395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            cap_[ip->cap()] = p;
2405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            continue;
2415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
2435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        continue;
2445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      case kInstEmptyWidth:
2465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (ip->empty() & ~Prog::EmptyFlags(context_, p))
2475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          continue;
2485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        id = ip->out();
2495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        goto CheckAndLoop;
2505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      case kInstNop:
2525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        id = ip->out();
2535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        goto CheckAndLoop;
2545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      case kInstMatch: {
2565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (endmatch_ && p != text_.end())
2575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          continue;
2585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // VLOG(0) << "Found match.";
2605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // We found a match.  If the caller doesn't care
2615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // where the match is, no point going further.
2625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (nsubmatch_ == 0)
2635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          return true;
2645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Record best match so far.
2665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Only need to check end point, because this entire
2675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // call is only considering one start position.
2685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        matched = true;
2695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        cap_[1] = p;
2705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (submatch_[0].data() == NULL ||
2715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            (longest_ && p > submatch_[0].end())) {
2725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          for (int i = 0; i < nsubmatch_; i++)
2735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
274926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)        }
275926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)
276926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)        // If going for first match, we're done.
277926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)        if (!longest_)
278926b001d589ce2f10facb93dd4b87578ea35a855Torne (Richard Coles)          return true;
2795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // If we used the entire text, no longer match is possible.
2815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (p == text_.end())
2825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)          return true;
2835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Otherwise, continue on in hope of a longer match.
2855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        continue;
2865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      }
2875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
2885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  }
2895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  return matched;
2905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
2915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Search text (within context) for prog_.
2935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)bool BitState::Search(const StringPiece& text, const StringPiece& context,
2945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                      bool anchored, bool longest,
2955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                      StringPiece* submatch, int nsubmatch) {
2965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // Search parameters.
2975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  text_ = text;
2985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  context_ = context;
2995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  if (context_.begin() == NULL)
3005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    context_ = text;
3015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  if (prog_->anchor_start() && context_.begin() != text.begin())
3025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return false;
3035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  if (prog_->anchor_end() && context_.end() != text.end())
3045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return false;
3055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  anchored_ = anchored || prog_->anchor_start();
3065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  longest_ = longest || prog_->anchor_end();
3075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  endmatch_ = prog_->anchor_end();
3085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  submatch_ = submatch;
3095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  nsubmatch_ = nsubmatch;
3105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  for (int i = 0; i < nsubmatch_; i++)
3115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    submatch_[i] = NULL;
3125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // Allocate scratch space.
3145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
3155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  visited_ = new uint32[nvisited_];
3165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  memset(visited_, 0, nvisited_*sizeof visited_[0]);
3175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // VLOG(0) << "nvisited_ = " << nvisited_;
3185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  ncap_ = 2*nsubmatch;
3205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  if (ncap_ < 2)
3215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ncap_ = 2;
3225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  cap_ = new const char*[ncap_];
3235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  memset(cap_, 0, ncap_*sizeof cap_[0]);
3245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  maxjob_ = 256;
3265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  job_ = new Job[maxjob_];
3275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // Anchored search must start at text.begin().
3295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  if (anchored_) {
3305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    cap_[0] = text.begin();
3315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return TrySearch(prog_->start(), text.begin());
3325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  }
3335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // Unanchored search, starting from each possible text position.
3355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // Notice that we have to try the empty string at the end of
3365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // the text, so the loop condition is p <= text.end(), not p < text.end().
3375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // This looks like it's quadratic in the size of the text,
3385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // but we are not clearing visited_ between calls to TrySearch,
3395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // so no work is duplicated and it ends up still being linear.
3405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  for (const char* p = text.begin(); p <= text.end(); p++) {
3415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    cap_[0] = p;
3425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
3435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      return true;
3445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  }
3455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  return false;
3465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
3475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// Bit-state search.
3495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)bool Prog::SearchBitState(const StringPiece& text,
3505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                          const StringPiece& context,
3515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                          Anchor anchor,
3525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                          MatchKind kind,
3535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                          StringPiece* match,
3545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                          int nmatch) {
3555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // If full match, we ask for an anchored longest match
3565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // and then check that match[0] == text.
3575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // So make sure match[0] exists.
3585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  StringPiece sp0;
3595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  if (kind == kFullMatch) {
3605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    anchor = kAnchored;
3615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (nmatch < 1) {
3625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      match = &sp0;
3635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)      nmatch = 1;
3645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
3655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  }
3665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  // Run the search.
3685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  BitState b(this);
3695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  bool anchored = anchor == kAnchored;
3705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  bool longest = kind != kFirstMatch;
3715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  if (!b.Search(text, context, anchored, longest, match, nmatch))
3725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return false;
3735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  if (kind == kFullMatch && match[0].end() != text.end())
3745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    return false;
3755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)  return true;
3765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
3775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}  // namespace re2
3795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)