12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2006-2007 The RE2 Authors.  All Rights Reserved.
22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style
32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file.
42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Tested by search_test.cc.
62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Prog::SearchNFA, an NFA search.
82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This is an actual NFA like the theorists talk about,
92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// not the pseudo-NFA found in backtracking regexp implementations.
102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// IMPLEMENTATION
122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// which is a variant of the one described in Thompson's 1968 CACM paper.
152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// See http://swtch.com/~rsc/regexp/ for various history.  The main feature
162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// over the DFA implementation is that it tracks submatch boundaries.
172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// When the choice of submatch boundaries is ambiguous, this particular
192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// implementation makes the same choices that traditional backtracking
202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// implementations (in particular, Perl and PCRE) do.
212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// time in the length of the input.
232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson//
242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Like Thompson's original machine and like the DFA implementation, this
252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// implementation notices a match only once it is one byte past it.
262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/prog.h"
282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/regexp.h"
292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/sparse_array.h"
302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/sparse_set.h"
312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 {
332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass NFA {
352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public:
362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  NFA(Prog* prog);
372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  ~NFA();
382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Searches for a matching string.
402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //   * If anchored is true, only considers matches starting at offset.
412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //     Otherwise finds lefmost match at or after offset.
422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //   * If longest is true, returns the longest match starting
432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //     at the chosen start point.  Otherwise returns the so-called
442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //     left-biased match, the one traditional backtracking engines
452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //     (like Perl and PCRE) find.
462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Records submatch boundaries in submatch[1..nsubmatch-1].
472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Submatch[0] is the entire match.  When there is a choice in
482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // which text matches each subexpression, the submatch boundaries
492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // are chosen to match what a backtracking implementation would choose.
502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool Search(const StringPiece& text, const StringPiece& context,
512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson              bool anchored, bool longest,
522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson              StringPiece* submatch, int nsubmatch);
532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  static const int Debug = 0;
552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private:
572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  struct Thread {
582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    union {
592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      int id;
602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      Thread* next;  // when on free list
612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    };
622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    const char** capture;
632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  };
642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // State for explicit stack in AddToThreadq.
662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  struct AddState {
672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int id;           // Inst to process
682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int j;
692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    const char* cap_j;  // if j>=0, set capture[j] = cap_j before processing ip
702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    AddState()
722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      : id(0), j(-1), cap_j(NULL) {}
732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    explicit AddState(int id)
742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      : id(id), j(-1), cap_j(NULL) {}
752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    AddState(int id, const char* cap_j, int j)
762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      : id(id), j(j), cap_j(cap_j) {}
772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  };
782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Threadq is a list of threads.  The list is sorted by the order
802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // in which Perl would explore that particular state -- the earlier
812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // choices appear earlier in the list.
822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  typedef SparseArray<Thread*> Threadq;
832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  inline Thread* AllocThread();
852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  inline void FreeThread(Thread*);
862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Add r (or its children, following unlabeled arrows)
882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // to the workqueue q with associated capture info.
892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  void AddToThreadq(Threadq* q, int id, int flag,
902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                    const char* p, const char** capture);
912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Run runq on byte c, appending new states to nextq.
932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Updates matched_ and match_ as new, better matches are found.
942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // p is position of the next byte (the one after c)
952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // in the input string, used when processing capturing parens.
962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // flag is the bitwise or of Bol, Eol, etc., specifying whether
972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // ^, $ and \b match the current input point (after c).
982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Returns text version of capture information, for debugging.
1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  string FormatCapture(const char** capture);
1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  inline void CopyCapture(const char** dst, const char** src);
1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Computes whether all matches must begin with the same first
1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // byte, and if so, returns that byte.  If not, returns -1.
1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int ComputeFirstByte();
1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Prog* prog_;          // underlying program
1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int start_;           // start instruction in program
1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int ncapture_;        // number of submatches to track
1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool longest_;        // whether searching for longest match
1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool endmatch_;       // whether match must end at text.end()
1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  const char* btext_;   // beginning of text being matched (for FormatSubmatch)
1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  const char* etext_;   // end of text being matched (for endmatch_)
1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Threadq q0_, q1_;     // pre-allocated for Search.
1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  const char** match_;  // best match so far
1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bool matched_;        // any match so far?
1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  AddState* astack_;    // pre-allocated for AddToThreadq
1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int nastack_;
1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int first_byte_;      // required first byte for match, or -1 if none
1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Thread* free_threads_;  // free list
1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DISALLOW_EVIL_CONSTRUCTORS(NFA);
1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson};
1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonNFA::NFA(Prog* prog) {
1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  prog_ = prog;
1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  start_ = prog->start();
1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  ncapture_ = 0;
1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  longest_ = false;
1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  endmatch_ = false;
1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  btext_ = NULL;
1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  etext_ = NULL;
1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  q0_.resize(prog_->size());
1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  q1_.resize(prog_->size());
1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  nastack_ = 2*prog_->size();
1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  astack_ = new AddState[nastack_];
1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  match_ = NULL;
1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  matched_ = false;
1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  free_threads_ = NULL;
1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  first_byte_ = ComputeFirstByte();
1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonNFA::~NFA() {
1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  delete[] match_;
1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  delete[] astack_;
1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Thread* next;
1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (Thread* t = free_threads_; t; t = next) {
1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    next = t->next;
1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    delete[] t->capture;
1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    delete t;
1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid NFA::FreeThread(Thread *t) {
1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (t == NULL)
1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return;
1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  t->next = free_threads_;
1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  free_threads_ = t;
1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonNFA::Thread* NFA::AllocThread() {
1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Thread* t = free_threads_;
1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (t == NULL) {
1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    t = new Thread;
1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    t->capture = new const char*[ncapture_];
1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return t;
1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  free_threads_ = t->next;
1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return t;
1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid NFA::CopyCapture(const char** dst, const char** src) {
1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = 0; i < ncapture_; i+=2) {
1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    dst[i] = src[i];
1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    dst[i+1] = src[i+1];
1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Follows all empty arrows from r and enqueues all the states reached.
1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The pointer p is the current input position, and m is the
1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// current set of match boundaries.
1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid NFA::AddToThreadq(Threadq* q, int id0, int flag,
1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                       const char* p, const char** capture) {
1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (id0 == 0)
1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return;
1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Astack_ is pre-allocated to avoid resize operations.
1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // It has room for 2*prog_->size() entries, which is enough:
1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Each inst in prog can be processed at most once,
1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // pushing at most two entries on stk.
1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int nstk = 0;
1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  AddState* stk = astack_;
1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  stk[nstk++] = AddState(id0);
1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  while (nstk > 0) {
2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    DCHECK_LE(nstk, nastack_);
2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    const AddState& a = stk[--nstk];
2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (a.j >= 0)
2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      capture[a.j] = a.cap_j;
2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int id = a.id;
2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (id == 0)
2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      continue;
2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (q->has_index(id)) {
2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (Debug)
2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        fprintf(stderr, "  [%d%s]\n", id, FormatCapture(capture).c_str());
2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      continue;
2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Create entry in q no matter what.  We might fill it in below,
2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // or we might not.  Even if not, it is necessary to have it,
2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // so that we don't revisit r during the recursion.
2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    q->set_new(id, NULL);
2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Thread** tp = &q->find(id)->second;
2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int j;
2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Thread* t;
2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Prog::Inst* ip = prog_->inst(id);
2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    switch (ip->opcode()) {
2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    default:
2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      break;
2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstFail:
2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      break;
2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstAltMatch:
2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // Save state; will pick up at next byte.
2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      t = AllocThread();
2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      t->id = id;
2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      CopyCapture(t->capture, capture);
2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      *tp = t;
2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // fall through
2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstAlt:
2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // Explore alternatives.
2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      stk[nstk++] = AddState(ip->out1());
2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      stk[nstk++] = AddState(ip->out());
2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      break;
2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstNop:
2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // Continue on.
2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      stk[nstk++] = AddState(ip->out());
2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      break;
2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstCapture:
2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if ((j=ip->cap()) < ncapture_) {
2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // Push a dummy whose only job is to restore capture[j]
2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // once we finish exploring this possibility.
2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        stk[nstk++] = AddState(0, capture[j], j);
2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // Record capture.
2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        capture[j] = p;
2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      stk[nstk++] = AddState(ip->out());
2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      break;
2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstMatch:
2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstByteRange:
2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // Save state; will pick up at next byte.
2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      t = AllocThread();
2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      t->id = id;
2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      CopyCapture(t->capture, capture);
2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      *tp = t;
2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (Debug)
2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      break;
2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstEmptyWidth:
2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // Continue on if we have all the right flag bits.
2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (ip->empty() & ~flag)
2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      stk[nstk++] = AddState(ip->out());
2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      break;
2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Run runq on byte c, appending new states to nextq.
2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Updates match as new, better matches are found.
2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// p is position of the byte c in the input string,
2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// used when processing capturing parens.
2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// flag is the bitwise or of Bol, Eol, etc., specifying whether
2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// ^, $ and \b match the current input point (after c).
2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Frees all the threads on runq.
2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If there is a shortcut to the end, returns that shortcut.
2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonint NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  nextq->clear();
2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Thread* t = i->second;
2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (t == NULL)
2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      continue;
2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (longest_) {
3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // Can skip any threads started after our current best match.
3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (matched_ && match_[0] < t->capture[0]) {
3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        FreeThread(t);
3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        continue;
3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int id = t->id;
3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Prog::Inst* ip = prog_->inst(id);
3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    switch (ip->opcode()) {
3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      default:
3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // Should only see the values handled below.
3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstByteRange:
3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (ip->Matches(c))
3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstAltMatch:
3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (i != runq->begin())
3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          break;
3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // The match is ours if we want it.
3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (ip->greedy(prog_) || longest_) {
3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          CopyCapture((const char**)match_, t->capture);
3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          FreeThread(t);
3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          for (++i; i != runq->end(); ++i)
3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            FreeThread(i->second);
3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          runq->clear();
3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          matched_ = true;
3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          if (ip->greedy(prog_))
3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            return ip->out1();
3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          return ip->out();
3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        }
3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstMatch:
3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (endmatch_ && p != etext_)
3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          break;
3422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        const char* old = t->capture[1];  // previous end pointer
3442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        t->capture[1] = p;
3452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (longest_) {
3462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          // Leftmost-longest mode: save this match only if
3472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          // it is either farther to the left or at the same
3482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          // point but longer than an existing match.
3492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          if (!matched_ || t->capture[0] < match_[0] ||
3502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson              (t->capture[0] == match_[0] && t->capture[1] > match_[1]))
3512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            CopyCapture((const char**)match_, t->capture);
3522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        } else {
3532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          // Leftmost-biased mode: this match is by definition
3542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          // better than what we've already found (see next line).
3552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          CopyCapture((const char**)match_, t->capture);
3562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          // Cut off the threads that can only find matches
3582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          // worse than the one we just found: don't run the
3592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          // rest of the current Threadq.
3602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          t->capture[0] = old;
3612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          FreeThread(t);
3622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          for (++i; i != runq->end(); ++i)
3632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            FreeThread(i->second);
3642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          runq->clear();
3652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          matched_ = true;
3662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          return 0;
3672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        }
3682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        t->capture[0] = old;
3692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        matched_ = true;
3702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
3712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
3722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    FreeThread(t);
3732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
3742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  runq->clear();
3752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return 0;
3762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
3772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring NFA::FormatCapture(const char** capture) {
3792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  string s;
3802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = 0; i < ncapture_; i+=2) {
3822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (capture[i] == NULL)
3832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      StringAppendF(&s, "(?,?)");
3842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    else if (capture[i+1] == NULL)
3852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
3862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    else
3872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      StringAppendF(&s, "(%d,%d)",
3882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                    (int)(capture[i] - btext_),
3892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                    (int)(capture[i+1] - btext_));
3902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
3912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return s;
3922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
3932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns whether haystack contains needle's memory.
3952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
3962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return haystack.begin() <= needle.begin() &&
3972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson         haystack.end() >= needle.end();
3982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
3992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool NFA::Search(const StringPiece& text, const StringPiece& const_context,
4012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            bool anchored, bool longest,
4022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            StringPiece* submatch, int nsubmatch) {
4032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (start_ == 0)
4042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
4052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  StringPiece context = const_context;
4072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (context.begin() == NULL)
4082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    context = text;
4092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (!StringPieceContains(context, text)) {
4112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    LOG(FATAL) << "Bad args: context does not contain text "
4122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                << reinterpret_cast<const void*>(context.begin())
4132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                << "+" << context.size() << " "
4142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                << reinterpret_cast<const void*>(text.begin())
4152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                << "+" << text.size();
4162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
4172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
4182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (prog_->anchor_start() && context.begin() != text.begin())
4202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
4212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (prog_->anchor_end() && context.end() != text.end())
4222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
4232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  anchored |= prog_->anchor_start();
4242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (prog_->anchor_end()) {
4252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    longest = true;
4262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    endmatch_ = true;
4272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    etext_ = text.end();
4282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
4292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (nsubmatch < 0) {
4312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
4322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
4332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
4342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Save search parameters.
4362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  ncapture_ = 2*nsubmatch;
4372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  longest_ = longest;
4382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (nsubmatch == 0) {
4402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // We need to maintain match[0], both to distinguish the
4412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // longest match (if longest is true) and also to tell
4422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // whether we've seen any matches at all.
4432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    ncapture_ = 2;
4442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
4452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  match_ = new const char*[ncapture_];
4472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  matched_ = false;
4482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  memset(match_, 0, ncapture_*sizeof match_[0]);
4492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // For debugging prints.
4512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  btext_ = context.begin();
4522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (Debug) {
4542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
4552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            text.as_string().c_str(), context.as_string().c_str(), anchored,
4562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            longest);
4572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
4582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Set up search.
4602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Threadq* runq = &q0_;
4612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Threadq* nextq = &q1_;
4622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  runq->clear();
4632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  nextq->clear();
4642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  memset(&match_[0], 0, ncapture_*sizeof match_[0]);
4652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  const char* bp = context.begin();
4662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int c = -1;
4672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int wasword = 0;
4682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (text.begin() > context.begin()) {
4702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    c = text.begin()[-1] & 0xFF;
4712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    wasword = Prog::IsWordChar(c);
4722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
4732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Loop over the text, stepping the machine.
4752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (const char* p = text.begin();; p++) {
4762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Check for empty-width specials.
4772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int flag = 0;
4782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // ^ and \A
4802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (p == context.begin())
4812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      flag |= kEmptyBeginText | kEmptyBeginLine;
4822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    else if (p <= context.end() && p[-1] == '\n')
4832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      flag |= kEmptyBeginLine;
4842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // $ and \z
4862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (p == context.end())
4872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      flag |= kEmptyEndText | kEmptyEndLine;
4882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    else if (p < context.end() && p[0] == '\n')
4892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      flag |= kEmptyEndLine;
4902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // \b and \B
4922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int isword = 0;
4932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (p < context.end())
4942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      isword = Prog::IsWordChar(p[0] & 0xFF);
4952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
4962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (isword != wasword)
4972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      flag |= kEmptyWordBoundary;
4982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    else
4992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      flag |= kEmptyNonWordBoundary;
5002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (Debug) {
5022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
5032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
5042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        Thread* t = i->second;
5052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (t == NULL)
5062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          continue;
5072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        fprintf(stderr, " %d%s", t->id,
5082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                FormatCapture((const char**)t->capture).c_str());
5092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
5102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      fprintf(stderr, "\n");
5112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
5122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Process previous character (waited until now to avoid
5142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // repeating the flag computation above).
5152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // This is a no-op the first time around the loop, because
5162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // runq is empty.
5172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int id = Step(runq, nextq, c, flag, p-1);
5182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    DCHECK_EQ(runq->size(), 0);
5192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    swap(nextq, runq);
5202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    nextq->clear();
5212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (id != 0) {
5222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // We're done: full match ahead.
5232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      p = text.end();
5242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      for (;;) {
5252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        Prog::Inst* ip = prog_->inst(id);
5262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        switch (ip->opcode()) {
5272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          default:
5282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
5292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            break;
5302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          case kInstCapture:
5322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            match_[ip->cap()] = p;
5332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            id = ip->out();
5342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            continue;
5352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          case kInstNop:
5372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            id = ip->out();
5382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            continue;
5392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          case kInstMatch:
5412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            match_[1] = p;
5422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            matched_ = true;
5432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            break;
5442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          case kInstEmptyWidth:
5462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
5472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson              LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
5482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson              break;
5492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            }
5502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            id = ip->out();
5512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            continue;
5522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        }
5532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
5542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
5552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      break;
5562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
5572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (p > text.end())
5592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      break;
5602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Start a new thread if there have not been any matches.
5622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // (No point in starting a new thread if there have been
5632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // matches, since it would be to the right of the match
5642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // we already found.)
5652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (!matched_ && (!anchored || p == text.begin())) {
5662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // If there's a required first byte for an unanchored search
5672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // and we're not in the middle of any possible matches,
5682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // use memchr to search for the byte quickly.
5692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
5702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          p < text.end() && (p[0] & 0xFF) != first_byte_) {
5712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        p = reinterpret_cast<const char*>(memchr(p, first_byte_,
5722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                                                 text.end() - p));
5732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (p == NULL) {
5742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          p = text.end();
5752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          isword = 0;
5762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        } else {
5772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          isword = Prog::IsWordChar(p[0] & 0xFF);
5782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        }
5792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        flag = Prog::EmptyFlags(context, p);
5802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
5812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // Steal match storage (cleared but unused as of yet)
5832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // temporarily to hold match boundaries for new thread.
5842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      match_[0] = p;
5852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      AddToThreadq(runq, start_, flag, p, match_);
5862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      match_[0] = NULL;
5872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
5882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // If all the threads have died, stop early.
5902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (runq->size() == 0) {
5912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (Debug)
5922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        fprintf(stderr, "dead\n");
5932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      break;
5942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
5952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
5962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (p == text.end())
5972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      c = 0;
5982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    else
5992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      c = *p & 0xFF;
6002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    wasword = isword;
6012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Will run step(runq, nextq, c, ...) on next iteration.  See above.
6032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
6042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
6062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    FreeThread(i->second);
6072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (matched_) {
6092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    for (int i = 0; i < nsubmatch; i++)
6102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
6112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (Debug)
6122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      fprintf(stderr, "match (%d,%d)\n",
6132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson              static_cast<int>(match_[0] - btext_),
6142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson              static_cast<int>(match_[1] - btext_));
6152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return true;
6162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
6172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  VLOG(1) << "No matches found";
6182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return false;
6192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
6202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Computes whether all successful matches have a common first byte,
6222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and if so, returns that byte.  If not, returns -1.
6232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonint NFA::ComputeFirstByte() {
6242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (start_ == 0)
6252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return -1;
6262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int b = -1;  // first byte, not yet computed
6282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  typedef SparseSet Workq;
6302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Workq q(prog_->size());
6312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  q.insert(start_);
6322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
6332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int id = *it;
6342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Prog::Inst* ip = prog_->inst(id);
6352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    switch (ip->opcode()) {
6362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      default:
6372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
6382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
6392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstMatch:
6412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // The empty string matches: no first byte.
6422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        return -1;
6432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstByteRange:
6452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // Must match only a single byte
6462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (ip->lo() != ip->hi())
6472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          return -1;
6482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
6492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          return -1;
6502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // If we haven't seen any bytes yet, record it;
6512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // otherwise must match the one we saw before.
6522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (b == -1)
6532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          b = ip->lo();
6542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        else if (b != ip->lo())
6552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          return -1;
6562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
6572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstNop:
6592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstCapture:
6602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstEmptyWidth:
6612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // Continue on.
6622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // Ignore ip->empty() flags for kInstEmptyWidth
6632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // in order to be as conservative as possible
6642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // (assume all possible empty-width flags are true).
6652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (ip->out())
6662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          q.insert(ip->out());
6672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
6682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstAlt:
6702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstAltMatch:
6712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        // Explore alternatives.
6722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (ip->out())
6732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          q.insert(ip->out());
6742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (ip->out1())
6752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          q.insert(ip->out1());
6762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
6772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstFail:
6792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
6802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
6812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
6822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return b;
6832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
6842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool
6862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonProg::SearchNFA(const StringPiece& text, const StringPiece& context,
6872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                Anchor anchor, MatchKind kind,
6882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                StringPiece* match, int nmatch) {
6892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (NFA::Debug)
6902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Dump();
6912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
6922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  NFA nfa(this);
6932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  StringPiece sp;
6942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (kind == kFullMatch) {
6952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    anchor = kAnchored;
6962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (nmatch == 0) {
6972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      match = &sp;
6982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      nmatch = 1;
6992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
7002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
7012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
7022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
7032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (kind == kFullMatch && match[0].end() != text.end())
7042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
7052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return true;
7062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
7072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
7082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}  // namespace re2
7092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
710