nfa.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2006-2007 The RE2 Authors.  All Rights Reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Tested by search_test.cc.
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Prog::SearchNFA, an NFA search.
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This is an actual NFA like the theorists talk about,
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// not the pseudo-NFA found in backtracking regexp implementations.
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// IMPLEMENTATION
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// which is a variant of the one described in Thompson's 1968 CACM paper.
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// See http://swtch.com/~rsc/regexp/ for various history.  The main feature
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// over the DFA implementation is that it tracks submatch boundaries.
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// When the choice of submatch boundaries is ambiguous, this particular
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// implementation makes the same choices that traditional backtracking
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// implementations (in particular, Perl and PCRE) do.
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// time in the length of the input.
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Like Thompson's original machine and like the DFA implementation, this
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// implementation notices a match only once it is one byte past it.
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/prog.h"
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/regexp.h"
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/sparse_array.h"
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/sparse_set.h"
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 {
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class NFA {
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  NFA(Prog* prog);
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~NFA();
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Searches for a matching string.
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   * If anchored is true, only considers matches starting at offset.
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //     Otherwise finds lefmost match at or after offset.
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   * If longest is true, returns the longest match starting
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //     at the chosen start point.  Otherwise returns the so-called
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //     left-biased match, the one traditional backtracking engines
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //     (like Perl and PCRE) find.
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Records submatch boundaries in submatch[1..nsubmatch-1].
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Submatch[0] is the entire match.  When there is a choice in
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // which text matches each subexpression, the submatch boundaries
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // are chosen to match what a backtracking implementation would choose.
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool Search(const StringPiece& text, const StringPiece& context,
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              bool anchored, bool longest,
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              StringPiece* submatch, int nsubmatch);
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int Debug = 0;
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  struct Thread {
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    union {
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      int id;
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Thread* next;  // when on free list
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    };
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const char** capture;
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // State for explicit stack in AddToThreadq.
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  struct AddState {
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int id;           // Inst to process
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int j;
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const char* cap_j;  // if j>=0, set capture[j] = cap_j before processing ip
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    AddState()
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      : id(0), j(-1), cap_j(NULL) {}
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    explicit AddState(int id)
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      : id(id), j(-1), cap_j(NULL) {}
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    AddState(int id, const char* cap_j, int j)
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      : id(id), j(j), cap_j(cap_j) {}
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Threadq is a list of threads.  The list is sorted by the order
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // in which Perl would explore that particular state -- the earlier
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // choices appear earlier in the list.
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef SparseArray<Thread*> Threadq;
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inline Thread* AllocThread();
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inline void FreeThread(Thread*);
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Add id (or its children, following unlabeled arrows)
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to the workqueue q with associated capture info.
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void AddToThreadq(Threadq* q, int id, int flag,
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                    const char* p, const char** capture);
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Run runq on byte c, appending new states to nextq.
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Updates matched_ and match_ as new, better matches are found.
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // p is position of the next byte (the one after c)
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // in the input string, used when processing capturing parens.
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // flag is the bitwise or of Bol, Eol, etc., specifying whether
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // ^, $ and \b match the current input point (after c).
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns text version of capture information, for debugging.
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  string FormatCapture(const char** capture);
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inline void CopyCapture(const char** dst, const char** src);
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Computes whether all matches must begin with the same first
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // byte, and if so, returns that byte.  If not, returns -1.
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int ComputeFirstByte();
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Prog* prog_;          // underlying program
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int start_;           // start instruction in program
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int ncapture_;        // number of submatches to track
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool longest_;        // whether searching for longest match
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool endmatch_;       // whether match must end at text.end()
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* btext_;   // beginning of text being matched (for FormatSubmatch)
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* etext_;   // end of text being matched (for endmatch_)
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Threadq q0_, q1_;     // pre-allocated for Search.
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char** match_;  // best match so far
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool matched_;        // any match so far?
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  AddState* astack_;    // pre-allocated for AddToThreadq
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int nastack_;
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int first_byte_;      // required first byte for match, or -1 if none
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Thread* free_threads_;  // free list
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_EVIL_CONSTRUCTORS(NFA);
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)NFA::NFA(Prog* prog) {
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  prog_ = prog;
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  start_ = prog->start();
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ncapture_ = 0;
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  longest_ = false;
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  endmatch_ = false;
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  btext_ = NULL;
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  etext_ = NULL;
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  q0_.resize(prog_->size());
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  q1_.resize(prog_->size());
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  nastack_ = 2*prog_->size();
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  astack_ = new AddState[nastack_];
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  match_ = NULL;
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  matched_ = false;
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  free_threads_ = NULL;
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  first_byte_ = ComputeFirstByte();
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)NFA::~NFA() {
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  delete[] match_;
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  delete[] astack_;
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Thread* next;
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (Thread* t = free_threads_; t; t = next) {
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    next = t->next;
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    delete[] t->capture;
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    delete t;
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void NFA::FreeThread(Thread *t) {
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (t == NULL)
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  t->next = free_threads_;
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  free_threads_ = t;
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)NFA::Thread* NFA::AllocThread() {
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Thread* t = free_threads_;
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (t == NULL) {
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    t = new Thread;
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    t->capture = new const char*[ncapture_];
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return t;
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  free_threads_ = t->next;
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return t;
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void NFA::CopyCapture(const char** dst, const char** src) {
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 0; i < ncapture_; i+=2) {
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    dst[i] = src[i];
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    dst[i+1] = src[i+1];
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Follows all empty arrows from id0 and enqueues all the states reached.
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The pointer p is the current input position, and m is the
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// current set of match boundaries.
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void NFA::AddToThreadq(Threadq* q, int id0, int flag,
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       const char* p, const char** capture) {
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (id0 == 0)
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Astack_ is pre-allocated to avoid resize operations.
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // It has room for 2*prog_->size() entries, which is enough:
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Each inst in prog can be processed at most once,
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // pushing at most two entries on stk.
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int nstk = 0;
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  AddState* stk = astack_;
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  stk[nstk++] = AddState(id0);
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (nstk > 0) {
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DCHECK_LE(nstk, nastack_);
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const AddState& a = stk[--nstk];
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (a.j >= 0)
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      capture[a.j] = a.cap_j;
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int id = a.id;
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (id == 0)
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      continue;
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (q->has_index(id)) {
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (Debug)
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        fprintf(stderr, "  [%d%s]\n", id, FormatCapture(capture).c_str());
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      continue;
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Create entry in q no matter what.  We might fill it in below,
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // or we might not.  Even if not, it is necessary to have it,
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // so that we don't revisit id0 during the recursion.
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    q->set_new(id, NULL);
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Thread** tp = &q->find(id)->second;
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int j;
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Thread* t;
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Prog::Inst* ip = prog_->inst(id);
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    switch (ip->opcode()) {
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    default:
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kInstFail:
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kInstAltMatch:
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Save state; will pick up at next byte.
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      t = AllocThread();
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      t->id = id;
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      CopyCapture(t->capture, capture);
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      *tp = t;
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // fall through
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kInstAlt:
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Explore alternatives.
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      stk[nstk++] = AddState(ip->out1());
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      stk[nstk++] = AddState(ip->out());
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kInstNop:
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Continue on.
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      stk[nstk++] = AddState(ip->out());
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kInstCapture:
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if ((j=ip->cap()) < ncapture_) {
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Push a dummy whose only job is to restore capture[j]
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // once we finish exploring this possibility.
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        stk[nstk++] = AddState(0, capture[j], j);
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Record capture.
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        capture[j] = p;
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      stk[nstk++] = AddState(ip->out());
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kInstMatch:
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kInstByteRange:
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Save state; will pick up at next byte.
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      t = AllocThread();
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      t->id = id;
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      CopyCapture(t->capture, capture);
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      *tp = t;
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (Debug)
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kInstEmptyWidth:
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Continue on if we have all the right flag bits.
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (ip->empty() & ~flag)
2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      stk[nstk++] = AddState(ip->out());
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Run runq on byte c, appending new states to nextq.
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Updates match as new, better matches are found.
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// p is position of the byte c in the input string,
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// used when processing capturing parens.
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// flag is the bitwise or of Bol, Eol, etc., specifying whether
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ^, $ and \b match the current input point (after c).
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Frees all the threads on runq.
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If there is a shortcut to the end, returns that shortcut.
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  nextq->clear();
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Thread* t = i->second;
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (t == NULL)
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      continue;
2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (longest_) {
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Can skip any threads started after our current best match.
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (matched_ && match_[0] < t->capture[0]) {
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        FreeThread(t);
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        continue;
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int id = t->id;
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Prog::Inst* ip = prog_->inst(id);
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    switch (ip->opcode()) {
3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      default:
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Should only see the values handled below.
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstByteRange:
3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (ip->Matches(c))
3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstAltMatch:
3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (i != runq->begin())
3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          break;
3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // The match is ours if we want it.
3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (ip->greedy(prog_) || longest_) {
3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          CopyCapture((const char**)match_, t->capture);
3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          FreeThread(t);
3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          for (++i; i != runq->end(); ++i)
3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            FreeThread(i->second);
3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          runq->clear();
3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          matched_ = true;
3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          if (ip->greedy(prog_))
3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            return ip->out1();
3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return ip->out();
3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstMatch:
3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (endmatch_ && p != etext_)
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          break;
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        const char* old = t->capture[1];  // previous end pointer
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        t->capture[1] = p;
3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (longest_) {
3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // Leftmost-longest mode: save this match only if
3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // it is either farther to the left or at the same
3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // point but longer than an existing match.
3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          if (!matched_ || t->capture[0] < match_[0] ||
3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              (t->capture[0] == match_[0] && t->capture[1] > match_[1]))
3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            CopyCapture((const char**)match_, t->capture);
3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        } else {
3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // Leftmost-biased mode: this match is by definition
3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // better than what we've already found (see next line).
3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          CopyCapture((const char**)match_, t->capture);
3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // Cut off the threads that can only find matches
3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // worse than the one we just found: don't run the
3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          // rest of the current Threadq.
3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          t->capture[0] = old;
3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          FreeThread(t);
3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          for (++i; i != runq->end(); ++i)
3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            FreeThread(i->second);
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          runq->clear();
3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          matched_ = true;
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return 0;
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        t->capture[0] = old;
3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        matched_ = true;
3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    FreeThread(t);
3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  runq->clear();
3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return 0;
3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)string NFA::FormatCapture(const char** capture) {
3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  string s;
3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 0; i < ncapture_; i+=2) {
3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (capture[i] == NULL)
3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      StringAppendF(&s, "(?,?)");
3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    else if (capture[i+1] == NULL)
3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    else
3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      StringAppendF(&s, "(%d,%d)",
3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                    (int)(capture[i] - btext_),
3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                    (int)(capture[i+1] - btext_));
3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return s;
3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns whether haystack contains needle's memory.
3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return haystack.begin() <= needle.begin() &&
3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)         haystack.end() >= needle.end();
3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            bool anchored, bool longest,
4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            StringPiece* submatch, int nsubmatch) {
4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (start_ == 0)
4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  StringPiece context = const_context;
4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (context.begin() == NULL)
4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    context = text;
4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!StringPieceContains(context, text)) {
4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    LOG(FATAL) << "Bad args: context does not contain text "
4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                << reinterpret_cast<const void*>(context.begin())
4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                << "+" << context.size() << " "
4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                << reinterpret_cast<const void*>(text.begin())
4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                << "+" << text.size();
4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (prog_->anchor_start() && context.begin() != text.begin())
4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (prog_->anchor_end() && context.end() != text.end())
4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  anchored |= prog_->anchor_start();
4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (prog_->anchor_end()) {
4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    longest = true;
4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    endmatch_ = true;
4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    etext_ = text.end();
4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (nsubmatch < 0) {
4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Save search parameters.
4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ncapture_ = 2*nsubmatch;
4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  longest_ = longest;
4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (nsubmatch == 0) {
4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // We need to maintain match[0], both to distinguish the
4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // longest match (if longest is true) and also to tell
4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // whether we've seen any matches at all.
4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ncapture_ = 2;
4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  match_ = new const char*[ncapture_];
4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  matched_ = false;
4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  memset(match_, 0, ncapture_*sizeof match_[0]);
4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // For debugging prints.
4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  btext_ = context.begin();
4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (Debug) {
4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            text.as_string().c_str(), context.as_string().c_str(), anchored,
4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            longest);
4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Set up search.
4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Threadq* runq = &q0_;
4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Threadq* nextq = &q1_;
4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  runq->clear();
4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  nextq->clear();
4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  memset(&match_[0], 0, ncapture_*sizeof match_[0]);
4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* bp = context.begin();
4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int c = -1;
4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int wasword = 0;
4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (text.begin() > context.begin()) {
4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c = text.begin()[-1] & 0xFF;
4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    wasword = Prog::IsWordChar(c);
4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Loop over the text, stepping the machine.
4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (const char* p = text.begin();; p++) {
4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Check for empty-width specials.
4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int flag = 0;
4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // ^ and \A
4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (p == context.begin())
4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      flag |= kEmptyBeginText | kEmptyBeginLine;
4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    else if (p <= context.end() && p[-1] == '\n')
4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      flag |= kEmptyBeginLine;
4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // $ and \z
4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (p == context.end())
4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      flag |= kEmptyEndText | kEmptyEndLine;
4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    else if (p < context.end() && p[0] == '\n')
4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      flag |= kEmptyEndLine;
4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // \b and \B
4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int isword = 0;
4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (p < context.end())
4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      isword = Prog::IsWordChar(p[0] & 0xFF);
4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (isword != wasword)
4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      flag |= kEmptyWordBoundary;
4985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    else
4995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      flag |= kEmptyNonWordBoundary;
5005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (Debug) {
5025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
5035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
5045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        Thread* t = i->second;
5055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (t == NULL)
5065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          continue;
5075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        fprintf(stderr, " %d%s", t->id,
5085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                FormatCapture((const char**)t->capture).c_str());
5095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
5105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      fprintf(stderr, "\n");
5115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
5125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Process previous character (waited until now to avoid
5145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // repeating the flag computation above).
5155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // This is a no-op the first time around the loop, because
5165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // runq is empty.
5175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int id = Step(runq, nextq, c, flag, p-1);
5185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DCHECK_EQ(runq->size(), 0);
5195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    swap(nextq, runq);
5205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    nextq->clear();
5215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (id != 0) {
5225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // We're done: full match ahead.
5235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      p = text.end();
5245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      for (;;) {
5255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        Prog::Inst* ip = prog_->inst(id);
5265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        switch (ip->opcode()) {
5275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          default:
5285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
5295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            break;
5305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          case kInstCapture:
5325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            match_[ip->cap()] = p;
5335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            id = ip->out();
5345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            continue;
5355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          case kInstNop:
5375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            id = ip->out();
5385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            continue;
5395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          case kInstMatch:
5415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            match_[1] = p;
5425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            matched_ = true;
5435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            break;
5445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          case kInstEmptyWidth:
5465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
5475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
5485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              break;
5495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            }
5505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            id = ip->out();
5515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            continue;
5525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
5535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
5545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
5555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
5565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
5575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (p > text.end())
5595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
5605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Start a new thread if there have not been any matches.
5625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // (No point in starting a new thread if there have been
5635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // matches, since it would be to the right of the match
5645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // we already found.)
5655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (!matched_ && (!anchored || p == text.begin())) {
5665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // If there's a required first byte for an unanchored search
5675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // and we're not in the middle of any possible matches,
5685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // use memchr to search for the byte quickly.
5695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
5705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          p < text.end() && (p[0] & 0xFF) != first_byte_) {
5715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        p = reinterpret_cast<const char*>(memchr(p, first_byte_,
5725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                                 text.end() - p));
5735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (p == NULL) {
5745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          p = text.end();
5755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          isword = 0;
5765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        } else {
5775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          isword = Prog::IsWordChar(p[0] & 0xFF);
5785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
5795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        flag = Prog::EmptyFlags(context, p);
5805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
5815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Steal match storage (cleared but unused as of yet)
5835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // temporarily to hold match boundaries for new thread.
5845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      match_[0] = p;
5855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      AddToThreadq(runq, start_, flag, p, match_);
5865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      match_[0] = NULL;
5875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
5885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // If all the threads have died, stop early.
5905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (runq->size() == 0) {
5915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (Debug)
5925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        fprintf(stderr, "dead\n");
5935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
5945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
5955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (p == text.end())
5975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      c = 0;
5985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    else
5995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      c = *p & 0xFF;
6005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    wasword = isword;
6015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Will run step(runq, nextq, c, ...) on next iteration.  See above.
6035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
6045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
6065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    FreeThread(i->second);
6075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (matched_) {
6095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (int i = 0; i < nsubmatch; i++)
6105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
6115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (Debug)
6125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      fprintf(stderr, "match (%d,%d)\n",
6135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              static_cast<int>(match_[0] - btext_),
6145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              static_cast<int>(match_[1] - btext_));
6155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return true;
6165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
6175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  VLOG(1) << "No matches found";
6185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return false;
6195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
6205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Computes whether all successful matches have a common first byte,
6225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and if so, returns that byte.  If not, returns -1.
6235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int NFA::ComputeFirstByte() {
6245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (start_ == 0)
6255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return -1;
6265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int b = -1;  // first byte, not yet computed
6285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef SparseSet Workq;
6305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Workq q(prog_->size());
6315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  q.insert(start_);
6325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
6335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int id = *it;
6345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Prog::Inst* ip = prog_->inst(id);
6355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    switch (ip->opcode()) {
6365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      default:
6375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
6385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
6395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstMatch:
6415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // The empty string matches: no first byte.
6425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return -1;
6435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstByteRange:
6455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Must match only a single byte
6465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (ip->lo() != ip->hi())
6475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return -1;
6485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
6495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return -1;
6505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // If we haven't seen any bytes yet, record it;
6515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // otherwise must match the one we saw before.
6525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (b == -1)
6535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          b = ip->lo();
6545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        else if (b != ip->lo())
6555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return -1;
6565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
6575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstNop:
6595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstCapture:
6605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstEmptyWidth:
6615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Continue on.
6625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Ignore ip->empty() flags for kInstEmptyWidth
6635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // in order to be as conservative as possible
6645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // (assume all possible empty-width flags are true).
6655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (ip->out())
6665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          q.insert(ip->out());
6675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
6685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstAlt:
6705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstAltMatch:
6715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // Explore alternatives.
6725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (ip->out())
6735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          q.insert(ip->out());
6745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (ip->out1())
6755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          q.insert(ip->out1());
6765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
6775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      case kInstFail:
6795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        break;
6805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
6815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
6825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return b;
6835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
6845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool
6865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
6875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                Anchor anchor, MatchKind kind,
6885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                StringPiece* match, int nmatch) {
6895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (NFA::Debug)
6905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Dump();
6915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  NFA nfa(this);
6935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  StringPiece sp;
6945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (kind == kFullMatch) {
6955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    anchor = kAnchored;
6965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (nmatch == 0) {
6975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      match = &sp;
6985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nmatch = 1;
6995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
7005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
7015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
7025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
7035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (kind == kFullMatch && match[0].end() != text.end())
7045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
7055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
7065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
7075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace re2
7095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
710