1// Copyright 2006-2007 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Tested by search_test.cc.
6//
7// Prog::SearchNFA, an NFA search.
8// This is an actual NFA like the theorists talk about,
9// not the pseudo-NFA found in backtracking regexp implementations.
10//
11// IMPLEMENTATION
12//
13// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14// which is a variant of the one described in Thompson's 1968 CACM paper.
15// See http://swtch.com/~rsc/regexp/ for various history.  The main feature
16// over the DFA implementation is that it tracks submatch boundaries.
17//
18// When the choice of submatch boundaries is ambiguous, this particular
19// implementation makes the same choices that traditional backtracking
20// implementations (in particular, Perl and PCRE) do.
21// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22// time in the length of the input.
23//
24// Like Thompson's original machine and like the DFA implementation, this
25// implementation notices a match only once it is one byte past it.
26
27#include "re2/prog.h"
28#include "re2/regexp.h"
29#include "util/sparse_array.h"
30#include "util/sparse_set.h"
31
32namespace re2 {
33
34class NFA {
35 public:
36  NFA(Prog* prog);
37  ~NFA();
38
39  // Searches for a matching string.
40  //   * If anchored is true, only considers matches starting at offset.
41  //     Otherwise finds lefmost match at or after offset.
42  //   * If longest is true, returns the longest match starting
43  //     at the chosen start point.  Otherwise returns the so-called
44  //     left-biased match, the one traditional backtracking engines
45  //     (like Perl and PCRE) find.
46  // Records submatch boundaries in submatch[1..nsubmatch-1].
47  // Submatch[0] is the entire match.  When there is a choice in
48  // which text matches each subexpression, the submatch boundaries
49  // are chosen to match what a backtracking implementation would choose.
50  bool Search(const StringPiece& text, const StringPiece& context,
51              bool anchored, bool longest,
52              StringPiece* submatch, int nsubmatch);
53
54  static const int Debug = 0;
55
56 private:
57  struct Thread {
58    union {
59      int id;
60      Thread* next;  // when on free list
61    };
62    const char** capture;
63  };
64
65  // State for explicit stack in AddToThreadq.
66  struct AddState {
67    int id;           // Inst to process
68    int j;
69    const char* cap_j;  // if j>=0, set capture[j] = cap_j before processing ip
70
71    AddState()
72      : id(0), j(-1), cap_j(NULL) {}
73    explicit AddState(int id)
74      : id(id), j(-1), cap_j(NULL) {}
75    AddState(int id, const char* cap_j, int j)
76      : id(id), j(j), cap_j(cap_j) {}
77  };
78
79  // Threadq is a list of threads.  The list is sorted by the order
80  // in which Perl would explore that particular state -- the earlier
81  // choices appear earlier in the list.
82  typedef SparseArray<Thread*> Threadq;
83
84  inline Thread* AllocThread();
85  inline void FreeThread(Thread*);
86
87  // Add id (or its children, following unlabeled arrows)
88  // to the workqueue q with associated capture info.
89  void AddToThreadq(Threadq* q, int id, int flag,
90                    const char* p, const char** capture);
91
92  // Run runq on byte c, appending new states to nextq.
93  // Updates matched_ and match_ as new, better matches are found.
94  // p is position of the next byte (the one after c)
95  // in the input string, used when processing capturing parens.
96  // flag is the bitwise or of Bol, Eol, etc., specifying whether
97  // ^, $ and \b match the current input point (after c).
98  inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
99
100  // Returns text version of capture information, for debugging.
101  string FormatCapture(const char** capture);
102
103  inline void CopyCapture(const char** dst, const char** src);
104
105  // Computes whether all matches must begin with the same first
106  // byte, and if so, returns that byte.  If not, returns -1.
107  int ComputeFirstByte();
108
109  Prog* prog_;          // underlying program
110  int start_;           // start instruction in program
111  int ncapture_;        // number of submatches to track
112  bool longest_;        // whether searching for longest match
113  bool endmatch_;       // whether match must end at text.end()
114  const char* btext_;   // beginning of text being matched (for FormatSubmatch)
115  const char* etext_;   // end of text being matched (for endmatch_)
116  Threadq q0_, q1_;     // pre-allocated for Search.
117  const char** match_;  // best match so far
118  bool matched_;        // any match so far?
119  AddState* astack_;    // pre-allocated for AddToThreadq
120  int nastack_;
121  int first_byte_;      // required first byte for match, or -1 if none
122
123  Thread* free_threads_;  // free list
124
125  DISALLOW_EVIL_CONSTRUCTORS(NFA);
126};
127
128NFA::NFA(Prog* prog) {
129  prog_ = prog;
130  start_ = prog->start();
131  ncapture_ = 0;
132  longest_ = false;
133  endmatch_ = false;
134  btext_ = NULL;
135  etext_ = NULL;
136  q0_.resize(prog_->size());
137  q1_.resize(prog_->size());
138  nastack_ = 2*prog_->size();
139  astack_ = new AddState[nastack_];
140  match_ = NULL;
141  matched_ = false;
142  free_threads_ = NULL;
143  first_byte_ = ComputeFirstByte();
144}
145
146NFA::~NFA() {
147  delete[] match_;
148  delete[] astack_;
149  Thread* next;
150  for (Thread* t = free_threads_; t; t = next) {
151    next = t->next;
152    delete[] t->capture;
153    delete t;
154  }
155}
156
157void NFA::FreeThread(Thread *t) {
158  if (t == NULL)
159    return;
160  t->next = free_threads_;
161  free_threads_ = t;
162}
163
164NFA::Thread* NFA::AllocThread() {
165  Thread* t = free_threads_;
166  if (t == NULL) {
167    t = new Thread;
168    t->capture = new const char*[ncapture_];
169    return t;
170  }
171  free_threads_ = t->next;
172  return t;
173}
174
175void NFA::CopyCapture(const char** dst, const char** src) {
176  for (int i = 0; i < ncapture_; i+=2) {
177    dst[i] = src[i];
178    dst[i+1] = src[i+1];
179  }
180}
181
182// Follows all empty arrows from id0 and enqueues all the states reached.
183// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
184// The pointer p is the current input position, and m is the
185// current set of match boundaries.
186void NFA::AddToThreadq(Threadq* q, int id0, int flag,
187                       const char* p, const char** capture) {
188  if (id0 == 0)
189    return;
190
191  // Astack_ is pre-allocated to avoid resize operations.
192  // It has room for 2*prog_->size() entries, which is enough:
193  // Each inst in prog can be processed at most once,
194  // pushing at most two entries on stk.
195
196  int nstk = 0;
197  AddState* stk = astack_;
198  stk[nstk++] = AddState(id0);
199
200  while (nstk > 0) {
201    DCHECK_LE(nstk, nastack_);
202    const AddState& a = stk[--nstk];
203    if (a.j >= 0)
204      capture[a.j] = a.cap_j;
205
206    int id = a.id;
207    if (id == 0)
208      continue;
209    if (q->has_index(id)) {
210      if (Debug)
211        fprintf(stderr, "  [%d%s]\n", id, FormatCapture(capture).c_str());
212      continue;
213    }
214
215    // Create entry in q no matter what.  We might fill it in below,
216    // or we might not.  Even if not, it is necessary to have it,
217    // so that we don't revisit id0 during the recursion.
218    q->set_new(id, NULL);
219
220    Thread** tp = &q->find(id)->second;
221    int j;
222    Thread* t;
223    Prog::Inst* ip = prog_->inst(id);
224    switch (ip->opcode()) {
225    default:
226      LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
227      break;
228
229    case kInstFail:
230      break;
231
232    case kInstAltMatch:
233      // Save state; will pick up at next byte.
234      t = AllocThread();
235      t->id = id;
236      CopyCapture(t->capture, capture);
237      *tp = t;
238      // fall through
239
240    case kInstAlt:
241      // Explore alternatives.
242      stk[nstk++] = AddState(ip->out1());
243      stk[nstk++] = AddState(ip->out());
244      break;
245
246    case kInstNop:
247      // Continue on.
248      stk[nstk++] = AddState(ip->out());
249      break;
250
251    case kInstCapture:
252      if ((j=ip->cap()) < ncapture_) {
253        // Push a dummy whose only job is to restore capture[j]
254        // once we finish exploring this possibility.
255        stk[nstk++] = AddState(0, capture[j], j);
256
257        // Record capture.
258        capture[j] = p;
259      }
260      stk[nstk++] = AddState(ip->out());
261      break;
262
263    case kInstMatch:
264    case kInstByteRange:
265      // Save state; will pick up at next byte.
266      t = AllocThread();
267      t->id = id;
268      CopyCapture(t->capture, capture);
269      *tp = t;
270      if (Debug)
271        fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
272      break;
273
274    case kInstEmptyWidth:
275      // Continue on if we have all the right flag bits.
276      if (ip->empty() & ~flag)
277        break;
278      stk[nstk++] = AddState(ip->out());
279      break;
280    }
281  }
282}
283
284// Run runq on byte c, appending new states to nextq.
285// Updates match as new, better matches are found.
286// p is position of the byte c in the input string,
287// used when processing capturing parens.
288// flag is the bitwise or of Bol, Eol, etc., specifying whether
289// ^, $ and \b match the current input point (after c).
290// Frees all the threads on runq.
291// If there is a shortcut to the end, returns that shortcut.
292int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
293  nextq->clear();
294
295  for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
296    Thread* t = i->second;
297    if (t == NULL)
298      continue;
299
300    if (longest_) {
301      // Can skip any threads started after our current best match.
302      if (matched_ && match_[0] < t->capture[0]) {
303        FreeThread(t);
304        continue;
305      }
306    }
307
308    int id = t->id;
309    Prog::Inst* ip = prog_->inst(id);
310
311    switch (ip->opcode()) {
312      default:
313        // Should only see the values handled below.
314        LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
315        break;
316
317      case kInstByteRange:
318        if (ip->Matches(c))
319          AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
320        break;
321
322      case kInstAltMatch:
323        if (i != runq->begin())
324          break;
325        // The match is ours if we want it.
326        if (ip->greedy(prog_) || longest_) {
327          CopyCapture((const char**)match_, t->capture);
328          FreeThread(t);
329          for (++i; i != runq->end(); ++i)
330            FreeThread(i->second);
331          runq->clear();
332          matched_ = true;
333          if (ip->greedy(prog_))
334            return ip->out1();
335          return ip->out();
336        }
337        break;
338
339      case kInstMatch:
340        if (endmatch_ && p != etext_)
341          break;
342
343        const char* old = t->capture[1];  // previous end pointer
344        t->capture[1] = p;
345        if (longest_) {
346          // Leftmost-longest mode: save this match only if
347          // it is either farther to the left or at the same
348          // point but longer than an existing match.
349          if (!matched_ || t->capture[0] < match_[0] ||
350              (t->capture[0] == match_[0] && t->capture[1] > match_[1]))
351            CopyCapture((const char**)match_, t->capture);
352        } else {
353          // Leftmost-biased mode: this match is by definition
354          // better than what we've already found (see next line).
355          CopyCapture((const char**)match_, t->capture);
356
357          // Cut off the threads that can only find matches
358          // worse than the one we just found: don't run the
359          // rest of the current Threadq.
360          t->capture[0] = old;
361          FreeThread(t);
362          for (++i; i != runq->end(); ++i)
363            FreeThread(i->second);
364          runq->clear();
365          matched_ = true;
366          return 0;
367        }
368        t->capture[0] = old;
369        matched_ = true;
370        break;
371    }
372    FreeThread(t);
373  }
374  runq->clear();
375  return 0;
376}
377
378string NFA::FormatCapture(const char** capture) {
379  string s;
380
381  for (int i = 0; i < ncapture_; i+=2) {
382    if (capture[i] == NULL)
383      StringAppendF(&s, "(?,?)");
384    else if (capture[i+1] == NULL)
385      StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
386    else
387      StringAppendF(&s, "(%d,%d)",
388                    (int)(capture[i] - btext_),
389                    (int)(capture[i+1] - btext_));
390  }
391  return s;
392}
393
394// Returns whether haystack contains needle's memory.
395static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
396  return haystack.begin() <= needle.begin() &&
397         haystack.end() >= needle.end();
398}
399
400bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
401            bool anchored, bool longest,
402            StringPiece* submatch, int nsubmatch) {
403  if (start_ == 0)
404    return false;
405
406  StringPiece context = const_context;
407  if (context.begin() == NULL)
408    context = text;
409
410  if (!StringPieceContains(context, text)) {
411    LOG(FATAL) << "Bad args: context does not contain text "
412                << reinterpret_cast<const void*>(context.begin())
413                << "+" << context.size() << " "
414                << reinterpret_cast<const void*>(text.begin())
415                << "+" << text.size();
416    return false;
417  }
418
419  if (prog_->anchor_start() && context.begin() != text.begin())
420    return false;
421  if (prog_->anchor_end() && context.end() != text.end())
422    return false;
423  anchored |= prog_->anchor_start();
424  if (prog_->anchor_end()) {
425    longest = true;
426    endmatch_ = true;
427    etext_ = text.end();
428  }
429
430  if (nsubmatch < 0) {
431    LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
432    return false;
433  }
434
435  // Save search parameters.
436  ncapture_ = 2*nsubmatch;
437  longest_ = longest;
438
439  if (nsubmatch == 0) {
440    // We need to maintain match[0], both to distinguish the
441    // longest match (if longest is true) and also to tell
442    // whether we've seen any matches at all.
443    ncapture_ = 2;
444  }
445
446  match_ = new const char*[ncapture_];
447  matched_ = false;
448  memset(match_, 0, ncapture_*sizeof match_[0]);
449
450  // For debugging prints.
451  btext_ = context.begin();
452
453  if (Debug) {
454    fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
455            text.as_string().c_str(), context.as_string().c_str(), anchored,
456            longest);
457  }
458
459  // Set up search.
460  Threadq* runq = &q0_;
461  Threadq* nextq = &q1_;
462  runq->clear();
463  nextq->clear();
464  memset(&match_[0], 0, ncapture_*sizeof match_[0]);
465  const char* bp = context.begin();
466  int c = -1;
467  int wasword = 0;
468
469  if (text.begin() > context.begin()) {
470    c = text.begin()[-1] & 0xFF;
471    wasword = Prog::IsWordChar(c);
472  }
473
474  // Loop over the text, stepping the machine.
475  for (const char* p = text.begin();; p++) {
476    // Check for empty-width specials.
477    int flag = 0;
478
479    // ^ and \A
480    if (p == context.begin())
481      flag |= kEmptyBeginText | kEmptyBeginLine;
482    else if (p <= context.end() && p[-1] == '\n')
483      flag |= kEmptyBeginLine;
484
485    // $ and \z
486    if (p == context.end())
487      flag |= kEmptyEndText | kEmptyEndLine;
488    else if (p < context.end() && p[0] == '\n')
489      flag |= kEmptyEndLine;
490
491    // \b and \B
492    int isword = 0;
493    if (p < context.end())
494      isword = Prog::IsWordChar(p[0] & 0xFF);
495
496    if (isword != wasword)
497      flag |= kEmptyWordBoundary;
498    else
499      flag |= kEmptyNonWordBoundary;
500
501    if (Debug) {
502      fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
503      for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
504        Thread* t = i->second;
505        if (t == NULL)
506          continue;
507        fprintf(stderr, " %d%s", t->id,
508                FormatCapture((const char**)t->capture).c_str());
509      }
510      fprintf(stderr, "\n");
511    }
512
513    // Process previous character (waited until now to avoid
514    // repeating the flag computation above).
515    // This is a no-op the first time around the loop, because
516    // runq is empty.
517    int id = Step(runq, nextq, c, flag, p-1);
518    DCHECK_EQ(runq->size(), 0);
519    swap(nextq, runq);
520    nextq->clear();
521    if (id != 0) {
522      // We're done: full match ahead.
523      p = text.end();
524      for (;;) {
525        Prog::Inst* ip = prog_->inst(id);
526        switch (ip->opcode()) {
527          default:
528            LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
529            break;
530
531          case kInstCapture:
532            match_[ip->cap()] = p;
533            id = ip->out();
534            continue;
535
536          case kInstNop:
537            id = ip->out();
538            continue;
539
540          case kInstMatch:
541            match_[1] = p;
542            matched_ = true;
543            break;
544
545          case kInstEmptyWidth:
546            if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
547              LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
548              break;
549            }
550            id = ip->out();
551            continue;
552        }
553        break;
554      }
555      break;
556    }
557
558    if (p > text.end())
559      break;
560
561    // Start a new thread if there have not been any matches.
562    // (No point in starting a new thread if there have been
563    // matches, since it would be to the right of the match
564    // we already found.)
565    if (!matched_ && (!anchored || p == text.begin())) {
566      // If there's a required first byte for an unanchored search
567      // and we're not in the middle of any possible matches,
568      // use memchr to search for the byte quickly.
569      if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
570          p < text.end() && (p[0] & 0xFF) != first_byte_) {
571        p = reinterpret_cast<const char*>(memchr(p, first_byte_,
572                                                 text.end() - p));
573        if (p == NULL) {
574          p = text.end();
575          isword = 0;
576        } else {
577          isword = Prog::IsWordChar(p[0] & 0xFF);
578        }
579        flag = Prog::EmptyFlags(context, p);
580      }
581
582      // Steal match storage (cleared but unused as of yet)
583      // temporarily to hold match boundaries for new thread.
584      match_[0] = p;
585      AddToThreadq(runq, start_, flag, p, match_);
586      match_[0] = NULL;
587    }
588
589    // If all the threads have died, stop early.
590    if (runq->size() == 0) {
591      if (Debug)
592        fprintf(stderr, "dead\n");
593      break;
594    }
595
596    if (p == text.end())
597      c = 0;
598    else
599      c = *p & 0xFF;
600    wasword = isword;
601
602    // Will run step(runq, nextq, c, ...) on next iteration.  See above.
603  }
604
605  for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
606    FreeThread(i->second);
607
608  if (matched_) {
609    for (int i = 0; i < nsubmatch; i++)
610      submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
611    if (Debug)
612      fprintf(stderr, "match (%d,%d)\n",
613              static_cast<int>(match_[0] - btext_),
614              static_cast<int>(match_[1] - btext_));
615    return true;
616  }
617  VLOG(1) << "No matches found";
618  return false;
619}
620
621// Computes whether all successful matches have a common first byte,
622// and if so, returns that byte.  If not, returns -1.
623int NFA::ComputeFirstByte() {
624  if (start_ == 0)
625    return -1;
626
627  int b = -1;  // first byte, not yet computed
628
629  typedef SparseSet Workq;
630  Workq q(prog_->size());
631  q.insert(start_);
632  for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
633    int id = *it;
634    Prog::Inst* ip = prog_->inst(id);
635    switch (ip->opcode()) {
636      default:
637        LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
638        break;
639
640      case kInstMatch:
641        // The empty string matches: no first byte.
642        return -1;
643
644      case kInstByteRange:
645        // Must match only a single byte
646        if (ip->lo() != ip->hi())
647          return -1;
648        if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
649          return -1;
650        // If we haven't seen any bytes yet, record it;
651        // otherwise must match the one we saw before.
652        if (b == -1)
653          b = ip->lo();
654        else if (b != ip->lo())
655          return -1;
656        break;
657
658      case kInstNop:
659      case kInstCapture:
660      case kInstEmptyWidth:
661        // Continue on.
662        // Ignore ip->empty() flags for kInstEmptyWidth
663        // in order to be as conservative as possible
664        // (assume all possible empty-width flags are true).
665        if (ip->out())
666          q.insert(ip->out());
667        break;
668
669      case kInstAlt:
670      case kInstAltMatch:
671        // Explore alternatives.
672        if (ip->out())
673          q.insert(ip->out());
674        if (ip->out1())
675          q.insert(ip->out1());
676        break;
677
678      case kInstFail:
679        break;
680    }
681  }
682  return b;
683}
684
685bool
686Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
687                Anchor anchor, MatchKind kind,
688                StringPiece* match, int nmatch) {
689  if (NFA::Debug)
690    Dump();
691
692  NFA nfa(this);
693  StringPiece sp;
694  if (kind == kFullMatch) {
695    anchor = kAnchored;
696    if (nmatch == 0) {
697      match = &sp;
698      nmatch = 1;
699    }
700  }
701  if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
702    return false;
703  if (kind == kFullMatch && match[0].end() != text.end())
704    return false;
705  return true;
706}
707
708}  // namespace re2
709
710