onepass.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Tested by search_test.cc.
6//
7// Prog::SearchOnePass is an efficient implementation of
8// regular expression search with submatch tracking for
9// what I call "one-pass regular expressions".  (An alternate
10// name might be "backtracking-free regular expressions".)
11//
12// One-pass regular expressions have the property that
13// at each input byte during an anchored match, there may be
14// multiple alternatives but only one can proceed for any
15// given input byte.
16//
17// For example, the regexp /x*yx*/ is one-pass: you read
18// x's until a y, then you read the y, then you keep reading x's.
19// At no point do you have to guess what to do or back up
20// and try a different guess.
21//
22// On the other hand, /x*x/ is not one-pass: when you're
23// looking at an input "x", it's not clear whether you should
24// use it to extend the x* or as the final x.
25//
26// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
27// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
28//
29// A simple intuition for identifying one-pass regular expressions
30// is that it's always immediately obvious when a repetition ends.
31// It must also be immediately obvious which branch of an | to take:
32//
33// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
34//
35// The NFA-based search in nfa.cc does some bookkeeping to
36// avoid the need for backtracking and its associated exponential blowup.
37// But if we have a one-pass regular expression, there is no
38// possibility of backtracking, so there is no need for the
39// extra bookkeeping.  Hence, this code.
40//
41// On a one-pass regular expression, the NFA code in nfa.cc
42// runs at about 1/20 of the backtracking-based PCRE speed.
43// In contrast, the code in this file runs at about the same
44// speed as PCRE.
45//
46// One-pass regular expressions get used a lot when RE is
47// used for parsing simple strings, so it pays off to
48// notice them and handle them efficiently.
49//
50// See also Anne Brüggemann-Klein and Derick Wood,
51// "One-unambiguous regular languages", Information and Computation 142(2).
52
53#include <string.h>
54#include <map>
55#include "util/util.h"
56#include "util/arena.h"
57#include "util/sparse_set.h"
58#include "re2/prog.h"
59#include "re2/stringpiece.h"
60
61namespace re2 {
62
63static const int Debug = 0;
64
65// The key insight behind this implementation is that the
66// non-determinism in an NFA for a one-pass regular expression
67// is contained.  To explain what that means, first a
68// refresher about what regular expression programs look like
69// and how the usual NFA execution runs.
70//
71// In a regular expression program, only the kInstByteRange
72// instruction processes an input byte c and moves on to the
73// next byte in the string (it does so if c is in the given range).
74// The kInstByteRange instructions correspond to literal characters
75// and character classes in the regular expression.
76//
77// The kInstAlt instructions are used as wiring to connect the
78// kInstByteRange instructions together in interesting ways when
79// implementing | + and *.
80// The kInstAlt instruction forks execution, like a goto that
81// jumps to ip->out() and ip->out1() in parallel.  Each of the
82// resulting computation paths is called a thread.
83//
84// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
85// are interesting in their own right but like kInstAlt they don't
86// advance the input pointer.  Only kInstByteRange does.
87//
88// The automaton execution in nfa.cc runs all the possible
89// threads of execution in lock-step over the input.  To process
90// a particular byte, each thread gets run until it either dies
91// or finds a kInstByteRange instruction matching the byte.
92// If the latter happens, the thread stops just past the
93// kInstByteRange instruction (at ip->out()) and waits for
94// the other threads to finish processing the input byte.
95// Then, once all the threads have processed that input byte,
96// the whole process repeats.  The kInstAlt state instruction
97// might create new threads during input processing, but no
98// matter what, all the threads stop after a kInstByteRange
99// and wait for the other threads to "catch up".
100// Running in lock step like this ensures that the NFA reads
101// the input string only once.
102//
103// Each thread maintains its own set of capture registers
104// (the string positions at which it executed the kInstCapture
105// instructions corresponding to capturing parentheses in the
106// regular expression).  Repeated copying of the capture registers
107// is the main performance bottleneck in the NFA implementation.
108//
109// A regular expression program is "one-pass" if, no matter what
110// the input string, there is only one thread that makes it
111// past a kInstByteRange instruction at each input byte.  This means
112// that there is in some sense only one active thread throughout
113// the execution.  Other threads might be created during the
114// processing of an input byte, but they are ephemeral: only one
115// thread is left to start processing the next input byte.
116// This is what I meant above when I said the non-determinism
117// was "contained".
118//
119// To execute a one-pass regular expression program, we can build
120// a DFA (no non-determinism) that has at most as many states as
121// the NFA (compare this to the possibly exponential number of states
122// in the general case).  Each state records, for each possible
123// input byte, the next state along with the conditions required
124// before entering that state -- empty-width flags that must be true
125// and capture operations that must be performed.  It also records
126// whether a set of conditions required to finish a match at that
127// point in the input rather than process the next byte.
128
129// A state in the one-pass NFA (aka DFA) - just an array of actions.
130struct OneState;
131
132// A state in the one-pass NFA - just an array of actions indexed
133// by the bytemap_[] of the next input byte.  (The bytemap
134// maps next input bytes into equivalence classes, to reduce
135// the memory footprint.)
136struct OneState {
137  uint32 matchcond;   // conditions to match right now.
138  uint32 action[1];
139};
140
141// The uint32 conditions in the action are a combination of
142// condition and capture bits and the next state.  The bottom 16 bits
143// are the condition and capture bits, and the top 16 are the index of
144// the next state.
145//
146// Bits 0-5 are the empty-width flags from prog.h.
147// Bit 6 is kMatchWins, which means the match takes
148// priority over moving to next in a first-match search.
149// The remaining bits mark capture registers that should
150// be set to the current input position.  The capture bits
151// start at index 2, since the search loop can take care of
152// cap[0], cap[1] (the overall match position).
153// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
154// No input position can satisfy both kEmptyWordBoundary
155// and kEmptyNonWordBoundary, so we can use that as a sentinel
156// instead of needing an extra bit.
157
158static const int    kIndexShift    = 16;  // number of bits below index
159static const int    kEmptyShift   = 6;  // number of empty flags in prog.h
160static const int    kRealCapShift = kEmptyShift + 1;
161static const int    kRealMaxCap   = (kIndexShift - kRealCapShift) / 2 * 2;
162
163// Parameters used to skip over cap[0], cap[1].
164static const int    kCapShift     = kRealCapShift - 2;
165static const int    kMaxCap       = kRealMaxCap + 2;
166
167static const uint32 kMatchWins    = 1 << kEmptyShift;
168static const uint32 kCapMask      = ((1 << kRealMaxCap) - 1) << kRealCapShift;
169
170static const uint32 kImpossible   = kEmptyWordBoundary | kEmptyNonWordBoundary;
171
172// Check, at compile time, that prog.h agrees with math above.
173// This function is never called.
174void OnePass_Checks() {
175  COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
176                 kEmptyShift_disagrees_with_kEmptyAllFlags);
177  // kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
178  COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
179                 kMaxCap_disagrees_with_kMaxOnePassCapture);
180}
181
182static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
183  uint32 satisfied = Prog::EmptyFlags(context, p);
184  if (cond & kEmptyAllFlags & ~satisfied)
185    return false;
186  return true;
187}
188
189// Apply the capture bits in cond, saving p to the appropriate
190// locations in cap[].
191static void ApplyCaptures(uint32 cond, const char* p,
192                          const char** cap, int ncap) {
193  for (int i = 2; i < ncap; i++)
194    if (cond & (1 << kCapShift << i))
195      cap[i] = p;
196}
197
198// Compute a node pointer.
199// Basically (OneState*)(nodes + statesize*nodeindex)
200// but the version with the C++ casts overflows 80 characters (and is ugly).
201static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
202                                    int nodeindex) {
203  return reinterpret_cast<OneState*>(
204    const_cast<uint8*>(nodes + statesize*nodeindex));
205}
206
207bool Prog::SearchOnePass(const StringPiece& text,
208                         const StringPiece& const_context,
209                         Anchor anchor, MatchKind kind,
210                         StringPiece* match, int nmatch) {
211  if (anchor != kAnchored && kind != kFullMatch) {
212    LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
213    return false;
214  }
215
216  // Make sure we have at least cap[1],
217  // because we use it to tell if we matched.
218  int ncap = 2*nmatch;
219  if (ncap < 2)
220    ncap = 2;
221
222  const char* cap[kMaxCap];
223  for (int i = 0; i < ncap; i++)
224    cap[i] = NULL;
225
226  const char* matchcap[kMaxCap];
227  for (int i = 0; i < ncap; i++)
228    matchcap[i] = NULL;
229
230  StringPiece context = const_context;
231  if (context.begin() == NULL)
232    context = text;
233  if (anchor_start() && context.begin() != text.begin())
234    return false;
235  if (anchor_end() && context.end() != text.end())
236    return false;
237  if (anchor_end())
238    kind = kFullMatch;
239
240  // State and act are marked volatile to
241  // keep the compiler from re-ordering the
242  // memory accesses walking over the NFA.
243  // This is worth about 5%.
244  volatile OneState* state = onepass_start_;
245  volatile uint8* nodes = onepass_nodes_;
246  volatile uint32 statesize = onepass_statesize_;
247  uint8* bytemap = bytemap_;
248  const char* bp = text.begin();
249  const char* ep = text.end();
250  const char* p;
251  bool matched = false;
252  matchcap[0] = bp;
253  cap[0] = bp;
254  uint32 nextmatchcond = state->matchcond;
255  for (p = bp; p < ep; p++) {
256    int c = bytemap[*p & 0xFF];
257    uint32 matchcond = nextmatchcond;
258    uint32 cond = state->action[c];
259
260    // Determine whether we can reach act->next.
261    // If so, advance state and nextmatchcond.
262    if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
263      uint32 nextindex = cond >> kIndexShift;
264      state = IndexToNode(nodes, statesize, nextindex);
265      nextmatchcond = state->matchcond;
266    } else {
267      state = NULL;
268      nextmatchcond = kImpossible;
269    }
270
271    // This code section is carefully tuned.
272    // The goto sequence is about 10% faster than the
273    // obvious rewrite as a large if statement in the
274    // ASCIIMatchRE2 and DotMatchRE2 benchmarks.
275
276    // Saving the match capture registers is expensive.
277    // Is this intermediate match worth thinking about?
278
279    // Not if we want a full match.
280    if (kind == kFullMatch)
281      goto skipmatch;
282
283    // Not if it's impossible.
284    if (matchcond == kImpossible)
285      goto skipmatch;
286
287    // Not if the possible match is beaten by the certain
288    // match at the next byte.  When this test is useless
289    // (e.g., HTTPPartialMatchRE2) it slows the loop by
290    // about 10%, but when it avoids work (e.g., DotMatchRE2),
291    // it cuts the loop execution by about 45%.
292    if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
293      goto skipmatch;
294
295    // Finally, the match conditions must be satisfied.
296    if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
297      for (int i = 2; i < 2*nmatch; i++)
298        matchcap[i] = cap[i];
299      if (nmatch > 1 && (matchcond & kCapMask))
300        ApplyCaptures(matchcond, p, matchcap, ncap);
301      matchcap[1] = p;
302      matched = true;
303
304      // If we're in longest match mode, we have to keep
305      // going and see if we find a longer match.
306      // In first match mode, we can stop if the match
307      // takes priority over the next state for this input byte.
308      // That bit is per-input byte and thus in cond, not matchcond.
309      if (kind == kFirstMatch && (cond & kMatchWins))
310        goto done;
311    }
312
313  skipmatch:
314    if (state == NULL)
315      goto done;
316    if ((cond & kCapMask) && nmatch > 1)
317      ApplyCaptures(cond, p, cap, ncap);
318  }
319
320  // Look for match at end of input.
321  {
322    uint32 matchcond = state->matchcond;
323    if (matchcond != kImpossible &&
324        ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
325      if (nmatch > 1 && (matchcond & kCapMask))
326        ApplyCaptures(matchcond, p, cap, ncap);
327      for (int i = 2; i < ncap; i++)
328        matchcap[i] = cap[i];
329      matchcap[1] = p;
330      matched = true;
331    }
332  }
333
334done:
335  if (!matched)
336    return false;
337  for (int i = 0; i < nmatch; i++)
338    match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]);
339  return true;
340}
341
342
343// Analysis to determine whether a given regexp program is one-pass.
344
345// If ip is not on workq, adds ip to work queue and returns true.
346// If ip is already on work queue, does nothing and returns false.
347// If ip is NULL, does nothing and returns true (pretends to add it).
348typedef SparseSet Instq;
349static bool AddQ(Instq *q, int id) {
350  if (id == 0)
351    return true;
352  if (q->contains(id))
353    return false;
354  q->insert(id);
355  return true;
356}
357
358struct InstCond {
359  int id;
360  uint32 cond;
361};
362
363// Returns whether this is a one-pass program; that is,
364// returns whether it is safe to use SearchOnePass on this program.
365// These conditions must be true for any instruction ip:
366//
367//   (1) for any other Inst nip, there is at most one input-free
368//       path from ip to nip.
369//   (2) there is at most one kInstByte instruction reachable from
370//       ip that matches any particular byte c.
371//   (3) there is at most one input-free path from ip to a kInstMatch
372//       instruction.
373//
374// This is actually just a conservative approximation: it might
375// return false when the answer is true, when kInstEmptyWidth
376// instructions are involved.
377// Constructs and saves corresponding one-pass NFA on success.
378bool Prog::IsOnePass() {
379  if (did_onepass_)
380    return onepass_start_ != NULL;
381  did_onepass_ = true;
382
383  if (start() == 0)  // no match
384    return false;
385
386  // Steal memory for the one-pass NFA from the overall DFA budget.
387  // Willing to use at most 1/4 of the DFA budget (heuristic).
388  // Limit max node count to 65000 as a conservative estimate to
389  // avoid overflowing 16-bit node index in encoding.
390  int maxnodes = 2 + byte_inst_count_;
391  int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
392  if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
393    return false;
394
395  // Flood the graph starting at the start state, and check
396  // that in each reachable state, each possible byte leads
397  // to a unique next state.
398  int size = this->size();
399  InstCond *stack = new InstCond[size];
400
401  int* nodebyid = new int[size];  // indexed by ip
402  memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
403
404  uint8* nodes = new uint8[maxnodes*statesize];
405  uint8* nodep = nodes;
406
407  Instq tovisit(size), workq(size);
408  AddQ(&tovisit, start());
409  nodebyid[start()] = 0;
410  nodep += statesize;
411  int nalloc = 1;
412  for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
413    int id = *it;
414    int nodeindex = nodebyid[id];
415    OneState* node = IndexToNode(nodes, statesize, nodeindex);
416
417    // Flood graph using manual stack, filling in actions as found.
418    // Default is none.
419    for (int b = 0; b < bytemap_range_; b++)
420      node->action[b] = kImpossible;
421    node->matchcond = kImpossible;
422
423    workq.clear();
424    bool matched = false;
425    int nstack = 0;
426    stack[nstack].id = id;
427    stack[nstack++].cond = 0;
428    while (nstack > 0) {
429      int id = stack[--nstack].id;
430      Prog::Inst* ip = inst(id);
431      uint32 cond = stack[nstack].cond;
432      switch (ip->opcode()) {
433        case kInstAltMatch:
434          // TODO(rsc): Ignoring kInstAltMatch optimization.
435          // Should implement it in this engine, but it's subtle.
436          // Fall through.
437        case kInstAlt:
438          // If already on work queue, (1) is violated: bail out.
439          if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
440            goto fail;
441          stack[nstack].id = ip->out1();
442          stack[nstack++].cond = cond;
443          stack[nstack].id = ip->out();
444          stack[nstack++].cond = cond;
445          break;
446
447        case kInstByteRange: {
448          int nextindex = nodebyid[ip->out()];
449          if (nextindex == -1) {
450            if (nalloc >= maxnodes) {
451              if (Debug)
452                LOG(ERROR)
453                  << StringPrintf("Not OnePass: hit node limit %d > %d",
454                                  nalloc, maxnodes);
455              goto fail;
456            }
457            nextindex = nalloc;
458            nodep += statesize;
459            nodebyid[ip->out()] = nextindex;
460            nalloc++;
461            AddQ(&tovisit, ip->out());
462          }
463          if (matched)
464            cond |= kMatchWins;
465          for (int c = ip->lo(); c <= ip->hi(); c++) {
466            int b = bytemap_[c];
467            c = unbytemap_[b];  // last c in byte class
468            uint32 act = node->action[b];
469            uint32 newact = (nextindex << kIndexShift) | cond;
470            if ((act & kImpossible) == kImpossible) {
471              node->action[b] = newact;
472            } else if (act != newact) {
473              if (Debug) {
474                LOG(ERROR)
475                  << StringPrintf("Not OnePass: conflict on byte "
476                                  "%#x at state %d",
477                                  c, *it);
478              }
479              goto fail;
480            }
481          }
482          if (ip->foldcase()) {
483            Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
484            Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
485            for (int c = lo; c <= hi; c++) {
486              int b = bytemap_[c];
487              c = unbytemap_[b];  // last c in class
488              uint32 act = node->action[b];
489              uint32 newact = (nextindex << kIndexShift) | cond;
490              if ((act & kImpossible) == kImpossible) {
491                node->action[b] = newact;
492              } else if (act != newact) {
493                if (Debug) {
494                  LOG(ERROR)
495                    << StringPrintf("Not OnePass: conflict on byte "
496                                    "%#x at state %d",
497                                    c, *it);
498                }
499                goto fail;
500              }
501            }
502          }
503          break;
504        }
505
506        case kInstCapture:
507          if (ip->cap() < kMaxCap)
508            cond |= (1 << kCapShift) << ip->cap();
509          goto QueueEmpty;
510
511        case kInstEmptyWidth:
512          cond |= ip->empty();
513          goto QueueEmpty;
514
515        case kInstNop:
516        QueueEmpty:
517          // kInstCapture and kInstNop always proceed to ip->out().
518          // kInstEmptyWidth only sometimes proceeds to ip->out(),
519          // but as a conservative approximation we assume it always does.
520          // We could be a little more precise by looking at what c
521          // is, but that seems like overkill.
522
523          // If already on work queue, (1) is violated: bail out.
524          if (!AddQ(&workq, ip->out())) {
525            if (Debug) {
526              LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
527                                         " %d -> %d\n",
528                                         *it, ip->out());
529            }
530            goto fail;
531          }
532          stack[nstack].id = ip->out();
533          stack[nstack++].cond = cond;
534          break;
535
536        case kInstMatch:
537          if (matched) {
538            // (3) is violated
539            if (Debug) {
540              LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
541                                         " from %d\n", *it);
542            }
543            goto fail;
544          }
545          matched = true;
546          node->matchcond = cond;
547          break;
548
549        case kInstFail:
550          break;
551      }
552    }
553  }
554
555  if (Debug) {  // For debugging, dump one-pass NFA to LOG(ERROR).
556    string dump = "prog dump:\n" + Dump() + "node dump\n";
557    map<int, int> idmap;
558    for (int i = 0; i < size; i++)
559      if (nodebyid[i] != -1)
560        idmap[nodebyid[i]] = i;
561
562    StringAppendF(&dump, "byte ranges:\n");
563    int i = 0;
564    for (int b = 0; b < bytemap_range_; b++) {
565      int lo = i;
566      while (bytemap_[i] == b)
567        i++;
568      StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
569    }
570
571    for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
572      int id = *it;
573      int nodeindex = nodebyid[id];
574      if (nodeindex == -1)
575      	continue;
576      OneState* node = IndexToNode(nodes, statesize, nodeindex);
577      string s;
578      StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
579                    nodeindex, id, node->matchcond);
580      for (int i = 0; i < bytemap_range_; i++) {
581        if ((node->action[i] & kImpossible) == kImpossible)
582          continue;
583        StringAppendF(&dump, "  %d cond %#x -> %d id=%d\n",
584                      i, node->action[i] & 0xFFFF,
585                      node->action[i] >> kIndexShift,
586                      idmap[node->action[i] >> kIndexShift]);
587      }
588    }
589    LOG(ERROR) << dump;
590  }
591
592  // Overallocated earlier; cut down to actual size.
593  nodep = new uint8[nalloc*statesize];
594  memmove(nodep, nodes, nalloc*statesize);
595  delete[] nodes;
596  nodes = nodep;
597
598  onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
599  onepass_nodes_ = nodes;
600  onepass_statesize_ = statesize;
601  dfa_mem_ -= nalloc*statesize;
602
603  delete[] stack;
604  delete[] nodebyid;
605  return true;
606
607fail:
608  delete[] stack;
609  delete[] nodebyid;
610  delete[] nodes;
611  return false;
612}
613
614}  // namespace re2
615