1// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Helper class for traversing Regexps without recursion.
6// Clients should declare their own subclasses that override
7// the PreVisit and PostVisit methods, which are called before
8// and after visiting the subexpressions.
9
10// Not quite the Visitor pattern, because (among other things)
11// the Visitor pattern is recursive.
12
13#ifndef RE2_WALKER_INL_H__
14#define RE2_WALKER_INL_H__
15
16#include "re2/regexp.h"
17
18namespace re2 {
19
20template<typename T> struct WalkState;
21
22template<typename T> class Regexp::Walker {
23 public:
24  Walker();
25  virtual ~Walker();
26
27  // Virtual method called before visiting re's children.
28  // PreVisit passes ownership of its return value to its caller.
29  // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
30  // and passed to the child PreVisits and PostVisits as parent_arg.
31  // At the top-most Regexp, parent_arg is arg passed to walk.
32  // If PreVisit sets *stop to true, the walk does not recurse
33  // into the children.  Instead it behaves as though the return
34  // value from PreVisit is the return value from PostVisit.
35  // The default PreVisit returns parent_arg.
36  virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
37
38  // Virtual method called after visiting re's children.
39  // The pre_arg is the T that PreVisit returned.
40  // The child_args is a vector of the T that the child PostVisits returned.
41  // PostVisit takes ownership of pre_arg.
42  // PostVisit takes ownership of the Ts
43  // in *child_args, but not the vector itself.
44  // PostVisit passes ownership of its return value
45  // to its caller.
46  // The default PostVisit simply returns pre_arg.
47  virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
48                      T* child_args, int nchild_args);
49
50  // Virtual method called to copy a T,
51  // when Walk notices that more than one child is the same re.
52  virtual T Copy(T arg);
53
54  // Virtual method called to do a "quick visit" of the re,
55  // but not its children.  Only called once the visit budget
56  // has been used up and we're trying to abort the walk
57  // as quickly as possible.  Should return a value that
58  // makes sense for the parent PostVisits still to be run.
59  // This function is (hopefully) only called by
60  // WalkExponential, but must be implemented by all clients,
61  // just in case.
62  virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
63
64  // Walks over a regular expression.
65  // Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
66  // Returns the T returned by PostVisit on re.
67  T Walk(Regexp* re, T top_arg);
68
69  // Like Walk, but doesn't use Copy.  This can lead to
70  // exponential runtimes on cross-linked Regexps like the
71  // ones generated by Simplify.  To help limit this,
72  // at most max_visits nodes will be visited and then
73  // the walk will be cut off early.
74  // If the walk *is* cut off early, ShortVisit(re)
75  // will be called on regexps that cannot be fully
76  // visited rather than calling PreVisit/PostVisit.
77  T WalkExponential(Regexp* re, T top_arg, int max_visits);
78
79  // Clears the stack.  Should never be necessary, since
80  // Walk always enters and exits with an empty stack.
81  // Logs DFATAL if stack is not already clear.
82  void Reset();
83
84  // Returns whether walk was cut off.
85  bool stopped_early() { return stopped_early_; }
86
87 private:
88  // Walk state for the entire traversal.
89  stack<WalkState<T> >* stack_;
90  bool stopped_early_;
91  int max_visits_;
92
93  T WalkInternal(Regexp* re, T top_arg, bool use_copy);
94
95  DISALLOW_EVIL_CONSTRUCTORS(Walker);
96};
97
98template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
99                                                   T parent_arg,
100                                                   bool* stop) {
101  return parent_arg;
102}
103
104template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
105                                                    T parent_arg,
106                                                    T pre_arg,
107                                                    T* child_args,
108                                                    int nchild_args) {
109  return pre_arg;
110}
111
112template<typename T> T Regexp::Walker<T>::Copy(T arg) {
113  return arg;
114}
115
116// State about a single level in the traversal.
117template<typename T> struct WalkState {
118  WalkState<T>(Regexp* re, T parent)
119    : re(re),
120      n(-1),
121      parent_arg(parent),
122      child_args(NULL) { }
123
124  Regexp* re;  // The regexp
125  int n;  // The index of the next child to process; -1 means need to PreVisit
126  T parent_arg;  // Accumulated arguments.
127  T pre_arg;
128  T child_arg;  // One-element buffer for child_args.
129  T* child_args;
130};
131
132template<typename T> Regexp::Walker<T>::Walker() {
133  stack_ = new stack<WalkState<T> >;
134  stopped_early_ = false;
135}
136
137template<typename T> Regexp::Walker<T>::~Walker() {
138  Reset();
139  delete stack_;
140}
141
142// Clears the stack.  Should never be necessary, since
143// Walk always enters and exits with an empty stack.
144// Logs DFATAL if stack is not already clear.
145template<typename T> void Regexp::Walker<T>::Reset() {
146  if (stack_ && stack_->size() > 0) {
147    LOG(DFATAL) << "Stack not empty.";
148    while (stack_->size() > 0) {
149      delete stack_->top().child_args;
150      stack_->pop();
151    }
152  }
153}
154
155template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
156                                                       bool use_copy) {
157  Reset();
158
159  if (re == NULL) {
160    LOG(DFATAL) << "Walk NULL";
161    return top_arg;
162  }
163
164  stack_->push(WalkState<T>(re, top_arg));
165
166  WalkState<T>* s;
167  for (;;) {
168    T t;
169    s = &stack_->top();
170    Regexp* re = s->re;
171    switch (s->n) {
172      case -1: {
173        if (--max_visits_ < 0) {
174          stopped_early_ = true;
175          t = ShortVisit(re, s->parent_arg);
176          break;
177        }
178        bool stop = false;
179        s->pre_arg = PreVisit(re, s->parent_arg, &stop);
180        if (stop) {
181          t = s->pre_arg;
182          break;
183        }
184        s->n = 0;
185        s->child_args = NULL;
186        if (re->nsub_ == 1)
187          s->child_args = &s->child_arg;
188        else if (re->nsub_ > 1)
189          s->child_args = new T[re->nsub_];
190        // Fall through.
191      }
192      default: {
193        if (re->nsub_ > 0) {
194          Regexp** sub = re->sub();
195          if (s->n < re->nsub_) {
196            if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
197              s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
198              s->n++;
199            } else {
200              stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
201            }
202            continue;
203          }
204        }
205
206        t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
207        if (re->nsub_ > 1)
208          delete[] s->child_args;
209        break;
210      }
211    }
212
213    // We've finished stack_->top().
214    // Update next guy down.
215    stack_->pop();
216    if (stack_->size() == 0)
217      return t;
218    s = &stack_->top();
219    if (s->child_args != NULL)
220      s->child_args[s->n] = t;
221    else
222      s->child_arg = t;
223    s->n++;
224  }
225}
226
227template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
228  // Without the exponential walking behavior,
229  // this budget should be more than enough for any
230  // regexp, and yet not enough to get us in trouble
231  // as far as CPU time.
232  max_visits_ = 1000000;
233  return WalkInternal(re, top_arg, true);
234}
235
236template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
237                                                          int max_visits) {
238  max_visits_ = max_visits;
239  return WalkInternal(re, top_arg, false);
240}
241
242}  // namespace re2
243
244#endif  // RE2_WALKER_INL_H__
245