1// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Rewrite POSIX and other features in re
6// to use simple extended regular expression features.
7// Also sort and simplify character classes.
8
9#include "util/util.h"
10#include "re2/regexp.h"
11#include "re2/walker-inl.h"
12
13namespace re2 {
14
15// Parses the regexp src and then simplifies it and sets *dst to the
16// string representation of the simplified form.  Returns true on success.
17// Returns false and sets *error (if error != NULL) on error.
18bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
19                            string* dst,
20                            RegexpStatus* status) {
21  Regexp* re = Parse(src, flags, status);
22  if (re == NULL)
23    return false;
24  Regexp* sre = re->Simplify();
25  re->Decref();
26  if (sre == NULL) {
27    // Should not happen, since Simplify never fails.
28    LOG(ERROR) << "Simplify failed on " << src;
29    if (status) {
30      status->set_code(kRegexpInternalError);
31      status->set_error_arg(src);
32    }
33    return false;
34  }
35  *dst = sre->ToString();
36  sre->Decref();
37  return true;
38}
39
40// Assuming the simple_ flags on the children are accurate,
41// is this Regexp* simple?
42bool Regexp::ComputeSimple() {
43  Regexp** subs;
44  switch (op_) {
45    case kRegexpNoMatch:
46    case kRegexpEmptyMatch:
47    case kRegexpLiteral:
48    case kRegexpLiteralString:
49    case kRegexpBeginLine:
50    case kRegexpEndLine:
51    case kRegexpBeginText:
52    case kRegexpWordBoundary:
53    case kRegexpNoWordBoundary:
54    case kRegexpEndText:
55    case kRegexpAnyChar:
56    case kRegexpAnyByte:
57    case kRegexpHaveMatch:
58      return true;
59    case kRegexpConcat:
60    case kRegexpAlternate:
61      // These are simple as long as the subpieces are simple.
62      subs = sub();
63      for (int i = 0; i < nsub_; i++)
64        if (!subs[i]->simple_)
65          return false;
66      return true;
67    case kRegexpCharClass:
68      // Simple as long as the char class is not empty, not full.
69      if (ccb_ != NULL)
70        return !ccb_->empty() && !ccb_->full();
71      return !cc_->empty() && !cc_->full();
72    case kRegexpCapture:
73      subs = sub();
74      return subs[0]->simple_;
75    case kRegexpStar:
76    case kRegexpPlus:
77    case kRegexpQuest:
78      subs = sub();
79      if (!subs[0]->simple_)
80        return false;
81      switch (subs[0]->op_) {
82        case kRegexpStar:
83        case kRegexpPlus:
84        case kRegexpQuest:
85        case kRegexpEmptyMatch:
86        case kRegexpNoMatch:
87          return false;
88        default:
89          break;
90      }
91      return true;
92    case kRegexpRepeat:
93      return false;
94  }
95  LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
96  return false;
97}
98
99// Walker subclass used by Simplify.
100// The simplify walk is purely post-recursive: given the simplified children,
101// PostVisit creates the simplified result.
102// The child_args are simplified Regexp*s.
103class SimplifyWalker : public Regexp::Walker<Regexp*> {
104 public:
105  SimplifyWalker() {}
106  virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
107  virtual Regexp* PostVisit(Regexp* re,
108                            Regexp* parent_arg,
109                            Regexp* pre_arg,
110                            Regexp** child_args, int nchild_args);
111  virtual Regexp* Copy(Regexp* re);
112  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
113
114 private:
115  // These functions are declared inside SimplifyWalker so that
116  // they can edit the private fields of the Regexps they construct.
117
118  // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
119  // Caller must Decref return value when done with it.
120  static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
121
122  // Simplifies the expression re{min,max} in terms of *, +, and ?.
123  // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
124  // Caller must Decref return value when done with it.
125  static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
126                                Regexp::ParseFlags parse_flags);
127
128  // Simplifies a character class by expanding any named classes
129  // into rune ranges.  Does not edit re.  Does not consume ref to re.
130  // Caller must Decref return value when done with it.
131  static Regexp* SimplifyCharClass(Regexp* re);
132
133  DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
134};
135
136// Simplifies a regular expression, returning a new regexp.
137// The new regexp uses traditional Unix egrep features only,
138// plus the Perl (?:) non-capturing parentheses.
139// Otherwise, no POSIX or Perl additions.  The new regexp
140// captures exactly the same subexpressions (with the same indices)
141// as the original.
142// Does not edit current object.
143// Caller must Decref() return value when done with it.
144
145Regexp* Regexp::Simplify() {
146  if (simple_)
147    return Incref();
148  SimplifyWalker w;
149  return w.Walk(this, NULL);
150}
151
152#define Simplify DontCallSimplify  // Avoid accidental recursion
153
154Regexp* SimplifyWalker::Copy(Regexp* re) {
155  return re->Incref();
156}
157
158Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
159  // This should never be called, since we use Walk and not
160  // WalkExponential.
161  LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
162  return re->Incref();
163}
164
165Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
166  if (re->simple_) {
167    *stop = true;
168    return re->Incref();
169  }
170  return NULL;
171}
172
173Regexp* SimplifyWalker::PostVisit(Regexp* re,
174                                  Regexp* parent_arg,
175                                  Regexp* pre_arg,
176                                  Regexp** child_args,
177                                  int nchild_args) {
178  switch (re->op()) {
179    case kRegexpNoMatch:
180    case kRegexpEmptyMatch:
181    case kRegexpLiteral:
182    case kRegexpLiteralString:
183    case kRegexpBeginLine:
184    case kRegexpEndLine:
185    case kRegexpBeginText:
186    case kRegexpWordBoundary:
187    case kRegexpNoWordBoundary:
188    case kRegexpEndText:
189    case kRegexpAnyChar:
190    case kRegexpAnyByte:
191    case kRegexpHaveMatch:
192      // All these are always simple.
193      re->simple_ = true;
194      return re->Incref();
195
196    case kRegexpConcat:
197    case kRegexpAlternate: {
198      // These are simple as long as the subpieces are simple.
199      // Two passes to avoid allocation in the common case.
200      bool changed = false;
201      Regexp** subs = re->sub();
202      for (int i = 0; i < re->nsub_; i++) {
203        Regexp* sub = subs[i];
204        Regexp* newsub = child_args[i];
205        if (newsub != sub) {
206          changed = true;
207          break;
208        }
209      }
210      if (!changed) {
211        for (int i = 0; i < re->nsub_; i++) {
212          Regexp* newsub = child_args[i];
213          newsub->Decref();
214        }
215        re->simple_ = true;
216        return re->Incref();
217      }
218      Regexp* nre = new Regexp(re->op(), re->parse_flags());
219      nre->AllocSub(re->nsub_);
220      Regexp** nre_subs = nre->sub();
221      for (int i = 0; i <re->nsub_; i++)
222        nre_subs[i] = child_args[i];
223      nre->simple_ = true;
224      return nre;
225    }
226
227    case kRegexpCapture: {
228      Regexp* newsub = child_args[0];
229      if (newsub == re->sub()[0]) {
230        newsub->Decref();
231        re->simple_ = true;
232        return re->Incref();
233      }
234      Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
235      nre->AllocSub(1);
236      nre->sub()[0] = newsub;
237      nre->cap_ = re->cap_;
238      nre->simple_ = true;
239      return nre;
240    }
241
242    case kRegexpStar:
243    case kRegexpPlus:
244    case kRegexpQuest: {
245      Regexp* newsub = child_args[0];
246      // Special case: repeat the empty string as much as
247      // you want, but it's still the empty string.
248      if (newsub->op() == kRegexpEmptyMatch)
249        return newsub;
250
251      // These are simple as long as the subpiece is simple.
252      if (newsub == re->sub()[0]) {
253        newsub->Decref();
254        re->simple_ = true;
255        return re->Incref();
256      }
257
258      // These are also idempotent if flags are constant.
259      if (re->op() == newsub->op() &&
260          re->parse_flags() == newsub->parse_flags())
261        return newsub;
262
263      Regexp* nre = new Regexp(re->op(), re->parse_flags());
264      nre->AllocSub(1);
265      nre->sub()[0] = newsub;
266      nre->simple_ = true;
267      return nre;
268    }
269
270    case kRegexpRepeat: {
271      Regexp* newsub = child_args[0];
272      // Special case: repeat the empty string as much as
273      // you want, but it's still the empty string.
274      if (newsub->op() == kRegexpEmptyMatch)
275        return newsub;
276
277      Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
278                                   re->parse_flags());
279      newsub->Decref();
280      nre->simple_ = true;
281      return nre;
282    }
283
284    case kRegexpCharClass: {
285      Regexp* nre = SimplifyCharClass(re);
286      nre->simple_ = true;
287      return nre;
288    }
289  }
290
291  LOG(ERROR) << "Simplify case not handled: " << re->op();
292  return re->Incref();
293}
294
295// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
296// Returns a new Regexp, handing the ref to the caller.
297Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
298                                Regexp::ParseFlags parse_flags) {
299  Regexp* re = new Regexp(kRegexpConcat, parse_flags);
300  re->AllocSub(2);
301  Regexp** subs = re->sub();
302  subs[0] = re1;
303  subs[1] = re2;
304  return re;
305}
306
307// Simplifies the expression re{min,max} in terms of *, +, and ?.
308// Returns a new regexp.  Does not edit re.  Does not consume reference to re.
309// Caller must Decref return value when done with it.
310// The result will *not* necessarily have the right capturing parens
311// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
312// but in the Regexp* representation, both (x) are marked as $1.
313Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
314                                       Regexp::ParseFlags f) {
315  // x{n,} means at least n matches of x.
316  if (max == -1) {
317    // Special case: x{0,} is x*
318    if (min == 0)
319      return Regexp::Star(re->Incref(), f);
320
321    // Special case: x{1,} is x+
322    if (min == 1)
323      return Regexp::Plus(re->Incref(), f);
324
325    // General case: x{4,} is xxxx+
326    Regexp* nre = new Regexp(kRegexpConcat, f);
327    nre->AllocSub(min);
328    VLOG(1) << "Simplify " << min;
329    Regexp** nre_subs = nre->sub();
330    for (int i = 0; i < min-1; i++)
331      nre_subs[i] = re->Incref();
332    nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
333    return nre;
334  }
335
336  // Special case: (x){0} matches only empty string.
337  if (min == 0 && max == 0)
338    return new Regexp(kRegexpEmptyMatch, f);
339
340  // Special case: x{1} is just x.
341  if (min == 1 && max == 1)
342    return re->Incref();
343
344  // General case: x{n,m} means n copies of x and m copies of x?.
345  // The machine will do less work if we nest the final m copies,
346  // so that x{2,5} = xx(x(x(x)?)?)?
347
348  // Build leading prefix: xx.  Capturing only on the last one.
349  Regexp* nre = NULL;
350  if (min > 0) {
351    nre = new Regexp(kRegexpConcat, f);
352    nre->AllocSub(min);
353    Regexp** nre_subs = nre->sub();
354    for (int i = 0; i < min; i++)
355      nre_subs[i] = re->Incref();
356  }
357
358  // Build and attach suffix: (x(x(x)?)?)?
359  if (max > min) {
360    Regexp* suf = Regexp::Quest(re->Incref(), f);
361    for (int i = min+1; i < max; i++)
362      suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
363    if (nre == NULL)
364      nre = suf;
365    else
366      nre = Concat2(nre, suf, f);
367  }
368
369  if (nre == NULL) {
370    // Some degenerate case, like min > max, or min < max < 0.
371    // This shouldn't happen, because the parser rejects such regexps.
372    LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
373    return new Regexp(kRegexpNoMatch, f);
374  }
375
376  return nre;
377}
378
379// Simplifies a character class.
380// Caller must Decref return value when done with it.
381Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
382  CharClass* cc = re->cc();
383
384  // Special cases
385  if (cc->empty())
386    return new Regexp(kRegexpNoMatch, re->parse_flags());
387  if (cc->full())
388    return new Regexp(kRegexpAnyChar, re->parse_flags());
389
390  return re->Incref();
391}
392
393}  // namespace re2
394