1// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Determine whether this library should match PCRE exactly
6// for a particular Regexp.  (If so, the testing framework can
7// check that it does.)
8//
9// This library matches PCRE except in these cases:
10//   * the regexp contains a repetition of an empty string,
11//     like (a*)* or (a*)+.  In this case, PCRE will treat
12//     the repetition sequence as ending with an empty string,
13//     while this library does not.
14//   * Perl and PCRE differ on whether \v matches \n.
15//     For historical reasons, this library implements the Perl behavior.
16//   * Perl and PCRE allow $ in one-line mode to match either the very
17//     end of the text or just before a \n at the end of the text.
18//     This library requires it to match only the end of the text.
19//   * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20//     match the end of the text if the last character is a \n.
21//     This library does allow it.
22//
23// Regexp::MimicsPCRE checks for any of these conditions.
24
25#include "util/util.h"
26#include "re2/regexp.h"
27#include "re2/walker-inl.h"
28
29namespace re2 {
30
31// Returns whether re might match an empty string.
32static bool CanBeEmptyString(Regexp *re);
33
34// Walker class to compute whether library handles a regexp
35// exactly as PCRE would.  See comment at top for conditions.
36
37class PCREWalker : public Regexp::Walker<bool> {
38 public:
39  PCREWalker() {}
40  bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
41                 int nchild_args);
42
43  bool ShortVisit(Regexp* re, bool a) {
44    // Should never be called: we use Walk not WalkExponential.
45    LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
46    return a;
47  }
48};
49
50// Called after visiting each of re's children and accumulating
51// the return values in child_args.  So child_args contains whether
52// this library mimics PCRE for those subexpressions.
53bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
54                           bool* child_args, int nchild_args) {
55  // If children failed, so do we.
56  for (int i = 0; i < nchild_args; i++)
57    if (!child_args[i])
58      return false;
59
60  // Otherwise look for other reasons to fail.
61  switch (re->op()) {
62    // Look for repeated empty string.
63    case kRegexpStar:
64    case kRegexpPlus:
65    case kRegexpQuest:
66      if (CanBeEmptyString(re->sub()[0]))
67        return false;
68      break;
69    case kRegexpRepeat:
70      if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
71        return false;
72      break;
73
74    // Look for \v
75    case kRegexpLiteral:
76      if (re->rune() == '\v')
77        return false;
78      break;
79
80    // Look for $ in single-line mode.
81    case kRegexpEndText:
82    case kRegexpEmptyMatch:
83      if (re->parse_flags() & Regexp::WasDollar)
84        return false;
85      break;
86
87    // Look for ^ in multi-line mode.
88    case kRegexpBeginLine:
89      // No condition: in single-line mode ^ becomes kRegexpBeginText.
90      return false;
91
92    default:
93      break;
94  }
95
96  // Not proven guilty.
97  return true;
98}
99
100// Returns whether this regexp's behavior will mimic PCRE's exactly.
101bool Regexp::MimicsPCRE() {
102  PCREWalker w;
103  return w.Walk(this, true);
104}
105
106
107// Walker class to compute whether a Regexp can match an empty string.
108// It is okay to overestimate.  For example, \b\B cannot match an empty
109// string, because \b and \B are mutually exclusive, but this isn't
110// that smart and will say it can.  Spurious empty strings
111// will reduce the number of regexps we sanity check against PCRE,
112// but they won't break anything.
113
114class EmptyStringWalker : public Regexp::Walker<bool> {
115 public:
116  EmptyStringWalker() { }
117  bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
118                 bool* child_args, int nchild_args);
119
120  bool ShortVisit(Regexp* re, bool a) {
121    // Should never be called: we use Walk not WalkExponential.
122    LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
123    return a;
124  }
125
126 private:
127  DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
128};
129
130// Called after visiting re's children.  child_args contains the return
131// value from each of the children's PostVisits (i.e., whether each child
132// can match an empty string).  Returns whether this clause can match an
133// empty string.
134bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
135                                  bool* child_args, int nchild_args) {
136  switch (re->op()) {
137    case kRegexpNoMatch:               // never empty
138    case kRegexpLiteral:
139    case kRegexpAnyChar:
140    case kRegexpAnyByte:
141    case kRegexpCharClass:
142    case kRegexpLiteralString:
143      return false;
144
145    case kRegexpEmptyMatch:            // always empty
146    case kRegexpBeginLine:             // always empty, when they match
147    case kRegexpEndLine:
148    case kRegexpNoWordBoundary:
149    case kRegexpWordBoundary:
150    case kRegexpBeginText:
151    case kRegexpEndText:
152    case kRegexpStar:                  // can always be empty
153    case kRegexpQuest:
154    case kRegexpHaveMatch:
155      return true;
156
157    case kRegexpConcat:                // can be empty if all children can
158      for (int i = 0; i < nchild_args; i++)
159        if (!child_args[i])
160          return false;
161      return true;
162
163    case kRegexpAlternate:             // can be empty if any child can
164      for (int i = 0; i < nchild_args; i++)
165        if (child_args[i])
166          return true;
167      return false;
168
169    case kRegexpPlus:                  // can be empty if the child can
170    case kRegexpCapture:
171      return child_args[0];
172
173    case kRegexpRepeat:                // can be empty if child can or is x{0}
174      return child_args[0] || re->min() == 0;
175  }
176  return false;
177}
178
179// Returns whether re can match an empty string.
180static bool CanBeEmptyString(Regexp* re) {
181  EmptyStringWalker w;
182  return w.Walk(re, true);
183}
184
185}  // namespace re2
186