1// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Format a regular expression structure as a string.
6// Tested by parse_test.cc
7
8#include "util/util.h"
9#include "re2/regexp.h"
10#include "re2/walker-inl.h"
11
12namespace re2 {
13
14enum {
15  PrecAtom,
16  PrecUnary,
17  PrecConcat,
18  PrecAlternate,
19  PrecEmpty,
20  PrecParen,
21  PrecToplevel,
22};
23
24// Helper function.  See description below.
25static void AppendCCRange(string* t, Rune lo, Rune hi);
26
27// Walker to generate string in s_.
28// The arg pointers are actually integers giving the
29// context precedence.
30// The child_args are always NULL.
31class ToStringWalker : public Regexp::Walker<int> {
32 public:
33  explicit ToStringWalker(string* t) : t_(t) {}
34
35  virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
36  virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
37                        int* child_args, int nchild_args);
38  virtual int ShortVisit(Regexp* re, int parent_arg) {
39    return 0;
40  }
41
42 private:
43  string* t_;  // The string the walker appends to.
44
45  DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
46};
47
48string Regexp::ToString() {
49  string t;
50  ToStringWalker w(&t);
51  w.WalkExponential(this, PrecToplevel, 100000);
52  if (w.stopped_early())
53    t += " [truncated]";
54  return t;
55}
56
57#define ToString DontCallToString  // Avoid accidental recursion.
58
59// Visits re before children are processed.
60// Appends ( if needed and passes new precedence to children.
61int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
62  int prec = parent_arg;
63  int nprec = PrecAtom;
64
65  switch (re->op()) {
66    case kRegexpNoMatch:
67    case kRegexpEmptyMatch:
68    case kRegexpLiteral:
69    case kRegexpAnyChar:
70    case kRegexpAnyByte:
71    case kRegexpBeginLine:
72    case kRegexpEndLine:
73    case kRegexpBeginText:
74    case kRegexpEndText:
75    case kRegexpWordBoundary:
76    case kRegexpNoWordBoundary:
77    case kRegexpCharClass:
78    case kRegexpHaveMatch:
79      nprec = PrecAtom;
80      break;
81
82    case kRegexpConcat:
83    case kRegexpLiteralString:
84      if (prec < PrecConcat)
85        t_->append("(?:");
86      nprec = PrecConcat;
87      break;
88
89    case kRegexpAlternate:
90      if (prec < PrecAlternate)
91        t_->append("(?:");
92      nprec = PrecAlternate;
93      break;
94
95    case kRegexpCapture:
96      t_->append("(");
97      if (re->name()) {
98        t_->append("?P<");
99        t_->append(*re->name());
100        t_->append(">");
101      }
102      nprec = PrecParen;
103      break;
104
105    case kRegexpStar:
106    case kRegexpPlus:
107    case kRegexpQuest:
108    case kRegexpRepeat:
109      if (prec < PrecUnary)
110        t_->append("(?:");
111      // The subprecedence here is PrecAtom instead of PrecUnary
112      // because PCRE treats two unary ops in a row as a parse error.
113      nprec = PrecAtom;
114      break;
115  }
116
117  return nprec;
118}
119
120static void AppendLiteral(string *t, Rune r, bool foldcase) {
121  if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
122    t->append(1, '\\');
123    t->append(1, r);
124  } else if (foldcase && 'a' <= r && r <= 'z') {
125    if ('a' <= r && r <= 'z')
126      r += 'A' - 'a';
127    t->append(1, '[');
128    t->append(1, r);
129    t->append(1, r + 'a' - 'A');
130    t->append(1, ']');
131  } else {
132    AppendCCRange(t, r, r);
133  }
134}
135
136// Visits re after children are processed.
137// For childless regexps, all the work is done here.
138// For regexps with children, append any unary suffixes or ).
139int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
140                              int* child_args, int nchild_args) {
141  int prec = parent_arg;
142  switch (re->op()) {
143    case kRegexpNoMatch:
144      // There's no simple symbol for "no match", but
145      // [^0-Runemax] excludes everything.
146      t_->append("[^\\x00-\\x{10ffff}]");
147      break;
148
149    case kRegexpEmptyMatch:
150      // Append (?:) to make empty string visible,
151      // unless this is already being parenthesized.
152      if (prec < PrecEmpty)
153        t_->append("(?:)");
154      break;
155
156    case kRegexpLiteral:
157      AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
158      break;
159
160    case kRegexpLiteralString:
161      for (int i = 0; i < re->nrunes(); i++)
162        AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
163      if (prec < PrecConcat)
164        t_->append(")");
165      break;
166
167    case kRegexpConcat:
168      if (prec < PrecConcat)
169        t_->append(")");
170      break;
171
172    case kRegexpAlternate:
173      // Clumsy but workable: the children all appended |
174      // at the end of their strings, so just remove the last one.
175      if ((*t_)[t_->size()-1] == '|')
176        t_->erase(t_->size()-1);
177      else
178        LOG(DFATAL) << "Bad final char: " << t_;
179      if (prec < PrecAlternate)
180        t_->append(")");
181      break;
182
183    case kRegexpStar:
184      t_->append("*");
185      if (re->parse_flags() & Regexp::NonGreedy)
186        t_->append("?");
187      if (prec < PrecUnary)
188        t_->append(")");
189      break;
190
191    case kRegexpPlus:
192      t_->append("+");
193      if (re->parse_flags() & Regexp::NonGreedy)
194        t_->append("?");
195      if (prec < PrecUnary)
196        t_->append(")");
197      break;
198
199    case kRegexpQuest:
200      t_->append("?");
201      if (re->parse_flags() & Regexp::NonGreedy)
202        t_->append("?");
203      if (prec < PrecUnary)
204        t_->append(")");
205      break;
206
207    case kRegexpRepeat:
208      if (re->max() == -1)
209        t_->append(StringPrintf("{%d,}", re->min()));
210      else if (re->min() == re->max())
211        t_->append(StringPrintf("{%d}", re->min()));
212      else
213        t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
214      if (re->parse_flags() & Regexp::NonGreedy)
215        t_->append("?");
216      if (prec < PrecUnary)
217        t_->append(")");
218      break;
219
220    case kRegexpAnyChar:
221      t_->append(".");
222      break;
223
224    case kRegexpAnyByte:
225      t_->append("\\C");
226      break;
227
228    case kRegexpBeginLine:
229      t_->append("^");
230      break;
231
232    case kRegexpEndLine:
233      t_->append("$");
234      break;
235
236    case kRegexpBeginText:
237      t_->append("(?-m:^)");
238      break;
239
240    case kRegexpEndText:
241      if (re->parse_flags() & Regexp::WasDollar)
242        t_->append("(?-m:$)");
243      else
244        t_->append("\\z");
245      break;
246
247    case kRegexpWordBoundary:
248      t_->append("\\b");
249      break;
250
251    case kRegexpNoWordBoundary:
252      t_->append("\\B");
253      break;
254
255    case kRegexpCharClass: {
256      if (re->cc()->size() == 0) {
257        t_->append("[^\\x00-\\x{10ffff}]");
258        break;
259      }
260      t_->append("[");
261      // Heuristic: show class as negated if it contains the
262      // non-character 0xFFFE.
263      CharClass* cc = re->cc();
264      if (cc->Contains(0xFFFE)) {
265        cc = cc->Negate();
266        t_->append("^");
267      }
268      for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
269        AppendCCRange(t_, i->lo, i->hi);
270      if (cc != re->cc())
271        cc->Delete();
272      t_->append("]");
273      break;
274    }
275
276    case kRegexpCapture:
277      t_->append(")");
278      break;
279
280    case kRegexpHaveMatch:
281      // There's no syntax accepted by the parser to generate
282      // this node (it is generated by RE2::Set) so make something
283      // up that is readable but won't compile.
284      t_->append("(?HaveMatch:%d)", re->match_id());
285      break;
286  }
287
288  // If the parent is an alternation, append the | for it.
289  if (prec == PrecAlternate)
290    t_->append("|");
291
292  return 0;
293}
294
295// Appends a rune for use in a character class to the string t.
296static void AppendCCChar(string* t, Rune r) {
297  if (0x20 <= r && r <= 0x7E) {
298    if (strchr("[]^-\\", r))
299      t->append("\\");
300    t->append(1, r);
301    return;
302  }
303  switch (r) {
304    default:
305      break;
306
307    case '\r':
308      t->append("\\r");
309      return;
310
311    case '\t':
312      t->append("\\t");
313      return;
314
315    case '\n':
316      t->append("\\n");
317      return;
318
319    case '\f':
320      t->append("\\f");
321      return;
322  }
323
324  if (r < 0x100) {
325    StringAppendF(t, "\\x%02x", static_cast<int>(r));
326    return;
327  }
328  StringAppendF(t, "\\x{%x}", static_cast<int>(r));
329}
330
331static void AppendCCRange(string* t, Rune lo, Rune hi) {
332  if (lo > hi)
333    return;
334  AppendCCChar(t, lo);
335  if (lo < hi) {
336    t->append("-");
337    AppendCCChar(t, hi);
338  }
339}
340
341}  // namespace re2
342