1// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Test simplify.cc.
6
7#include <string>
8#include <vector>
9#include "util/test.h"
10#include "re2/regexp.h"
11
12namespace re2 {
13
14struct Test {
15  const char* regexp;
16  const char* simplified;
17};
18
19static Test tests[] = {
20  // Already-simple constructs
21  { "a", "a" },
22  { "ab", "ab" },
23  { "a|b", "[a-b]" },
24  { "ab|cd", "ab|cd" },
25  { "(ab)*", "(ab)*" },
26  { "(ab)+", "(ab)+" },
27  { "(ab)?", "(ab)?" },
28  { ".", "." },
29  { "^", "^" },
30  { "$", "$" },
31  { "[ac]", "[ac]" },
32  { "[^ac]", "[^ac]" },
33
34  // Posix character classes
35  { "[[:alnum:]]", "[0-9A-Za-z]" },
36  { "[[:alpha:]]", "[A-Za-z]" },
37  { "[[:blank:]]", "[\\t ]" },
38  { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
39  { "[[:digit:]]", "[0-9]" },
40  { "[[:graph:]]", "[!-~]" },
41  { "[[:lower:]]", "[a-z]" },
42  { "[[:print:]]", "[ -~]" },
43  { "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
44  { "[[:space:]]" , "[\\t-\\r ]" },
45  { "[[:upper:]]", "[A-Z]" },
46  { "[[:xdigit:]]", "[0-9A-Fa-f]" },
47
48  // Perl character classes
49  { "\\d", "[0-9]" },
50  { "\\s", "[\\t-\\n\\f-\\r ]" },
51  { "\\w", "[0-9A-Z_a-z]" },
52  { "\\D", "[^0-9]" },
53  { "\\S", "[^\\t-\\n\\f-\\r ]" },
54  { "\\W", "[^0-9A-Z_a-z]" },
55  { "[\\d]", "[0-9]" },
56  { "[\\s]", "[\\t-\\n\\f-\\r ]" },
57  { "[\\w]", "[0-9A-Z_a-z]" },
58  { "[\\D]", "[^0-9]" },
59  { "[\\S]", "[^\\t-\\n\\f-\\r ]" },
60  { "[\\W]", "[^0-9A-Z_a-z]" },
61
62  // Posix repetitions
63  { "a{1}", "a" },
64  { "a{2}", "aa" },
65  { "a{5}", "aaaaa" },
66  { "a{0,1}", "a?" },
67  // The next three are illegible because Simplify inserts (?:)
68  // parens instead of () parens to avoid creating extra
69  // captured subexpressions.  The comments show a version fewer parens.
70  { "(a){0,2}",                   "(?:(a)(a)?)?"     },  //       (aa?)?
71  { "(a){0,4}",       "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" },  //   (a(a(aa?)?)?)?
72  { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" },  // aa(a(a(aa?)?)?)?
73  { "a{0,2}",           "(?:aa?)?"     },  //       (aa?)?
74  { "a{0,4}",   "(?:a(?:a(?:aa?)?)?)?" },  //   (a(a(aa?)?)?)?
75  { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" },  // aa(a(a(aa?)?)?)?
76  { "a{0,}", "a*" },
77  { "a{1,}", "a+" },
78  { "a{2,}", "aa+" },
79  { "a{5,}", "aaaaa+" },
80
81  // Test that operators simplify their arguments.
82  // (Simplify used to not simplify arguments to a {} repeat.)
83  { "(?:a{1,}){1,}", "a+" },
84  { "(a{1,}b{1,})", "(a+b+)" },
85  { "a{1,}|b{1,}", "a+|b+" },
86  { "(?:a{1,})*", "(?:a+)*" },
87  { "(?:a{1,})+", "a+" },
88  { "(?:a{1,})?", "(?:a+)?" },
89  { "a{0}", "" },
90
91  // Character class simplification
92  { "[ab]", "[a-b]" },
93  { "[a-za-za-z]", "[a-z]" },
94  { "[A-Za-zA-Za-z]", "[A-Za-z]" },
95  { "[ABCDEFGH]", "[A-H]" },
96  { "[AB-CD-EF-GH]", "[A-H]" },
97  { "[W-ZP-XE-R]", "[E-Z]" },
98  { "[a-ee-gg-m]", "[a-m]" },
99  { "[a-ea-ha-m]", "[a-m]" },
100  { "[a-ma-ha-e]", "[a-m]" },
101  { "[a-zA-Z0-9 -~]", "[ -~]" },
102
103  // Empty character classes
104  { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
105
106  // Full character classes
107  { "[[:cntrl:][:^cntrl:]]", "." },
108
109  // Unicode case folding.
110  { "(?i)A", "[Aa]" },
111  { "(?i)a", "[Aa]" },
112  { "(?i)K", "[Kk\\x{212a}]" },
113  { "(?i)k", "[Kk\\x{212a}]" },
114  { "(?i)\\x{212a}", "[Kk\\x{212a}]" },
115  { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
116  { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
117  { "(?i)[\\x00-\\x{10ffff}]", "." },
118
119  // Empty string as a regular expression.
120  // Empty string must be preserved inside parens in order
121  // to make submatches work right, so these are less
122  // interesting than they used to be.  ToString inserts
123  // explicit (?:) in place of non-parenthesized empty strings,
124  // to make them easier to spot for other parsers.
125  { "(a|b|)", "([a-b]|(?:))" },
126  { "(|)", "()" },
127  { "a()", "a()" },
128  { "(()|())", "(()|())" },
129  { "(a|)", "(a|(?:))" },
130  { "ab()cd()", "ab()cd()" },
131  { "()", "()" },
132  { "()*", "()*" },
133  { "()+", "()+" },
134  { "()?" , "()?" },
135  { "(){0}", "" },
136  { "(){1}", "()" },
137  { "(){1,}", "()+" },
138  { "(){0,2}", "(?:()()?)?" },
139};
140
141TEST(TestSimplify, SimpleRegexps) {
142  for (int i = 0; i < arraysize(tests); i++) {
143    RegexpStatus status;
144    VLOG(1) << "Testing " << tests[i].regexp;
145    Regexp* re = Regexp::Parse(tests[i].regexp,
146                               Regexp::MatchNL | (Regexp::LikePerl &
147                                                  ~Regexp::OneLine),
148                               &status);
149    CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
150    Regexp* sre = re->Simplify();
151    CHECK(sre != NULL);
152
153    // Check that already-simple regexps don't allocate new ones.
154    if (strcmp(tests[i].regexp, tests[i].simplified) == 0) {
155      CHECK(re == sre) << " " << tests[i].regexp
156        << " " << re->ToString() << " " << sre->ToString();
157    }
158
159    EXPECT_EQ(tests[i].simplified, sre->ToString())
160      << " " << tests[i].regexp << " " << sre->Dump();
161
162    re->Decref();
163    sre->Decref();
164  }
165}
166
167}  // namespace re2
168