1// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Test parse.cc, dump.cc, and tostring.cc.
6
7#include <string>
8#include <vector>
9#include "util/test.h"
10#include "re2/regexp.h"
11
12namespace re2 {
13
14static const Regexp::ParseFlags TestZeroFlags = Regexp::ParseFlags(1<<30);
15
16struct Test {
17  const char* regexp;
18  const char* parse;
19  Regexp::ParseFlags flags;
20};
21
22static Regexp::ParseFlags kTestFlags = Regexp::MatchNL |
23                                       Regexp::PerlX |
24                                       Regexp::PerlClasses |
25                                       Regexp::UnicodeGroups;
26
27static Test tests[] = {
28  // Base cases
29  { "a", "lit{a}" },
30  { "a.", "cat{lit{a}dot{}}" },
31  { "a.b", "cat{lit{a}dot{}lit{b}}" },
32  { "ab", "str{ab}" },
33  { "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
34  { "abc", "str{abc}" },
35  { "a|^", "alt{lit{a}bol{}}" },
36  { "a|b", "cc{0x61-0x62}" },
37  { "(a)", "cap{lit{a}}" },
38  { "(a)|b", "alt{cap{lit{a}}lit{b}}" },
39  { "a*", "star{lit{a}}" },
40  { "a+", "plus{lit{a}}" },
41  { "a?", "que{lit{a}}" },
42  { "a{2}", "rep{2,2 lit{a}}" },
43  { "a{2,3}", "rep{2,3 lit{a}}" },
44  { "a{2,}", "rep{2,-1 lit{a}}" },
45  { "a*?", "nstar{lit{a}}" },
46  { "a+?", "nplus{lit{a}}" },
47  { "a??", "nque{lit{a}}" },
48  { "a{2}?", "nrep{2,2 lit{a}}" },
49  { "a{2,3}?", "nrep{2,3 lit{a}}" },
50  { "a{2,}?", "nrep{2,-1 lit{a}}" },
51  { "", "emp{}" },
52  { "|", "emp{}" },  // alt{emp{}emp{}} but got factored
53  { "|x|", "alt{emp{}lit{x}emp{}}" },
54  { ".", "dot{}" },
55  { "^", "bol{}" },
56  { "$", "eol{}" },
57  { "\\|", "lit{|}" },
58  { "\\(", "lit{(}" },
59  { "\\)", "lit{)}" },
60  { "\\*", "lit{*}" },
61  { "\\+", "lit{+}" },
62  { "\\?", "lit{?}" },
63  { "{", "lit{{}" },
64  { "}", "lit{}}" },
65  { "\\.", "lit{.}" },
66  { "\\^", "lit{^}" },
67  { "\\$", "lit{$}" },
68  { "\\\\", "lit{\\}" },
69  { "[ace]", "cc{0x61 0x63 0x65}" },
70  { "[abc]", "cc{0x61-0x63}" },
71  { "[a-z]", "cc{0x61-0x7a}" },
72  { "[a]", "lit{a}" },
73  { "\\-", "lit{-}" },
74  { "-", "lit{-}" },
75  { "\\_", "lit{_}" },
76
77  // Posix and Perl extensions
78  { "[[:lower:]]", "cc{0x61-0x7a}" },
79  { "[a-z]", "cc{0x61-0x7a}" },
80  { "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
81  { "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
82  { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
83  { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
84  { "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
85  { "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
86  { "\\d", "cc{0x30-0x39}" },
87  { "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
88  { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
89  { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
90  { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
91  { "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
92  { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
93  { "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
94  { "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
95  { "\\C", "byte{}" },
96
97  // Unicode, negatives, and a double negative.
98  { "\\p{Braille}", "cc{0x2800-0x28ff}" },
99  { "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
100  { "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
101  { "\\P{^Braille}", "cc{0x2800-0x28ff}" },
102
103  // More interesting regular expressions.
104  { "a{,2}", "str{a{,2}}" },
105  { "\\.\\^\\$\\\\", "str{.^$\\}" },
106  { "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
107  { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
108  { "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" },  // utf-8
109  { "a*{", "cat{star{lit{a}}lit{{}}" },
110
111  // Test precedences
112  { "(?:ab)*", "star{str{ab}}" },
113  { "(ab)*", "star{cap{str{ab}}}" },
114  { "ab|cd", "alt{str{ab}str{cd}}" },
115  { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
116
117  // Test flattening.
118  { "(?:a)", "lit{a}" },
119  { "(?:ab)(?:cd)", "str{abcd}" },
120  { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
121  { "a|.", "dot{}" },
122  { ".|a", "dot{}" },
123
124  // Test Perl quoted literals
125  { "\\Q+|*?{[\\E", "str{+|*?{[}" },
126  { "\\Q+\\E+", "plus{lit{+}}" },
127  { "\\Q\\\\E", "lit{\\}" },
128  { "\\Q\\\\\\E", "str{\\\\}" },
129
130  // Test Perl \A and \z
131  { "(?m)^", "bol{}" },
132  { "(?m)$", "eol{}" },
133  { "(?-m)^", "bot{}" },
134  { "(?-m)$", "eot{}" },
135  { "(?m)\\A", "bot{}" },
136  { "(?m)\\z", "eot{\\z}" },
137  { "(?-m)\\A", "bot{}" },
138  { "(?-m)\\z", "eot{\\z}" },
139
140  // Test named captures
141  { "(?P<name>a)", "cap{name:lit{a}}" },
142
143  // Case-folded literals
144  { "[Aa]", "litfold{a}" },
145
146  // Strings
147  { "abcde", "str{abcde}" },
148  { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
149
150  // Reported bug involving \n leaking in despite use of NeverNL.
151  { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
152  { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
153  { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
154  { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
155  { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags },
156  { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
157  { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
158  { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
159  { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags },
160  { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
161  { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
162  { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
163  { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags },
164  { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
165  { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
166  { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
167  { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
168  { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
169  { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
170  { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
171  { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
172  { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
173  { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
174  { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
175  { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
176  { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
177  { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
178  { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
179  { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
180    Regexp::PerlClasses },
181  { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
182    Regexp::PerlClasses | Regexp::FoldCase },
183  { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
184    Regexp::PerlClasses | Regexp::NeverNL },
185  { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
186    Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
187  { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
188    Regexp::PerlClasses },
189  { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
190    Regexp::PerlClasses | Regexp::FoldCase },
191  { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
192    Regexp::PerlClasses | Regexp::NeverNL },
193  { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
194    Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
195};
196
197bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
198  return Regexp::Equal(a, b);
199}
200
201void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
202               const string& title) {
203  Regexp** re = new Regexp*[ntests];
204  for (int i = 0; i < ntests; i++) {
205    RegexpStatus status;
206    Regexp::ParseFlags f = flags;
207    if (tests[i].flags != 0) {
208      f = tests[i].flags & ~TestZeroFlags;
209    }
210    re[i] = Regexp::Parse(tests[i].regexp, f, &status);
211    CHECK(re[i] != NULL) << " " << tests[i].regexp << " "
212                         << status.Text();
213    string s = re[i]->Dump();
214    EXPECT_EQ(string(tests[i].parse), s) << "Regexp: " << tests[i].regexp
215      << "\nparse: " << tests[i].parse << " s: " << s << " flag=" << f;
216  }
217
218  for (int i = 0; i < ntests; i++) {
219    for (int j = 0; j < ntests; j++) {
220      EXPECT_EQ(string(tests[i].parse) == tests[j].parse,
221                RegexpEqualTestingOnly(re[i], re[j]))
222        << "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
223    }
224  }
225
226  for (int i = 0; i < ntests; i++)
227    re[i]->Decref();
228  delete[] re;
229}
230
231// Test that regexps parse to expected structures.
232TEST(TestParse, SimpleRegexps) {
233  TestParse(tests, arraysize(tests), kTestFlags, "simple");
234}
235
236Test foldcase_tests[] = {
237  { "AbCdE", "strfold{abcde}" },
238  { "[Aa]", "litfold{a}" },
239  { "a", "litfold{a}" },
240
241  // 0x17F is an old English long s (looks like an f) and folds to s.
242  // 0x212A is the Kelvin symbol and folds to k.
243  { "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" },  // [Aa][A-z...]
244  { "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
245  { "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
246};
247
248// Test that parsing with FoldCase works.
249TEST(TestParse, FoldCase) {
250  TestParse(foldcase_tests, arraysize(foldcase_tests), Regexp::FoldCase, "foldcase");
251}
252
253Test literal_tests[] = {
254  { "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" },
255};
256
257// Test that parsing with Literal works.
258TEST(TestParse, Literal) {
259  TestParse(literal_tests, arraysize(literal_tests), Regexp::Literal, "literal");
260}
261
262Test matchnl_tests[] = {
263  { ".", "dot{}" },
264  { "\n", "lit{\n}" },
265  { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
266  { "[a\\n]", "cc{0xa 0x61}" },
267};
268
269// Test that parsing with MatchNL works.
270// (Also tested above during simple cases.)
271TEST(TestParse, MatchNL) {
272  TestParse(matchnl_tests, arraysize(matchnl_tests), Regexp::MatchNL, "with MatchNL");
273}
274
275Test nomatchnl_tests[] = {
276  { ".", "cc{0-0x9 0xb-0x10ffff}" },
277  { "\n", "lit{\n}" },
278  { "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
279  { "[a\\n]", "cc{0xa 0x61}" },
280};
281
282// Test that parsing without MatchNL works.
283TEST(TestParse, NoMatchNL) {
284  TestParse(nomatchnl_tests, arraysize(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
285}
286
287Test prefix_tests[] = {
288  { "abc|abd", "cat{str{ab}cc{0x63-0x64}}" },
289  { "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" },
290  { "abc|abd|aef|bcx|bcy",
291    "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
292      "cat{str{bc}cc{0x78-0x79}}}" },
293  { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
294  { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
295  { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
296  { "(?:xx|yy)c|(?:xx|yy)d",
297    "cat{alt{str{xx}str{yy}}cc{0x63-0x64}}" },
298  { "x{2}|x{2}[0-9]",
299    "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
300  { "x{2}y|x{2}[0-9]y",
301    "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
302};
303
304// Test that prefix factoring works.
305TEST(TestParse, Prefix) {
306  TestParse(prefix_tests, arraysize(prefix_tests), Regexp::PerlX, "prefix");
307}
308
309// Invalid regular expressions
310const char* badtests[] = {
311  "(",
312  ")",
313  "(a",
314  "(a|b|",
315  "(a|b",
316  "[a-z",
317  "([a-z)",
318  "x{1001}",
319  "\xff",      // Invalid UTF-8
320  "[\xff]",
321  "[\\\xff]",
322  "\\\xff",
323  "(?P<name>a",
324  "(?P<name>",
325  "(?P<name",
326  "(?P<x y>a)",
327  "(?P<>a)",
328  "[a-Z]",
329  "(?i)[a-Z]",
330  "a{100000}",
331  "a{100000,}",
332};
333
334// Valid in Perl, bad in POSIX
335const char* only_perl[] = {
336 "[a-b-c]",
337 "\\Qabc\\E",
338 "\\Q*+?{[\\E",
339 "\\Q\\\\E",
340 "\\Q\\\\\\E",
341 "\\Q\\\\\\\\E",
342 "\\Q\\\\\\\\\\E",
343 "(?:a)",
344 "(?P<name>a)",
345};
346
347// Valid in POSIX, bad in Perl.
348const char* only_posix[] = {
349  "a++",
350  "a**",
351  "a?*",
352  "a+*",
353  "a{1}*",
354};
355
356// Test that parser rejects bad regexps.
357TEST(TestParse, InvalidRegexps) {
358  for (int i = 0; i < arraysize(badtests); i++) {
359    CHECK(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
360      << " " << badtests[i];
361    CHECK(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
362      << " " << badtests[i];
363  }
364  for (int i = 0; i < arraysize(only_posix); i++) {
365    CHECK(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
366      << " " << only_posix[i];
367    Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
368    CHECK(re) << " " << only_posix[i];
369    re->Decref();
370  }
371  for (int i = 0; i < arraysize(only_perl); i++) {
372    CHECK(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
373      << " " << only_perl[i];
374    Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
375    CHECK(re) << " " << only_perl[i];
376    re->Decref();
377  }
378}
379
380// Test that ToString produces original regexp or equivalent one.
381TEST(TestToString, EquivalentParse) {
382  for (int i = 0; i < arraysize(tests); i++) {
383    RegexpStatus status;
384    Regexp::ParseFlags f = kTestFlags;
385    if (tests[i].flags != 0) {
386      f = tests[i].flags & ~TestZeroFlags;
387    }
388    Regexp* re = Regexp::Parse(tests[i].regexp, f, &status);
389    CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
390    string s = re->Dump();
391    EXPECT_EQ(string(tests[i].parse), s) << " " << tests[i].regexp << " " << string(tests[i].parse) << " " << s;
392    string t = re->ToString();
393    if (t != tests[i].regexp) {
394      // If ToString didn't return the original regexp,
395      // it must have found one with fewer parens.
396      // Unfortunately we can't check the length here, because
397      // ToString produces "\\{" for a literal brace,
398      // but "{" is a shorter equivalent.
399      // CHECK_LT(t.size(), strlen(tests[i].regexp))
400      //     << " t=" << t << " regexp=" << tests[i].regexp;
401
402      // Test that if we parse the new regexp we get the same structure.
403      Regexp* nre = Regexp::Parse(t, Regexp::MatchNL | Regexp::PerlX, &status);
404      CHECK(nre != NULL) << " reparse " << t << " " << status.Text();
405      string ss = nre->Dump();
406      string tt = nre->ToString();
407      if (s != ss || t != tt)
408        LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
409      EXPECT_EQ(s, ss);
410      EXPECT_EQ(t, tt);
411      nre->Decref();
412    }
413    re->Decref();
414  }
415}
416
417// Test that capture error args are correct.
418TEST(NamedCaptures, ErrorArgs) {
419  RegexpStatus status;
420  Regexp* re;
421
422  re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
423  EXPECT_TRUE(re == NULL);
424  EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
425  EXPECT_EQ(status.error_arg(), "(?P<name");
426
427  re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
428  EXPECT_TRUE(re == NULL);
429  EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
430  EXPECT_EQ(status.error_arg(), "(?P<space bar>");
431}
432
433}  // namespace re2
434