10d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Copyright 2006 The RE2 Authors.  All Rights Reserved.
20d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Use of this source code is governed by a BSD-style
30d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// license that can be found in the LICENSE file.
40d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
50d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test simplify.cc.
60d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
70d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include <string>
80d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include <vector>
90d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "util/test.h"
100d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "re2/regexp.h"
110d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
120d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinnamespace re2 {
130d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
140d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinstruct Test {
150d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  const char* regexp;
160d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  const char* simplified;
170d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin};
180d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
190d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinstatic Test tests[] = {
200d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // Already-simple constructs
210d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a", "a" },
220d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "ab", "ab" },
230d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a|b", "[a-b]" },
240d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "ab|cd", "ab|cd" },
250d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(ab)*", "(ab)*" },
260d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(ab)+", "(ab)+" },
270d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(ab)?", "(ab)?" },
280d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { ".", "." },
290d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "^", "^" },
300d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "$", "$" },
310d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[ac]", "[ac]" },
320d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[^ac]", "[^ac]" },
330d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
340d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // Posix character classes
350d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[[:alnum:]]", "[0-9A-Za-z]" },
360d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[[:alpha:]]", "[A-Za-z]" },
370d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[[:blank:]]", "[\\t ]" },
380d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
390d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[[:digit:]]", "[0-9]" },
400d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[[:graph:]]", "[!-~]" },
410d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[[:lower:]]", "[a-z]" },
420d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[[:print:]]", "[ -~]" },
430d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
440d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[[:space:]]" , "[\\t-\\r ]" },
450d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[[:upper:]]", "[A-Z]" },
460d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[[:xdigit:]]", "[0-9A-Fa-f]" },
470d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
480d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // Perl character classes
490d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "\\d", "[0-9]" },
500d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "\\s", "[\\t-\\n\\f-\\r ]" },
510d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "\\w", "[0-9A-Z_a-z]" },
520d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "\\D", "[^0-9]" },
530d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "\\S", "[^\\t-\\n\\f-\\r ]" },
540d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "\\W", "[^0-9A-Z_a-z]" },
550d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[\\d]", "[0-9]" },
560d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[\\s]", "[\\t-\\n\\f-\\r ]" },
570d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[\\w]", "[0-9A-Z_a-z]" },
580d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[\\D]", "[^0-9]" },
590d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[\\S]", "[^\\t-\\n\\f-\\r ]" },
600d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[\\W]", "[^0-9A-Z_a-z]" },
610d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
620d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // Posix repetitions
630d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a{1}", "a" },
640d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a{2}", "aa" },
650d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a{5}", "aaaaa" },
660d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a{0,1}", "a?" },
670d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // The next three are illegible because Simplify inserts (?:)
680d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // parens instead of () parens to avoid creating extra
690d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // captured subexpressions.  The comments show a version fewer parens.
700d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(a){0,2}",                   "(?:(a)(a)?)?"     },  //       (aa?)?
710d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(a){0,4}",       "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" },  //   (a(a(aa?)?)?)?
720d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" },  // aa(a(a(aa?)?)?)?
730d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a{0,2}",           "(?:aa?)?"     },  //       (aa?)?
740d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a{0,4}",   "(?:a(?:a(?:aa?)?)?)?" },  //   (a(a(aa?)?)?)?
750d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" },  // aa(a(a(aa?)?)?)?
760d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a{0,}", "a*" },
770d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a{1,}", "a+" },
780d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a{2,}", "aa+" },
790d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a{5,}", "aaaaa+" },
800d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
810d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // Test that operators simplify their arguments.
820d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // (Simplify used to not simplify arguments to a {} repeat.)
830d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(?:a{1,}){1,}", "a+" },
840d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(a{1,}b{1,})", "(a+b+)" },
850d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a{1,}|b{1,}", "a+|b+" },
860d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(?:a{1,})*", "(?:a+)*" },
870d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(?:a{1,})+", "a+" },
880d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(?:a{1,})?", "(?:a+)?" },
890d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a{0}", "" },
900d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
910d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // Character class simplification
920d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[ab]", "[a-b]" },
930d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[a-za-za-z]", "[a-z]" },
940d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[A-Za-zA-Za-z]", "[A-Za-z]" },
950d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[ABCDEFGH]", "[A-H]" },
960d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[AB-CD-EF-GH]", "[A-H]" },
970d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[W-ZP-XE-R]", "[E-Z]" },
980d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[a-ee-gg-m]", "[a-m]" },
990d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[a-ea-ha-m]", "[a-m]" },
1000d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[a-ma-ha-e]", "[a-m]" },
1010d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[a-zA-Z0-9 -~]", "[ -~]" },
1020d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
1030d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // Empty character classes
1040d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
1050d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
1060d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // Full character classes
1070d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "[[:cntrl:][:^cntrl:]]", "." },
1080d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
1090d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // Unicode case folding.
1100d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(?i)A", "[Aa]" },
1110d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(?i)a", "[Aa]" },
1120d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(?i)K", "[Kk\\x{212a}]" },
1130d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(?i)k", "[Kk\\x{212a}]" },
1140d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(?i)\\x{212a}", "[Kk\\x{212a}]" },
1150d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
1160d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
1170d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(?i)[\\x00-\\x{10ffff}]", "." },
1180d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
1190d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // Empty string as a regular expression.
1200d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // Empty string must be preserved inside parens in order
1210d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // to make submatches work right, so these are less
1220d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // interesting than they used to be.  ToString inserts
1230d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // explicit (?:) in place of non-parenthesized empty strings,
1240d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  // to make them easier to spot for other parsers.
1250d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(a|b|)", "([a-b]|(?:))" },
1260d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(|)", "()" },
1270d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "a()", "a()" },
1280d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(()|())", "(()|())" },
1290d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(a|)", "(a|(?:))" },
1300d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "ab()cd()", "ab()cd()" },
1310d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "()", "()" },
1320d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "()*", "()*" },
1330d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "()+", "()+" },
1340d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "()?" , "()?" },
1350d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(){0}", "" },
1360d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(){1}", "()" },
1370d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(){1,}", "()+" },
1380d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  { "(){0,2}", "(?:()()?)?" },
1390d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin};
1400d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
1410d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(TestSimplify, SimpleRegexps) {
1420d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  for (int i = 0; i < arraysize(tests); i++) {
1430d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    RegexpStatus status;
1440d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    VLOG(1) << "Testing " << tests[i].regexp;
1450d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    Regexp* re = Regexp::Parse(tests[i].regexp,
1460d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin                               Regexp::MatchNL | (Regexp::LikePerl &
1470d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin                                                  ~Regexp::OneLine),
1480d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin                               &status);
1490d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
1500d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    Regexp* sre = re->Simplify();
1510d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    CHECK(sre != NULL);
1520d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
1530d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    // Check that already-simple regexps don't allocate new ones.
1540d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    if (strcmp(tests[i].regexp, tests[i].simplified) == 0) {
1550d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin      CHECK(re == sre) << " " << tests[i].regexp
1560d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin        << " " << re->ToString() << " " << sre->ToString();
1570d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    }
1580d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
1590d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    EXPECT_EQ(tests[i].simplified, sre->ToString())
1600d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin      << " " << tests[i].regexp << " " << sre->Dump();
1610d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
1620d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    re->Decref();
1630d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    sre->Decref();
1640d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  }
1650d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}
1660d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
1670d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}  // namespace re2
168