10d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Copyright 2006 The RE2 Authors. All Rights Reserved. 20d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Use of this source code is governed by a BSD-style 30d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// license that can be found in the LICENSE file. 40d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 50d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test simplify.cc. 60d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 70d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include <string> 80d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include <vector> 90d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "util/test.h" 100d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "re2/regexp.h" 110d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 120d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinnamespace re2 { 130d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 140d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinstruct Test { 150d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin const char* regexp; 160d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin const char* simplified; 170d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}; 180d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 190d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinstatic Test tests[] = { 200d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Already-simple constructs 210d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a", "a" }, 220d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "ab", "ab" }, 230d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a|b", "[a-b]" }, 240d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "ab|cd", "ab|cd" }, 250d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(ab)*", "(ab)*" }, 260d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(ab)+", "(ab)+" }, 270d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(ab)?", "(ab)?" }, 280d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { ".", "." }, 290d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "^", "^" }, 300d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "$", "$" }, 310d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[ac]", "[ac]" }, 320d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[^ac]", "[^ac]" }, 330d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 340d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Posix character classes 350d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[[:alnum:]]", "[0-9A-Za-z]" }, 360d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[[:alpha:]]", "[A-Za-z]" }, 370d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[[:blank:]]", "[\\t ]" }, 380d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" }, 390d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[[:digit:]]", "[0-9]" }, 400d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[[:graph:]]", "[!-~]" }, 410d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[[:lower:]]", "[a-z]" }, 420d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[[:print:]]", "[ -~]" }, 430d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[[:punct:]]", "[!-/:-@\\[-`{-~]" }, 440d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[[:space:]]" , "[\\t-\\r ]" }, 450d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[[:upper:]]", "[A-Z]" }, 460d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[[:xdigit:]]", "[0-9A-Fa-f]" }, 470d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 480d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Perl character classes 490d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "\\d", "[0-9]" }, 500d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "\\s", "[\\t-\\n\\f-\\r ]" }, 510d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "\\w", "[0-9A-Z_a-z]" }, 520d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "\\D", "[^0-9]" }, 530d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "\\S", "[^\\t-\\n\\f-\\r ]" }, 540d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "\\W", "[^0-9A-Z_a-z]" }, 550d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[\\d]", "[0-9]" }, 560d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[\\s]", "[\\t-\\n\\f-\\r ]" }, 570d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[\\w]", "[0-9A-Z_a-z]" }, 580d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[\\D]", "[^0-9]" }, 590d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[\\S]", "[^\\t-\\n\\f-\\r ]" }, 600d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[\\W]", "[^0-9A-Z_a-z]" }, 610d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 620d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Posix repetitions 630d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a{1}", "a" }, 640d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a{2}", "aa" }, 650d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a{5}", "aaaaa" }, 660d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a{0,1}", "a?" }, 670d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // The next three are illegible because Simplify inserts (?:) 680d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // parens instead of () parens to avoid creating extra 690d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // captured subexpressions. The comments show a version fewer parens. 700d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)? 710d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)? 720d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)? 730d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a{0,2}", "(?:aa?)?" }, // (aa?)? 740d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)? 750d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)? 760d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a{0,}", "a*" }, 770d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a{1,}", "a+" }, 780d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a{2,}", "aa+" }, 790d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a{5,}", "aaaaa+" }, 800d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 810d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Test that operators simplify their arguments. 820d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // (Simplify used to not simplify arguments to a {} repeat.) 830d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(?:a{1,}){1,}", "a+" }, 840d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(a{1,}b{1,})", "(a+b+)" }, 850d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a{1,}|b{1,}", "a+|b+" }, 860d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(?:a{1,})*", "(?:a+)*" }, 870d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(?:a{1,})+", "a+" }, 880d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(?:a{1,})?", "(?:a+)?" }, 890d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a{0}", "" }, 900d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 910d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Character class simplification 920d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[ab]", "[a-b]" }, 930d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[a-za-za-z]", "[a-z]" }, 940d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[A-Za-zA-Za-z]", "[A-Za-z]" }, 950d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[ABCDEFGH]", "[A-H]" }, 960d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[AB-CD-EF-GH]", "[A-H]" }, 970d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[W-ZP-XE-R]", "[E-Z]" }, 980d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[a-ee-gg-m]", "[a-m]" }, 990d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[a-ea-ha-m]", "[a-m]" }, 1000d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[a-ma-ha-e]", "[a-m]" }, 1010d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[a-zA-Z0-9 -~]", "[ -~]" }, 1020d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1030d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Empty character classes 1040d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" }, 1050d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1060d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Full character classes 1070d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "[[:cntrl:][:^cntrl:]]", "." }, 1080d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1090d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Unicode case folding. 1100d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(?i)A", "[Aa]" }, 1110d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(?i)a", "[Aa]" }, 1120d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(?i)K", "[Kk\\x{212a}]" }, 1130d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(?i)k", "[Kk\\x{212a}]" }, 1140d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(?i)\\x{212a}", "[Kk\\x{212a}]" }, 1150d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" }, 1160d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" }, 1170d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(?i)[\\x00-\\x{10ffff}]", "." }, 1180d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1190d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Empty string as a regular expression. 1200d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Empty string must be preserved inside parens in order 1210d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // to make submatches work right, so these are less 1220d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // interesting than they used to be. ToString inserts 1230d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // explicit (?:) in place of non-parenthesized empty strings, 1240d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // to make them easier to spot for other parsers. 1250d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(a|b|)", "([a-b]|(?:))" }, 1260d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(|)", "()" }, 1270d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "a()", "a()" }, 1280d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(()|())", "(()|())" }, 1290d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(a|)", "(a|(?:))" }, 1300d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "ab()cd()", "ab()cd()" }, 1310d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "()", "()" }, 1320d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "()*", "()*" }, 1330d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "()+", "()+" }, 1340d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "()?" , "()?" }, 1350d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(){0}", "" }, 1360d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(){1}", "()" }, 1370d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(){1,}", "()+" }, 1380d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin { "(){0,2}", "(?:()()?)?" }, 1390d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}; 1400d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1410d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(TestSimplify, SimpleRegexps) { 1420d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin for (int i = 0; i < arraysize(tests); i++) { 1430d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin RegexpStatus status; 1440d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin VLOG(1) << "Testing " << tests[i].regexp; 1450d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin Regexp* re = Regexp::Parse(tests[i].regexp, 1460d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin Regexp::MatchNL | (Regexp::LikePerl & 1470d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ~Regexp::OneLine), 1480d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin &status); 1490d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text(); 1500d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin Regexp* sre = re->Simplify(); 1510d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin CHECK(sre != NULL); 1520d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1530d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Check that already-simple regexps don't allocate new ones. 1540d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (strcmp(tests[i].regexp, tests[i].simplified) == 0) { 1550d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin CHECK(re == sre) << " " << tests[i].regexp 1560d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin << " " << re->ToString() << " " << sre->ToString(); 1570d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 1580d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1590d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin EXPECT_EQ(tests[i].simplified, sre->ToString()) 1600d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin << " " << tests[i].regexp << " " << sre->Dump(); 1610d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1620d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin re->Decref(); 1630d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin sre->Decref(); 1640d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 1650d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 1660d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1670d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} // namespace re2 168