10d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Copyright 2008 The RE2 Authors. All Rights Reserved. 20d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Use of this source code is governed by a BSD-style 30d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// license that can be found in the LICENSE file. 40d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 50d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Exhaustive testing of regular expression matching. 60d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 70d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Each test picks an alphabet (e.g., "abc"), a maximum string length, 80d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// a maximum regular expression length, and a maximum number of letters 90d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// that can appear in the regular expression. Given these parameters, 100d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// it tries every possible regular expression and string, verifying that 110d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// the NFA, DFA, and a trivial backtracking implementation agree about 120d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// the location of the match. 130d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 140d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include <stdlib.h> 150d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include <stdio.h> 160d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 170d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#ifndef LOGGING 180d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#define LOGGING 0 190d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#endif 200d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 210d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "util/test.h" 220d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "re2/testing/exhaustive_tester.h" 230d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "re2/testing/tester.h" 240d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 250d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinDEFINE_bool(show_regexps, false, "show regexps during testing"); 260d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 270d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinDEFINE_int32(max_bad_regexp_inputs, 1, 280d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin "Stop testing a regular expression after finding this many " 290d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin "strings that break it."); 300d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 310d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Compiled in debug mode, the usual tests run for over an hour. 320d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Have to cut it down to make the unit test machines happy. 330d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinDEFINE_bool(quick_debug_mode, true, "Run fewer tests in debug mode."); 340d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 350d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinnamespace re2 { 360d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 370d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinstatic char* escape(const StringPiece& sp) { 380d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin static char buf[512]; 390d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin char* p = buf; 400d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin *p++ = '\"'; 410d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin for (int i = 0; i < sp.size(); i++) { 420d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if(p+5 >= buf+sizeof buf) 430d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin LOG(FATAL) << "ExhaustiveTester escape: too long"; 440d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if(sp[i] == '\\' || sp[i] == '\"') { 450d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin *p++ = '\\'; 460d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin *p++ = sp[i]; 470d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } else if(sp[i] == '\n') { 480d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin *p++ = '\\'; 490d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin *p++ = 'n'; 500d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } else { 510d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin *p++ = sp[i]; 520d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 530d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 540d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin *p++ = '\"'; 550d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin *p = '\0'; 560d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return buf; 570d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 580d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 590d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinstatic void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) { 600d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (!re.Match(input, 0, input.size(), anchor, m, n)) { 610d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf("-"); 620d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return; 630d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 640d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin for (int i = 0; i < n; i++) { 650d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (i > 0) 660d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf(" "); 670d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (m[i].begin() == NULL) 680d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf("-"); 690d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin else 700d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf("%d-%d", static_cast<int>(m[i].begin() - input.begin()), static_cast<int>(m[i].end() - input.begin())); 710d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 720d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 730d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 740d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Processes a single generated regexp. 750d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Compiles it using Regexp interface and PCRE, and then 760d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// checks that NFA, DFA, and PCRE all return the same results. 770d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinvoid ExhaustiveTester::HandleRegexp(const string& const_regexp) { 780d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin regexps_++; 790d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin string regexp = const_regexp; 800d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (!topwrapper_.empty()) 810d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str()); 820d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 830d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (FLAGS_show_regexps) { 840d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf("\r%s", regexp.c_str()); 850d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin fflush(stdout); 860d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 870d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 880d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (LOGGING) { 890d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Write out test cases and answers for use in testing 900d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // other implementations, such as Go's regexp package. 910d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (randomstrings_) 920d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin LOG(ERROR) << "Cannot log with random strings."; 930d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (regexps_ == 1) { // first 940d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf("strings\n"); 950d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin strgen_.Reset(); 960d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin while (strgen_.HasNext()) 970d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf("%s\n", escape(strgen_.Next())); 980d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf("regexps\n"); 990d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 1000d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf("%s\n", escape(regexp)); 1010d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1020d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin RE2 re(regexp); 1030d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin RE2::Options longest; 1040d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin longest.set_longest_match(true); 1050d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin RE2 relongest(regexp, longest); 1060d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin int ngroup = re.NumberOfCapturingGroups()+1; 1070d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin StringPiece* group = new StringPiece[ngroup]; 1080d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1090d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin strgen_.Reset(); 1100d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin while (strgen_.HasNext()) { 1110d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin StringPiece input = strgen_.Next(); 1120d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup); 1130d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf(";"); 1140d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin PrintResult(re, input, RE2::UNANCHORED, group, ngroup); 1150d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf(";"); 1160d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup); 1170d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf(";"); 1180d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup); 1190d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf("\n"); 1200d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 1210d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin delete[] group; 1220d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return; 1230d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 1240d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1250d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin Tester tester(regexp); 1260d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (tester.error()) 1270d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return; 1280d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1290d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin strgen_.Reset(); 1300d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin strgen_.GenerateNULL(); 1310d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (randomstrings_) 1320d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin strgen_.Random(stringseed_, stringcount_); 1330d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin int bad_inputs = 0; 1340d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin while (strgen_.HasNext()) { 1350d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin tests_++; 1360d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (!tester.TestInput(strgen_.Next())) { 1370d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin failures_++; 1380d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (++bad_inputs >= FLAGS_max_bad_regexp_inputs) 1390d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin break; 1400d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 1410d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 1420d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 1430d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1440d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Runs an exhaustive test on the given parameters. 1450d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinvoid ExhaustiveTest(int maxatoms, int maxops, 1460d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin const vector<string>& alphabet, 1470d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin const vector<string>& ops, 1480d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin int maxstrlen, const vector<string>& stralphabet, 1490d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin const string& wrapper, 1500d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin const string& topwrapper) { 1510d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (DEBUG_MODE && FLAGS_quick_debug_mode) { 1520d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (maxatoms > 1) 1530d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin maxatoms--; 1540d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (maxops > 1) 1550d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin maxops--; 1560d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (maxstrlen > 1) 1570d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin maxstrlen--; 1580d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 1590d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ExhaustiveTester t(maxatoms, maxops, alphabet, ops, 1600d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin maxstrlen, stralphabet, wrapper, 1610d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin topwrapper); 1620d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin t.Generate(); 1630d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (!LOGGING) { 1640d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin printf("%d regexps, %d tests, %d failures [%d/%d str]\n", 1650d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size()); 1660d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 1670d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin EXPECT_EQ(0, t.failures()); 1680d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 1690d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1700d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Runs an exhaustive test using the given parameters and 1710d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// the basic egrep operators. 1720d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinvoid EgrepTest(int maxatoms, int maxops, const string& alphabet, 1730d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin int maxstrlen, const string& stralphabet, 1740d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin const string& wrapper) { 1750d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" }; 1760d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1770d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin for (int i = 0; i < arraysize(tops); i++) { 1780d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ExhaustiveTest(maxatoms, maxops, 1790d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin Split("", alphabet), 1800d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin RegexpGenerator::EgrepOps(), 1810d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin maxstrlen, 1820d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin Split("", stralphabet), 1830d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin wrapper, 1840d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin tops[i]); 1850d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 1860d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 1870d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 1880d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} // namespace re2 189