10d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Copyright 2008 The RE2 Authors. All Rights Reserved. 20d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Use of this source code is governed by a BSD-style 30d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// license that can be found in the LICENSE file. 40d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 50d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Exhaustive testing of regular expression matching. 60d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 70d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "util/test.h" 80d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "re2/re2.h" 90d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "re2/testing/exhaustive_tester.h" 100d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 110d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinDECLARE_string(regexp_engines); 120d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 130d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinnamespace re2 { 140d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 150d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test empty string matches (aka "(?:)") 160d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(EmptyString, Exhaustive) { 170d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ExhaustiveTest(2, 2, Split(" ", "(?:) a"), 180d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin RegexpGenerator::EgrepOps(), 190d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 5, Split("", "ab"), "", ""); 200d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 210d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 220d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test escaped versions of regexp syntax. 230d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(Punctuation, Literals) { 240d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin vector<string> alphabet = Explode("()*+?{}[]\\^$."); 250d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin vector<string> escaped = alphabet; 260d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin for (int i = 0; i < escaped.size(); i++) 270d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin escaped[i] = "\\" + escaped[i]; 280d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(), 290d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 2, alphabet, "", ""); 300d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 310d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 320d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test ^ $ . \A \z in presence of line endings. 330d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Have to wrap the empty-width ones in (?:) so that 340d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// they can be repeated -- PCRE rejects ^* but allows (?:^)* 350d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(LineEnds, Exhaustive) { 360d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"), 370d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin RegexpGenerator::EgrepOps(), 380d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 4, Explode("ab\n"), "", ""); 390d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} 400d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 410d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test what does and does not match \n. 420d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// This would be a good test, except that PCRE seems to have a bug: 430d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// in single-byte character set mode (the default), 440d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// [^a] matches \n, but in UTF-8 mode it does not. 450d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// So when we run the test, the tester complains that 460d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// we don't agree with PCRE, but it's PCRE that is at fault. 470d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// For what it's worth, Perl gets this right (matches 480d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// regardless of whether UTF-8 input is selected): 490d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// 500d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// #!/usr/bin/perl 510d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// use POSIX qw(locale_h); 520d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// print "matches in latin1\n" if "\n" =~ /[^a]/; 530d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// setlocale("en_US.utf8"); 540d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// print "matches in utf8\n" if "\n" =~ /[^a]/; 550d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// 560d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// The rule chosen for RE2 is that by default, like Perl, 570d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// dot does not match \n but negated character classes [^a] do. 580d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// (?s) will allow dot to match \n; there is no way in RE2 590d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// to stop [^a] from matching \n, though the underlying library 600d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// provides a mechanism, and RE2 could add new syntax if needed. 610d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// 620d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// TEST(Newlines, Exhaustive) { 630d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// vector<string> empty_vector; 640d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"), 650d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// RegexpGenerator::EgrepOps(), 660d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// 4, Explode("a\n"), ""); 670d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// } 680d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 690d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin} // namespace re2 700d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 71