15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2008 The RE2 Authors. All Rights Reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Exhaustive testing of regular expression matching. 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/test.h" 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/re2.h" 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/testing/exhaustive_tester.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DECLARE_string(regexp_engines); 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 { 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Test empty string matches (aka "(?:)") 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(EmptyString, Exhaustive) { 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ExhaustiveTest(2, 2, Split(" ", "(?:) a"), 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpGenerator::EgrepOps(), 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5, Split("", "ab"), "", ""); 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Test escaped versions of regexp syntax. 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(Punctuation, Literals) { 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<string> alphabet = Explode("()*+?{}[]\\^$."); 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<string> escaped = alphabet; 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < escaped.size(); i++) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) escaped[i] = "\\" + escaped[i]; 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(), 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2, alphabet, "", ""); 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Test ^ $ . \A \z in presence of line endings. 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Have to wrap the empty-width ones in (?:) so that 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// they can be repeated -- PCRE rejects ^* but allows (?:^)* 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(LineEnds, Exhaustive) { 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"), 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpGenerator::EgrepOps(), 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4, Explode("ab\n"), "", ""); 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Test what does and does not match \n. 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This would be a good test, except that PCRE seems to have a bug: 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in single-byte character set mode (the default), 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// [^a] matches \n, but in UTF-8 mode it does not. 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// So when we run the test, the tester complains that 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// we don't agree with PCRE, but it's PCRE that is at fault. 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For what it's worth, Perl gets this right (matches 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// regardless of whether UTF-8 input is selected): 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// #!/usr/bin/perl 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// use POSIX qw(locale_h); 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// print "matches in latin1\n" if "\n" =~ /[^a]/; 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// setlocale("en_US.utf8"); 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// print "matches in utf8\n" if "\n" =~ /[^a]/; 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The rule chosen for RE2 is that by default, like Perl, 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// dot does not match \n but negated character classes [^a] do. 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (?s) will allow dot to match \n; there is no way in RE2 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to stop [^a] from matching \n, though the underlying library 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// provides a mechanism, and RE2 could add new syntax if needed. 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// TEST(Newlines, Exhaustive) { 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// vector<string> empty_vector; 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"), 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// RegexpGenerator::EgrepOps(), 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 4, Explode("a\n"), ""); 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// } 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace re2 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 71