15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2008 The RE2 Authors.  All Rights Reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Exhaustive testing of regular expression matching.
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/test.h"
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/re2.h"
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/testing/exhaustive_tester.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DECLARE_string(regexp_engines);
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 {
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Test empty string matches (aka "(?:)")
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(EmptyString, Exhaustive) {
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 RegexpGenerator::EgrepOps(),
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 5, Split("", "ab"), "", "");
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Test escaped versions of regexp syntax.
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(Punctuation, Literals) {
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  vector<string> alphabet = Explode("()*+?{}[]\\^$.");
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  vector<string> escaped = alphabet;
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 0; i < escaped.size(); i++)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    escaped[i] = "\\" + escaped[i];
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 2, alphabet, "", "");
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Test ^ $ . \A \z in presence of line endings.
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Have to wrap the empty-width ones in (?:) so that
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// they can be repeated -- PCRE rejects ^* but allows (?:^)*
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)TEST(LineEnds, Exhaustive) {
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 RegexpGenerator::EgrepOps(),
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 4, Explode("ab\n"), "", "");
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Test what does and does not match \n.
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This would be a good test, except that PCRE seems to have a bug:
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in single-byte character set mode (the default),
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// [^a] matches \n, but in UTF-8 mode it does not.
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// So when we run the test, the tester complains that
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// we don't agree with PCRE, but it's PCRE that is at fault.
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For what it's worth, Perl gets this right (matches
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// regardless of whether UTF-8 input is selected):
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     #!/usr/bin/perl
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     use POSIX qw(locale_h);
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     print "matches in latin1\n" if "\n" =~ /[^a]/;
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     setlocale("en_US.utf8");
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     print "matches in utf8\n" if "\n" =~ /[^a]/;
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The rule chosen for RE2 is that by default, like Perl,
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// dot does not match \n but negated character classes [^a] do.
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (?s) will allow dot to match \n; there is no way in RE2
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to stop [^a] from matching \n, though the underlying library
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// provides a mechanism, and RE2 could add new syntax if needed.
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// TEST(Newlines, Exhaustive) {
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   vector<string> empty_vector;
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//   ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//                  RegexpGenerator::EgrepOps(),
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//                  4, Explode("a\n"), "");
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// }
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace re2
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
71