10d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Copyright 2008 The RE2 Authors.  All Rights Reserved.
20d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Use of this source code is governed by a BSD-style
30d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// license that can be found in the LICENSE file.
40d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
50d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Exhaustive testing of regular expression matching.
60d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
70d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "util/test.h"
80d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "re2/re2.h"
90d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#include "re2/testing/exhaustive_tester.h"
100d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
110d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinDECLARE_string(regexp_engines);
120d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
130d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkinnamespace re2 {
140d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
150d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test empty string matches (aka "(?:)")
160d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(EmptyString, Exhaustive) {
170d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
180d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin                 RegexpGenerator::EgrepOps(),
190d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin                 5, Split("", "ab"), "", "");
200d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}
210d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
220d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test escaped versions of regexp syntax.
230d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(Punctuation, Literals) {
240d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  vector<string> alphabet = Explode("()*+?{}[]\\^$.");
250d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  vector<string> escaped = alphabet;
260d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  for (int i = 0; i < escaped.size(); i++)
270d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin    escaped[i] = "\\" + escaped[i];
280d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
290d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin                 2, alphabet, "", "");
300d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}
310d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
320d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test ^ $ . \A \z in presence of line endings.
330d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Have to wrap the empty-width ones in (?:) so that
340d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// they can be repeated -- PCRE rejects ^* but allows (?:^)*
350d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander GutkinTEST(LineEnds, Exhaustive) {
360d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin  ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
370d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin                 RegexpGenerator::EgrepOps(),
380d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin                 4, Explode("ab\n"), "", "");
390d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}
400d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
410d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// Test what does and does not match \n.
420d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// This would be a good test, except that PCRE seems to have a bug:
430d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// in single-byte character set mode (the default),
440d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// [^a] matches \n, but in UTF-8 mode it does not.
450d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// So when we run the test, the tester complains that
460d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// we don't agree with PCRE, but it's PCRE that is at fault.
470d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// For what it's worth, Perl gets this right (matches
480d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// regardless of whether UTF-8 input is selected):
490d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin//
500d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin//     #!/usr/bin/perl
510d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin//     use POSIX qw(locale_h);
520d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin//     print "matches in latin1\n" if "\n" =~ /[^a]/;
530d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin//     setlocale("en_US.utf8");
540d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin//     print "matches in utf8\n" if "\n" =~ /[^a]/;
550d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin//
560d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// The rule chosen for RE2 is that by default, like Perl,
570d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// dot does not match \n but negated character classes [^a] do.
580d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// (?s) will allow dot to match \n; there is no way in RE2
590d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// to stop [^a] from matching \n, though the underlying library
600d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// provides a mechanism, and RE2 could add new syntax if needed.
610d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin//
620d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// TEST(Newlines, Exhaustive) {
630d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin//   vector<string> empty_vector;
640d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin//   ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
650d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin//                  RegexpGenerator::EgrepOps(),
660d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin//                  4, Explode("a\n"), "");
670d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin// }
680d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
690d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin}  // namespace re2
700d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin
71