1// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5#include <vector> 6#include "util/test.h" 7#include "re2/prog.h" 8#include "re2/re2.h" 9#include "re2/regexp.h" 10#include "re2/testing/regexp_generator.h" 11#include "re2/testing/string_generator.h" 12 13namespace re2 { 14 15// Test that C++ strings are compared as uint8s, not int8s. 16// PossibleMatchRange doesn't depend on this, but callers probably will. 17TEST(CplusplusStrings, EightBit) { 18 string s = "\x70"; 19 string t = "\xA0"; 20 EXPECT_LT(s, t); 21} 22 23struct PrefixTest { 24 const char* regexp; 25 int maxlen; 26 const char* min; 27 const char* max; 28}; 29 30static PrefixTest tests[] = { 31 { "", 10, "", "", }, 32 { "Abcdef", 10, "Abcdef", "Abcdef" }, 33 { "abc(def|ghi)", 10, "abcdef", "abcghi" }, 34 { "a+hello", 10, "aa", "ahello" }, 35 { "a*hello", 10, "a", "hello" }, 36 { "def|abc", 10, "abc", "def" }, 37 { "a(b)(c)[d]", 10, "abcd", "abcd" }, 38 { "ab(cab|cat)", 10, "abcab", "abcat" }, 39 { "ab(cab|ca)x", 10, "abcabx", "abcax" }, 40 { "(ab|x)(c|de)", 10, "abc", "xde" }, 41 { "(ab|x)?(c|z)?", 10, "", "z" }, 42 { "[^\\s\\S]", 10, "", "" }, 43 { "(abc)+", 5, "abc", "abcac" }, 44 { "(abc)+", 2, "ab", "ac" }, 45 { "(abc)+", 1, "a", "b" }, 46 { "[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, 47 { "a*", 10, "", "ab" }, 48 49 { "(?i)Abcdef", 10, "ABCDEF", "abcdef" }, 50 { "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" }, 51 { "(?i)a+hello", 10, "AA", "ahello" }, 52 { "(?i)a*hello", 10, "A", "hello" }, 53 { "(?i)def|abc", 10, "ABC", "def" }, 54 { "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" }, 55 { "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" }, 56 { "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" }, 57 { "(?i)(ab|x)(c|de)", 10, "ABC", "xde" }, 58 { "(?i)(ab|x)?(c|z)?", 10, "", "z" }, 59 { "(?i)[^\\s\\S]", 10, "", "" }, 60 { "(?i)(abc)+", 5, "ABC", "abcac" }, 61 { "(?i)(abc)+", 2, "AB", "ac" }, 62 { "(?i)(abc)+", 1, "A", "b" }, 63 { "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, 64 { "(?i)a*", 10, "", "ab" }, 65 { "(?i)A*", 10, "", "ab" }, 66 67 { "\\AAbcdef", 10, "Abcdef", "Abcdef" }, 68 { "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" }, 69 { "\\Aa+hello", 10, "aa", "ahello" }, 70 { "\\Aa*hello", 10, "a", "hello" }, 71 { "\\Adef|abc", 10, "abc", "def" }, 72 { "\\Aa(b)(c)[d]", 10, "abcd", "abcd" }, 73 { "\\Aab(cab|cat)", 10, "abcab", "abcat" }, 74 { "\\Aab(cab|ca)x", 10, "abcabx", "abcax" }, 75 { "\\A(ab|x)(c|de)", 10, "abc", "xde" }, 76 { "\\A(ab|x)?(c|z)?", 10, "", "z" }, 77 { "\\A[^\\s\\S]", 10, "", "" }, 78 { "\\A(abc)+", 5, "abc", "abcac" }, 79 { "\\A(abc)+", 2, "ab", "ac" }, 80 { "\\A(abc)+", 1, "a", "b" }, 81 { "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" }, 82 { "\\Aa*", 10, "", "ab" }, 83 84 { "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" }, 85 { "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" }, 86 { "(?i)\\Aa+hello", 10, "AA", "ahello" }, 87 { "(?i)\\Aa*hello", 10, "A", "hello" }, 88 { "(?i)\\Adef|abc", 10, "ABC", "def" }, 89 { "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" }, 90 { "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" }, 91 { "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" }, 92 { "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" }, 93 { "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" }, 94 { "(?i)\\A[^\\s\\S]", 10, "", "" }, 95 { "(?i)\\A(abc)+", 5, "ABC", "abcac" }, 96 { "(?i)\\A(abc)+", 2, "AB", "ac" }, 97 { "(?i)\\A(abc)+", 1, "A", "b" }, 98 { "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" }, 99 { "(?i)\\Aa*", 10, "", "ab" }, 100 { "(?i)\\AA*", 10, "", "ab" }, 101}; 102 103TEST(PossibleMatchRange, HandWritten) { 104 for (int i = 0; i < arraysize(tests); i++) { 105 for (int j = 0; j < 2; j++) { 106 const PrefixTest& t = tests[i]; 107 string min, max; 108 if (j == 0) { 109 LOG(INFO) << "Checking regexp=" << CEscape(t.regexp); 110 Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); 111 CHECK(re); 112 Prog* prog = re->CompileToProg(0); 113 CHECK(prog); 114 CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen)) 115 << " " << t.regexp; 116 delete prog; 117 re->Decref(); 118 } else { 119 CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen)); 120 } 121 EXPECT_EQ(t.min, min) << t.regexp; 122 EXPECT_EQ(t.max, max) << t.regexp; 123 } 124 } 125} 126 127// Test cases where PossibleMatchRange should return false. 128TEST(PossibleMatchRange, Failures) { 129 string min, max; 130 131 // Fails because no room to write max. 132 EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0)); 133 134 // Fails because there is no max -- any non-empty string matches 135 // or begins a match. Have to use Latin-1 input, because there 136 // are no valid UTF-8 strings beginning with byte 0xFF. 137 EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1). 138 PossibleMatchRange(&min, &max, 10)) 139 << "min=" << CEscape(min) << ", max=" << CEscape(max); 140 EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1). 141 PossibleMatchRange(&min, &max, 10)) 142 << "min=" << CEscape(min) << ", max=" << CEscape(max); 143 EXPECT_FALSE(RE2(".+hello", RE2::Latin1). 144 PossibleMatchRange(&min, &max, 10)) 145 << "min=" << CEscape(min) << ", max=" << CEscape(max); 146 EXPECT_FALSE(RE2(".*hello", RE2::Latin1). 147 PossibleMatchRange(&min, &max, 10)) 148 << "min=" << CEscape(min) << ", max=" << CEscape(max); 149 EXPECT_FALSE(RE2(".*", RE2::Latin1). 150 PossibleMatchRange(&min, &max, 10)) 151 << "min=" << CEscape(min) << ", max=" << CEscape(max); 152 EXPECT_FALSE(RE2("\\C*"). 153 PossibleMatchRange(&min, &max, 10)) 154 << "min=" << CEscape(min) << ", max=" << CEscape(max); 155 156 // Fails because it's a malformed regexp. 157 EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10)) 158 << "min=" << CEscape(min) << ", max=" << CEscape(max); 159} 160 161// Exhaustive test: generate all regexps within parameters, 162// then generate all strings of a given length over a given alphabet, 163// then check that the prefix information agrees with whether 164// the regexp matches each of the strings. 165class PossibleMatchTester : public RegexpGenerator { 166 public: 167 PossibleMatchTester(int maxatoms, 168 int maxops, 169 const vector<string>& alphabet, 170 const vector<string>& ops, 171 int maxstrlen, 172 const vector<string>& stralphabet) 173 : RegexpGenerator(maxatoms, maxops, alphabet, ops), 174 strgen_(maxstrlen, stralphabet), 175 regexps_(0), tests_(0) { } 176 177 int regexps() { return regexps_; } 178 int tests() { return tests_; } 179 180 // Needed for RegexpGenerator interface. 181 void HandleRegexp(const string& regexp); 182 183 private: 184 StringGenerator strgen_; 185 186 int regexps_; // Number of HandleRegexp calls 187 int tests_; // Number of regexp tests. 188 189 DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester); 190}; 191 192// Processes a single generated regexp. 193// Checks that all accepted strings agree with the prefix range. 194void PossibleMatchTester::HandleRegexp(const string& regexp) { 195 regexps_++; 196 197 VLOG(3) << CEscape(regexp); 198 199 RE2 re(regexp, RE2::Latin1); 200 CHECK_EQ(re.error(), ""); 201 202 string min, max; 203 if(!re.PossibleMatchRange(&min, &max, 10)) { 204 // There's no good max for "\\C*". Can't use strcmp 205 // because sometimes it gets embedded in more 206 // complicated expressions. 207 if(strstr(regexp.c_str(), "\\C*")) 208 return; 209 LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp); 210 } 211 212 strgen_.Reset(); 213 while (strgen_.HasNext()) { 214 const StringPiece& s = strgen_.Next(); 215 tests_++; 216 if (!RE2::FullMatch(s, re)) 217 continue; 218 CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max; 219 CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min; 220 } 221} 222 223TEST(PossibleMatchRange, Exhaustive) { 224 int natom = 3; 225 int noperator = 3; 226 int stringlen = 5; 227 if (DEBUG_MODE) { 228 natom = 2; 229 noperator = 3; 230 stringlen = 3; 231 } 232 PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"), 233 RegexpGenerator::EgrepOps(), 234 stringlen, Explode("ab4")); 235 t.Generate(); 236 LOG(INFO) << t.regexps() << " regexps, " 237 << t.tests() << " tests"; 238} 239 240} // namespace re2 241