1// Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include <vector>
6#include "util/test.h"
7#include "re2/prog.h"
8#include "re2/re2.h"
9#include "re2/regexp.h"
10#include "re2/testing/regexp_generator.h"
11#include "re2/testing/string_generator.h"
12
13namespace re2 {
14
15// Test that C++ strings are compared as uint8s, not int8s.
16// PossibleMatchRange doesn't depend on this, but callers probably will.
17TEST(CplusplusStrings, EightBit) {
18  string s = "\x70";
19  string t = "\xA0";
20  EXPECT_LT(s, t);
21}
22
23struct PrefixTest {
24  const char* regexp;
25  int maxlen;
26  const char* min;
27  const char* max;
28};
29
30static PrefixTest tests[] = {
31  { "",                  10,  "",           "",        },
32  { "Abcdef",            10,  "Abcdef",     "Abcdef"   },
33  { "abc(def|ghi)",      10,  "abcdef",     "abcghi"   },
34  { "a+hello",           10,  "aa",         "ahello"   },
35  { "a*hello",           10,  "a",          "hello"    },
36  { "def|abc",           10,  "abc",        "def"      },
37  { "a(b)(c)[d]",        10,  "abcd",       "abcd"     },
38  { "ab(cab|cat)",       10,  "abcab",      "abcat"    },
39  { "ab(cab|ca)x",       10,  "abcabx",     "abcax"    },
40  { "(ab|x)(c|de)",      10,  "abc",        "xde"      },
41  { "(ab|x)?(c|z)?",     10,  "",           "z"        },
42  { "[^\\s\\S]",         10,  "",           ""         },
43  { "(abc)+",             5,  "abc",        "abcac"    },
44  { "(abc)+",             2,  "ab",         "ac"       },
45  { "(abc)+",             1,  "a",          "b"        },
46  { "[a\xC3\xA1]",        4,  "a",          "\xC3\xA1" },
47  { "a*",                10,  "",           "ab"       },
48
49  { "(?i)Abcdef",        10,  "ABCDEF",     "abcdef"   },
50  { "(?i)abc(def|ghi)",  10,  "ABCDEF",     "abcghi"   },
51  { "(?i)a+hello",       10,  "AA",         "ahello"   },
52  { "(?i)a*hello",       10,  "A",          "hello"    },
53  { "(?i)def|abc",       10,  "ABC",        "def"      },
54  { "(?i)a(b)(c)[d]",    10,  "ABCD",       "abcd"     },
55  { "(?i)ab(cab|cat)",   10,  "ABCAB",      "abcat"    },
56  { "(?i)ab(cab|ca)x",   10,  "ABCABX",     "abcax"    },
57  { "(?i)(ab|x)(c|de)",  10,  "ABC",        "xde"      },
58  { "(?i)(ab|x)?(c|z)?", 10,  "",           "z"        },
59  { "(?i)[^\\s\\S]",     10,  "",           ""         },
60  { "(?i)(abc)+",         5,  "ABC",        "abcac"    },
61  { "(?i)(abc)+",         2,  "AB",         "ac"       },
62  { "(?i)(abc)+",         1,  "A",          "b"        },
63  { "(?i)[a\xC3\xA1]",    4,  "A",          "\xC3\xA1" },
64  { "(?i)a*",            10,  "",           "ab"       },
65  { "(?i)A*",            10,  "",           "ab"       },
66
67  { "\\AAbcdef",         10,  "Abcdef",     "Abcdef"   },
68  { "\\Aabc(def|ghi)",   10,  "abcdef",     "abcghi"   },
69  { "\\Aa+hello",        10,  "aa",         "ahello"   },
70  { "\\Aa*hello",        10,  "a",          "hello"    },
71  { "\\Adef|abc",        10,  "abc",        "def"      },
72  { "\\Aa(b)(c)[d]",     10,  "abcd",       "abcd"     },
73  { "\\Aab(cab|cat)",    10,  "abcab",      "abcat"    },
74  { "\\Aab(cab|ca)x",    10,  "abcabx",     "abcax"    },
75  { "\\A(ab|x)(c|de)",   10,  "abc",        "xde"      },
76  { "\\A(ab|x)?(c|z)?",  10,  "",           "z"        },
77  { "\\A[^\\s\\S]",      10,  "",           ""         },
78  { "\\A(abc)+",          5,  "abc",        "abcac"    },
79  { "\\A(abc)+",          2,  "ab",         "ac"       },
80  { "\\A(abc)+",          1,  "a",          "b"        },
81  { "\\A[a\xC3\xA1]",     4,  "a",          "\xC3\xA1" },
82  { "\\Aa*",             10,  "",           "ab"       },
83
84  { "(?i)\\AAbcdef",         10,  "ABCDEF",     "abcdef"   },
85  { "(?i)\\Aabc(def|ghi)",   10,  "ABCDEF",     "abcghi"   },
86  { "(?i)\\Aa+hello",        10,  "AA",         "ahello"   },
87  { "(?i)\\Aa*hello",        10,  "A",          "hello"    },
88  { "(?i)\\Adef|abc",        10,  "ABC",        "def"      },
89  { "(?i)\\Aa(b)(c)[d]",     10,  "ABCD",       "abcd"     },
90  { "(?i)\\Aab(cab|cat)",    10,  "ABCAB",      "abcat"    },
91  { "(?i)\\Aab(cab|ca)x",    10,  "ABCABX",     "abcax"    },
92  { "(?i)\\A(ab|x)(c|de)",   10,  "ABC",        "xde"      },
93  { "(?i)\\A(ab|x)?(c|z)?",  10,  "",           "z"        },
94  { "(?i)\\A[^\\s\\S]",      10,  "",           ""         },
95  { "(?i)\\A(abc)+",          5,  "ABC",        "abcac"    },
96  { "(?i)\\A(abc)+",          2,  "AB",         "ac"       },
97  { "(?i)\\A(abc)+",          1,  "A",          "b"        },
98  { "(?i)\\A[a\xC3\xA1]",     4,  "A",          "\xC3\xA1" },
99  { "(?i)\\Aa*",             10,  "",           "ab"       },
100  { "(?i)\\AA*",             10,  "",           "ab"       },
101};
102
103TEST(PossibleMatchRange, HandWritten) {
104  for (int i = 0; i < arraysize(tests); i++) {
105    for (int j = 0; j < 2; j++) {
106      const PrefixTest& t = tests[i];
107      string min, max;
108      if (j == 0) {
109        LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
110        Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
111        CHECK(re);
112        Prog* prog = re->CompileToProg(0);
113        CHECK(prog);
114        CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen))
115          << " " << t.regexp;
116        delete prog;
117        re->Decref();
118      } else {
119        CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
120      }
121      EXPECT_EQ(t.min, min) << t.regexp;
122      EXPECT_EQ(t.max, max) << t.regexp;
123    }
124  }
125}
126
127// Test cases where PossibleMatchRange should return false.
128TEST(PossibleMatchRange, Failures) {
129  string min, max;
130
131  // Fails because no room to write max.
132  EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
133
134  // Fails because there is no max -- any non-empty string matches
135  // or begins a match.  Have to use Latin-1 input, because there
136  // are no valid UTF-8 strings beginning with byte 0xFF.
137  EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
138               PossibleMatchRange(&min, &max, 10))
139    << "min=" << CEscape(min) << ", max=" << CEscape(max);
140  EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
141               PossibleMatchRange(&min, &max, 10))
142    << "min=" << CEscape(min) << ", max=" << CEscape(max);
143  EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
144               PossibleMatchRange(&min, &max, 10))
145    << "min=" << CEscape(min) << ", max=" << CEscape(max);
146  EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
147               PossibleMatchRange(&min, &max, 10))
148    << "min=" << CEscape(min) << ", max=" << CEscape(max);
149  EXPECT_FALSE(RE2(".*", RE2::Latin1).
150               PossibleMatchRange(&min, &max, 10))
151    << "min=" << CEscape(min) << ", max=" << CEscape(max);
152  EXPECT_FALSE(RE2("\\C*").
153               PossibleMatchRange(&min, &max, 10))
154    << "min=" << CEscape(min) << ", max=" << CEscape(max);
155
156  // Fails because it's a malformed regexp.
157  EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
158    << "min=" << CEscape(min) << ", max=" << CEscape(max);
159}
160
161// Exhaustive test: generate all regexps within parameters,
162// then generate all strings of a given length over a given alphabet,
163// then check that the prefix information agrees with whether
164// the regexp matches each of the strings.
165class PossibleMatchTester : public RegexpGenerator {
166 public:
167  PossibleMatchTester(int maxatoms,
168                      int maxops,
169                      const vector<string>& alphabet,
170                      const vector<string>& ops,
171                      int maxstrlen,
172                      const vector<string>& stralphabet)
173    : RegexpGenerator(maxatoms, maxops, alphabet, ops),
174      strgen_(maxstrlen, stralphabet),
175      regexps_(0), tests_(0) { }
176
177  int regexps()  { return regexps_; }
178  int tests()    { return tests_; }
179
180  // Needed for RegexpGenerator interface.
181  void HandleRegexp(const string& regexp);
182
183 private:
184  StringGenerator strgen_;
185
186  int regexps_;   // Number of HandleRegexp calls
187  int tests_;     // Number of regexp tests.
188
189  DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester);
190};
191
192// Processes a single generated regexp.
193// Checks that all accepted strings agree with the prefix range.
194void PossibleMatchTester::HandleRegexp(const string& regexp) {
195  regexps_++;
196
197  VLOG(3) << CEscape(regexp);
198
199  RE2 re(regexp, RE2::Latin1);
200  CHECK_EQ(re.error(), "");
201
202  string min, max;
203  if(!re.PossibleMatchRange(&min, &max, 10)) {
204    // There's no good max for "\\C*".  Can't use strcmp
205    // because sometimes it gets embedded in more
206    // complicated expressions.
207    if(strstr(regexp.c_str(), "\\C*"))
208      return;
209    LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
210  }
211
212  strgen_.Reset();
213  while (strgen_.HasNext()) {
214    const StringPiece& s = strgen_.Next();
215    tests_++;
216    if (!RE2::FullMatch(s, re))
217      continue;
218    CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max;
219    CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min;
220  }
221}
222
223TEST(PossibleMatchRange, Exhaustive) {
224  int natom = 3;
225  int noperator = 3;
226  int stringlen = 5;
227  if (DEBUG_MODE) {
228    natom = 2;
229    noperator = 3;
230    stringlen = 3;
231  }
232  PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
233                 RegexpGenerator::EgrepOps(),
234                 stringlen, Explode("ab4"));
235  t.Generate();
236  LOG(INFO) << t.regexps() << " regexps, "
237            << t.tests() << " tests";
238}
239
240}  // namespace re2
241