filtered_re2_test.cc revision 0d4c52358a1af421705c54bd8a9fdd8a30558a2e
1// Copyright 2009 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "util/test.h"
6#include "re2/filtered_re2.h"
7#include "re2/re2.h"
8
9DECLARE_int32(filtered_re2_min_atom_len); // From prefilter_tree.cc
10
11namespace re2 {
12
13struct FilterTestVars {
14  vector<string> atoms;
15  vector<int> atom_indices;
16  vector<int> matches;
17  RE2::Options opts;
18  FilteredRE2 f;
19};
20
21TEST(FilteredRE2Test, EmptyTest) {
22  FilterTestVars v;
23  v.f.AllMatches("foo", v.atom_indices, &v.matches);
24  EXPECT_EQ(0, v.matches.size());
25}
26
27TEST(FilteredRE2Test, SmallOrTest) {
28  FLAGS_filtered_re2_min_atom_len = 4;
29
30  FilterTestVars v;
31  int id;
32  v.f.Add("(foo|bar)", v.opts, &id);
33
34  v.f.Compile(&v.atoms);
35  EXPECT_EQ(0, v.atoms.size());
36
37  v.f.AllMatches("lemurs bar", v.atom_indices, &v.matches);
38  EXPECT_EQ(1, v.matches.size());
39  EXPECT_EQ(id, v.matches[0]);
40}
41
42TEST(FilteredRE2Test, SmallLatinTest) {
43  FLAGS_filtered_re2_min_atom_len = 3;
44  FilterTestVars v;
45  int id;
46
47  v.opts.set_utf8(false);
48  v.f.Add("\xde\xadQ\xbe\xef", v.opts, &id);
49  v.f.Compile(&v.atoms);
50  EXPECT_EQ(1, v.atoms.size());
51  EXPECT_EQ(v.atoms[0], "\xde\xadq\xbe\xef");
52
53  v.atom_indices.push_back(0);
54  v.f.AllMatches("foo\xde\xadQ\xbe\xeflemur", v.atom_indices, &v.matches);
55  EXPECT_EQ(1, v.matches.size());
56  EXPECT_EQ(id, v.matches[0]);
57}
58
59struct AtomTest {
60  const char* testname;
61  // If any test needs more than this many regexps or atoms, increase
62  // the size of the corresponding array.
63  const char* regexps[20];
64  const char* atoms[20];
65};
66
67AtomTest atom_tests[] = {
68  {
69    // This test checks to make sure empty patterns are allowed.
70    "CheckEmptyPattern",
71    {""},
72    {}
73  }, {
74    // This test checks that all atoms of length greater than min length
75    // are found, and no atoms that are of smaller length are found.
76    "AllAtomsGtMinLengthFound", {
77      "(abc123|def456|ghi789).*mnop[x-z]+",
78      "abc..yyy..zz",
79      "mnmnpp[a-z]+PPP"
80    }, {
81      "abc123",
82      "def456",
83      "ghi789",
84      "mnop",
85      "abc",
86      "yyy",
87      "mnmnpp",
88      "ppp"
89    }
90  }, {
91    // Test to make sure that any atoms that have another atom as a
92    // substring in an OR are removed; that is, only the shortest
93    // substring is kept.
94    "SubstrAtomRemovesSuperStrInOr", {
95      "(abc123|abc|ghi789|abc1234).*[x-z]+",
96      "abcd..yyy..yyyzzz",
97      "mnmnpp[a-z]+PPP"
98    }, {
99      "abc",
100      "ghi789",
101      "abcd",
102      "yyy",
103      "yyyzzz",
104      "mnmnpp",
105      "ppp"
106    }
107  }, {
108    // Test character class expansion.
109    "CharClassExpansion", {
110      "m[a-c][d-f]n.*[x-z]+",
111      "[x-y]bcde[ab]"
112    }, {
113      "madn", "maen", "mafn",
114      "mbdn", "mben", "mbfn",
115      "mcdn", "mcen", "mcfn",
116      "xbcdea", "xbcdeb",
117      "ybcdea", "ybcdeb"
118    }
119  }, {
120    // Test upper/lower of non-ASCII.
121    "UnicodeLower", {
122      "(?i)ΔδΠϖπΣςσ",
123      "ΛΜΝΟΠ",
124      "ψρστυ",
125    }, {
126      "δδπππσσσ",
127      "λμνοπ",
128      "ψρστυ",
129    },
130  },
131};
132
133void AddRegexpsAndCompile(const char* regexps[],
134                          int n,
135                          struct FilterTestVars* v) {
136  for (int i = 0; i < n; i++) {
137    int id;
138    v->f.Add(regexps[i], v->opts, &id);
139  }
140  v->f.Compile(&v->atoms);
141}
142
143bool CheckExpectedAtoms(const char* atoms[],
144                        int n,
145                        const char* testname,
146                        struct FilterTestVars* v) {
147  vector<string> expected;
148  for (int i = 0; i < n; i++)
149    expected.push_back(atoms[i]);
150
151  bool pass = expected.size() == v->atoms.size();
152
153  sort(v->atoms.begin(), v->atoms.end());
154  sort(expected.begin(), expected.end());
155  for (int i = 0; pass && i < n; i++)
156      pass = pass && expected[i] == v->atoms[i];
157
158  if (!pass) {
159    LOG(WARNING) << "Failed " << testname;
160    LOG(WARNING) << "Expected #atoms = " << expected.size();
161    for (int i = 0; i < expected.size(); i++)
162      LOG(WARNING) << expected[i];
163    LOG(WARNING) << "Found #atoms = " << v->atoms.size();
164    for (int i = 0; i < v->atoms.size(); i++)
165      LOG(WARNING) << v->atoms[i];
166  }
167
168  return pass;
169}
170
171TEST(FilteredRE2Test, AtomTests) {
172  FLAGS_filtered_re2_min_atom_len = 3;
173
174  int nfail = 0;
175  for (int i = 0; i < arraysize(atom_tests); i++) {
176    FilterTestVars v;
177    AtomTest* t = &atom_tests[i];
178    int natom, nregexp;
179    for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
180      if (t->regexps[nregexp] == NULL)
181        break;
182    for (natom = 0; natom < arraysize(t->atoms); natom++)
183      if (t->atoms[natom] == NULL)
184        break;
185    AddRegexpsAndCompile(t->regexps, nregexp, &v);
186    if (!CheckExpectedAtoms(t->atoms, natom, t->testname, &v))
187      nfail++;
188  }
189  EXPECT_EQ(0, nfail);
190}
191
192void FindAtomIndices(const vector<string> atoms,
193                     const vector<string> matched_atoms,
194                     vector<int>* atom_indices) {
195  atom_indices->clear();
196  for (int i = 0; i < matched_atoms.size(); i++) {
197    int j = 0;
198    for (; j < atoms.size(); j++) {
199      if (matched_atoms[i] == atoms[j]) {
200        atom_indices->push_back(j);
201        break;
202      }
203      EXPECT_LT(j, atoms.size());
204    }
205  }
206}
207
208TEST(FilteredRE2Test, MatchEmptyPattern) {
209  FLAGS_filtered_re2_min_atom_len = 3;
210
211  FilterTestVars v;
212  AtomTest* t = &atom_tests[0];
213  // We are using the regexps used in one of the atom tests
214  // for this test. Adding the EXPECT here to make sure
215  // the index we use for the test is for the correct test.
216  EXPECT_EQ("CheckEmptyPattern", string(t->testname));
217  int nregexp;
218  for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
219    if (t->regexps[nregexp] == NULL)
220      break;
221  AddRegexpsAndCompile(t->regexps, nregexp, &v);
222  string text = "0123";
223  vector<int> atom_ids;
224  vector<int> matching_regexps;
225  EXPECT_EQ(0, v.f.FirstMatch(text, atom_ids));
226}
227
228TEST(FilteredRE2Test, MatchTests) {
229  FLAGS_filtered_re2_min_atom_len = 3;
230
231  FilterTestVars v;
232  AtomTest* t = &atom_tests[2];
233  // We are using the regexps used in one of the atom tests
234  // for this test.
235  EXPECT_EQ("SubstrAtomRemovesSuperStrInOr", string(t->testname));
236  int nregexp;
237  for (nregexp = 0; nregexp < arraysize(t->regexps); nregexp++)
238    if (t->regexps[nregexp] == NULL)
239      break;
240  AddRegexpsAndCompile(t->regexps, nregexp, &v);
241
242  string text = "abc121212xyz";
243  // atoms = abc
244  vector<int> atom_ids;
245  vector<string> atoms;
246  atoms.push_back("abc");
247  FindAtomIndices(v.atoms, atoms, &atom_ids);
248  vector<int> matching_regexps;
249  v.f.AllMatches(text, atom_ids, &matching_regexps);
250  EXPECT_EQ(1, matching_regexps.size());
251
252  text = "abc12312yyyzzz";
253  atoms.clear();
254  atoms.push_back("abc");
255  atoms.push_back("yyy");
256  atoms.push_back("yyyzzz");
257  FindAtomIndices(v.atoms, atoms, &atom_ids);
258  v.f.AllMatches(text, atom_ids, &matching_regexps);
259  EXPECT_EQ(1, matching_regexps.size());
260
261  text = "abcd12yyy32yyyzzz";
262  atoms.clear();
263  atoms.push_back("abc");
264  atoms.push_back("abcd");
265  atoms.push_back("yyy");
266  atoms.push_back("yyyzzz");
267  FindAtomIndices(v.atoms, atoms, &atom_ids);
268  LOG(INFO) << "S: " << atom_ids.size();
269  for (int i = 0; i < atom_ids.size(); i++)
270    LOG(INFO) << "i: " << i << " : " << atom_ids[i];
271  v.f.AllMatches(text, atom_ids, &matching_regexps);
272  EXPECT_EQ(2, matching_regexps.size());
273}
274
275}  //  namespace re2
276