1// Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "util/test.h"
6#include "util/thread.h"
7#include "re2/prog.h"
8#include "re2/re2.h"
9#include "re2/regexp.h"
10#include "re2/testing/regexp_generator.h"
11#include "re2/testing/string_generator.h"
12
13DECLARE_bool(re2_dfa_bail_when_slow);
14
15DEFINE_int32(size, 8, "log2(number of DFA nodes)");
16DEFINE_int32(repeat, 2, "Repetition count.");
17DEFINE_int32(threads, 4, "number of threads");
18
19namespace re2 {
20
21// Check that multithreaded access to DFA class works.
22
23// Helper thread: builds entire DFA for prog.
24class BuildThread : public Thread {
25 public:
26  BuildThread(Prog* prog) : prog_(prog) {}
27  virtual void Run() {
28    CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch));
29  }
30
31 private:
32  Prog* prog_;
33};
34
35TEST(Multithreaded, BuildEntireDFA) {
36  // Create regexp with 2^FLAGS_size states in DFA.
37  string s = "a";
38  for (int i = 0; i < FLAGS_size; i++)
39    s += "[ab]";
40  s += "b";
41
42  // Check that single-threaded code works.
43  {
44    //LOG(INFO) << s;
45    Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
46    CHECK(re);
47    Prog* prog = re->CompileToProg(0);
48    CHECK(prog);
49    BuildThread* t = new BuildThread(prog);
50    t->SetJoinable(true);
51    t->Start();
52    t->Join();
53    delete t;
54    delete prog;
55    re->Decref();
56  }
57
58  // Build the DFA simultaneously in a bunch of threads.
59  for (int i = 0; i < FLAGS_repeat; i++) {
60    Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
61    CHECK(re);
62    Prog* prog = re->CompileToProg(0);
63    CHECK(prog);
64
65    vector<BuildThread*> threads;
66    for (int j = 0; j < FLAGS_threads; j++) {
67      BuildThread *t = new BuildThread(prog);
68      t->SetJoinable(true);
69      threads.push_back(t);
70    }
71    for (int j = 0; j < FLAGS_threads; j++)
72      threads[j]->Start();
73    for (int j = 0; j < FLAGS_threads; j++) {
74      threads[j]->Join();
75      delete threads[j];
76    }
77
78    // One more compile, to make sure everything is okay.
79    prog->BuildEntireDFA(Prog::kFirstMatch);
80    delete prog;
81    re->Decref();
82  }
83}
84
85// Check that DFA size requirements are followed.
86// BuildEntireDFA will, like SearchDFA, stop building out
87// the DFA once the memory limits are reached.
88TEST(SingleThreaded, BuildEntireDFA) {
89  // Create regexp with 2^30 states in DFA.
90  string s = "a";
91  for (int i = 0; i < 30; i++)
92    s += "[ab]";
93  s += "b";
94
95  //LOG(INFO) << s;
96  Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
97  CHECK(re);
98  int max = 24;
99  for (int i = 17; i < max; i++) {
100    int limit = 1<<i;
101    int usage;
102    //int progusage, dfamem;
103    {
104      testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
105      Prog* prog = re->CompileToProg(limit);
106      CHECK(prog);
107      //progusage = m.HeapGrowth();
108      //dfamem = prog->dfa_mem();
109      prog->BuildEntireDFA(Prog::kFirstMatch);
110      prog->BuildEntireDFA(Prog::kLongestMatch);
111      usage = m.HeapGrowth();
112      delete prog;
113    }
114    if (!UsingMallocCounter)
115      continue;
116    //LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n",
117    //                          limit, progusage, dfamem, usage);
118    CHECK_GT(usage, limit*9/10);
119    CHECK_LT(usage, limit + (16<<10));  // 16kB of slop okay
120  }
121  re->Decref();
122}
123
124// Generates and returns a string over binary alphabet {0,1} that contains
125// all possible binary sequences of length n as subsequences.  The obvious
126// brute force method would generate a string of length n * 2^n, but this
127// generates a string of length n + 2^n - 1 called a De Bruijn cycle.
128// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
129// Such a string is useful for testing a DFA.  If you have a DFA
130// where distinct last n bytes implies distinct states, then running on a
131// DeBruijn string causes the DFA to need to create a new state at every
132// position in the input, never reusing any states until it gets to the
133// end of the string.  This is the worst possible case for DFA execution.
134static string DeBruijnString(int n) {
135  CHECK_LT(n, 8*sizeof(int));
136  CHECK_GT(n, 0);
137
138  vector<bool> did(1<<n);
139  for (int i = 0; i < 1<<n; i++)
140    did[i] = false;
141
142  string s;
143  for (int i = 0; i < n-1; i++)
144    s.append("0");
145  int bits = 0;
146  int mask = (1<<n) - 1;
147  for (int i = 0; i < (1<<n); i++) {
148    bits <<= 1;
149    bits &= mask;
150    if (!did[bits|1]) {
151      bits |= 1;
152      s.append("1");
153    } else {
154      s.append("0");
155    }
156    CHECK(!did[bits]);
157    did[bits] = true;
158  }
159  return s;
160}
161
162// Test that the DFA gets the right result even if it runs
163// out of memory during a search.  The regular expression
164// 0[01]{n}$ matches a binary string of 0s and 1s only if
165// the (n+1)th-to-last character is a 0.  Matching this in
166// a single forward pass (as done by the DFA) requires
167// keeping one bit for each of the last n+1 characters
168// (whether each was a 0), or 2^(n+1) possible states.
169// If we run this regexp to search in a string that contains
170// every possible n-character binary string as a substring,
171// then it will have to run through at least 2^n states.
172// States are big data structures -- certainly more than 1 byte --
173// so if the DFA can search correctly while staying within a
174// 2^n byte limit, it must be handling out-of-memory conditions
175// gracefully.
176TEST(SingleThreaded, SearchDFA) {
177  // Choice of n is mostly arbitrary, except that:
178  //   * making n too big makes the test run for too long.
179  //   * making n too small makes the DFA refuse to run,
180  //     because it has so little memory compared to the program size.
181  // Empirically, n = 18 is a good compromise between the two.
182  const int n = 18;
183
184  Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
185                             Regexp::LikePerl, NULL);
186  CHECK(re);
187
188  // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
189  // which is not a match for 0[01]{n}$.  Adding one more 0 is a match.
190  string no_match = DeBruijnString(n);
191  string match = no_match + "0";
192
193  // The De Bruijn string is the worst case input for this regexp.
194  // By default, the DFA will notice that it is flushing its cache
195  // too frequently and will bail out early, so that RE2 can use the
196  // NFA implementation instead.  (The DFA loses its speed advantage
197  // if it can't get a good cache hit rate.)
198  // Tell the DFA to trudge along instead.
199  FLAGS_re2_dfa_bail_when_slow = false;
200
201  int64 usage;
202  int64 peak_usage;
203  {
204    testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
205    Prog* prog = re->CompileToProg(1<<n);
206    CHECK(prog);
207    for (int i = 0; i < 10; i++) {
208      bool matched, failed = false;
209      matched = prog->SearchDFA(match, NULL,
210                                Prog::kUnanchored, Prog::kFirstMatch,
211                                NULL, &failed, NULL);
212      CHECK(!failed);
213      CHECK(matched);
214      matched = prog->SearchDFA(no_match, NULL,
215                                Prog::kUnanchored, Prog::kFirstMatch,
216                                NULL, &failed, NULL);
217      CHECK(!failed);
218      CHECK(!matched);
219    }
220    usage = m.HeapGrowth();
221    peak_usage = m.PeakHeapGrowth();
222    delete prog;
223  }
224  re->Decref();
225
226  if (!UsingMallocCounter)
227    return;
228  //LOG(INFO) << "usage " << usage << " " << peak_usage;
229  CHECK_LT(usage, 1<<n);
230  CHECK_LT(peak_usage, 1<<n);
231}
232
233// Helper thread: searches for match, which should match,
234// and no_match, which should not.
235class SearchThread : public Thread {
236 public:
237  SearchThread(Prog* prog, const StringPiece& match,
238               const StringPiece& no_match)
239    : prog_(prog), match_(match), no_match_(no_match) {}
240
241  virtual void Run() {
242    for (int i = 0; i < 2; i++) {
243      bool matched, failed = false;
244      matched = prog_->SearchDFA(match_, NULL,
245                                 Prog::kUnanchored, Prog::kFirstMatch,
246                                 NULL, &failed, NULL);
247      CHECK(!failed);
248      CHECK(matched);
249      matched = prog_->SearchDFA(no_match_, NULL,
250                                 Prog::kUnanchored, Prog::kFirstMatch,
251                                 NULL, &failed, NULL);
252      CHECK(!failed);
253      CHECK(!matched);
254    }
255  }
256
257 private:
258  Prog* prog_;
259  StringPiece match_;
260  StringPiece no_match_;
261};
262
263TEST(Multithreaded, SearchDFA) {
264  // Same as single-threaded test above.
265  const int n = 18;
266  Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
267                             Regexp::LikePerl, NULL);
268  CHECK(re);
269  string no_match = DeBruijnString(n);
270  string match = no_match + "0";
271  FLAGS_re2_dfa_bail_when_slow = false;
272
273  // Check that single-threaded code works.
274  {
275    Prog* prog = re->CompileToProg(1<<n);
276    CHECK(prog);
277    SearchThread* t = new SearchThread(prog, match, no_match);
278    t->SetJoinable(true);
279    t->Start();
280    t->Join();
281    delete t;
282    delete prog;
283  }
284
285  // Run the search simultaneously in a bunch of threads.
286  // Reuse same flags for Multithreaded.BuildDFA above.
287  for (int i = 0; i < FLAGS_repeat; i++) {
288    //LOG(INFO) << "Search " << i;
289    Prog* prog = re->CompileToProg(1<<n);
290    CHECK(prog);
291
292    vector<SearchThread*> threads;
293    for (int j = 0; j < FLAGS_threads; j++) {
294      SearchThread *t = new SearchThread(prog, match, no_match);
295      t->SetJoinable(true);
296      threads.push_back(t);
297    }
298    for (int j = 0; j < FLAGS_threads; j++)
299      threads[j]->Start();
300    for (int j = 0; j < FLAGS_threads; j++) {
301      threads[j]->Join();
302      delete threads[j];
303    }
304    delete prog;
305  }
306  re->Decref();
307}
308
309struct ReverseTest {
310  const char *regexp;
311  const char *text;
312  bool match;
313};
314
315// Test that reverse DFA handles anchored/unanchored correctly.
316// It's in the DFA interface but not used by RE2.
317ReverseTest reverse_tests[] = {
318  { "\\A(a|b)", "abc", true },
319  { "(a|b)\\z", "cba", true },
320  { "\\A(a|b)", "cba", false },
321  { "(a|b)\\z", "abc", false },
322};
323
324TEST(DFA, ReverseMatch) {
325  int nfail = 0;
326  for (int i = 0; i < arraysize(reverse_tests); i++) {
327    const ReverseTest& t = reverse_tests[i];
328    Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
329    CHECK(re);
330    Prog *prog = re->CompileToReverseProg(0);
331    CHECK(prog);
332    bool failed = false;
333    bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed, NULL);
334    if (matched != t.match) {
335      LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
336      nfail++;
337    }
338    delete prog;
339    re->Decref();
340  }
341  EXPECT_EQ(nfail, 0);
342}
343
344}  // namespace re2
345