1// Copyright 2006-2008 The RE2 Authors. All Rights Reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5#include "util/test.h" 6#include "util/thread.h" 7#include "re2/prog.h" 8#include "re2/re2.h" 9#include "re2/regexp.h" 10#include "re2/testing/regexp_generator.h" 11#include "re2/testing/string_generator.h" 12 13DECLARE_bool(re2_dfa_bail_when_slow); 14 15DEFINE_int32(size, 8, "log2(number of DFA nodes)"); 16DEFINE_int32(repeat, 2, "Repetition count."); 17DEFINE_int32(threads, 4, "number of threads"); 18 19namespace re2 { 20 21// Check that multithreaded access to DFA class works. 22 23// Helper thread: builds entire DFA for prog. 24class BuildThread : public Thread { 25 public: 26 BuildThread(Prog* prog) : prog_(prog) {} 27 virtual void Run() { 28 CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch)); 29 } 30 31 private: 32 Prog* prog_; 33}; 34 35TEST(Multithreaded, BuildEntireDFA) { 36 // Create regexp with 2^FLAGS_size states in DFA. 37 string s = "a"; 38 for (int i = 0; i < FLAGS_size; i++) 39 s += "[ab]"; 40 s += "b"; 41 42 // Check that single-threaded code works. 43 { 44 //LOG(INFO) << s; 45 Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL); 46 CHECK(re); 47 Prog* prog = re->CompileToProg(0); 48 CHECK(prog); 49 BuildThread* t = new BuildThread(prog); 50 t->SetJoinable(true); 51 t->Start(); 52 t->Join(); 53 delete t; 54 delete prog; 55 re->Decref(); 56 } 57 58 // Build the DFA simultaneously in a bunch of threads. 59 for (int i = 0; i < FLAGS_repeat; i++) { 60 Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL); 61 CHECK(re); 62 Prog* prog = re->CompileToProg(0); 63 CHECK(prog); 64 65 vector<BuildThread*> threads; 66 for (int j = 0; j < FLAGS_threads; j++) { 67 BuildThread *t = new BuildThread(prog); 68 t->SetJoinable(true); 69 threads.push_back(t); 70 } 71 for (int j = 0; j < FLAGS_threads; j++) 72 threads[j]->Start(); 73 for (int j = 0; j < FLAGS_threads; j++) { 74 threads[j]->Join(); 75 delete threads[j]; 76 } 77 78 // One more compile, to make sure everything is okay. 79 prog->BuildEntireDFA(Prog::kFirstMatch); 80 delete prog; 81 re->Decref(); 82 } 83} 84 85// Check that DFA size requirements are followed. 86// BuildEntireDFA will, like SearchDFA, stop building out 87// the DFA once the memory limits are reached. 88TEST(SingleThreaded, BuildEntireDFA) { 89 // Create regexp with 2^30 states in DFA. 90 string s = "a"; 91 for (int i = 0; i < 30; i++) 92 s += "[ab]"; 93 s += "b"; 94 95 //LOG(INFO) << s; 96 Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL); 97 CHECK(re); 98 int max = 24; 99 for (int i = 17; i < max; i++) { 100 int limit = 1<<i; 101 int usage; 102 //int progusage, dfamem; 103 { 104 testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY); 105 Prog* prog = re->CompileToProg(limit); 106 CHECK(prog); 107 //progusage = m.HeapGrowth(); 108 //dfamem = prog->dfa_mem(); 109 prog->BuildEntireDFA(Prog::kFirstMatch); 110 prog->BuildEntireDFA(Prog::kLongestMatch); 111 usage = m.HeapGrowth(); 112 delete prog; 113 } 114 if (!UsingMallocCounter) 115 continue; 116 //LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n", 117 // limit, progusage, dfamem, usage); 118 CHECK_GT(usage, limit*9/10); 119 CHECK_LT(usage, limit + (16<<10)); // 16kB of slop okay 120 } 121 re->Decref(); 122} 123 124// Generates and returns a string over binary alphabet {0,1} that contains 125// all possible binary sequences of length n as subsequences. The obvious 126// brute force method would generate a string of length n * 2^n, but this 127// generates a string of length n + 2^n - 1 called a De Bruijn cycle. 128// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17. 129// Such a string is useful for testing a DFA. If you have a DFA 130// where distinct last n bytes implies distinct states, then running on a 131// DeBruijn string causes the DFA to need to create a new state at every 132// position in the input, never reusing any states until it gets to the 133// end of the string. This is the worst possible case for DFA execution. 134static string DeBruijnString(int n) { 135 CHECK_LT(n, 8*sizeof(int)); 136 CHECK_GT(n, 0); 137 138 vector<bool> did(1<<n); 139 for (int i = 0; i < 1<<n; i++) 140 did[i] = false; 141 142 string s; 143 for (int i = 0; i < n-1; i++) 144 s.append("0"); 145 int bits = 0; 146 int mask = (1<<n) - 1; 147 for (int i = 0; i < (1<<n); i++) { 148 bits <<= 1; 149 bits &= mask; 150 if (!did[bits|1]) { 151 bits |= 1; 152 s.append("1"); 153 } else { 154 s.append("0"); 155 } 156 CHECK(!did[bits]); 157 did[bits] = true; 158 } 159 return s; 160} 161 162// Test that the DFA gets the right result even if it runs 163// out of memory during a search. The regular expression 164// 0[01]{n}$ matches a binary string of 0s and 1s only if 165// the (n+1)th-to-last character is a 0. Matching this in 166// a single forward pass (as done by the DFA) requires 167// keeping one bit for each of the last n+1 characters 168// (whether each was a 0), or 2^(n+1) possible states. 169// If we run this regexp to search in a string that contains 170// every possible n-character binary string as a substring, 171// then it will have to run through at least 2^n states. 172// States are big data structures -- certainly more than 1 byte -- 173// so if the DFA can search correctly while staying within a 174// 2^n byte limit, it must be handling out-of-memory conditions 175// gracefully. 176TEST(SingleThreaded, SearchDFA) { 177 // Choice of n is mostly arbitrary, except that: 178 // * making n too big makes the test run for too long. 179 // * making n too small makes the DFA refuse to run, 180 // because it has so little memory compared to the program size. 181 // Empirically, n = 18 is a good compromise between the two. 182 const int n = 18; 183 184 Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n), 185 Regexp::LikePerl, NULL); 186 CHECK(re); 187 188 // The De Bruijn string for n ends with a 1 followed by n 0s in a row, 189 // which is not a match for 0[01]{n}$. Adding one more 0 is a match. 190 string no_match = DeBruijnString(n); 191 string match = no_match + "0"; 192 193 // The De Bruijn string is the worst case input for this regexp. 194 // By default, the DFA will notice that it is flushing its cache 195 // too frequently and will bail out early, so that RE2 can use the 196 // NFA implementation instead. (The DFA loses its speed advantage 197 // if it can't get a good cache hit rate.) 198 // Tell the DFA to trudge along instead. 199 FLAGS_re2_dfa_bail_when_slow = false; 200 201 int64 usage; 202 int64 peak_usage; 203 { 204 testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY); 205 Prog* prog = re->CompileToProg(1<<n); 206 CHECK(prog); 207 for (int i = 0; i < 10; i++) { 208 bool matched, failed = false; 209 matched = prog->SearchDFA(match, NULL, 210 Prog::kUnanchored, Prog::kFirstMatch, 211 NULL, &failed, NULL); 212 CHECK(!failed); 213 CHECK(matched); 214 matched = prog->SearchDFA(no_match, NULL, 215 Prog::kUnanchored, Prog::kFirstMatch, 216 NULL, &failed, NULL); 217 CHECK(!failed); 218 CHECK(!matched); 219 } 220 usage = m.HeapGrowth(); 221 peak_usage = m.PeakHeapGrowth(); 222 delete prog; 223 } 224 re->Decref(); 225 226 if (!UsingMallocCounter) 227 return; 228 //LOG(INFO) << "usage " << usage << " " << peak_usage; 229 CHECK_LT(usage, 1<<n); 230 CHECK_LT(peak_usage, 1<<n); 231} 232 233// Helper thread: searches for match, which should match, 234// and no_match, which should not. 235class SearchThread : public Thread { 236 public: 237 SearchThread(Prog* prog, const StringPiece& match, 238 const StringPiece& no_match) 239 : prog_(prog), match_(match), no_match_(no_match) {} 240 241 virtual void Run() { 242 for (int i = 0; i < 2; i++) { 243 bool matched, failed = false; 244 matched = prog_->SearchDFA(match_, NULL, 245 Prog::kUnanchored, Prog::kFirstMatch, 246 NULL, &failed, NULL); 247 CHECK(!failed); 248 CHECK(matched); 249 matched = prog_->SearchDFA(no_match_, NULL, 250 Prog::kUnanchored, Prog::kFirstMatch, 251 NULL, &failed, NULL); 252 CHECK(!failed); 253 CHECK(!matched); 254 } 255 } 256 257 private: 258 Prog* prog_; 259 StringPiece match_; 260 StringPiece no_match_; 261}; 262 263TEST(Multithreaded, SearchDFA) { 264 // Same as single-threaded test above. 265 const int n = 18; 266 Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n), 267 Regexp::LikePerl, NULL); 268 CHECK(re); 269 string no_match = DeBruijnString(n); 270 string match = no_match + "0"; 271 FLAGS_re2_dfa_bail_when_slow = false; 272 273 // Check that single-threaded code works. 274 { 275 Prog* prog = re->CompileToProg(1<<n); 276 CHECK(prog); 277 SearchThread* t = new SearchThread(prog, match, no_match); 278 t->SetJoinable(true); 279 t->Start(); 280 t->Join(); 281 delete t; 282 delete prog; 283 } 284 285 // Run the search simultaneously in a bunch of threads. 286 // Reuse same flags for Multithreaded.BuildDFA above. 287 for (int i = 0; i < FLAGS_repeat; i++) { 288 //LOG(INFO) << "Search " << i; 289 Prog* prog = re->CompileToProg(1<<n); 290 CHECK(prog); 291 292 vector<SearchThread*> threads; 293 for (int j = 0; j < FLAGS_threads; j++) { 294 SearchThread *t = new SearchThread(prog, match, no_match); 295 t->SetJoinable(true); 296 threads.push_back(t); 297 } 298 for (int j = 0; j < FLAGS_threads; j++) 299 threads[j]->Start(); 300 for (int j = 0; j < FLAGS_threads; j++) { 301 threads[j]->Join(); 302 delete threads[j]; 303 } 304 delete prog; 305 } 306 re->Decref(); 307} 308 309struct ReverseTest { 310 const char *regexp; 311 const char *text; 312 bool match; 313}; 314 315// Test that reverse DFA handles anchored/unanchored correctly. 316// It's in the DFA interface but not used by RE2. 317ReverseTest reverse_tests[] = { 318 { "\\A(a|b)", "abc", true }, 319 { "(a|b)\\z", "cba", true }, 320 { "\\A(a|b)", "cba", false }, 321 { "(a|b)\\z", "abc", false }, 322}; 323 324TEST(DFA, ReverseMatch) { 325 int nfail = 0; 326 for (int i = 0; i < arraysize(reverse_tests); i++) { 327 const ReverseTest& t = reverse_tests[i]; 328 Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL); 329 CHECK(re); 330 Prog *prog = re->CompileToReverseProg(0); 331 CHECK(prog); 332 bool failed = false; 333 bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed, NULL); 334 if (matched != t.match) { 335 LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match; 336 nfail++; 337 } 338 delete prog; 339 re->Decref(); 340 } 341 EXPECT_EQ(nfail, 0); 342} 343 344} // namespace re2 345