1// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression engine tester -- test all the implementations against each other.
6
7#include "util/util.h"
8#include "util/flags.h"
9#include "re2/testing/tester.h"
10#include "re2/prog.h"
11#include "re2/re2.h"
12#include "re2/regexp.h"
13
14DEFINE_bool(dump_prog, false, "dump regexp program");
15DEFINE_bool(log_okay, false, "log successful runs");
16DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
17
18DEFINE_int32(max_regexp_failures, 100,
19             "maximum number of regexp test failures (-1 = unlimited)");
20
21DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
22
23namespace re2 {
24
25enum {
26  kMaxSubmatch = 1+16,  // $0...$16
27};
28
29const char* engine_types[kEngineMax] = {
30  "Backtrack",
31  "NFA",
32  "DFA",
33  "DFA1",
34  "OnePass",
35  "BitState",
36  "RE2",
37  "RE2a",
38  "RE2b",
39  "PCRE",
40};
41
42// Returns the name string for the type t.
43static string EngineString(Engine t) {
44  if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {
45    return StringPrintf("type%d", static_cast<int>(t));
46  }
47  return engine_types[t];
48}
49
50// Returns bit mask of engines to use.
51static uint32 Engines() {
52  static uint32 cached_engines;
53  static bool did_parse;
54
55  if (did_parse)
56    return cached_engines;
57
58  if (FLAGS_regexp_engines.empty()) {
59    cached_engines = ~0;
60  } else {
61    for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
62      if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))
63        cached_engines |= 1<<i;
64  }
65
66  if (cached_engines == 0)
67    LOG(INFO) << "Warning: no engines enabled.";
68  if (!UsingPCRE)
69    cached_engines &= ~(1<<kEnginePCRE);
70  for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
71    if (cached_engines & (1<<i))
72      LOG(INFO) << EngineString(i) << " enabled";
73  }
74  did_parse = true;
75  return cached_engines;
76}
77
78// The result of running a match.
79struct TestInstance::Result {
80  bool skipped;         // test skipped: wasn't applicable
81  bool matched;         // found a match
82  bool untrusted;       // don't really trust the answer
83  bool have_submatch;   // computed all submatch info
84  bool have_submatch0;  // computed just submatch[0]
85  StringPiece submatch[kMaxSubmatch];
86};
87
88typedef TestInstance::Result Result;
89
90// Formats a single capture range s in text in the form (a,b)
91// where a and b are the starting and ending offsets of s in text.
92static string FormatCapture(const StringPiece& text, const StringPiece& s) {
93  if (s.begin() == NULL)
94    return "(?,?)";
95  return StringPrintf("(%d,%d)",
96                      static_cast<int>(s.begin() - text.begin()),
97                      static_cast<int>(s.end() - text.begin()));
98}
99
100// Returns whether text contains non-ASCII (>= 0x80) bytes.
101static bool NonASCII(const StringPiece& text) {
102  for (int i = 0; i < text.size(); i++)
103    if ((uint8)text[i] >= 0x80)
104      return true;
105  return false;
106}
107
108// Returns string representation of match kind.
109static string FormatKind(Prog::MatchKind kind) {
110  switch (kind) {
111    case Prog::kFullMatch:
112      return "full match";
113    case Prog::kLongestMatch:
114      return "longest match";
115    case Prog::kFirstMatch:
116      return "first match";
117    case Prog::kManyMatch:
118      return "many match";
119  }
120  return "???";
121}
122
123// Returns string representation of anchor kind.
124static string FormatAnchor(Prog::Anchor anchor) {
125  switch (anchor) {
126    case Prog::kAnchored:
127      return "anchored";
128    case Prog::kUnanchored:
129      return "unanchored";
130  }
131  return "???";
132}
133
134struct ParseMode {
135  Regexp::ParseFlags parse_flags;
136  string desc;
137};
138
139static const Regexp::ParseFlags single_line =
140  Regexp::LikePerl;
141static const Regexp::ParseFlags multi_line =
142  static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
143
144static ParseMode parse_modes[] = {
145  { single_line,                   "single-line"          },
146  { single_line|Regexp::Latin1,    "single-line, latin1"  },
147  { multi_line,                    "multiline"            },
148  { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
149  { multi_line|Regexp::Latin1,     "multiline, latin1"    },
150};
151
152static string FormatMode(Regexp::ParseFlags flags) {
153  for (int i = 0; i < arraysize(parse_modes); i++)
154    if (parse_modes[i].parse_flags == flags)
155      return parse_modes[i].desc;
156  return StringPrintf("%#x", static_cast<uint>(flags));
157}
158
159// Constructs and saves all the matching engines that
160// will be required for the given tests.
161TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
162                           Regexp::ParseFlags flags)
163  : regexp_str_(regexp_str),
164    kind_(kind),
165    flags_(flags),
166    error_(false),
167    regexp_(NULL),
168    num_captures_(0),
169    prog_(NULL),
170    rprog_(NULL),
171    re_(NULL),
172    re2_(NULL) {
173
174  VLOG(1) << CEscape(regexp_str);
175
176  // Compile regexp to prog.
177  // Always required - needed for backtracking (reference implementation).
178  RegexpStatus status;
179  regexp_ = Regexp::Parse(regexp_str, flags, &status);
180  if (regexp_ == NULL) {
181    LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
182              << " mode: " << FormatMode(flags);
183    error_ = true;
184    return;
185  }
186  num_captures_ = regexp_->NumCaptures();
187  prog_ = regexp_->CompileToProg(0);
188  if (prog_ == NULL) {
189    LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
190    error_ = true;
191    return;
192  }
193  if (FLAGS_dump_prog) {
194    LOG(INFO) << "Prog for "
195              << " regexp "
196              << CEscape(regexp_str_)
197              << " (" << FormatKind(kind_)
198              << ", " << FormatMode(flags_)
199              << ")\n"
200              << prog_->Dump();
201  }
202
203  // Compile regexp to reversed prog.  Only needed for DFA engines.
204  if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
205    rprog_ = regexp_->CompileToReverseProg(0);
206    if (rprog_ == NULL) {
207      LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
208      error_ = true;
209      return;
210    }
211    if (FLAGS_dump_rprog)
212      LOG(INFO) << rprog_->Dump();
213  }
214
215  // Create re string that will be used for RE and RE2.
216  string re = regexp_str.as_string();
217  // Accomodate flags.
218  // Regexp::Latin1 will be accomodated below.
219  if (!(flags & Regexp::OneLine))
220    re = "(?m)" + re;
221  if (flags & Regexp::NonGreedy)
222    re = "(?U)" + re;
223  if (flags & Regexp::DotNL)
224    re = "(?s)" + re;
225
226  // Compile regexp to RE2.
227  if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
228    RE2::Options options;
229    if (flags & Regexp::Latin1)
230      options.set_encoding(RE2::Options::EncodingLatin1);
231    if (kind_ == Prog::kLongestMatch)
232      options.set_longest_match(true);
233    re2_ = new RE2(re, options);
234    if (!re2_->error().empty()) {
235      LOG(INFO) << "Cannot RE2: " << CEscape(re);
236      error_ = true;
237      return;
238    }
239  }
240
241  // Compile regexp to RE.
242  // PCRE as exposed by the RE interface isn't always usable.
243  // 1. It disagrees about handling of empty-string reptitions
244  //    like matching (a*)* against "b".  PCRE treats the (a*) as
245  //    occurring once, while we treat it as occurring not at all.
246  // 2. It treats $ as this weird thing meaning end of string
247  //    or before the \n at the end of the string.
248  // 3. It doesn't implement POSIX leftmost-longest matching.
249  // MimicsPCRE() detects 1 and 2.
250  if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
251      kind_ != Prog::kLongestMatch) {
252    PCRE_Options o;
253    o.set_option(PCRE::UTF8);
254    if (flags & Regexp::Latin1)
255      o.set_option(PCRE::None);
256    // PCRE has interface bug keeping us from finding $0, so
257    // add one more layer of parens.
258    re_ = new PCRE("("+re+")", o);
259    if (!re_->error().empty()) {
260      LOG(INFO) << "Cannot PCRE: " << CEscape(re);
261      error_ = true;
262      return;
263    }
264  }
265}
266
267TestInstance::~TestInstance() {
268  if (regexp_)
269    regexp_->Decref();
270  delete prog_;
271  delete rprog_;
272  delete re_;
273  delete re2_;
274}
275
276// Runs a single search using the named engine type.
277// This interface hides all the irregularities of the various
278// engine interfaces from the rest of this file.
279void TestInstance::RunSearch(Engine type,
280                             const StringPiece& orig_text,
281                             const StringPiece& orig_context,
282                             Prog::Anchor anchor,
283                             Result *result) {
284  memset(result, 0, sizeof *result);
285  if (regexp_ == NULL) {
286    result->skipped = true;
287    return;
288  }
289  int nsubmatch = 1 + num_captures_;  // NumCaptures doesn't count $0
290  if (nsubmatch > kMaxSubmatch)
291    nsubmatch = kMaxSubmatch;
292
293  StringPiece text = orig_text;
294  StringPiece context = orig_context;
295
296  switch (type) {
297    default:
298      LOG(FATAL) << "Bad RunSearch type: " << (int)type;
299
300    case kEngineBacktrack:
301      if (prog_ == NULL) {
302        result->skipped = true;
303        break;
304      }
305      result->matched =
306        prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
307                                     result->submatch, nsubmatch);
308      result->have_submatch = true;
309      break;
310
311    case kEngineNFA:
312      if (prog_ == NULL) {
313        result->skipped = true;
314        break;
315      }
316      result->matched =
317        prog_->SearchNFA(text, context, anchor, kind_,
318                        result->submatch, nsubmatch);
319      result->have_submatch = true;
320      break;
321
322    case kEngineDFA:
323      if (prog_ == NULL) {
324        result->skipped = true;
325        break;
326      }
327      result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
328                                         &result->skipped, NULL);
329      break;
330
331    case kEngineDFA1:
332      if (prog_ == NULL || rprog_ == NULL) {
333        result->skipped = true;
334        break;
335      }
336      result->matched =
337        prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
338                         &result->skipped, NULL);
339      // If anchored, no need for second run,
340      // but do it anyway to find more bugs.
341      if (result->matched) {
342        if (!rprog_->SearchDFA(result->submatch[0], context,
343                               Prog::kAnchored, Prog::kLongestMatch,
344                               result->submatch,
345                               &result->skipped, NULL)) {
346          LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_)
347                     << " on " << CEscape(text);
348          result->matched = false;
349        }
350      }
351      result->have_submatch0 = true;
352      break;
353
354    case kEngineOnePass:
355      if (prog_ == NULL ||
356          anchor == Prog::kUnanchored ||
357          !prog_->IsOnePass() ||
358          nsubmatch > Prog::kMaxOnePassCapture) {
359        result->skipped = true;
360        break;
361      }
362      result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
363                                      result->submatch, nsubmatch);
364      result->have_submatch = true;
365      break;
366
367    case kEngineBitState:
368      if (prog_ == NULL) {
369        result->skipped = true;
370        break;
371      }
372      result->matched = prog_->SearchBitState(text, context, anchor, kind_,
373                                              result->submatch, nsubmatch);
374      result->have_submatch = true;
375      break;
376
377    case kEngineRE2:
378    case kEngineRE2a:
379    case kEngineRE2b: {
380      if (!re2_ || text.end() != context.end()) {
381        result->skipped = true;
382        break;
383      }
384
385      RE2::Anchor re_anchor;
386      if (anchor == Prog::kAnchored)
387        re_anchor = RE2::ANCHOR_START;
388      else
389        re_anchor = RE2::UNANCHORED;
390      if (kind_ == Prog::kFullMatch)
391        re_anchor = RE2::ANCHOR_BOTH;
392
393      result->matched = re2_->Match(context,
394                                    text.begin() - context.begin(),
395                                    text.end() - context.begin(),
396                                    re_anchor, result->submatch, nsubmatch);
397      result->have_submatch = nsubmatch > 0;
398      break;
399    }
400
401    case kEnginePCRE: {
402      if (!re_ || text.begin() != context.begin() ||
403          text.end() != context.end()) {
404        result->skipped = true;
405        break;
406      }
407
408      const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
409      PCRE::Arg *a = new PCRE::Arg[nsubmatch];
410      for (int i = 0; i < nsubmatch; i++) {
411        a[i] = PCRE::Arg(&result->submatch[i]);
412        argptr[i] = &a[i];
413      }
414      int consumed;
415      PCRE::Anchor pcre_anchor;
416      if (anchor == Prog::kAnchored)
417        pcre_anchor = PCRE::ANCHOR_START;
418      else
419        pcre_anchor = PCRE::UNANCHORED;
420      if (kind_ == Prog::kFullMatch)
421        pcre_anchor = PCRE::ANCHOR_BOTH;
422      re_->ClearHitLimit();
423      result->matched =
424        re_->DoMatch(text,
425                     pcre_anchor,
426                     &consumed,
427                     argptr, nsubmatch);
428      if (re_->HitLimit()) {
429        result->untrusted = true;
430        delete[] argptr;
431        delete[] a;
432        break;
433      }
434      result->have_submatch = true;
435
436      // Work around RE interface bug: PCRE returns -1 as the
437      // offsets for an unmatched subexpression, and RE should
438      // turn that into StringPiece(NULL) but in fact it uses
439      // StringPiece(text.begin() - 1, 0).  Oops.
440      for (int i = 0; i < nsubmatch; i++)
441        if (result->submatch[i].begin() == text.begin() - 1)
442          result->submatch[i] = NULL;
443      delete[] argptr;
444      delete[] a;
445      break;
446    }
447  }
448
449  if (!result->matched)
450    memset(result->submatch, 0, sizeof result->submatch);
451}
452
453// Checks whether r is okay given that correct is the right answer.
454// Specifically, r's answers have to match (but it doesn't have to
455// claim to have all the answers).
456static bool ResultOkay(const Result& r, const Result& correct) {
457  if (r.skipped)
458    return true;
459  if (r.matched != correct.matched)
460    return false;
461  if (r.have_submatch || r.have_submatch0) {
462    for (int i = 0; i < kMaxSubmatch; i++) {
463      if (correct.submatch[i].begin() != r.submatch[i].begin() ||
464          correct.submatch[i].size() != r.submatch[i].size())
465        return false;
466      if (!r.have_submatch)
467        break;
468    }
469  }
470  return true;
471}
472
473// Runs a single test.
474bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
475                           Prog::Anchor anchor) {
476  // Backtracking is the gold standard.
477  Result correct;
478  RunSearch(kEngineBacktrack, text, context, anchor, &correct);
479  if (correct.skipped) {
480    if (regexp_ == NULL)
481      return true;
482    LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
483               << " " << FormatMode(flags_);
484    return false;
485  }
486  VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
487          << " text " << CEscape(text)
488          << " (" << FormatKind(kind_)
489          << ", " << FormatAnchor(anchor)
490          << ", " << FormatMode(flags_)
491          << ")";
492
493  // Compare the others.
494  bool all_okay = true;
495  for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
496    if (!(Engines() & (1<<i)))
497      continue;
498
499    Result r;
500    RunSearch(i, text, context, anchor, &r);
501    if (ResultOkay(r, correct)) {
502      if (FLAGS_log_okay)
503        LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
504      continue;
505    }
506
507    // We disagree with PCRE on the meaning of some Unicode matches.
508    // In particular, we treat all non-ASCII UTF-8 as word characters.
509    // We also treat "empty" character sets like [^\w\W] as being
510    // impossible to match, while PCRE apparently excludes some code
511    // points (e.g., 0x0080) from both \w and \W.
512    if (i == kEnginePCRE && NonASCII(text))
513      continue;
514
515    if (!r.untrusted)
516      all_okay = false;
517
518    LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
519             context, anchor);
520    if (r.matched != correct.matched) {
521      if (r.matched) {
522        LOG(INFO) << "   Should not match (but does).";
523      } else {
524        LOG(INFO) << "   Should match (but does not).";
525        continue;
526      }
527    }
528    for (int i = 0; i < 1+num_captures_; i++) {
529      if (r.submatch[i].begin() != correct.submatch[i].begin() ||
530          r.submatch[i].end() != correct.submatch[i].end()) {
531        LOG(INFO) <<
532          StringPrintf("   $%d: should be %s is %s",
533                       i,
534                       FormatCapture(text, correct.submatch[i]).c_str(),
535                       FormatCapture(text, r.submatch[i]).c_str());
536      } else {
537        LOG(INFO) <<
538          StringPrintf("   $%d: %s ok", i,
539                       FormatCapture(text, r.submatch[i]).c_str());
540      }
541    }
542  }
543
544  if (!all_okay) {
545    if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
546      LOG(QFATAL) << "Too many regexp failures.";
547  }
548
549  return all_okay;
550}
551
552void TestInstance::LogMatch(const char* prefix, Engine e,
553                            const StringPiece& text, const StringPiece& context,
554                            Prog::Anchor anchor) {
555  LOG(INFO) << prefix
556    << EngineString(e)
557    << " regexp "
558    << CEscape(regexp_str_)
559    << " "
560    << CEscape(regexp_->ToString())
561    << " text "
562    << CEscape(text)
563    << " ("
564    << text.begin() - context.begin()
565    << ","
566    << text.end() - context.begin()
567    << ") of context "
568    << CEscape(context)
569    << " (" << FormatKind(kind_)
570    << ", " << FormatAnchor(anchor)
571    << ", " << FormatMode(flags_)
572    << ")";
573}
574
575static Prog::MatchKind kinds[] = {
576  Prog::kFirstMatch,
577  Prog::kLongestMatch,
578  Prog::kFullMatch,
579};
580
581// Test all possible match kinds and parse modes.
582Tester::Tester(const StringPiece& regexp) {
583  error_ = false;
584  for (int i = 0; i < arraysize(kinds); i++) {
585    for (int j = 0; j < arraysize(parse_modes); j++) {
586      TestInstance* t = new TestInstance(regexp, kinds[i],
587                                         parse_modes[j].parse_flags);
588      error_ |= t->error();
589      v_.push_back(t);
590    }
591  }
592}
593
594Tester::~Tester() {
595  for (int i = 0; i < v_.size(); i++)
596    delete v_[i];
597}
598
599bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
600                         Prog::Anchor anchor) {
601  bool okay = true;
602  for (int i = 0; i < v_.size(); i++)
603    okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
604  return okay;
605}
606
607static Prog::Anchor anchors[] = {
608  Prog::kAnchored,
609  Prog::kUnanchored
610};
611
612bool Tester::TestInput(const StringPiece& text) {
613  bool okay = TestInputInContext(text, text);
614  if (text.size() > 0) {
615    StringPiece sp;
616    sp = text;
617    sp.remove_prefix(1);
618    okay &= TestInputInContext(sp, text);
619    sp = text;
620    sp.remove_suffix(1);
621    okay &= TestInputInContext(sp, text);
622  }
623  return okay;
624}
625
626bool Tester::TestInputInContext(const StringPiece& text,
627                                const StringPiece& context) {
628  bool okay = true;
629  for (int i = 0; i < arraysize(anchors); i++)
630    okay &= TestCase(text, context, anchors[i]);
631  return okay;
632}
633
634bool TestRegexpOnText(const StringPiece& regexp,
635                      const StringPiece& text) {
636  Tester t(regexp);
637  return t.TestInput(text);
638}
639
640}  // namespace re2
641