prefilter.cc revision f8ee788a64d60abd8f2d742a5fdedde054ecd910
1// Copyright 2009 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "util/util.h"
6#include "re2/prefilter.h"
7#include "re2/re2.h"
8#include "re2/unicode_casefold.h"
9#include "re2/walker-inl.h"
10
11namespace re2 {
12
13static const int Trace = false;
14
15typedef set<string>::iterator SSIter;
16typedef set<string>::const_iterator ConstSSIter;
17
18static int alloc_id = 100000;  // Used for debugging.
19// Initializes a Prefilter, allocating subs_ as necessary.
20Prefilter::Prefilter(Op op) {
21  op_ = op;
22  subs_ = NULL;
23  if (op_ == AND || op_ == OR)
24    subs_ = new vector<Prefilter*>;
25
26  alloc_id_ = alloc_id++;
27  VLOG(10) << "alloc_id: " << alloc_id_;
28}
29
30// Destroys a Prefilter.
31Prefilter::~Prefilter() {
32  VLOG(10) << "Deleted: " << alloc_id_;
33  if (subs_) {
34    for (int i = 0; i < subs_->size(); i++)
35      delete (*subs_)[i];
36    delete subs_;
37    subs_ = NULL;
38  }
39}
40
41// Simplify if the node is an empty Or or And.
42Prefilter* Prefilter::Simplify() {
43  if (op_ != AND && op_ != OR) {
44    return this;
45  }
46
47  // Nothing left in the AND/OR.
48  if (subs_->size() == 0) {
49    if (op_ == AND)
50      op_ = ALL;  // AND of nothing is true
51    else
52      op_ = NONE;  // OR of nothing is false
53
54    return this;
55  }
56
57  // Just one subnode: throw away wrapper.
58  if (subs_->size() == 1) {
59    Prefilter* a = (*subs_)[0];
60    subs_->clear();
61    delete this;
62    return a->Simplify();
63  }
64
65  return this;
66}
67
68// Combines two Prefilters together to create an "op" (AND or OR).
69// The passed Prefilters will be part of the returned Prefilter or deleted.
70// Does lots of work to avoid creating unnecessarily complicated structures.
71Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
72  // If a, b can be rewritten as op, do so.
73  a = a->Simplify();
74  b = b->Simplify();
75
76  // Canonicalize: a->op <= b->op.
77  if (a->op() > b->op()) {
78    Prefilter* t = a;
79    a = b;
80    b = t;
81  }
82
83  // Trivial cases.
84  //    ALL AND b = b
85  //    NONE OR b = b
86  //    ALL OR b   = ALL
87  //    NONE AND b = NONE
88  // Don't need to look at b, because of canonicalization above.
89  // ALL and NONE are smallest opcodes.
90  if (a->op() == ALL || a->op() == NONE) {
91    if ((a->op() == ALL && op == AND) ||
92        (a->op() == NONE && op == OR)) {
93      delete a;
94      return b;
95    } else {
96      delete b;
97      return a;
98    }
99  }
100
101  // If a and b match op, merge their contents.
102  if (a->op() == op && b->op() == op) {
103    for (int i = 0; i < b->subs()->size(); i++) {
104      Prefilter* bb = (*b->subs())[i];
105      a->subs()->push_back(bb);
106    }
107    b->subs()->clear();
108    delete b;
109    return a;
110  }
111
112  // If a already has the same op as the op that is under construction
113  // add in b (similarly if b already has the same op, add in a).
114  if (b->op() == op) {
115    Prefilter* t = a;
116    a = b;
117    b = t;
118  }
119  if (a->op() == op) {
120    a->subs()->push_back(b);
121    return a;
122  }
123
124  // Otherwise just return the op.
125  Prefilter* c = new Prefilter(op);
126  c->subs()->push_back(a);
127  c->subs()->push_back(b);
128  return c;
129}
130
131Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
132  return AndOr(AND, a, b);
133}
134
135Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
136  return AndOr(OR, a, b);
137}
138
139static void SimplifyStringSet(set<string> *ss) {
140  // Now make sure that the strings aren't redundant.  For example, if
141  // we know "ab" is a required string, then it doesn't help at all to
142  // know that "abc" is also a required string, so delete "abc". This
143  // is because, when we are performing a string search to filter
144  // regexps, matching ab will already allow this regexp to be a
145  // candidate for match, so further matching abc is redundant.
146
147  for (SSIter i = ss->begin(); i != ss->end(); ++i) {
148    SSIter j = i;
149    ++j;
150    while (j != ss->end()) {
151      // Increment j early so that we can erase the element it points to.
152      SSIter old_j = j;
153      ++j;
154      if (old_j->find(*i) != string::npos)
155        ss->erase(old_j);
156    }
157  }
158}
159
160Prefilter* Prefilter::OrStrings(set<string>* ss) {
161  SimplifyStringSet(ss);
162  Prefilter* or_prefilter = NULL;
163  if (!ss->empty()) {
164    or_prefilter = new Prefilter(NONE);
165    for (SSIter i = ss->begin(); i != ss->end(); ++i)
166      or_prefilter = Or(or_prefilter, FromString(*i));
167  }
168  return or_prefilter;
169}
170
171static Rune ToLowerRune(Rune r) {
172  if (r < Runeself) {
173    if ('A' <= r && r <= 'Z')
174      r += 'a' - 'A';
175    return r;
176  }
177
178  CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
179  if (f == NULL || r < f->lo)
180    return r;
181  return ApplyFold(f, r);
182}
183
184static Rune ToLowerRuneLatin1(Rune r) {
185  if ('A' <= r && r <= 'Z')
186    r += 'a' - 'A';
187  return r;
188}
189
190Prefilter* Prefilter::FromString(const string& str) {
191  Prefilter* m = new Prefilter(Prefilter::ATOM);
192  m->atom_ = str;
193  return m;
194}
195
196// Information about a regexp used during computation of Prefilter.
197// Can be thought of as information about the set of strings matching
198// the given regular expression.
199class Prefilter::Info {
200 public:
201  Info();
202  ~Info();
203
204  // More constructors.  They delete their Info* arguments.
205  static Info* Alt(Info* a, Info* b);
206  static Info* Concat(Info* a, Info* b);
207  static Info* And(Info* a, Info* b);
208  static Info* Star(Info* a);
209  static Info* Plus(Info* a);
210  static Info* Quest(Info* a);
211  static Info* EmptyString();
212  static Info* NoMatch();
213  static Info* AnyChar();
214  static Info* CClass(CharClass* cc, bool latin1);
215  static Info* Literal(Rune r);
216  static Info* LiteralLatin1(Rune r);
217  static Info* AnyMatch();
218
219  // Format Info as a string.
220  string ToString();
221
222  // Caller takes ownership of the Prefilter.
223  Prefilter* TakeMatch();
224
225  set<string>& exact() { return exact_; }
226
227  bool is_exact() const { return is_exact_; }
228
229  class Walker;
230
231 private:
232  set<string> exact_;
233
234  // When is_exact_ is true, the strings that match
235  // are placed in exact_. When it is no longer an exact
236  // set of strings that match this RE, then is_exact_
237  // is false and the match_ contains the required match
238  // criteria.
239  bool is_exact_;
240
241  // Accumulated Prefilter query that any
242  // match for this regexp is guaranteed to match.
243  Prefilter* match_;
244};
245
246
247Prefilter::Info::Info()
248  : is_exact_(false),
249    match_(NULL) {
250}
251
252Prefilter::Info::~Info() {
253  delete match_;
254}
255
256Prefilter* Prefilter::Info::TakeMatch() {
257  if (is_exact_) {
258    match_ = Prefilter::OrStrings(&exact_);
259    is_exact_ = false;
260  }
261  Prefilter* m = match_;
262  match_ = NULL;
263  return m;
264}
265
266// Format a Info in string form.
267string Prefilter::Info::ToString() {
268  if (is_exact_) {
269    int n = 0;
270    string s;
271    for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
272      if (n++ > 0)
273        s += ",";
274      s += *i;
275    }
276    return s;
277  }
278
279  if (match_)
280    return match_->DebugString();
281
282  return "";
283}
284
285// Add the strings from src to dst.
286static void CopyIn(const set<string>& src, set<string>* dst) {
287  for (ConstSSIter i = src.begin(); i != src.end(); ++i)
288    dst->insert(*i);
289}
290
291// Add the cross-product of a and b to dst.
292// (For each string i in a and j in b, add i+j.)
293static void CrossProduct(const set<string>& a,
294                         const set<string>& b,
295                         set<string>* dst) {
296  for (ConstSSIter i = a.begin(); i != a.end(); ++i)
297    for (ConstSSIter j = b.begin(); j != b.end(); ++j)
298      dst->insert(*i + *j);
299}
300
301// Concats a and b. Requires that both are exact sets.
302// Forms an exact set that is a crossproduct of a and b.
303Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
304  if (a == NULL)
305    return b;
306  DCHECK(a->is_exact_);
307  DCHECK(b && b->is_exact_);
308  Info *ab = new Info();
309
310  CrossProduct(a->exact_, b->exact_, &ab->exact_);
311  ab->is_exact_ = true;
312
313  delete a;
314  delete b;
315  return ab;
316}
317
318// Constructs an inexact Info for ab given a and b.
319// Used only when a or b is not exact or when the
320// exact cross product is likely to be too big.
321Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
322  if (a == NULL)
323    return b;
324  if (b == NULL)
325    return a;
326
327  Info *ab = new Info();
328
329  ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
330  ab->is_exact_ = false;
331  delete a;
332  delete b;
333  return ab;
334}
335
336// Constructs Info for a|b given a and b.
337Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
338  Info *ab = new Info();
339
340  if (a->is_exact_ && b->is_exact_) {
341    CopyIn(a->exact_, &ab->exact_);
342    CopyIn(b->exact_, &ab->exact_);
343    ab->is_exact_ = true;
344  } else {
345    // Either a or b has is_exact_ = false. If the other
346    // one has is_exact_ = true, we move it to match_ and
347    // then create a OR of a,b. The resulting Info has
348    // is_exact_ = false.
349    ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
350    ab->is_exact_ = false;
351  }
352
353  delete a;
354  delete b;
355  return ab;
356}
357
358// Constructs Info for a? given a.
359Prefilter::Info* Prefilter::Info::Quest(Info *a) {
360  Info *ab = new Info();
361
362  ab->is_exact_ = false;
363  ab->match_ = new Prefilter(ALL);
364  delete a;
365  return ab;
366}
367
368// Constructs Info for a* given a.
369// Same as a? -- not much to do.
370Prefilter::Info* Prefilter::Info::Star(Info *a) {
371  return Quest(a);
372}
373
374// Constructs Info for a+ given a. If a was exact set, it isn't
375// anymore.
376Prefilter::Info* Prefilter::Info::Plus(Info *a) {
377  Info *ab = new Info();
378
379  ab->match_ = a->TakeMatch();
380  ab->is_exact_ = false;
381
382  delete a;
383  return ab;
384}
385
386static string RuneToString(Rune r) {
387  char buf[UTFmax];
388  int n = runetochar(buf, &r);
389  return string(buf, n);
390}
391
392static string RuneToStringLatin1(Rune r) {
393  char c = r & 0xff;
394  return string(&c, 1);
395}
396
397// Constructs Info for literal rune.
398Prefilter::Info* Prefilter::Info::Literal(Rune r) {
399  Info* info = new Info();
400  info->exact_.insert(RuneToString(ToLowerRune(r)));
401  info->is_exact_ = true;
402  return info;
403}
404
405// Constructs Info for literal rune for Latin1 encoded string.
406Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
407  Info* info = new Info();
408  info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
409  info->is_exact_ = true;
410  return info;
411}
412
413// Constructs Info for dot (any character).
414Prefilter::Info* Prefilter::Info::AnyChar() {
415  Prefilter::Info* info = new Prefilter::Info();
416  info->match_ = new Prefilter(ALL);
417  return info;
418}
419
420// Constructs Prefilter::Info for no possible match.
421Prefilter::Info* Prefilter::Info::NoMatch() {
422  Prefilter::Info* info = new Prefilter::Info();
423  info->match_ = new Prefilter(NONE);
424  return info;
425}
426
427// Constructs Prefilter::Info for any possible match.
428// This Prefilter::Info is valid for any regular expression,
429// since it makes no assertions whatsoever about the
430// strings being matched.
431Prefilter::Info* Prefilter::Info::AnyMatch() {
432  Prefilter::Info *info = new Prefilter::Info();
433  info->match_ = new Prefilter(ALL);
434  return info;
435}
436
437// Constructs Prefilter::Info for just the empty string.
438Prefilter::Info* Prefilter::Info::EmptyString() {
439  Prefilter::Info* info = new Prefilter::Info();
440  info->is_exact_ = true;
441  info->exact_.insert("");
442  return info;
443}
444
445// Constructs Prefilter::Info for a character class.
446typedef CharClass::iterator CCIter;
447Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
448                                         bool latin1) {
449  if (Trace) {
450    VLOG(0) << "CharClassInfo:";
451    for (CCIter i = cc->begin(); i != cc->end(); ++i)
452      VLOG(0) << "  " << i->lo << "-" << i->hi;
453  }
454
455  // If the class is too large, it's okay to overestimate.
456  if (cc->size() > 10)
457    return AnyChar();
458
459  Prefilter::Info *a = new Prefilter::Info();
460  for (CCIter i = cc->begin(); i != cc->end(); ++i)
461    for (Rune r = i->lo; r <= i->hi; r++) {
462      if (latin1) {
463        a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
464      } else {
465        a->exact_.insert(RuneToString(ToLowerRune(r)));
466      }
467    }
468
469
470  a->is_exact_ = true;
471
472  if (Trace) {
473    VLOG(0) << " = " << a->ToString();
474  }
475
476  return a;
477}
478
479class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
480 public:
481  Walker(bool latin1) : latin1_(latin1) {}
482
483  virtual Info* PostVisit(
484      Regexp* re, Info* parent_arg,
485      Info* pre_arg,
486      Info** child_args, int nchild_args);
487
488  virtual Info* ShortVisit(
489      Regexp* re,
490      Info* parent_arg);
491
492  bool latin1() { return latin1_; }
493 private:
494  bool latin1_;
495  DISALLOW_EVIL_CONSTRUCTORS(Walker);
496};
497
498Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
499  if (Trace) {
500    LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
501  }
502
503  bool latin1 = re->parse_flags() & Regexp::Latin1;
504  Prefilter::Info::Walker w(latin1);
505  Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
506
507  if (w.stopped_early()) {
508    delete info;
509    return NULL;
510  }
511
512  return info;
513}
514
515Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
516    Regexp* re, Prefilter::Info* parent_arg) {
517  return AnyMatch();
518}
519
520// Constructs the Prefilter::Info for the given regular expression.
521// Assumes re is simplified.
522Prefilter::Info* Prefilter::Info::Walker::PostVisit(
523    Regexp* re, Prefilter::Info* parent_arg,
524    Prefilter::Info* pre_arg, Prefilter::Info** child_args,
525    int nchild_args) {
526  Prefilter::Info *info;
527  switch (re->op()) {
528    default:
529    case kRegexpRepeat:
530      LOG(DFATAL) << "Bad regexp op " << re->op();
531      info = EmptyString();
532      break;
533
534    case kRegexpNoMatch:
535      info = NoMatch();
536      break;
537
538    // These ops match the empty string:
539    case kRegexpEmptyMatch:      // anywhere
540    case kRegexpBeginLine:       // at beginning of line
541    case kRegexpEndLine:         // at end of line
542    case kRegexpBeginText:       // at beginning of text
543    case kRegexpEndText:         // at end of text
544    case kRegexpWordBoundary:    // at word boundary
545    case kRegexpNoWordBoundary:  // not at word boundary
546      info = EmptyString();
547      break;
548
549    case kRegexpLiteral:
550      if (latin1()) {
551        info = LiteralLatin1(re->rune());
552      }
553      else {
554        info = Literal(re->rune());
555      }
556      break;
557
558    case kRegexpLiteralString:
559      if (re->nrunes() == 0) {
560        info = NoMatch();
561        break;
562      }
563      if (latin1()) {
564        info = LiteralLatin1(re->runes()[0]);
565        for (int i = 1; i < re->nrunes(); i++) {
566          info = Concat(info, LiteralLatin1(re->runes()[i]));
567        }
568      } else {
569        info = Literal(re->runes()[0]);
570        for (int i = 1; i < re->nrunes(); i++) {
571          info = Concat(info, Literal(re->runes()[i]));
572        }
573      }
574      break;
575
576    case kRegexpConcat: {
577      // Accumulate in info.
578      // Exact is concat of recent contiguous exact nodes.
579      info = NULL;
580      Info* exact = NULL;
581      for (int i = 0; i < nchild_args; i++) {
582        Info* ci = child_args[i];  // child info
583        if (!ci->is_exact() ||
584            (exact && ci->exact().size() * exact->exact().size() > 16)) {
585          // Exact run is over.
586          info = And(info, exact);
587          exact = NULL;
588          // Add this child's info.
589          info = And(info, ci);
590        } else {
591          // Append to exact run.
592          exact = Concat(exact, ci);
593        }
594      }
595      info = And(info, exact);
596    }
597      break;
598
599    case kRegexpAlternate:
600      info = child_args[0];
601      for (int i = 1; i < nchild_args; i++)
602        info = Alt(info, child_args[i]);
603      VLOG(10) << "Alt: " << info->ToString();
604      break;
605
606    case kRegexpStar:
607      info = Star(child_args[0]);
608      break;
609
610    case kRegexpQuest:
611      info = Quest(child_args[0]);
612      break;
613
614    case kRegexpPlus:
615      info = Plus(child_args[0]);
616      break;
617
618    case kRegexpAnyChar:
619      // Claim nothing, except that it's not empty.
620      info = AnyChar();
621      break;
622
623    case kRegexpCharClass:
624      info = CClass(re->cc(), latin1());
625      break;
626
627    case kRegexpCapture:
628      // These don't affect the set of matching strings.
629      info = child_args[0];
630      break;
631  }
632
633  if (Trace) {
634    VLOG(0) << "BuildInfo " << re->ToString()
635            << ": " << (info ? info->ToString() : "");
636  }
637
638  return info;
639}
640
641
642Prefilter* Prefilter::FromRegexp(Regexp* re) {
643  if (re == NULL)
644    return NULL;
645
646  Regexp* simple = re->Simplify();
647  Prefilter::Info *info = BuildInfo(simple);
648
649  simple->Decref();
650  if (info == NULL)
651    return NULL;
652
653  Prefilter* m = info->TakeMatch();
654
655  delete info;
656  return m;
657}
658
659string Prefilter::DebugString() const {
660  switch (op_) {
661    default:
662      LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
663      return StringPrintf("op%d", op_);
664    case NONE:
665      return "*no-matches*";
666    case ATOM:
667      return atom_;
668    case ALL:
669      return "";
670    case AND: {
671      string s = "";
672      for (int i = 0; i < subs_->size(); i++) {
673        if (i > 0)
674          s += " ";
675        Prefilter* sub = (*subs_)[i];
676        s += sub ? sub->DebugString() : "<nil>";
677      }
678      return s;
679    }
680    case OR: {
681      string s = "(";
682      for (int i = 0; i < subs_->size(); i++) {
683        if (i > 0)
684          s += "|";
685        Prefilter* sub = (*subs_)[i];
686        s += sub ? sub->DebugString() : "<nil>";
687      }
688      s += ")";
689      return s;
690    }
691  }
692}
693
694Prefilter* Prefilter::FromRE2(const RE2* re2) {
695  if (re2 == NULL)
696    return NULL;
697
698  Regexp* regexp = re2->Regexp();
699  if (regexp == NULL)
700    return NULL;
701
702  return FromRegexp(regexp);
703}
704
705
706}  // namespace re2
707