15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2006 The RE2 Authors.  All Rights Reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Rewrite POSIX and other features in re
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to use simple extended regular expression features.
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Also sort and simplify character classes.
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/util.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/regexp.h"
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/walker-inl.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 {
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Parses the regexp src and then simplifies it and sets *dst to the
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// string representation of the simplified form.  Returns true on success.
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns false and sets *error (if error != NULL) on error.
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                            string* dst,
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                            RegexpStatus* status) {
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Regexp* re = Parse(src, flags, status);
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (re == NULL)
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Regexp* sre = re->Simplify();
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  re->Decref();
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (sre == NULL) {
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Should not happen, since Simplify never fails.
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    LOG(ERROR) << "Simplify failed on " << src;
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (status) {
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      status->set_code(kRegexpInternalError);
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      status->set_error_arg(src);
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *dst = sre->ToString();
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sre->Decref();
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return true;
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Assuming the simple_ flags on the children are accurate,
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is this Regexp* simple?
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ComputeSimple() {
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Regexp** subs;
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  switch (op_) {
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpNoMatch:
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpEmptyMatch:
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpLiteral:
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpLiteralString:
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpBeginLine:
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpEndLine:
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpBeginText:
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpWordBoundary:
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpNoWordBoundary:
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpEndText:
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpAnyChar:
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpAnyByte:
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpHaveMatch:
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return true;
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpConcat:
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpAlternate:
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // These are simple as long as the subpieces are simple.
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      subs = sub();
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      for (int i = 0; i < nsub_; i++)
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (!subs[i]->simple_)
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return false;
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return true;
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpCharClass:
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Simple as long as the char class is not empty, not full.
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (ccb_ != NULL)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return !ccb_->empty() && !ccb_->full();
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return !cc_->empty() && !cc_->full();
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpCapture:
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      subs = sub();
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return subs[0]->simple_;
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpStar:
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpPlus:
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpQuest:
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      subs = sub();
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (!subs[0]->simple_)
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return false;
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      switch (subs[0]->op_) {
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        case kRegexpStar:
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        case kRegexpPlus:
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        case kRegexpQuest:
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        case kRegexpEmptyMatch:
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        case kRegexpNoMatch:
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return false;
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        default:
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          break;
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return true;
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpRepeat:
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return false;
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return false;
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Walker subclass used by Simplify.
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The simplify walk is purely post-recursive: given the simplified children,
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// PostVisit creates the simplified result.
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The child_args are simplified Regexp*s.
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class SimplifyWalker : public Regexp::Walker<Regexp*> {
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SimplifyWalker() {}
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual Regexp* PostVisit(Regexp* re,
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                            Regexp* parent_arg,
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                            Regexp* pre_arg,
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                            Regexp** child_args, int nchild_args);
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual Regexp* Copy(Regexp* re);
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // These functions are declared inside SimplifyWalker so that
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // they can edit the private fields of the Regexps they construct.
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Caller must Decref return value when done with it.
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Simplifies the expression re{min,max} in terms of *, +, and ?.
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Caller must Decref return value when done with it.
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                Regexp::ParseFlags parse_flags);
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Simplifies a character class by expanding any named classes
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // into rune ranges.  Does not edit re.  Does not consume ref to re.
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Caller must Decref return value when done with it.
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static Regexp* SimplifyCharClass(Regexp* re);
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Simplifies a regular expression, returning a new regexp.
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The new regexp uses traditional Unix egrep features only,
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// plus the Perl (?:) non-capturing parentheses.
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Otherwise, no POSIX or Perl additions.  The new regexp
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// captures exactly the same subexpressions (with the same indices)
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// as the original.
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Does not edit current object.
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller must Decref() return value when done with it.
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* Regexp::Simplify() {
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (simple_)
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return Incref();
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SimplifyWalker w;
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return w.Walk(this, NULL);
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define Simplify DontCallSimplify  // Avoid accidental recursion
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::Copy(Regexp* re) {
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return re->Incref();
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This should never be called, since we use Walk and not
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // WalkExponential.
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return re->Incref();
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (re->simple_) {
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *stop = true;
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return re->Incref();
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return NULL;
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::PostVisit(Regexp* re,
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                  Regexp* parent_arg,
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                  Regexp* pre_arg,
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                  Regexp** child_args,
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                  int nchild_args) {
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  switch (re->op()) {
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpNoMatch:
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpEmptyMatch:
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpLiteral:
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpLiteralString:
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpBeginLine:
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpEndLine:
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpBeginText:
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpWordBoundary:
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpNoWordBoundary:
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpEndText:
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpAnyChar:
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpAnyByte:
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpHaveMatch:
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // All these are always simple.
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      re->simple_ = true;
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return re->Incref();
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpConcat:
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpAlternate: {
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // These are simple as long as the subpieces are simple.
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Two passes to avoid allocation in the common case.
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      bool changed = false;
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Regexp** subs = re->sub();
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      for (int i = 0; i < re->nsub_; i++) {
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        Regexp* sub = subs[i];
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        Regexp* newsub = child_args[i];
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (newsub != sub) {
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          changed = true;
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          break;
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (!changed) {
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        for (int i = 0; i < re->nsub_; i++) {
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          Regexp* newsub = child_args[i];
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          newsub->Decref();
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        re->simple_ = true;
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return re->Incref();
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Regexp* nre = new Regexp(re->op(), re->parse_flags());
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre->AllocSub(re->nsub_);
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Regexp** nre_subs = nre->sub();
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      for (int i = 0; i <re->nsub_; i++)
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        nre_subs[i] = child_args[i];
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre->simple_ = true;
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return nre;
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpCapture: {
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Regexp* newsub = child_args[0];
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (newsub == re->sub()[0]) {
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        newsub->Decref();
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        re->simple_ = true;
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return re->Incref();
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre->AllocSub(1);
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre->sub()[0] = newsub;
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre->cap_ = re->cap_;
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre->simple_ = true;
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return nre;
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpStar:
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpPlus:
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpQuest: {
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Regexp* newsub = child_args[0];
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Special case: repeat the empty string as much as
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // you want, but it's still the empty string.
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (newsub->op() == kRegexpEmptyMatch)
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return newsub;
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // These are simple as long as the subpiece is simple.
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (newsub == re->sub()[0]) {
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        newsub->Decref();
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        re->simple_ = true;
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return re->Incref();
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // These are also idempotent if flags are constant.
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (re->op() == newsub->op() &&
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          re->parse_flags() == newsub->parse_flags())
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return newsub;
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Regexp* nre = new Regexp(re->op(), re->parse_flags());
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre->AllocSub(1);
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre->sub()[0] = newsub;
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre->simple_ = true;
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return nre;
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpRepeat: {
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Regexp* newsub = child_args[0];
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Special case: repeat the empty string as much as
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // you want, but it's still the empty string.
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (newsub->op() == kRegexpEmptyMatch)
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return newsub;
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                   re->parse_flags());
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      newsub->Decref();
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre->simple_ = true;
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return nre;
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpCharClass: {
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Regexp* nre = SimplifyCharClass(re);
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre->simple_ = true;
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return nre;
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  LOG(ERROR) << "Simplify case not handled: " << re->op();
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return re->Incref();
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns a new Regexp, handing the ref to the caller.
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                Regexp::ParseFlags parse_flags) {
2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Regexp* re = new Regexp(kRegexpConcat, parse_flags);
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  re->AllocSub(2);
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Regexp** subs = re->sub();
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  subs[0] = re1;
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  subs[1] = re2;
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return re;
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Simplifies the expression re{min,max} in terms of *, +, and ?.
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns a new regexp.  Does not edit re.  Does not consume reference to re.
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller must Decref return value when done with it.
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The result will *not* necessarily have the right capturing parens
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// but in the Regexp* representation, both (x) are marked as $1.
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                       Regexp::ParseFlags f) {
3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // x{n,} means at least n matches of x.
3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (max == -1) {
3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Special case: x{0,} is x*
3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (min == 0)
3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return Regexp::Star(re->Incref(), f);
3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Special case: x{1,} is x+
3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (min == 1)
3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return Regexp::Plus(re->Incref(), f);
3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // General case: x{4,} is xxxx+
3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Regexp* nre = new Regexp(kRegexpConcat, f);
3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    nre->AllocSub(min);
3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    VLOG(1) << "Simplify " << min;
3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Regexp** nre_subs = nre->sub();
3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (int i = 0; i < min-1; i++)
3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre_subs[i] = re->Incref();
3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return nre;
3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Special case: (x){0} matches only empty string.
3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (min == 0 && max == 0)
3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return new Regexp(kRegexpEmptyMatch, f);
3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Special case: x{1} is just x.
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (min == 1 && max == 1)
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return re->Incref();
3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // General case: x{n,m} means n copies of x and m copies of x?.
3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The machine will do less work if we nest the final m copies,
3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // so that x{2,5} = xx(x(x(x)?)?)?
3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Build leading prefix: xx.  Capturing only on the last one.
3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Regexp* nre = NULL;
3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (min > 0) {
3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    nre = new Regexp(kRegexpConcat, f);
3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    nre->AllocSub(min);
3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Regexp** nre_subs = nre->sub();
3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (int i = 0; i < min; i++)
3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre_subs[i] = re->Incref();
3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Build and attach suffix: (x(x(x)?)?)?
3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (max > min) {
3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Regexp* suf = Regexp::Quest(re->Incref(), f);
3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (int i = min+1; i < max; i++)
3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (nre == NULL)
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre = suf;
3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    else
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      nre = Concat2(nre, suf, f);
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (nre == NULL) {
3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Some degenerate case, like min > max, or min < max < 0.
3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // This shouldn't happen, because the parser rejects such regexps.
3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return new Regexp(kRegexpNoMatch, f);
3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return nre;
3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Simplifies a character class.
3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller must Decref return value when done with it.
3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CharClass* cc = re->cc();
3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Special cases
3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (cc->empty())
3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return new Regexp(kRegexpNoMatch, re->parse_flags());
3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (cc->full())
3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return new Regexp(kRegexpAnyChar, re->parse_flags());
3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return re->Incref();
3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace re2
394