15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2006 The RE2 Authors. All Rights Reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Rewrite POSIX and other features in re 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to use simple extended regular expression features. 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Also sort and simplify character classes. 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/util.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/regexp.h" 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/walker-inl.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 { 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Parses the regexp src and then simplifies it and sets *dst to the 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// string representation of the simplified form. Returns true on success. 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns false and sets *error (if error != NULL) on error. 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string* dst, 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexpStatus* status) { 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = Parse(src, flags, status); 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re == NULL) 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* sre = re->Simplify(); 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sre == NULL) { 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Should not happen, since Simplify never fails. 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(ERROR) << "Simplify failed on " << src; 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (status) { 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_code(kRegexpInternalError); 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) status->set_error_arg(src); 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *dst = sre->ToString(); 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sre->Decref(); 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Assuming the simple_ flags on the children are accurate, 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is this Regexp* simple? 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Regexp::ComputeSimple() { 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** subs; 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (op_) { 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpNoMatch: 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpEmptyMatch: 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpLiteral: 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpLiteralString: 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpBeginLine: 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpEndLine: 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpBeginText: 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpWordBoundary: 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpNoWordBoundary: 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpEndText: 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpAnyChar: 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpAnyByte: 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpHaveMatch: 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpConcat: 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpAlternate: 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // These are simple as long as the subpieces are simple. 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) subs = sub(); 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < nsub_; i++) 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!subs[i]->simple_) 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpCharClass: 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Simple as long as the char class is not empty, not full. 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ccb_ != NULL) 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return !ccb_->empty() && !ccb_->full(); 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return !cc_->empty() && !cc_->full(); 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpCapture: 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) subs = sub(); 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return subs[0]->simple_; 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpStar: 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpPlus: 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpQuest: 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) subs = sub(); 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!subs[0]->simple_) 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (subs[0]->op_) { 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpStar: 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpPlus: 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpQuest: 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpEmptyMatch: 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpNoMatch: 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpRepeat: 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_; 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Walker subclass used by Simplify. 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The simplify walk is purely post-recursive: given the simplified children, 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// PostVisit creates the simplified result. 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The child_args are simplified Regexp*s. 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class SimplifyWalker : public Regexp::Walker<Regexp*> { 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SimplifyWalker() {} 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual Regexp* PostVisit(Regexp* re, 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* parent_arg, 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* pre_arg, 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** child_args, int nchild_args); 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual Regexp* Copy(Regexp* re); 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // These functions are declared inside SimplifyWalker so that 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // they can edit the private fields of the Regexps they construct. 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Creates a concatenation of two Regexp, consuming refs to re1 and re2. 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Caller must Decref return value when done with it. 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags); 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Simplifies the expression re{min,max} in terms of *, +, and ?. 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns a new regexp. Does not edit re. Does not consume reference to re. 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Caller must Decref return value when done with it. 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static Regexp* SimplifyRepeat(Regexp* re, int min, int max, 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags parse_flags); 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Simplifies a character class by expanding any named classes 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // into rune ranges. Does not edit re. Does not consume ref to re. 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Caller must Decref return value when done with it. 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static Regexp* SimplifyCharClass(Regexp* re); 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker); 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Simplifies a regular expression, returning a new regexp. 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The new regexp uses traditional Unix egrep features only, 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// plus the Perl (?:) non-capturing parentheses. 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Otherwise, no POSIX or Perl additions. The new regexp 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// captures exactly the same subexpressions (with the same indices) 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// as the original. 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Does not edit current object. 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller must Decref() return value when done with it. 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* Regexp::Simplify() { 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (simple_) 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Incref(); 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SimplifyWalker w; 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return w.Walk(this, NULL); 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define Simplify DontCallSimplify // Avoid accidental recursion 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::Copy(Regexp* re) { 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re->Incref(); 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This should never be called, since we use Walk and not 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // WalkExponential. 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re->Incref(); 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->simple_) { 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *stop = true; 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re->Incref(); 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::PostVisit(Regexp* re, 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* parent_arg, 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* pre_arg, 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** child_args, 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nchild_args) { 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (re->op()) { 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpNoMatch: 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpEmptyMatch: 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpLiteral: 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpLiteralString: 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpBeginLine: 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpEndLine: 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpBeginText: 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpWordBoundary: 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpNoWordBoundary: 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpEndText: 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpAnyChar: 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpAnyByte: 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpHaveMatch: 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // All these are always simple. 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->simple_ = true; 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re->Incref(); 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpConcat: 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpAlternate: { 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // These are simple as long as the subpieces are simple. 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Two passes to avoid allocation in the common case. 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool changed = false; 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** subs = re->sub(); 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < re->nsub_; i++) { 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* sub = subs[i]; 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* newsub = child_args[i]; 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (newsub != sub) { 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) changed = true; 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!changed) { 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < re->nsub_; i++) { 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* newsub = child_args[i]; 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) newsub->Decref(); 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->simple_ = true; 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re->Incref(); 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* nre = new Regexp(re->op(), re->parse_flags()); 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre->AllocSub(re->nsub_); 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** nre_subs = nre->sub(); 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i <re->nsub_; i++) 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre_subs[i] = child_args[i]; 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre->simple_ = true; 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return nre; 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpCapture: { 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* newsub = child_args[0]; 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (newsub == re->sub()[0]) { 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) newsub->Decref(); 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->simple_ = true; 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re->Incref(); 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre->AllocSub(1); 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre->sub()[0] = newsub; 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre->cap_ = re->cap_; 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre->simple_ = true; 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return nre; 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpStar: 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpPlus: 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpQuest: { 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* newsub = child_args[0]; 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Special case: repeat the empty string as much as 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // you want, but it's still the empty string. 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (newsub->op() == kRegexpEmptyMatch) 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return newsub; 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // These are simple as long as the subpiece is simple. 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (newsub == re->sub()[0]) { 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) newsub->Decref(); 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->simple_ = true; 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re->Incref(); 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // These are also idempotent if flags are constant. 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->op() == newsub->op() && 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->parse_flags() == newsub->parse_flags()) 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return newsub; 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* nre = new Regexp(re->op(), re->parse_flags()); 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre->AllocSub(1); 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre->sub()[0] = newsub; 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre->simple_ = true; 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return nre; 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpRepeat: { 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* newsub = child_args[0]; 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Special case: repeat the empty string as much as 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // you want, but it's still the empty string. 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (newsub->op() == kRegexpEmptyMatch) 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return newsub; 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->parse_flags()); 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) newsub->Decref(); 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre->simple_ = true; 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return nre; 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpCharClass: { 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* nre = SimplifyCharClass(re); 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre->simple_ = true; 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return nre; 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(ERROR) << "Simplify case not handled: " << re->op(); 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re->Incref(); 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Creates a concatenation of two Regexp, consuming refs to re1 and re2. 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns a new Regexp, handing the ref to the caller. 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags parse_flags) { 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = new Regexp(kRegexpConcat, parse_flags); 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->AllocSub(2); 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** subs = re->sub(); 3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) subs[0] = re1; 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) subs[1] = re2; 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re; 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Simplifies the expression re{min,max} in terms of *, +, and ?. 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns a new regexp. Does not edit re. Does not consume reference to re. 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller must Decref return value when done with it. 3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The result will *not* necessarily have the right capturing parens 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// if you call ToString() and re-parse it: (x){2} becomes (x)(x), 3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// but in the Regexp* representation, both (x) are marked as $1. 3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, 3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags f) { 3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // x{n,} means at least n matches of x. 3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (max == -1) { 3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Special case: x{0,} is x* 3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (min == 0) 3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Regexp::Star(re->Incref(), f); 3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Special case: x{1,} is x+ 3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (min == 1) 3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Regexp::Plus(re->Incref(), f); 3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // General case: x{4,} is xxxx+ 3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* nre = new Regexp(kRegexpConcat, f); 3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre->AllocSub(min); 3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) VLOG(1) << "Simplify " << min; 3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** nre_subs = nre->sub(); 3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < min-1; i++) 3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre_subs[i] = re->Incref(); 3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre_subs[min-1] = Regexp::Plus(re->Incref(), f); 3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return nre; 3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Special case: (x){0} matches only empty string. 3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (min == 0 && max == 0) 3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return new Regexp(kRegexpEmptyMatch, f); 3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Special case: x{1} is just x. 3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (min == 1 && max == 1) 3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re->Incref(); 3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // General case: x{n,m} means n copies of x and m copies of x?. 3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The machine will do less work if we nest the final m copies, 3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // so that x{2,5} = xx(x(x(x)?)?)? 3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Build leading prefix: xx. Capturing only on the last one. 3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* nre = NULL; 3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (min > 0) { 3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre = new Regexp(kRegexpConcat, f); 3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre->AllocSub(min); 3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** nre_subs = nre->sub(); 3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < min; i++) 3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre_subs[i] = re->Incref(); 3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Build and attach suffix: (x(x(x)?)?)? 3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (max > min) { 3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* suf = Regexp::Quest(re->Incref(), f); 3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = min+1; i < max; i++) 3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f); 3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (nre == NULL) 3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre = suf; 3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nre = Concat2(nre, suf, f); 3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (nre == NULL) { 3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Some degenerate case, like min > max, or min < max < 0. 3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This shouldn't happen, because the parser rejects such regexps. 3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max; 3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return new Regexp(kRegexpNoMatch, f); 3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return nre; 3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Simplifies a character class. 3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller must Decref return value when done with it. 3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { 3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CharClass* cc = re->cc(); 3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Special cases 3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cc->empty()) 3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return new Regexp(kRegexpNoMatch, re->parse_flags()); 3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cc->full()) 3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return new Regexp(kRegexpAnyChar, re->parse_flags()); 3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return re->Incref(); 3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace re2 394