simplify.cc revision 2ee91b4af4353b9e6a9d591c32fedfc58fd4ef35
10a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel// Copyright 2006 The RE2 Authors. All Rights Reserved. 20a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel// Use of this source code is governed by a BSD-style 30a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel// license that can be found in the LICENSE file. 40a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel 50a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel// Rewrite POSIX and other features in re 60a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel// to use simple extended regular expression features. 70a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel// Also sort and simplify character classes. 80a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel 90a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel#include "util/util.h" 100a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel#include "re2/regexp.h" 110a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel#include "re2/walker-inl.h" 120a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel 130a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patelnamespace re2 { 140a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel 1537da0ad07bfd5e3856ce5d182b27b4c499c03c9aChris Lattner// Parses the regexp src and then simplifies it and sets *dst to the 16647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson// string representation of the simplified form. Returns true on success. 17f457d1316dec017cf204b54524878310c356bf64Devang Patel// Returns false and sets *error (if error != NULL) on error. 18937b1e92a9862862722732cb0f72d5ade77ac4c8Devang Patelbool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, 199d89df169027035bf692b3d397bb93cff7bdc860Devang Patel string* dst, 209d89df169027035bf692b3d397bb93cff7bdc860Devang Patel RegexpStatus* status) { 21d77fdba5737ee71b63681160fba5b2fc200583f4Devang Patel Regexp* re = Parse(src, flags, status); 2228bc9d88260a3e153ead4311c9129e3d3ad07736Devang Patel if (re == NULL) 2337da0ad07bfd5e3856ce5d182b27b4c499c03c9aChris Lattner return false; 240a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel Regexp* sre = re->Simplify(); 250a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel re->Decref(); 260a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel if (sre == NULL) { 27b2a33b46469a6d2c0e61122002079efb7d6d3f9cChris Lattner // Should not happen, since Simplify never fails. 28647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson LOG(ERROR) << "Simplify failed on " << src; 295e9cd434234a36089daeee915f1dc02b96947fbaChris Lattner if (status) { 305e9cd434234a36089daeee915f1dc02b96947fbaChris Lattner status->set_code(kRegexpInternalError); 31bc5201f8371f9041e79efcca3b158335da5c2604Devang Patel status->set_error_arg(src); 325e9cd434234a36089daeee915f1dc02b96947fbaChris Lattner } 3349b63a1f2c493f6452d22cf0df4c147e0539fb2eDevang Patel return false; 34647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson } 350d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov *dst = sre->ToString(); 36647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson sre->Decref(); 37647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson return true; 382353122a3c8f735deb1cb3c9c61785280923e09cNick Lewycky} 392353122a3c8f735deb1cb3c9c61785280923e09cNick Lewycky 40647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson// Assuming the simple_ flags on the children are accurate, 41647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson// is this Regexp* simple? 42647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Andersonbool Regexp::ComputeSimple() { 435d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner Regexp** subs; 44c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner switch (op_) { 45c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner case kRegexpNoMatch: 465d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner case kRegexpEmptyMatch: 47c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner case kRegexpLiteral: 485d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner case kRegexpLiteralString: 49c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner case kRegexpBeginLine: 50c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner case kRegexpEndLine: 515d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner case kRegexpBeginText: 525d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner case kRegexpWordBoundary: 530d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov case kRegexpNoWordBoundary: 54b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner case kRegexpEndText: 551d880e51142566838131b091306dd03549adb6d2Chris Lattner case kRegexpAnyChar: 561d880e51142566838131b091306dd03549adb6d2Chris Lattner case kRegexpAnyByte: 570d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov case kRegexpHaveMatch: 58c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner return true; 59c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner case kRegexpConcat: 60c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner case kRegexpAlternate: 61c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner // These are simple as long as the subpieces are simple. 62c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner subs = sub(); 63c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner for (int i = 0; i < nsub_; i++) 645d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner if (!subs[i]->simple_) 655d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner return false; 66c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner return true; 67c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner case kRegexpCharClass: 685d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner // Simple as long as the char class is not empty, not full. 695d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner if (ccb_ != NULL) 70c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner return !ccb_->empty() && !ccb_->full(); 71c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner return !cc_->empty() && !cc_->full(); 72c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner case kRegexpCapture: 73c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner subs = sub(); 74c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner return subs[0]->simple_; 75b2a33b46469a6d2c0e61122002079efb7d6d3f9cChris Lattner case kRegexpStar: 760a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel case kRegexpPlus: 77c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner case kRegexpQuest: 785d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner subs = sub(); 79b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner if (!subs[0]->simple_) 805d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner return false; 815d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner switch (subs[0]->op_) { 825d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner case kRegexpStar: 83b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner case kRegexpPlus: 84b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner case kRegexpQuest: 85b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner case kRegexpEmptyMatch: 86b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner case kRegexpNoMatch: 87bc5201f8371f9041e79efcca3b158335da5c2604Devang Patel return false; 88b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner default: 890d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov break; 90b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner } 91b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner return true; 92b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner case kRegexpRepeat: 93b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner return false; 945d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner } 95b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_; 965d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner return false; 97b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner} 98b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner 99b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner// Walker subclass used by Simplify. 100f2410180d07cb1ca074ad2a9b8538bc64895e68bChris Lattner// The simplify walk is purely post-recursive: given the simplified children, 101f2410180d07cb1ca074ad2a9b8538bc64895e68bChris Lattner// PostVisit creates the simplified result. 1020d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov// The child_args are simplified Regexp*s. 103b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattnerclass SimplifyWalker : public Regexp::Walker<Regexp*> { 1046f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin public: 1056f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin SimplifyWalker() {} 1066f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); 1076f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin virtual Regexp* PostVisit(Regexp* re, 10854a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner Regexp* parent_arg, 10954a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner Regexp* pre_arg, 1100d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov Regexp** child_args, int nchild_args); 111b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner virtual Regexp* Copy(Regexp* re); 1125d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); 113b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner 1145d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner private: 115f2410180d07cb1ca074ad2a9b8538bc64895e68bChris Lattner // These functions are declared inside SimplifyWalker so that 116f2410180d07cb1ca074ad2a9b8538bc64895e68bChris Lattner // they can edit the private fields of the Regexps they construct. 1178fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez 1186f8694b272ffbd5b4746feccd4866bf89794fe86Victor Hernandez // Creates a concatenation of two Regexp, consuming refs to re1 and re2. 1198fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez // Caller must Decref return value when done with it. 1208fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags); 1218fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez 122d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner // Simplifies the expression re{min,max} in terms of *, +, and ?. 123d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner // Returns a new regexp. Does not edit re. Does not consume reference to re. 124d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner // Caller must Decref return value when done with it. 125d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner static Regexp* SimplifyRepeat(Regexp* re, int min, int max, 1268fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez Regexp::ParseFlags parse_flags); 1278fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez 1288fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez // Simplifies a character class by expanding any named classes 129e05f66ef2e37bcbd88acaeef2bf14ddb1f04488fVictor Hernandez // into rune ranges. Does not edit re. Does not consume ref to re. 1308fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez // Caller must Decref return value when done with it. 131d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner static Regexp* SimplifyCharClass(Regexp* re); 132e05f66ef2e37bcbd88acaeef2bf14ddb1f04488fVictor Hernandez 133d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker); 134c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez}; 1358fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez 136d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// Simplifies a regular expression, returning a new regexp. 137d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// The new regexp uses traditional Unix egrep features only, 138d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// plus the Perl (?:) non-capturing parentheses. 139d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// Otherwise, no POSIX or Perl additions. The new regexp 14054630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez// captures exactly the same subexpressions (with the same indices) 141d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// as the original. 142d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// Does not edit current object. 143d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// Caller must Decref() return value when done with it. 144d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner 145c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor HernandezRegexp* Regexp::Simplify() { 146c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez if (simple_) 147c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez return Incref(); 14854630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez SimplifyWalker w; 149c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez return w.Walk(this, NULL); 150c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez} 151c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez 1526cead7879aa8902905809e3853d31093a5511c2fVictor Hernandez#define Simplify DontCallSimplify // Avoid accidental recursion 1536cead7879aa8902905809e3853d31093a5511c2fVictor Hernandez 1548fffff537194e2375e65600f27d716c99f0eb38aVictor HernandezRegexp* SimplifyWalker::Copy(Regexp* re) { 15554630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez return re->Incref(); 15654630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez} 15754630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez 158c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor HernandezRegexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { 15954630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez // This should never be called, since we use Walk and not 16054630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez // WalkExponential. 1618fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; 1629520cc2eae199f8974d5ed4f89ec43468be8f128Chandler Carruth return re->Incref(); 163d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner} 164d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner 1656f8694b272ffbd5b4746feccd4866bf89794fe86Victor HernandezRegexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { 1666f8694b272ffbd5b4746feccd4866bf89794fe86Victor Hernandez if (re->simple_) { 1679520cc2eae199f8974d5ed4f89ec43468be8f128Chandler Carruth *stop = true; 1688fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez return re->Incref(); 16954630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez } 17054630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez return NULL; 171c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez} 172c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez 173b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris LattnerRegexp* SimplifyWalker::PostVisit(Regexp* re, 174b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner Regexp* parent_arg, 175b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner Regexp* pre_arg, 176b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner Regexp** child_args, 177b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner int nchild_args) { 178b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner switch (re->op()) { 1790a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel case kRegexpNoMatch: 1800a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel case kRegexpEmptyMatch: 18124e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez case kRegexpLiteral: 182e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez case kRegexpLiteralString: 183e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez case kRegexpBeginLine: 1845f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel case kRegexpEndLine: 185e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez case kRegexpBeginText: 186e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez case kRegexpWordBoundary: 187e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez case kRegexpNoWordBoundary: 188e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez case kRegexpEndText: 189e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez case kRegexpAnyChar: 190e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez case kRegexpAnyByte: 191e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez case kRegexpHaveMatch: 192e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez // All these are always simple. 193e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez re->simple_ = true; 194e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez return re->Incref(); 19524e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez 19624e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez case kRegexpConcat: 197e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez case kRegexpAlternate: { 198e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez // These are simple as long as the subpieces are simple. 199e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez // Two passes to avoid allocation in the common case. 200e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez bool changed = false; 201e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez Regexp** subs = re->sub(); 202e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez for (int i = 0; i < re->nsub_; i++) { 203e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez Regexp* sub = subs[i]; 204e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez Regexp* newsub = child_args[i]; 20524e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez if (newsub != sub) { 206f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel changed = true; 207f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel break; 208f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel } 209f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel } 210f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel if (!changed) { 211f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel for (int i = 0; i < re->nsub_; i++) { 212f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel Regexp* newsub = child_args[i]; 213f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel newsub->Decref(); 214f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel } 215f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel re->simple_ = true; 216f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel return re->Incref(); 217f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel } 218f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel Regexp* nre = new Regexp(re->op(), re->parse_flags()); 219f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel nre->AllocSub(re->nsub_); 220e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez Regexp** nre_subs = nre->sub(); 221e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez for (int i = 0; i <re->nsub_; i++) 222e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez nre_subs[i] = child_args[i]; 223e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez nre->simple_ = true; 224e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez return nre; 225e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez } 2260d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov 2275f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel case kRegexpCapture: { 228647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson Regexp* newsub = child_args[0]; 229647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson if (newsub == re->sub()[0]) { 23024e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez newsub->Decref(); 23124e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez re->simple_ = true; 23224e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez return re->Incref(); 23324e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez } 234e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); 23524e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez nre->AllocSub(1); 23624e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez nre->sub()[0] = newsub; 23724e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez nre->cap_ = re->cap_; 23824e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez nre->simple_ = true; 239e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez return nre; 240e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez } 241e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez 242e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez case kRegexpStar: 243e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez case kRegexpPlus: 2445d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner case kRegexpQuest: { 2455d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner Regexp* newsub = child_args[0]; 246b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner // Special case: repeat the empty string as much as 247c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner // you want, but it's still the empty string. 248c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner if (newsub->op() == kRegexpEmptyMatch) 249b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner return newsub; 2505d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner 2515d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner // These are simple as long as the subpiece is simple. 252ea7b6bb32355c0d4cb1e1c16df7c8cf63787e73bDevang Patel if (newsub == re->sub()[0]) { 253b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner newsub->Decref(); 254c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner re->simple_ = true; 2556f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin return re->Incref(); 2566f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin } 2576f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin 2586f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin // These are also idempotent if flags are constant. 25906fdaccc89d9abdc7e03797b25173791a2f5692fDevang Patel if (re->op() == newsub->op() && 260c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner re->parse_flags() == newsub->parse_flags()) 2615d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner return newsub; 2625d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner 263df58389ef1fb8373d900975a6a66e4fbe0344f83Chris Lattner Regexp* nre = new Regexp(re->op(), re->parse_flags()); 2640d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov nre->AllocSub(1); 265df58389ef1fb8373d900975a6a66e4fbe0344f83Chris Lattner nre->sub()[0] = newsub; 2665f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel nre->simple_ = true; 2675f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel return nre; 26854a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner } 269b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner 27054a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner case kRegexpRepeat: { 27154a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner Regexp* newsub = child_args[0]; 27254a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner // Special case: repeat the empty string as much as 27354a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner // you want, but it's still the empty string. 2740d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov if (newsub->op() == kRegexpEmptyMatch) 275496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner return newsub; 276496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner 277496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, 278496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner re->parse_flags()); 279496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner newsub->Decref(); 280df58389ef1fb8373d900975a6a66e4fbe0344f83Chris Lattner nre->simple_ = true; 28154a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner return nre; 28254a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner } 28354a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner 28454a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner case kRegexpCharClass: { 28554a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner Regexp* nre = SimplifyCharClass(re); 28654a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner nre->simple_ = true; 28754a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner return nre; 2880d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov } 28954a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner } 29054a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner 29154a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner LOG(ERROR) << "Simplify case not handled: " << re->op(); 2925f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel return re->Incref(); 2935f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel} 2945f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel 2955f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel// Creates a concatenation of two Regexp, consuming refs to re1 and re2. 2965f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel// Returns a new Regexp, handing the ref to the caller. 2975f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang PatelRegexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, 2985f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel Regexp::ParseFlags parse_flags) { 299b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner Regexp* re = new Regexp(kRegexpConcat, parse_flags); 300496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner re->AllocSub(2); 301496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner Regexp** subs = re->sub(); 3025f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel subs[0] = re1; 3035f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel subs[1] = re2; 304496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner return re; 305496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner} 3065f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel 3075f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel// Simplifies the expression re{min,max} in terms of *, +, and ?. 308f457d1316dec017cf204b54524878310c356bf64Devang Patel// Returns a new regexp. Does not edit re. Does not consume reference to re. 309b2a33b46469a6d2c0e61122002079efb7d6d3f9cChris Lattner// Caller must Decref return value when done with it. 310f457d1316dec017cf204b54524878310c356bf64Devang Patel// The result will *not* necessarily have the right capturing parens 31126028f27ddd132b3284943e11aca130c2911abc4Devang Patel// if you call ToString() and re-parse it: (x){2} becomes (x)(x), 31226028f27ddd132b3284943e11aca130c2911abc4Devang Patel// but in the Regexp* representation, both (x) are marked as $1. 31326028f27ddd132b3284943e11aca130c2911abc4Devang PatelRegexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, 31426028f27ddd132b3284943e11aca130c2911abc4Devang Patel Regexp::ParseFlags f) { 31526028f27ddd132b3284943e11aca130c2911abc4Devang Patel // x{n,} means at least n matches of x. 31626028f27ddd132b3284943e11aca130c2911abc4Devang Patel if (max == -1) { 31726028f27ddd132b3284943e11aca130c2911abc4Devang Patel // Special case: x{0,} is x* 31826028f27ddd132b3284943e11aca130c2911abc4Devang Patel if (min == 0) 31926028f27ddd132b3284943e11aca130c2911abc4Devang Patel return Regexp::Star(re->Incref(), f); 32026028f27ddd132b3284943e11aca130c2911abc4Devang Patel 32126028f27ddd132b3284943e11aca130c2911abc4Devang Patel // Special case: x{1,} is x+ 32226028f27ddd132b3284943e11aca130c2911abc4Devang Patel if (min == 1) 32326028f27ddd132b3284943e11aca130c2911abc4Devang Patel return Regexp::Plus(re->Incref(), f); 32426028f27ddd132b3284943e11aca130c2911abc4Devang Patel 32526028f27ddd132b3284943e11aca130c2911abc4Devang Patel // General case: x{4,} is xxxx+ 32626028f27ddd132b3284943e11aca130c2911abc4Devang Patel Regexp* nre = new Regexp(kRegexpConcat, f); 32726028f27ddd132b3284943e11aca130c2911abc4Devang Patel nre->AllocSub(min); 32826028f27ddd132b3284943e11aca130c2911abc4Devang Patel VLOG(1) << "Simplify " << min; 32926028f27ddd132b3284943e11aca130c2911abc4Devang Patel Regexp** nre_subs = nre->sub(); 33026028f27ddd132b3284943e11aca130c2911abc4Devang Patel for (int i = 0; i < min-1; i++) 3313e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel nre_subs[i] = re->Incref(); 3323e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel nre_subs[min-1] = Regexp::Plus(re->Incref(), f); 33357109697282c6dffd04e2e275606352914217114Chris Lattner return nre; 33457109697282c6dffd04e2e275606352914217114Chris Lattner } 335d77fdba5737ee71b63681160fba5b2fc200583f4Devang Patel 3360d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov // Special case: (x){0} matches only empty string. 3375316bf0252ce9797e90a233645b6580695403837Devang Patel if (min == 0 && max == 0) 3388fba578be72dc98497508dec053e966858571f6aDevang Patel return new Regexp(kRegexpEmptyMatch, f); 33928bc9d88260a3e153ead4311c9129e3d3ad07736Devang Patel 3403e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel // Special case: x{1} is just x. 3410d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov if (min == 1 && max == 1) 3423e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel return re->Incref(); 34349708ad993529611cedfbe49ae44bb10beb73abeDevang Patel 3443e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel // General case: x{n,m} means n copies of x and m copies of x?. 34549708ad993529611cedfbe49ae44bb10beb73abeDevang Patel // The machine will do less work if we nest the final m copies, 34626028f27ddd132b3284943e11aca130c2911abc4Devang Patel // so that x{2,5} = xx(x(x(x)?)?)? 3475316bf0252ce9797e90a233645b6580695403837Devang Patel 348f457d1316dec017cf204b54524878310c356bf64Devang Patel // Build leading prefix: xx. Capturing only on the last one. 349a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel Regexp* nre = NULL; 350ab67e705f59d567afded845465f358b8a66ab62eDevang Patel if (min > 0) { 351b2a33b46469a6d2c0e61122002079efb7d6d3f9cChris Lattner nre = new Regexp(kRegexpConcat, f); 3523e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel nre->AllocSub(min); 3535d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner Regexp** nre_subs = nre->sub(); 3540d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov for (int i = 0; i < min; i++) 3555d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner nre_subs[i] = re->Incref(); 3565d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner } 3571d0be15f89cb5056e20e2d24faa8d6afb1573bcaOwen Anderson 3581d0be15f89cb5056e20e2d24faa8d6afb1573bcaOwen Anderson // Build and attach suffix: (x(x(x)?)?)? 359ab67e705f59d567afded845465f358b8a66ab62eDevang Patel if (max > min) { 360ab67e705f59d567afded845465f358b8a66ab62eDevang Patel Regexp* suf = Regexp::Quest(re->Incref(), f); 36157109697282c6dffd04e2e275606352914217114Chris Lattner for (int i = min+1; i < max; i++) 36257109697282c6dffd04e2e275606352914217114Chris Lattner suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f); 36357109697282c6dffd04e2e275606352914217114Chris Lattner if (nre == NULL) 36457109697282c6dffd04e2e275606352914217114Chris Lattner nre = suf; 36557109697282c6dffd04e2e275606352914217114Chris Lattner else 3665d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner nre = Concat2(nre, suf, f); 3675d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner } 36857109697282c6dffd04e2e275606352914217114Chris Lattner 36957109697282c6dffd04e2e275606352914217114Chris Lattner if (nre == NULL) { 37057109697282c6dffd04e2e275606352914217114Chris Lattner // Some degenerate case, like min > max, or min < max < 0. 3715d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner // This shouldn't happen, because the parser rejects such regexps. 3723e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max; 3735d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner return new Regexp(kRegexpNoMatch, f); 3743e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel } 37557109697282c6dffd04e2e275606352914217114Chris Lattner 37657109697282c6dffd04e2e275606352914217114Chris Lattner return nre; 3775d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner} 3783e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel 3793e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel// Simplifies a character class. 38057109697282c6dffd04e2e275606352914217114Chris Lattner// Caller must Decref return value when done with it. 38157109697282c6dffd04e2e275606352914217114Chris LattnerRegexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { 382a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel CharClass* cc = re->cc(); 383a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel 384a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel // Special cases 385a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel if (cc->empty()) 386a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel return new Regexp(kRegexpNoMatch, re->parse_flags()); 387a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel if (cc->full()) 388a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel return new Regexp(kRegexpAnyChar, re->parse_flags()); 389a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel 39057109697282c6dffd04e2e275606352914217114Chris Lattner return re->Incref(); 391a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel} 392a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel 3930386f01e061513094504bc11f8352a40173cada7Devang Patel} // namespace re2 394d77fdba5737ee71b63681160fba5b2fc200583f4Devang Patel