simplify.cc revision 2ee91b4af4353b9e6a9d591c32fedfc58fd4ef35
10a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel// Copyright 2006 The RE2 Authors.  All Rights Reserved.
20a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel// Use of this source code is governed by a BSD-style
30a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel// license that can be found in the LICENSE file.
40a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel
50a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel// Rewrite POSIX and other features in re
60a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel// to use simple extended regular expression features.
70a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel// Also sort and simplify character classes.
80a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel
90a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel#include "util/util.h"
100a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel#include "re2/regexp.h"
110a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel#include "re2/walker-inl.h"
120a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel
130a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patelnamespace re2 {
140a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel
1537da0ad07bfd5e3856ce5d182b27b4c499c03c9aChris Lattner// Parses the regexp src and then simplifies it and sets *dst to the
16647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson// string representation of the simplified form.  Returns true on success.
17f457d1316dec017cf204b54524878310c356bf64Devang Patel// Returns false and sets *error (if error != NULL) on error.
18937b1e92a9862862722732cb0f72d5ade77ac4c8Devang Patelbool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
199d89df169027035bf692b3d397bb93cff7bdc860Devang Patel                            string* dst,
209d89df169027035bf692b3d397bb93cff7bdc860Devang Patel                            RegexpStatus* status) {
21d77fdba5737ee71b63681160fba5b2fc200583f4Devang Patel  Regexp* re = Parse(src, flags, status);
2228bc9d88260a3e153ead4311c9129e3d3ad07736Devang Patel  if (re == NULL)
2337da0ad07bfd5e3856ce5d182b27b4c499c03c9aChris Lattner    return false;
240a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel  Regexp* sre = re->Simplify();
250a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel  re->Decref();
260a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel  if (sre == NULL) {
27b2a33b46469a6d2c0e61122002079efb7d6d3f9cChris Lattner    // Should not happen, since Simplify never fails.
28647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson    LOG(ERROR) << "Simplify failed on " << src;
295e9cd434234a36089daeee915f1dc02b96947fbaChris Lattner    if (status) {
305e9cd434234a36089daeee915f1dc02b96947fbaChris Lattner      status->set_code(kRegexpInternalError);
31bc5201f8371f9041e79efcca3b158335da5c2604Devang Patel      status->set_error_arg(src);
325e9cd434234a36089daeee915f1dc02b96947fbaChris Lattner    }
3349b63a1f2c493f6452d22cf0df4c147e0539fb2eDevang Patel    return false;
34647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson  }
350d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov  *dst = sre->ToString();
36647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson  sre->Decref();
37647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson  return true;
382353122a3c8f735deb1cb3c9c61785280923e09cNick Lewycky}
392353122a3c8f735deb1cb3c9c61785280923e09cNick Lewycky
40647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson// Assuming the simple_ flags on the children are accurate,
41647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson// is this Regexp* simple?
42647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Andersonbool Regexp::ComputeSimple() {
435d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner  Regexp** subs;
44c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner  switch (op_) {
45c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner    case kRegexpNoMatch:
465d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner    case kRegexpEmptyMatch:
47c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner    case kRegexpLiteral:
485d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner    case kRegexpLiteralString:
49c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner    case kRegexpBeginLine:
50c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner    case kRegexpEndLine:
515d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner    case kRegexpBeginText:
525d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner    case kRegexpWordBoundary:
530d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov    case kRegexpNoWordBoundary:
54b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner    case kRegexpEndText:
551d880e51142566838131b091306dd03549adb6d2Chris Lattner    case kRegexpAnyChar:
561d880e51142566838131b091306dd03549adb6d2Chris Lattner    case kRegexpAnyByte:
570d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov    case kRegexpHaveMatch:
58c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner      return true;
59c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner    case kRegexpConcat:
60c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner    case kRegexpAlternate:
61c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner      // These are simple as long as the subpieces are simple.
62c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner      subs = sub();
63c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner      for (int i = 0; i < nsub_; i++)
645d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner        if (!subs[i]->simple_)
655d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner          return false;
66c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner      return true;
67c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner    case kRegexpCharClass:
685d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner      // Simple as long as the char class is not empty, not full.
695d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner      if (ccb_ != NULL)
70c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner        return !ccb_->empty() && !ccb_->full();
71c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner      return !cc_->empty() && !cc_->full();
72c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner    case kRegexpCapture:
73c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner      subs = sub();
74c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner      return subs[0]->simple_;
75b2a33b46469a6d2c0e61122002079efb7d6d3f9cChris Lattner    case kRegexpStar:
760a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel    case kRegexpPlus:
77c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner    case kRegexpQuest:
785d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner      subs = sub();
79b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner      if (!subs[0]->simple_)
805d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner        return false;
815d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner      switch (subs[0]->op_) {
825d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner        case kRegexpStar:
83b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner        case kRegexpPlus:
84b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner        case kRegexpQuest:
85b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner        case kRegexpEmptyMatch:
86b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner        case kRegexpNoMatch:
87bc5201f8371f9041e79efcca3b158335da5c2604Devang Patel          return false;
88b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner        default:
890d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov          break;
90b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner      }
91b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner      return true;
92b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner    case kRegexpRepeat:
93b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner      return false;
945d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner  }
95b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner  LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
965d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner  return false;
97b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner}
98b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner
99b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner// Walker subclass used by Simplify.
100f2410180d07cb1ca074ad2a9b8538bc64895e68bChris Lattner// The simplify walk is purely post-recursive: given the simplified children,
101f2410180d07cb1ca074ad2a9b8538bc64895e68bChris Lattner// PostVisit creates the simplified result.
1020d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov// The child_args are simplified Regexp*s.
103b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattnerclass SimplifyWalker : public Regexp::Walker<Regexp*> {
1046f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin public:
1056f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin  SimplifyWalker() {}
1066f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin  virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
1076f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin  virtual Regexp* PostVisit(Regexp* re,
10854a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner                            Regexp* parent_arg,
10954a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner                            Regexp* pre_arg,
1100d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov                            Regexp** child_args, int nchild_args);
111b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner  virtual Regexp* Copy(Regexp* re);
1125d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
113b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner
1145d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner private:
115f2410180d07cb1ca074ad2a9b8538bc64895e68bChris Lattner  // These functions are declared inside SimplifyWalker so that
116f2410180d07cb1ca074ad2a9b8538bc64895e68bChris Lattner  // they can edit the private fields of the Regexps they construct.
1178fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez
1186f8694b272ffbd5b4746feccd4866bf89794fe86Victor Hernandez  // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
1198fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez  // Caller must Decref return value when done with it.
1208fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez  static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
1218fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez
122d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner  // Simplifies the expression re{min,max} in terms of *, +, and ?.
123d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner  // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
124d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner  // Caller must Decref return value when done with it.
125d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner  static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
1268fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez                                Regexp::ParseFlags parse_flags);
1278fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez
1288fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez  // Simplifies a character class by expanding any named classes
129e05f66ef2e37bcbd88acaeef2bf14ddb1f04488fVictor Hernandez  // into rune ranges.  Does not edit re.  Does not consume ref to re.
1308fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez  // Caller must Decref return value when done with it.
131d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner  static Regexp* SimplifyCharClass(Regexp* re);
132e05f66ef2e37bcbd88acaeef2bf14ddb1f04488fVictor Hernandez
133d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner  DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
134c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez};
1358fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez
136d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// Simplifies a regular expression, returning a new regexp.
137d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// The new regexp uses traditional Unix egrep features only,
138d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// plus the Perl (?:) non-capturing parentheses.
139d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// Otherwise, no POSIX or Perl additions.  The new regexp
14054630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez// captures exactly the same subexpressions (with the same indices)
141d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// as the original.
142d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// Does not edit current object.
143d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner// Caller must Decref() return value when done with it.
144d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner
145c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor HernandezRegexp* Regexp::Simplify() {
146c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez  if (simple_)
147c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez    return Incref();
14854630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez  SimplifyWalker w;
149c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez  return w.Walk(this, NULL);
150c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez}
151c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez
1526cead7879aa8902905809e3853d31093a5511c2fVictor Hernandez#define Simplify DontCallSimplify  // Avoid accidental recursion
1536cead7879aa8902905809e3853d31093a5511c2fVictor Hernandez
1548fffff537194e2375e65600f27d716c99f0eb38aVictor HernandezRegexp* SimplifyWalker::Copy(Regexp* re) {
15554630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez  return re->Incref();
15654630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez}
15754630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez
158c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor HernandezRegexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
15954630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez  // This should never be called, since we use Walk and not
16054630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez  // WalkExponential.
1618fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez  LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
1629520cc2eae199f8974d5ed4f89ec43468be8f128Chandler Carruth  return re->Incref();
163d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner}
164d3a6d90da74a59150ad781feb7cae0406c13e324Chris Lattner
1656f8694b272ffbd5b4746feccd4866bf89794fe86Victor HernandezRegexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
1666f8694b272ffbd5b4746feccd4866bf89794fe86Victor Hernandez  if (re->simple_) {
1679520cc2eae199f8974d5ed4f89ec43468be8f128Chandler Carruth    *stop = true;
1688fffff537194e2375e65600f27d716c99f0eb38aVictor Hernandez    return re->Incref();
16954630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez  }
17054630e1cef710e65751e2147cbdd2e018292c435Victor Hernandez  return NULL;
171c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez}
172c7650b4d1907becbf2ed112074fd2c4a6f174aa8Victor Hernandez
173b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris LattnerRegexp* SimplifyWalker::PostVisit(Regexp* re,
174b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner                                  Regexp* parent_arg,
175b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner                                  Regexp* pre_arg,
176b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner                                  Regexp** child_args,
177b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner                                  int nchild_args) {
178b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner  switch (re->op()) {
1790a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel    case kRegexpNoMatch:
1800a9f7b9c3ebe7d0ec033462e1a7c9101279956f9Devang Patel    case kRegexpEmptyMatch:
18124e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez    case kRegexpLiteral:
182e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    case kRegexpLiteralString:
183e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    case kRegexpBeginLine:
1845f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel    case kRegexpEndLine:
185e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    case kRegexpBeginText:
186e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    case kRegexpWordBoundary:
187e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    case kRegexpNoWordBoundary:
188e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    case kRegexpEndText:
189e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    case kRegexpAnyChar:
190e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    case kRegexpAnyByte:
191e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    case kRegexpHaveMatch:
192e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      // All these are always simple.
193e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      re->simple_ = true;
194e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      return re->Incref();
19524e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez
19624e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez    case kRegexpConcat:
197e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    case kRegexpAlternate: {
198e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      // These are simple as long as the subpieces are simple.
199e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      // Two passes to avoid allocation in the common case.
200e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      bool changed = false;
201e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      Regexp** subs = re->sub();
202e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      for (int i = 0; i < re->nsub_; i++) {
203e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez        Regexp* sub = subs[i];
204e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez        Regexp* newsub = child_args[i];
20524e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez        if (newsub != sub) {
206f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel          changed = true;
207f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel          break;
208f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel        }
209f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel      }
210f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel      if (!changed) {
211f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel        for (int i = 0; i < re->nsub_; i++) {
212f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel          Regexp* newsub = child_args[i];
213f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel          newsub->Decref();
214f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel        }
215f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel        re->simple_ = true;
216f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel        return re->Incref();
217f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel      }
218f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel      Regexp* nre = new Regexp(re->op(), re->parse_flags());
219f906cb933e4daa4b77c27941365b79cca1b697e9Devang Patel      nre->AllocSub(re->nsub_);
220e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      Regexp** nre_subs = nre->sub();
221e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      for (int i = 0; i <re->nsub_; i++)
222e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez        nre_subs[i] = child_args[i];
223e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      nre->simple_ = true;
224e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      return nre;
225e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    }
2260d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov
2275f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel    case kRegexpCapture: {
228647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson      Regexp* newsub = child_args[0];
229647e3016de18d2fc8b0f233a0b356809e3fdcc54Owen Anderson      if (newsub == re->sub()[0]) {
23024e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez        newsub->Decref();
23124e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez        re->simple_ = true;
23224e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez        return re->Incref();
23324e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez      }
234e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
23524e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez      nre->AllocSub(1);
23624e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez      nre->sub()[0] = newsub;
23724e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez      nre->cap_ = re->cap_;
23824e64df7ec25b55aa872c2ef33728dfbb8c353c4Victor Hernandez      nre->simple_ = true;
239e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez      return nre;
240e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    }
241e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez
242e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    case kRegexpStar:
243e685f230b6f4f7cd65f45e07cbbbcaa4d66a1ceaVictor Hernandez    case kRegexpPlus:
2445d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner    case kRegexpQuest: {
2455d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner      Regexp* newsub = child_args[0];
246b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner      // Special case: repeat the empty string as much as
247c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner      // you want, but it's still the empty string.
248c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner      if (newsub->op() == kRegexpEmptyMatch)
249b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner        return newsub;
2505d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner
2515d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner      // These are simple as long as the subpiece is simple.
252ea7b6bb32355c0d4cb1e1c16df7c8cf63787e73bDevang Patel      if (newsub == re->sub()[0]) {
253b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner        newsub->Decref();
254c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner        re->simple_ = true;
2556f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin        return re->Incref();
2566f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin      }
2576f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin
2586f555ca2cd0bba50542adcbb131f541ae70d34cdJeffrey Yasskin      // These are also idempotent if flags are constant.
25906fdaccc89d9abdc7e03797b25173791a2f5692fDevang Patel      if (re->op() == newsub->op() &&
260c5e08a963973f43c10869cd249c0718d307dd031Chris Lattner          re->parse_flags() == newsub->parse_flags())
2615d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner        return newsub;
2625d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner
263df58389ef1fb8373d900975a6a66e4fbe0344f83Chris Lattner      Regexp* nre = new Regexp(re->op(), re->parse_flags());
2640d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov      nre->AllocSub(1);
265df58389ef1fb8373d900975a6a66e4fbe0344f83Chris Lattner      nre->sub()[0] = newsub;
2665f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel      nre->simple_ = true;
2675f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel      return nre;
26854a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner    }
269b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner
27054a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner    case kRegexpRepeat: {
27154a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner      Regexp* newsub = child_args[0];
27254a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner      // Special case: repeat the empty string as much as
27354a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner      // you want, but it's still the empty string.
2740d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov      if (newsub->op() == kRegexpEmptyMatch)
275496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner        return newsub;
276496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner
277496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner      Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
278496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner                                   re->parse_flags());
279496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner      newsub->Decref();
280df58389ef1fb8373d900975a6a66e4fbe0344f83Chris Lattner      nre->simple_ = true;
28154a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner      return nre;
28254a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner    }
28354a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner
28454a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner    case kRegexpCharClass: {
28554a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner      Regexp* nre = SimplifyCharClass(re);
28654a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner      nre->simple_ = true;
28754a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner      return nre;
2880d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov    }
28954a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner  }
29054a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner
29154a1f9f9c1e2cf6f4541e998b20f5c7cfbe642d9Chris Lattner  LOG(ERROR) << "Simplify case not handled: " << re->op();
2925f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel  return re->Incref();
2935f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel}
2945f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel
2955f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
2965f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel// Returns a new Regexp, handing the ref to the caller.
2975f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang PatelRegexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
2985f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel                                Regexp::ParseFlags parse_flags) {
299b76359e36e75dfe16c5153c3cac903efbb2cd8d7Chris Lattner  Regexp* re = new Regexp(kRegexpConcat, parse_flags);
300496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner  re->AllocSub(2);
301496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner  Regexp** subs = re->sub();
3025f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel  subs[0] = re1;
3035f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel  subs[1] = re2;
304496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner  return re;
305496425774e8adbc526e298f91f4c6980177c1d7fChris Lattner}
3065f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel
3075f4ac848d94b0a92e19ac7f2b3d0284d7d323173Devang Patel// Simplifies the expression re{min,max} in terms of *, +, and ?.
308f457d1316dec017cf204b54524878310c356bf64Devang Patel// Returns a new regexp.  Does not edit re.  Does not consume reference to re.
309b2a33b46469a6d2c0e61122002079efb7d6d3f9cChris Lattner// Caller must Decref return value when done with it.
310f457d1316dec017cf204b54524878310c356bf64Devang Patel// The result will *not* necessarily have the right capturing parens
31126028f27ddd132b3284943e11aca130c2911abc4Devang Patel// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
31226028f27ddd132b3284943e11aca130c2911abc4Devang Patel// but in the Regexp* representation, both (x) are marked as $1.
31326028f27ddd132b3284943e11aca130c2911abc4Devang PatelRegexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
31426028f27ddd132b3284943e11aca130c2911abc4Devang Patel                                       Regexp::ParseFlags f) {
31526028f27ddd132b3284943e11aca130c2911abc4Devang Patel  // x{n,} means at least n matches of x.
31626028f27ddd132b3284943e11aca130c2911abc4Devang Patel  if (max == -1) {
31726028f27ddd132b3284943e11aca130c2911abc4Devang Patel    // Special case: x{0,} is x*
31826028f27ddd132b3284943e11aca130c2911abc4Devang Patel    if (min == 0)
31926028f27ddd132b3284943e11aca130c2911abc4Devang Patel      return Regexp::Star(re->Incref(), f);
32026028f27ddd132b3284943e11aca130c2911abc4Devang Patel
32126028f27ddd132b3284943e11aca130c2911abc4Devang Patel    // Special case: x{1,} is x+
32226028f27ddd132b3284943e11aca130c2911abc4Devang Patel    if (min == 1)
32326028f27ddd132b3284943e11aca130c2911abc4Devang Patel      return Regexp::Plus(re->Incref(), f);
32426028f27ddd132b3284943e11aca130c2911abc4Devang Patel
32526028f27ddd132b3284943e11aca130c2911abc4Devang Patel    // General case: x{4,} is xxxx+
32626028f27ddd132b3284943e11aca130c2911abc4Devang Patel    Regexp* nre = new Regexp(kRegexpConcat, f);
32726028f27ddd132b3284943e11aca130c2911abc4Devang Patel    nre->AllocSub(min);
32826028f27ddd132b3284943e11aca130c2911abc4Devang Patel    VLOG(1) << "Simplify " << min;
32926028f27ddd132b3284943e11aca130c2911abc4Devang Patel    Regexp** nre_subs = nre->sub();
33026028f27ddd132b3284943e11aca130c2911abc4Devang Patel    for (int i = 0; i < min-1; i++)
3313e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel      nre_subs[i] = re->Incref();
3323e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel    nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
33357109697282c6dffd04e2e275606352914217114Chris Lattner    return nre;
33457109697282c6dffd04e2e275606352914217114Chris Lattner  }
335d77fdba5737ee71b63681160fba5b2fc200583f4Devang Patel
3360d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov  // Special case: (x){0} matches only empty string.
3375316bf0252ce9797e90a233645b6580695403837Devang Patel  if (min == 0 && max == 0)
3388fba578be72dc98497508dec053e966858571f6aDevang Patel    return new Regexp(kRegexpEmptyMatch, f);
33928bc9d88260a3e153ead4311c9129e3d3ad07736Devang Patel
3403e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel  // Special case: x{1} is just x.
3410d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov  if (min == 1 && max == 1)
3423e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel    return re->Incref();
34349708ad993529611cedfbe49ae44bb10beb73abeDevang Patel
3443e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel  // General case: x{n,m} means n copies of x and m copies of x?.
34549708ad993529611cedfbe49ae44bb10beb73abeDevang Patel  // The machine will do less work if we nest the final m copies,
34626028f27ddd132b3284943e11aca130c2911abc4Devang Patel  // so that x{2,5} = xx(x(x(x)?)?)?
3475316bf0252ce9797e90a233645b6580695403837Devang Patel
348f457d1316dec017cf204b54524878310c356bf64Devang Patel  // Build leading prefix: xx.  Capturing only on the last one.
349a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel  Regexp* nre = NULL;
350ab67e705f59d567afded845465f358b8a66ab62eDevang Patel  if (min > 0) {
351b2a33b46469a6d2c0e61122002079efb7d6d3f9cChris Lattner    nre = new Regexp(kRegexpConcat, f);
3523e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel    nre->AllocSub(min);
3535d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner    Regexp** nre_subs = nre->sub();
3540d75d874bfa3cee9fbe907d8aa0bab61556e3d0eMikhail Glushenkov    for (int i = 0; i < min; i++)
3555d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner      nre_subs[i] = re->Incref();
3565d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner  }
3571d0be15f89cb5056e20e2d24faa8d6afb1573bcaOwen Anderson
3581d0be15f89cb5056e20e2d24faa8d6afb1573bcaOwen Anderson  // Build and attach suffix: (x(x(x)?)?)?
359ab67e705f59d567afded845465f358b8a66ab62eDevang Patel  if (max > min) {
360ab67e705f59d567afded845465f358b8a66ab62eDevang Patel    Regexp* suf = Regexp::Quest(re->Incref(), f);
36157109697282c6dffd04e2e275606352914217114Chris Lattner    for (int i = min+1; i < max; i++)
36257109697282c6dffd04e2e275606352914217114Chris Lattner      suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
36357109697282c6dffd04e2e275606352914217114Chris Lattner    if (nre == NULL)
36457109697282c6dffd04e2e275606352914217114Chris Lattner      nre = suf;
36557109697282c6dffd04e2e275606352914217114Chris Lattner    else
3665d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner      nre = Concat2(nre, suf, f);
3675d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner  }
36857109697282c6dffd04e2e275606352914217114Chris Lattner
36957109697282c6dffd04e2e275606352914217114Chris Lattner  if (nre == NULL) {
37057109697282c6dffd04e2e275606352914217114Chris Lattner    // Some degenerate case, like min > max, or min < max < 0.
3715d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner    // This shouldn't happen, because the parser rejects such regexps.
3723e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel    LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
3735d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner    return new Regexp(kRegexpNoMatch, f);
3743e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel  }
37557109697282c6dffd04e2e275606352914217114Chris Lattner
37657109697282c6dffd04e2e275606352914217114Chris Lattner  return nre;
3775d0cacdbb6577f2449986f345858db17dc1bcf59Chris Lattner}
3783e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel
3793e30c2a3c54c50246e6cccf0c8842619e29fe66cDevang Patel// Simplifies a character class.
38057109697282c6dffd04e2e275606352914217114Chris Lattner// Caller must Decref return value when done with it.
38157109697282c6dffd04e2e275606352914217114Chris LattnerRegexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
382a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel  CharClass* cc = re->cc();
383a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel
384a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel  // Special cases
385a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel  if (cc->empty())
386a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel    return new Regexp(kRegexpNoMatch, re->parse_flags());
387a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel  if (cc->full())
388a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel    return new Regexp(kRegexpAnyChar, re->parse_flags());
389a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel
39057109697282c6dffd04e2e275606352914217114Chris Lattner  return re->Incref();
391a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel}
392a82f8838c60b7a3b240e185983dacb3291396f3eDevang Patel
3930386f01e061513094504bc11f8352a40173cada7Devang Patel}  // namespace re2
394d77fdba5737ee71b63681160fba5b2fc200583f4Devang Patel