regexp_generator.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Regular expression generator: generates all possible
6// regular expressions within parameters (see regexp_generator.h for details).
7
8// The regexp generator first generates a sequence of commands in a simple
9// postfix language.  Each command in the language is a string,
10// like "a" or "%s*" or "%s|%s".
11//
12// To evaluate a command, enough arguments are popped from the value stack to
13// plug into the %s slots.  Then the result is pushed onto the stack.
14// For example, the command sequence
15//      a b %s%s c
16// results in the stack
17//      ab c
18//
19// GeneratePostfix generates all possible command sequences.
20// Then RunPostfix turns each sequence into a regular expression
21// and passes the regexp to HandleRegexp.
22
23#include <string.h>
24#include <string>
25#include <stack>
26#include <vector>
27#include "util/test.h"
28#include "re2/testing/regexp_generator.h"
29
30namespace re2 {
31
32// Returns a vector of the egrep regexp operators.
33const vector<string>& RegexpGenerator::EgrepOps() {
34  static const char *ops[] = {
35    "%s%s",
36    "%s|%s",
37    "%s*",
38    "%s+",
39    "%s?",
40    "%s\\C*",
41  };
42  static vector<string> v(ops, ops + arraysize(ops));
43  return v;
44}
45
46RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
47                                 const vector<string>& atoms,
48                                 const vector<string>& ops)
49    : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
50  // Degenerate case.
51  if (atoms_.size() == 0)
52    maxatoms_ = 0;
53  if (ops_.size() == 0)
54    maxops_ = 0;
55}
56
57// Generates all possible regular expressions (within the parameters),
58// calling HandleRegexp for each one.
59void RegexpGenerator::Generate() {
60  vector<string> postfix;
61  GeneratePostfix(&postfix, 0, 0, 0);
62}
63
64// Generates random regular expressions, calling HandleRegexp for each one.
65void RegexpGenerator::GenerateRandom(int32 seed, int n) {
66  ACMRandom acm(seed);
67  acm_ = &acm;
68
69  for (int i = 0; i < n; i++) {
70    vector<string> postfix;
71    GenerateRandomPostfix(&postfix, 0, 0, 0);
72  }
73
74  acm_ = NULL;
75}
76
77// Counts and returns the number of occurrences of "%s" in s.
78static int CountArgs(const string& s) {
79  const char *p = s.c_str();
80  int n = 0;
81  while ((p = strstr(p, "%s")) != NULL) {
82    p += 2;
83    n++;
84  }
85  return n;
86}
87
88// Generates all possible postfix command sequences.
89// Each sequence is handed off to RunPostfix to generate a regular expression.
90// The arguments are:
91//   post:  the current postfix sequence
92//   nstk:  the number of elements that would be on the stack after executing
93//          the sequence
94//   ops:   the number of operators used in the sequence
95//   atoms: the number of atoms used in the sequence
96// For example, if post were ["a", "b", "%s%s", "c"],
97// then nstk = 2, ops = 1, atoms = 3.
98//
99// The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
100//
101void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk,
102                                      int ops, int atoms) {
103  if (nstk == 1)
104    RunPostfix(*post);
105
106  // Early out: if used too many operators or can't
107  // get back down to a single expression on the stack
108  // using binary operators, give up.
109  if (ops + nstk - 1 > maxops_)
110    return;
111
112  // Add atoms if there is room.
113  if (atoms < maxatoms_) {
114    for (int i = 0; i < atoms_.size(); i++) {
115      post->push_back(atoms_[i]);
116      GeneratePostfix(post, nstk + 1, ops, atoms + 1);
117      post->pop_back();
118    }
119  }
120
121  // Add operators if there are enough arguments.
122  if (ops < maxops_) {
123    for (int i = 0; i < ops_.size(); i++) {
124      const string& fmt = ops_[i];
125      int nargs = CountArgs(fmt);
126      if (nargs <= nstk) {
127        post->push_back(fmt);
128        GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
129        post->pop_back();
130      }
131    }
132  }
133}
134
135// Generates a random postfix command sequence.
136// Stops and returns true once a single sequence has been generated.
137bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk,
138                                            int ops, int atoms) {
139  for (;;) {
140    // Stop if we get to a single element, but only sometimes.
141    if (nstk == 1 && acm_->Uniform(maxatoms_ + 1 - atoms) == 0) {
142      RunPostfix(*post);
143      return true;
144    }
145
146    // Early out: if used too many operators or can't
147    // get back down to a single expression on the stack
148    // using binary operators, give up.
149    if (ops + nstk - 1 > maxops_)
150      return false;
151
152    // Add operators if there are enough arguments.
153    if (ops < maxops_ && acm_->Uniform(2) == 0) {
154      const string& fmt = ops_[acm_->Uniform(ops_.size())];
155      int nargs = CountArgs(fmt);
156      if (nargs <= nstk) {
157        post->push_back(fmt);
158        bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
159                                         ops + 1, atoms);
160        post->pop_back();
161        if (ret)
162          return true;
163      }
164    }
165
166    // Add atoms if there is room.
167    if (atoms < maxatoms_ && acm_->Uniform(2) == 0) {
168      post->push_back(atoms_[acm_->Uniform(atoms_.size())]);
169      bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
170      post->pop_back();
171      if (ret)
172        return true;
173    }
174  }
175}
176
177// Interprets the postfix command sequence to create a regular expression
178// passed to HandleRegexp.  The results of operators like %s|%s are wrapped
179// in (?: ) to avoid needing to maintain a precedence table.
180void RegexpGenerator::RunPostfix(const vector<string>& post) {
181  stack<string> regexps;
182  for (int i = 0; i < post.size(); i++) {
183    switch (CountArgs(post[i])) {
184      default:
185        LOG(FATAL) << "Bad operator: " << post[i];
186      case 0:
187        regexps.push(post[i]);
188        break;
189      case 1: {
190        string a = regexps.top();
191        regexps.pop();
192        regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
193        break;
194      }
195      case 2: {
196        string b = regexps.top();
197        regexps.pop();
198        string a = regexps.top();
199        regexps.pop();
200        regexps.push("(?:" +
201                     StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
202                     ")");
203        break;
204      }
205    }
206  }
207
208  if (regexps.size() != 1) {
209    // Internal error - should never happen.
210    printf("Bad regexp program:\n");
211    for (int i = 0; i < post.size(); i++) {
212      printf("  %s\n", CEscape(post[i]).c_str());
213    }
214    printf("Stack after running program:\n");
215    while (!regexps.empty()) {
216      printf("  %s\n", CEscape(regexps.top()).c_str());
217      regexps.pop();
218    }
219    LOG(FATAL) << "Bad regexp program.";
220  }
221
222  HandleRegexp(regexps.top());
223  HandleRegexp("^(?:" + regexps.top() + ")$");
224  HandleRegexp("^(?:" + regexps.top() + ")");
225  HandleRegexp("(?:" + regexps.top() + ")$");
226}
227
228// Split s into an vector of strings, one for each UTF-8 character.
229vector<string> Explode(const StringPiece& s) {
230  vector<string> v;
231
232  for (const char *q = s.begin(); q < s.end(); ) {
233    const char* p = q;
234    Rune r;
235    q += chartorune(&r, q);
236    v.push_back(string(p, q - p));
237  }
238
239  return v;
240}
241
242// Split string everywhere a substring is found, returning
243// vector of pieces.
244vector<string> Split(const StringPiece& sep, const StringPiece& s) {
245  vector<string> v;
246
247  if (sep.size() == 0)
248    return Explode(s);
249
250  const char *p = s.begin();
251  for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) {
252    if (StringPiece(q, sep.size()) == sep) {
253      v.push_back(string(p, q - p));
254      p = q + sep.size();
255      q = p - 1;  // -1 for ++ in loop
256      continue;
257    }
258  }
259  if (p < s.end())
260    v.push_back(string(p, s.end() - p));
261  return v;
262}
263
264}  // namespace re2
265