prog.cc revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright 2007 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Compiled regular expression representation.
6// Tested by compile_test.cc
7
8#include "util/util.h"
9#include "util/sparse_set.h"
10#include "re2/prog.h"
11#include "re2/stringpiece.h"
12
13namespace re2 {
14
15// Constructors per Inst opcode
16
17void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
18  DCHECK_EQ(out_opcode_, 0);
19  set_out_opcode(out, kInstAlt);
20  out1_ = out1;
21}
22
23void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
24  DCHECK_EQ(out_opcode_, 0);
25  set_out_opcode(out, kInstByteRange);
26  lo_ = lo & 0xFF;
27  hi_ = hi & 0xFF;
28  foldcase_ = foldcase;
29}
30
31void Prog::Inst::InitCapture(int cap, uint32 out) {
32  DCHECK_EQ(out_opcode_, 0);
33  set_out_opcode(out, kInstCapture);
34  cap_ = cap;
35}
36
37void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
38  DCHECK_EQ(out_opcode_, 0);
39  set_out_opcode(out, kInstEmptyWidth);
40  empty_ = empty;
41}
42
43void Prog::Inst::InitMatch(int32 id) {
44  DCHECK_EQ(out_opcode_, 0);
45  set_opcode(kInstMatch);
46  match_id_ = id;
47}
48
49void Prog::Inst::InitNop(uint32 out) {
50  DCHECK_EQ(out_opcode_, 0);
51  set_opcode(kInstNop);
52}
53
54void Prog::Inst::InitFail() {
55  DCHECK_EQ(out_opcode_, 0);
56  set_opcode(kInstFail);
57}
58
59string Prog::Inst::Dump() {
60  switch (opcode()) {
61    default:
62      return StringPrintf("opcode %d", static_cast<int>(opcode()));
63
64    case kInstAlt:
65      return StringPrintf("alt -> %d | %d", out(), out1_);
66
67    case kInstAltMatch:
68      return StringPrintf("altmatch -> %d | %d", out(), out1_);
69
70    case kInstByteRange:
71      return StringPrintf("byte%s [%02x-%02x] -> %d",
72                          foldcase_ ? "/i" : "",
73                          lo_, hi_, out());
74
75    case kInstCapture:
76      return StringPrintf("capture %d -> %d", cap_, out());
77
78    case kInstEmptyWidth:
79      return StringPrintf("emptywidth %#x -> %d",
80                          static_cast<int>(empty_), out());
81
82    case kInstMatch:
83      return StringPrintf("match! %d", match_id());
84
85    case kInstNop:
86      return StringPrintf("nop -> %d", out());
87
88    case kInstFail:
89      return StringPrintf("fail");
90  }
91}
92
93Prog::Prog()
94  : anchor_start_(false),
95    anchor_end_(false),
96    reversed_(false),
97    did_onepass_(false),
98    start_(0),
99    start_unanchored_(0),
100    size_(0),
101    byte_inst_count_(0),
102    bytemap_range_(0),
103    flags_(0),
104    onepass_statesize_(0),
105    inst_(NULL),
106    dfa_first_(NULL),
107    dfa_longest_(NULL),
108    dfa_mem_(0),
109    delete_dfa_(NULL),
110    unbytemap_(NULL),
111    onepass_nodes_(NULL),
112    onepass_start_(NULL) {
113}
114
115Prog::~Prog() {
116  if (delete_dfa_) {
117    if (dfa_first_)
118      delete_dfa_(dfa_first_);
119    if (dfa_longest_)
120      delete_dfa_(dfa_longest_);
121  }
122  delete[] onepass_nodes_;
123  delete[] inst_;
124  delete[] unbytemap_;
125}
126
127typedef SparseSet Workq;
128
129static inline void AddToQueue(Workq* q, int id) {
130  if (id != 0)
131    q->insert(id);
132}
133
134static string ProgToString(Prog* prog, Workq* q) {
135  string s;
136
137  for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
138    int id = *i;
139    Prog::Inst* ip = prog->inst(id);
140    StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
141    AddToQueue(q, ip->out());
142    if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
143      AddToQueue(q, ip->out1());
144  }
145  return s;
146}
147
148string Prog::Dump() {
149  string map;
150  if (false) {  // Debugging
151    int lo = 0;
152    StringAppendF(&map, "byte map:\n");
153    for (int i = 0; i < bytemap_range_; i++) {
154      StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
155      lo = unbytemap_[i] + 1;
156    }
157    StringAppendF(&map, "\n");
158  }
159
160  Workq q(size_);
161  AddToQueue(&q, start_);
162  return map + ProgToString(this, &q);
163}
164
165string Prog::DumpUnanchored() {
166  Workq q(size_);
167  AddToQueue(&q, start_unanchored_);
168  return ProgToString(this, &q);
169}
170
171static bool IsMatch(Prog*, Prog::Inst*);
172
173// Peep-hole optimizer.
174void Prog::Optimize() {
175  Workq q(size_);
176
177  // Eliminate nops.  Most are taken out during compilation
178  // but a few are hard to avoid.
179  q.clear();
180  AddToQueue(&q, start_);
181  for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
182    int id = *i;
183
184    Inst* ip = inst(id);
185    int j = ip->out();
186    Inst* jp;
187    while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
188      j = jp->out();
189    }
190    ip->set_out(j);
191    AddToQueue(&q, ip->out());
192
193    if (ip->opcode() == kInstAlt) {
194      j = ip->out1();
195      while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
196        j = jp->out();
197      }
198      ip->out1_ = j;
199      AddToQueue(&q, ip->out1());
200    }
201  }
202
203  // Insert kInstAltMatch instructions
204  // Look for
205  //   ip: Alt -> j | k
206  //	  j: ByteRange [00-FF] -> ip
207  //    k: Match
208  // or the reverse (the above is the greedy one).
209  // Rewrite Alt to AltMatch.
210  q.clear();
211  AddToQueue(&q, start_);
212  for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
213    int id = *i;
214    Inst* ip = inst(id);
215    AddToQueue(&q, ip->out());
216    if (ip->opcode() == kInstAlt)
217      AddToQueue(&q, ip->out1());
218
219    if (ip->opcode() == kInstAlt) {
220      Inst* j = inst(ip->out());
221      Inst* k = inst(ip->out1());
222      if (j->opcode() == kInstByteRange && j->out() == id &&
223          j->lo() == 0x00 && j->hi() == 0xFF &&
224          IsMatch(this, k)) {
225        ip->set_opcode(kInstAltMatch);
226        continue;
227      }
228      if (IsMatch(this, j) &&
229          k->opcode() == kInstByteRange && k->out() == id &&
230          k->lo() == 0x00 && k->hi() == 0xFF) {
231        ip->set_opcode(kInstAltMatch);
232      }
233    }
234  }
235}
236
237// Is ip a guaranteed match at end of text, perhaps after some capturing?
238static bool IsMatch(Prog* prog, Prog::Inst* ip) {
239  for (;;) {
240    switch (ip->opcode()) {
241      default:
242        LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
243        return false;
244
245      case kInstAlt:
246      case kInstAltMatch:
247      case kInstByteRange:
248      case kInstFail:
249      case kInstEmptyWidth:
250        return false;
251
252      case kInstCapture:
253      case kInstNop:
254        ip = prog->inst(ip->out());
255        break;
256
257      case kInstMatch:
258        return true;
259    }
260  }
261}
262
263uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
264  int flags = 0;
265
266  // ^ and \A
267  if (p == text.begin())
268    flags |= kEmptyBeginText | kEmptyBeginLine;
269  else if (p[-1] == '\n')
270    flags |= kEmptyBeginLine;
271
272  // $ and \z
273  if (p == text.end())
274    flags |= kEmptyEndText | kEmptyEndLine;
275  else if (p < text.end() && p[0] == '\n')
276    flags |= kEmptyEndLine;
277
278  // \b and \B
279  if (p == text.begin() && p == text.end()) {
280    // no word boundary here
281  } else if (p == text.begin()) {
282    if (IsWordChar(p[0]))
283      flags |= kEmptyWordBoundary;
284  } else if (p == text.end()) {
285    if (IsWordChar(p[-1]))
286      flags |= kEmptyWordBoundary;
287  } else {
288    if (IsWordChar(p[-1]) != IsWordChar(p[0]))
289      flags |= kEmptyWordBoundary;
290  }
291  if (!(flags & kEmptyWordBoundary))
292    flags |= kEmptyNonWordBoundary;
293
294  return flags;
295}
296
297void Prog::MarkByteRange(int lo, int hi) {
298  CHECK_GE(lo, 0);
299  CHECK_GE(hi, 0);
300  CHECK_LE(lo, 255);
301  CHECK_LE(hi, 255);
302  if (lo > 0)
303    byterange_.Set(lo - 1);
304  byterange_.Set(hi);
305}
306
307void Prog::ComputeByteMap() {
308  // Fill in bytemap with byte classes for prog_.
309  // Ranges of bytes that are treated as indistinguishable
310  // by the regexp program are mapped to a single byte class.
311  // The vector prog_->byterange() marks the end of each
312  // such range.
313  const Bitmap<256>& v = byterange();
314
315  COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
316  uint8 n = 0;
317  uint32 bits = 0;
318  for (int i = 0; i < 256; i++) {
319    if ((i&31) == 0)
320      bits = v.Word(i >> 5);
321    bytemap_[i] = n;
322    n += bits & 1;
323    bits >>= 1;
324  }
325  bytemap_range_ = bytemap_[255] + 1;
326  unbytemap_ = new uint8[bytemap_range_];
327  for (int i = 0; i < 256; i++)
328    unbytemap_[bytemap_[i]] = i;
329
330  if (0) {  // For debugging: use trivial byte map.
331    for (int i = 0; i < 256; i++) {
332      bytemap_[i] = i;
333      unbytemap_[i] = i;
334    }
335    bytemap_range_ = 256;
336    LOG(INFO) << "Using trivial bytemap.";
337  }
338}
339
340}  // namespace re2
341
342