1// Copyright 2007 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Compile regular expression to Prog.
6//
7// Prog and Inst are defined in prog.h.
8// This file's external interface is just Regexp::CompileToProg.
9// The Compiler class defined in this file is private.
10
11#include "re2/prog.h"
12#include "re2/re2.h"
13#include "re2/regexp.h"
14#include "re2/walker-inl.h"
15
16namespace re2 {
17
18// List of pointers to Inst* that need to be filled in (patched).
19// Because the Inst* haven't been filled in yet,
20// we can use the Inst* word to hold the list's "next" pointer.
21// It's kind of sleazy, but it works well in practice.
22// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
23//
24// Because the out and out1 fields in Inst are no longer pointers,
25// we can't use pointers directly here either.  Instead, p refers
26// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
27// p == 0 represents the NULL list.  This is okay because instruction #0
28// is always the fail instruction, which never appears on a list.
29
30struct PatchList {
31  uint32 p;
32
33  // Returns patch list containing just p.
34  static PatchList Mk(uint32 p);
35
36  // Patches all the entries on l to have value v.
37  // Caller must not ever use patch list again.
38  static void Patch(Prog::Inst *inst0, PatchList l, uint32 v);
39
40  // Deref returns the next pointer pointed at by p.
41  static PatchList Deref(Prog::Inst *inst0, PatchList l);
42
43  // Appends two patch lists and returns result.
44  static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
45};
46
47static PatchList nullPatchList = { 0 };
48
49// Returns patch list containing just p.
50PatchList PatchList::Mk(uint32 p) {
51  PatchList l;
52  l.p = p;
53  return l;
54}
55
56// Returns the next pointer pointed at by l.
57PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
58  Prog::Inst* ip = &inst0[l.p>>1];
59  if (l.p&1)
60    l.p = ip->out1();
61  else
62    l.p = ip->out();
63  return l;
64}
65
66// Patches all the entries on l to have value v.
67void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) {
68  while (l.p != 0) {
69    Prog::Inst* ip = &inst0[l.p>>1];
70    if (l.p&1) {
71      l.p = ip->out1();
72      ip->out1_ = val;
73    } else {
74      l.p = ip->out();
75      ip->set_out(val);
76    }
77  }
78}
79
80// Appends two patch lists and returns result.
81PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
82  if (l1.p == 0)
83    return l2;
84  if (l2.p == 0)
85    return l1;
86
87  PatchList l = l1;
88  for (;;) {
89    PatchList next = PatchList::Deref(inst0, l);
90    if (next.p == 0)
91      break;
92    l = next;
93  }
94
95  Prog::Inst* ip = &inst0[l.p>>1];
96  if (l.p&1)
97    ip->out1_ = l2.p;
98  else
99    ip->set_out(l2.p);
100
101  return l1;
102}
103
104// Compiled program fragment.
105struct Frag {
106  uint32 begin;
107  PatchList end;
108
109  Frag() : begin(0) { end.p = 0; }  // needed so Frag can go in vector
110  Frag(uint32 begin, PatchList end) : begin(begin), end(end) {}
111};
112
113static Frag kNullFrag;
114
115// Input encodings.
116enum Encoding {
117  kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
118  kEncodingLatin1,    // Latin1 (0-FF)
119};
120
121class Compiler : public Regexp::Walker<Frag> {
122 public:
123  explicit Compiler();
124  ~Compiler();
125
126  // Compiles Regexp to a new Prog.
127  // Caller is responsible for deleting Prog when finished with it.
128  // If reversed is true, compiles for walking over the input
129  // string backward (reverses all concatenations).
130  static Prog *Compile(Regexp* re, bool reversed, int64 max_mem);
131
132  // Compiles alternation of all the re to a new Prog.
133  // Each re has a match with an id equal to its index in the vector.
134  static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
135                          Regexp* re);
136
137  // Interface for Regexp::Walker, which helps traverse the Regexp.
138  // The walk is purely post-recursive: given the machines for the
139  // children, PostVisit combines them to create the machine for
140  // the current node.  The child_args are Frags.
141  // The Compiler traverses the Regexp parse tree, visiting
142  // each node in depth-first order.  It invokes PreVisit before
143  // visiting the node's children and PostVisit after visiting
144  // the children.
145  Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
146  Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
147                 int nchild_args);
148  Frag ShortVisit(Regexp* re, Frag parent_arg);
149  Frag Copy(Frag arg);
150
151  // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
152  Frag Plus(Frag a, bool nongreedy);
153  Frag Star(Frag a, bool nongreedy);
154  Frag Quest(Frag a, bool nongreedy);
155
156  // Given fragment a, returns (a) capturing as \n.
157  Frag Capture(Frag a, int n);
158
159  // Given fragments a and b, returns ab; a|b
160  Frag Cat(Frag a, Frag b);
161  Frag Alt(Frag a, Frag b);
162
163  // Returns a fragment that can't match anything.
164  Frag NoMatch();
165
166  // Returns a fragment that matches the empty string.
167  Frag Match(int32 id);
168
169  // Returns a no-op fragment.
170  Frag Nop();
171
172  // Returns a fragment matching the byte range lo-hi.
173  Frag ByteRange(int lo, int hi, bool foldcase);
174
175  // Returns a fragment matching an empty-width special op.
176  Frag EmptyWidth(EmptyOp op);
177
178  // Adds n instructions to the program.
179  // Returns the index of the first one.
180  // Returns -1 if no more instructions are available.
181  int AllocInst(int n);
182
183  // Deletes unused instructions.
184  void Trim();
185
186  // Rune range compiler.
187
188  // Begins a new alternation.
189  void BeginRange();
190
191  // Adds a fragment matching the rune range lo-hi.
192  void AddRuneRange(Rune lo, Rune hi, bool foldcase);
193  void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
194  void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
195  void Add_80_10ffff();
196
197  // New suffix that matches the byte range lo-hi, then goes to next.
198  int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
199  int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
200
201  // Adds a suffix to alternation.
202  void AddSuffix(int id);
203
204  // Returns the alternation of all the added suffixes.
205  Frag EndRange();
206
207  // Single rune.
208  Frag Literal(Rune r, bool foldcase);
209
210  void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
211  Prog* Finish();
212
213  // Returns .* where dot = any byte
214  Frag DotStar();
215
216 private:
217  Prog* prog_;         // Program being built.
218  bool failed_;        // Did we give up compiling?
219  Encoding encoding_;  // Input encoding
220  bool reversed_;      // Should program run backward over text?
221
222  int max_inst_;       // Maximum number of instructions.
223
224  Prog::Inst* inst_;   // Pointer to first instruction.
225  int inst_len_;       // Number of instructions used.
226  int inst_cap_;       // Number of instructions allocated.
227
228  int64 max_mem_;      // Total memory budget.
229
230  map<uint64, int> rune_cache_;
231  Frag rune_range_;
232
233  RE2::Anchor anchor_;  // anchor mode for RE2::Set
234
235  DISALLOW_EVIL_CONSTRUCTORS(Compiler);
236};
237
238Compiler::Compiler() {
239  prog_ = new Prog();
240  failed_ = false;
241  encoding_ = kEncodingUTF8;
242  reversed_ = false;
243  inst_ = NULL;
244  inst_len_ = 0;
245  inst_cap_ = 0;
246  max_inst_ = 1;  // make AllocInst for fail instruction okay
247  max_mem_ = 0;
248  int fail = AllocInst(1);
249  inst_[fail].InitFail();
250  max_inst_ = 0;  // Caller must change
251}
252
253Compiler::~Compiler() {
254  delete prog_;
255  delete[] inst_;
256}
257
258int Compiler::AllocInst(int n) {
259  if (failed_ || inst_len_ + n > max_inst_) {
260    failed_ = true;
261    return -1;
262  }
263
264  if (inst_len_ + n > inst_cap_) {
265    if (inst_cap_ == 0)
266      inst_cap_ = 8;
267    while (inst_len_ + n > inst_cap_)
268      inst_cap_ *= 2;
269    Prog::Inst* ip = new Prog::Inst[inst_cap_];
270    memmove(ip, inst_, inst_len_ * sizeof ip[0]);
271    memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
272    delete[] inst_;
273    inst_ = ip;
274  }
275  int id = inst_len_;
276  inst_len_ += n;
277  return id;
278}
279
280void Compiler::Trim() {
281  if (inst_len_ < inst_cap_) {
282    Prog::Inst* ip = new Prog::Inst[inst_len_];
283    memmove(ip, inst_, inst_len_ * sizeof ip[0]);
284    delete[] inst_;
285    inst_ = ip;
286    inst_cap_ = inst_len_;
287  }
288}
289
290// These routines are somewhat hard to visualize in text --
291// see http://swtch.com/~rsc/regexp/regexp1.html for
292// pictures explaining what is going on here.
293
294// Returns an unmatchable fragment.
295Frag Compiler::NoMatch() {
296  return Frag(0, nullPatchList);
297}
298
299// Is a an unmatchable fragment?
300static bool IsNoMatch(Frag a) {
301  return a.begin == 0;
302}
303
304// Given fragments a and b, returns fragment for ab.
305Frag Compiler::Cat(Frag a, Frag b) {
306  if (IsNoMatch(a) || IsNoMatch(b))
307    return NoMatch();
308
309  // Elide no-op.
310  Prog::Inst* begin = &inst_[a.begin];
311  if (begin->opcode() == kInstNop &&
312      a.end.p == (a.begin << 1) &&
313      begin->out() == 0) {
314    PatchList::Patch(inst_, a.end, b.begin);  // in case refs to a somewhere
315    return b;
316  }
317
318  // To run backward over string, reverse all concatenations.
319  if (reversed_) {
320    PatchList::Patch(inst_, b.end, a.begin);
321    return Frag(b.begin, a.end);
322  }
323
324  PatchList::Patch(inst_, a.end, b.begin);
325  return Frag(a.begin, b.end);
326}
327
328// Given fragments for a and b, returns fragment for a|b.
329Frag Compiler::Alt(Frag a, Frag b) {
330  // Special case for convenience in loops.
331  if (IsNoMatch(a))
332    return b;
333  if (IsNoMatch(b))
334    return a;
335
336  int id = AllocInst(1);
337  if (id < 0)
338    return NoMatch();
339
340  inst_[id].InitAlt(a.begin, b.begin);
341  return Frag(id, PatchList::Append(inst_, a.end, b.end));
342}
343
344// When capturing submatches in like-Perl mode, a kOpAlt Inst
345// treats out_ as the first choice, out1_ as the second.
346//
347// For *, +, and ?, if out_ causes another repetition,
348// then the operator is greedy.  If out1_ is the repetition
349// (and out_ moves forward), then the operator is non-greedy.
350
351// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
352Frag Compiler::Star(Frag a, bool nongreedy) {
353  int id = AllocInst(1);
354  if (id < 0)
355    return NoMatch();
356  inst_[id].InitAlt(0, 0);
357  PatchList::Patch(inst_, a.end, id);
358  if (nongreedy) {
359    inst_[id].out1_ = a.begin;
360    return Frag(id, PatchList::Mk(id << 1));
361  } else {
362    inst_[id].set_out(a.begin);
363    return Frag(id, PatchList::Mk((id << 1) | 1));
364  }
365}
366
367// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
368Frag Compiler::Plus(Frag a, bool nongreedy) {
369  // a+ is just a* with a different entry point.
370  Frag f = Star(a, nongreedy);
371  return Frag(a.begin, f.end);
372}
373
374// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
375Frag Compiler::Quest(Frag a, bool nongreedy) {
376  int id = AllocInst(1);
377  if (id < 0)
378    return NoMatch();
379  PatchList pl;
380  if (nongreedy) {
381    inst_[id].InitAlt(0, a.begin);
382    pl = PatchList::Mk(id << 1);
383  } else {
384    inst_[id].InitAlt(a.begin, 0);
385    pl = PatchList::Mk((id << 1) | 1);
386  }
387  return Frag(id, PatchList::Append(inst_, pl, a.end));
388}
389
390// Returns a fragment for the byte range lo-hi.
391Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
392  int id = AllocInst(1);
393  if (id < 0)
394    return NoMatch();
395  inst_[id].InitByteRange(lo, hi, foldcase, 0);
396  prog_->byte_inst_count_++;
397  prog_->MarkByteRange(lo, hi);
398  if (foldcase && lo <= 'z' && hi >= 'a') {
399    if (lo < 'a')
400      lo = 'a';
401    if (hi > 'z')
402      hi = 'z';
403    if (lo <= hi)
404      prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
405  }
406  return Frag(id, PatchList::Mk(id << 1));
407}
408
409// Returns a no-op fragment.  Sometimes unavoidable.
410Frag Compiler::Nop() {
411  int id = AllocInst(1);
412  if (id < 0)
413    return NoMatch();
414  inst_[id].InitNop(0);
415  return Frag(id, PatchList::Mk(id << 1));
416}
417
418// Returns a fragment that signals a match.
419Frag Compiler::Match(int32 match_id) {
420  int id = AllocInst(1);
421  if (id < 0)
422    return NoMatch();
423  inst_[id].InitMatch(match_id);
424  return Frag(id, nullPatchList);
425}
426
427// Returns a fragment matching a particular empty-width op (like ^ or $)
428Frag Compiler::EmptyWidth(EmptyOp empty) {
429  int id = AllocInst(1);
430  if (id < 0)
431    return NoMatch();
432  inst_[id].InitEmptyWidth(empty, 0);
433  if (empty & (kEmptyBeginLine|kEmptyEndLine))
434    prog_->MarkByteRange('\n', '\n');
435  if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
436    int j;
437    for (int i = 0; i < 256; i = j) {
438      for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++)
439        ;
440      prog_->MarkByteRange(i, j-1);
441    }
442  }
443  return Frag(id, PatchList::Mk(id << 1));
444}
445
446// Given a fragment a, returns a fragment with capturing parens around a.
447Frag Compiler::Capture(Frag a, int n) {
448  int id = AllocInst(2);
449  if (id < 0)
450    return NoMatch();
451  inst_[id].InitCapture(2*n, a.begin);
452  inst_[id+1].InitCapture(2*n+1, 0);
453  PatchList::Patch(inst_, a.end, id+1);
454
455  return Frag(id, PatchList::Mk((id+1) << 1));
456}
457
458// A Rune is a name for a Unicode code point.
459// Returns maximum rune encoded by UTF-8 sequence of length len.
460static int MaxRune(int len) {
461  int b;  // number of Rune blents lenn len-byte UTF-8 sequence (len < UTFmax)
462  if (len == 1)
463    b = 7;
464  else
465    b = 8-(len+1) + 6*(len-1);
466  return (1<<b) - 1;   // maximum Rune for b bits.
467}
468
469// The rune range compiler caches common suffix fragments,
470// which are very common in UTF-8 (e.g., [80-bf]).
471// The fragment suffixes are identified by their start
472// instructions.  NULL denotes the eventual end match.
473// The Frag accumulates in rune_range_.  Caching common
474// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
475// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
476
477void Compiler::BeginRange() {
478  rune_cache_.clear();
479  rune_range_.begin = 0;
480  rune_range_.end = nullPatchList;
481}
482
483int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
484                                     int next) {
485  Frag f = ByteRange(lo, hi, foldcase);
486  if (next != 0) {
487    PatchList::Patch(inst_, f.end, next);
488  } else {
489    rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end);
490  }
491  return f.begin;
492}
493
494int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
495  // In Latin1 mode, there's no point in caching.
496  // In forward UTF-8 mode, only need to cache continuation bytes.
497  if (encoding_ == kEncodingLatin1 ||
498      (encoding_ == kEncodingUTF8 &&
499       !reversed_ &&
500       !(0x80 <= lo && hi <= 0xbf))) {
501    return UncachedRuneByteSuffix(lo, hi, foldcase, next);
502  }
503
504  uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase;
505  map<uint64, int>::iterator it = rune_cache_.find(key);
506  if (it != rune_cache_.end())
507    return it->second;
508  int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
509  rune_cache_[key] = id;
510  return id;
511}
512
513void Compiler::AddSuffix(int id) {
514  if (rune_range_.begin == 0) {
515    rune_range_.begin = id;
516    return;
517  }
518
519  int alt = AllocInst(1);
520  if (alt < 0) {
521    rune_range_.begin = 0;
522    return;
523  }
524  inst_[alt].InitAlt(rune_range_.begin, id);
525  rune_range_.begin = alt;
526}
527
528Frag Compiler::EndRange() {
529  return rune_range_;
530}
531
532// Converts rune range lo-hi into a fragment that recognizes
533// the bytes that would make up those runes in the current
534// encoding (Latin 1 or UTF-8).
535// This lets the machine work byte-by-byte even when
536// using multibyte encodings.
537
538void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
539  switch (encoding_) {
540    default:
541    case kEncodingUTF8:
542      AddRuneRangeUTF8(lo, hi, foldcase);
543      break;
544    case kEncodingLatin1:
545      AddRuneRangeLatin1(lo, hi, foldcase);
546      break;
547  }
548}
549
550void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
551  // Latin1 is easy: runes *are* bytes.
552  if (lo > hi || lo > 0xFF)
553    return;
554  if (hi > 0xFF)
555    hi = 0xFF;
556  AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
557}
558
559// Table describing how to make a UTF-8 matching machine
560// for the rune range 80-10FFFF (Runeself-Runemax).
561// This range happens frequently enough (for example /./ and /[^a-z]/)
562// and the rune_cache_ map is slow enough that this is worth
563// special handling.  Makes compilation of a small expression
564// with a dot in it about 10% faster.
565// The * in the comments below mark whole sequences.
566static struct ByteRangeProg {
567  int next;
568  int lo;
569  int hi;
570} prog_80_10ffff[] = {
571  // Two-byte
572  { -1, 0x80, 0xBF, },  // 0:  80-BF
573  {  0, 0xC2, 0xDF, },  // 1:  C2-DF 80-BF*
574
575  // Three-byte
576  {  0, 0xA0, 0xBF, },  // 2:  A0-BF 80-BF
577  {  2, 0xE0, 0xE0, },  // 3:  E0 A0-BF 80-BF*
578  {  0, 0x80, 0xBF, },  // 4:  80-BF 80-BF
579  {  4, 0xE1, 0xEF, },  // 5:  E1-EF 80-BF 80-BF*
580
581  // Four-byte
582  {  4, 0x90, 0xBF, },  // 6:  90-BF 80-BF 80-BF
583  {  6, 0xF0, 0xF0, },  // 7:  F0 90-BF 80-BF 80-BF*
584  {  4, 0x80, 0xBF, },  // 8:  80-BF 80-BF 80-BF
585  {  8, 0xF1, 0xF3, },  // 9: F1-F3 80-BF 80-BF 80-BF*
586  {  4, 0x80, 0x8F, },  // 10: 80-8F 80-BF 80-BF
587  { 10, 0xF4, 0xF4, },  // 11: F4 80-8F 80-BF 80-BF*
588};
589
590void Compiler::Add_80_10ffff() {
591  int inst[arraysize(prog_80_10ffff)];
592  for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
593    const ByteRangeProg& p = prog_80_10ffff[i];
594    int next = 0;
595    if (p.next >= 0)
596      next = inst[p.next];
597    inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);
598    if ((p.lo & 0xC0) != 0x80)
599      AddSuffix(inst[i]);
600  }
601}
602
603void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
604  if (lo > hi)
605    return;
606
607  // Pick off 80-10FFFF as a common special case
608  // that can bypass the slow rune_cache_.
609  if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
610    Add_80_10ffff();
611    return;
612  }
613
614  // Split range into same-length sized ranges.
615  for (int i = 1; i < UTFmax; i++) {
616    Rune max = MaxRune(i);
617    if (lo <= max && max < hi) {
618      AddRuneRangeUTF8(lo, max, foldcase);
619      AddRuneRangeUTF8(max+1, hi, foldcase);
620      return;
621    }
622  }
623
624  // ASCII range is always a special case.
625  if (hi < Runeself) {
626    AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
627    return;
628  }
629
630  // Split range into sections that agree on leading bytes.
631  for (int i = 1; i < UTFmax; i++) {
632    uint m = (1<<(6*i)) - 1;  // last i bytes of a UTF-8 sequence
633    if ((lo & ~m) != (hi & ~m)) {
634      if ((lo & m) != 0) {
635        AddRuneRangeUTF8(lo, lo|m, foldcase);
636        AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
637        return;
638      }
639      if ((hi & m) != m) {
640        AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
641        AddRuneRangeUTF8(hi&~m, hi, foldcase);
642        return;
643      }
644    }
645  }
646
647  // Finally.  Generate byte matching equivalent for lo-hi.
648  uint8 ulo[UTFmax], uhi[UTFmax];
649  int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
650  int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
651  (void)m;  // USED(m)
652  DCHECK_EQ(n, m);
653
654  int id = 0;
655  if (reversed_) {
656    for (int i = 0; i < n; i++)
657      id = RuneByteSuffix(ulo[i], uhi[i], false, id);
658  } else {
659    for (int i = n-1; i >= 0; i--)
660      id = RuneByteSuffix(ulo[i], uhi[i], false, id);
661  }
662  AddSuffix(id);
663}
664
665// Should not be called.
666Frag Compiler::Copy(Frag arg) {
667  // We're using WalkExponential; there should be no copying.
668  LOG(DFATAL) << "Compiler::Copy called!";
669  failed_ = true;
670  return NoMatch();
671}
672
673// Visits a node quickly; called once WalkExponential has
674// decided to cut this walk short.
675Frag Compiler::ShortVisit(Regexp* re, Frag) {
676  failed_ = true;
677  return NoMatch();
678}
679
680// Called before traversing a node's children during the walk.
681Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
682  // Cut off walk if we've already failed.
683  if (failed_)
684    *stop = true;
685
686  return kNullFrag;  // not used by caller
687}
688
689Frag Compiler::Literal(Rune r, bool foldcase) {
690  switch (encoding_) {
691    default:
692      return kNullFrag;
693
694    case kEncodingLatin1:
695      return ByteRange(r, r, foldcase);
696
697    case kEncodingUTF8: {
698      if (r < Runeself)  // Make common case fast.
699        return ByteRange(r, r, foldcase);
700      uint8 buf[UTFmax];
701      int n = runetochar(reinterpret_cast<char*>(buf), &r);
702      Frag f = ByteRange((uint8)buf[0], buf[0], false);
703      for (int i = 1; i < n; i++)
704        f = Cat(f, ByteRange((uint8)buf[i], buf[i], false));
705      return f;
706    }
707  }
708}
709
710// Called after traversing the node's children during the walk.
711// Given their frags, build and return the frag for this re.
712Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
713                         int nchild_frags) {
714  // If a child failed, don't bother going forward, especially
715  // since the child_frags might contain Frags with NULLs in them.
716  if (failed_)
717    return NoMatch();
718
719  // Given the child fragments, return the fragment for this node.
720  switch (re->op()) {
721    case kRegexpRepeat:
722      // Should not see; code at bottom of function will print error
723      break;
724
725    case kRegexpNoMatch:
726      return NoMatch();
727
728    case kRegexpEmptyMatch:
729      return Nop();
730
731    case kRegexpHaveMatch: {
732      Frag f = Match(re->match_id());
733      // Remember unanchored match to end of string.
734      if (anchor_ != RE2::ANCHOR_BOTH)
735        f = Cat(DotStar(), f);
736      return f;
737    }
738
739    case kRegexpConcat: {
740      Frag f = child_frags[0];
741      for (int i = 1; i < nchild_frags; i++)
742        f = Cat(f, child_frags[i]);
743      return f;
744    }
745
746    case kRegexpAlternate: {
747      Frag f = child_frags[0];
748      for (int i = 1; i < nchild_frags; i++)
749        f = Alt(f, child_frags[i]);
750      return f;
751    }
752
753    case kRegexpStar:
754      return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
755
756    case kRegexpPlus:
757      return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
758
759    case kRegexpQuest:
760      return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
761
762    case kRegexpLiteral:
763      return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase);
764
765    case kRegexpLiteralString: {
766      // Concatenation of literals.
767      if (re->nrunes() == 0)
768        return Nop();
769      Frag f;
770      for (int i = 0; i < re->nrunes(); i++) {
771        Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase);
772        if (i == 0)
773          f = f1;
774        else
775          f = Cat(f, f1);
776      }
777      return f;
778    }
779
780    case kRegexpAnyChar:
781      BeginRange();
782      AddRuneRange(0, Runemax, false);
783      return EndRange();
784
785    case kRegexpAnyByte:
786      return ByteRange(0x00, 0xFF, false);
787
788    case kRegexpCharClass: {
789      CharClass* cc = re->cc();
790      if (cc->empty()) {
791        // This can't happen.
792        LOG(DFATAL) << "No ranges in char class";
793        failed_ = true;
794        return NoMatch();
795      }
796
797      // ASCII case-folding optimization: if the char class
798      // behaves the same on A-Z as it does on a-z,
799      // discard any ranges wholly contained in A-Z
800      // and mark the other ranges as foldascii.
801      // This reduces the size of a program for
802      // (?i)abc from 3 insts per letter to 1 per letter.
803      bool foldascii = cc->FoldsASCII();
804
805      // Character class is just a big OR of the different
806      // character ranges in the class.
807      BeginRange();
808      for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
809        // ASCII case-folding optimization (see above).
810        if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
811          continue;
812
813        // If this range contains all of A-Za-z or none of it,
814        // the fold flag is unnecessary; don't bother.
815        bool fold = foldascii;
816        if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo)
817          fold = false;
818
819        AddRuneRange(i->lo, i->hi, fold);
820      }
821      return EndRange();
822    }
823
824    case kRegexpCapture:
825      // If this is a non-capturing parenthesis -- (?:foo) --
826      // just use the inner expression.
827      if (re->cap() < 0)
828        return child_frags[0];
829      return Capture(child_frags[0], re->cap());
830
831    case kRegexpBeginLine:
832      return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
833
834    case kRegexpEndLine:
835      return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
836
837    case kRegexpBeginText:
838      return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
839
840    case kRegexpEndText:
841      return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
842
843    case kRegexpWordBoundary:
844      return EmptyWidth(kEmptyWordBoundary);
845
846    case kRegexpNoWordBoundary:
847      return EmptyWidth(kEmptyNonWordBoundary);
848  }
849  LOG(DFATAL) << "Missing case in Compiler: " << re->op();
850  failed_ = true;
851  return NoMatch();
852}
853
854// Is this regexp required to start at the beginning of the text?
855// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
856// but handles (\A(a|b)).  Could use the Walker to write a more exact one.
857static bool IsAnchorStart(Regexp** pre, int depth) {
858  Regexp* re = *pre;
859  Regexp* sub;
860  // The depth limit makes sure that we don't overflow
861  // the stack on a deeply nested regexp.  As the comment
862  // above says, IsAnchorStart is conservative, so returning
863  // a false negative is okay.  The exact limit is somewhat arbitrary.
864  if (re == NULL || depth >= 4)
865    return false;
866  switch (re->op()) {
867    default:
868      break;
869    case kRegexpConcat:
870      if (re->nsub() > 0) {
871        sub = re->sub()[0]->Incref();
872        if (IsAnchorStart(&sub, depth+1)) {
873          Regexp** subcopy = new Regexp*[re->nsub()];
874          subcopy[0] = sub;  // already have reference
875          for (int i = 1; i < re->nsub(); i++)
876            subcopy[i] = re->sub()[i]->Incref();
877          *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
878          delete[] subcopy;
879          re->Decref();
880          return true;
881        }
882        sub->Decref();
883      }
884      break;
885    case kRegexpCapture:
886      sub = re->sub()[0]->Incref();
887      if (IsAnchorStart(&sub, depth+1)) {
888        *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
889        re->Decref();
890        return true;
891      }
892      sub->Decref();
893      break;
894    case kRegexpBeginText:
895      *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
896      re->Decref();
897      return true;
898  }
899  return false;
900}
901
902// Is this regexp required to start at the end of the text?
903// Only approximate; can return false for complicated regexps like (a\z|b\z),
904// but handles ((a|b)\z).  Could use the Walker to write a more exact one.
905static bool IsAnchorEnd(Regexp** pre, int depth) {
906  Regexp* re = *pre;
907  Regexp* sub;
908  // The depth limit makes sure that we don't overflow
909  // the stack on a deeply nested regexp.  As the comment
910  // above says, IsAnchorEnd is conservative, so returning
911  // a false negative is okay.  The exact limit is somewhat arbitrary.
912  if (re == NULL || depth >= 4)
913    return false;
914  switch (re->op()) {
915    default:
916      break;
917    case kRegexpConcat:
918      if (re->nsub() > 0) {
919        sub = re->sub()[re->nsub() - 1]->Incref();
920        if (IsAnchorEnd(&sub, depth+1)) {
921          Regexp** subcopy = new Regexp*[re->nsub()];
922          subcopy[re->nsub() - 1] = sub;  // already have reference
923          for (int i = 0; i < re->nsub() - 1; i++)
924            subcopy[i] = re->sub()[i]->Incref();
925          *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
926          delete[] subcopy;
927          re->Decref();
928          return true;
929        }
930        sub->Decref();
931      }
932      break;
933    case kRegexpCapture:
934      sub = re->sub()[0]->Incref();
935      if (IsAnchorEnd(&sub, depth+1)) {
936        *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
937        re->Decref();
938        return true;
939      }
940      sub->Decref();
941      break;
942    case kRegexpEndText:
943      *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
944      re->Decref();
945      return true;
946  }
947  return false;
948}
949
950void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
951                     RE2::Anchor anchor) {
952  prog_->set_flags(flags);
953
954  if (flags & Regexp::Latin1)
955    encoding_ = kEncodingLatin1;
956  max_mem_ = max_mem;
957  if (max_mem <= 0) {
958    max_inst_ = 100000;  // more than enough
959  } else if (max_mem <= sizeof(Prog)) {
960    // No room for anything.
961    max_inst_ = 0;
962  } else {
963    int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
964    // Limit instruction count so that inst->id() fits nicely in an int.
965    // SparseArray also assumes that the indices (inst->id()) are ints.
966    // The call to WalkExponential uses 2*max_inst_ below,
967    // and other places in the code use 2 or 3 * prog->size().
968    // Limiting to 2^24 should avoid overflow in those places.
969    // (The point of allowing more than 32 bits of memory is to
970    // have plenty of room for the DFA states, not to use it up
971    // on the program.)
972    if (m >= 1<<24)
973      m = 1<<24;
974
975    // Inst imposes its own limit (currently bigger than 2^24 but be safe).
976    if (m > Prog::Inst::kMaxInst)
977      m = Prog::Inst::kMaxInst;
978
979    max_inst_ = m;
980  }
981
982  anchor_ = anchor;
983}
984
985// Compiles re, returning program.
986// Caller is responsible for deleting prog_.
987// If reversed is true, compiles a program that expects
988// to run over the input string backward (reverses all concatenations).
989// The reversed flag is also recorded in the returned program.
990Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
991  Compiler c;
992
993  c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
994  c.reversed_ = reversed;
995
996  // Simplify to remove things like counted repetitions
997  // and character classes like \d.
998  Regexp* sre = re->Simplify();
999  if (sre == NULL)
1000    return NULL;
1001
1002  // Record whether prog is anchored, removing the anchors.
1003  // (They get in the way of other optimizations.)
1004  bool is_anchor_start = IsAnchorStart(&sre, 0);
1005  bool is_anchor_end = IsAnchorEnd(&sre, 0);
1006
1007  // Generate fragment for entire regexp.
1008  Frag f = c.WalkExponential(sre, kNullFrag, 2*c.max_inst_);
1009  sre->Decref();
1010  if (c.failed_)
1011    return NULL;
1012
1013  // Success!  Finish by putting Match node at end, and record start.
1014  // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1015  // to behave normally.
1016  c.reversed_ = false;
1017  Frag all = c.Cat(f, c.Match(0));
1018  c.prog_->set_start(all.begin);
1019
1020  if (reversed) {
1021    c.prog_->set_anchor_start(is_anchor_end);
1022    c.prog_->set_anchor_end(is_anchor_start);
1023  } else {
1024    c.prog_->set_anchor_start(is_anchor_start);
1025    c.prog_->set_anchor_end(is_anchor_end);
1026  }
1027
1028  // Also create unanchored version, which starts with a .*? loop.
1029  if (c.prog_->anchor_start()) {
1030    c.prog_->set_start_unanchored(c.prog_->start());
1031  } else {
1032    Frag unanchored = c.Cat(c.DotStar(), all);
1033    c.prog_->set_start_unanchored(unanchored.begin);
1034  }
1035
1036  c.prog_->set_reversed(reversed);
1037
1038  // Hand ownership of prog_ to caller.
1039  return c.Finish();
1040}
1041
1042Prog* Compiler::Finish() {
1043  if (failed_)
1044    return NULL;
1045
1046  if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1047    // No possible matches; keep Fail instruction only.
1048    inst_len_ = 1;
1049  }
1050
1051  // Trim instruction to minimum array and transfer to Prog.
1052  Trim();
1053  prog_->inst_ = inst_;
1054  prog_->size_ = inst_len_;
1055  inst_ = NULL;
1056
1057  // Compute byte map.
1058  prog_->ComputeByteMap();
1059
1060  prog_->Optimize();
1061
1062  // Record remaining memory for DFA.
1063  if (max_mem_ <= 0) {
1064    prog_->set_dfa_mem(1<<20);
1065  } else {
1066    int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst);
1067    if (m < 0)
1068      m = 0;
1069    prog_->set_dfa_mem(m);
1070  }
1071
1072  Prog* p = prog_;
1073  prog_ = NULL;
1074  return p;
1075}
1076
1077// Converts Regexp to Prog.
1078Prog* Regexp::CompileToProg(int64 max_mem) {
1079  return Compiler::Compile(this, false, max_mem);
1080}
1081
1082Prog* Regexp::CompileToReverseProg(int64 max_mem) {
1083  return Compiler::Compile(this, true, max_mem);
1084}
1085
1086Frag Compiler::DotStar() {
1087  return Star(ByteRange(0x00, 0xff, false), true);
1088}
1089
1090// Compiles RE set to Prog.
1091Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1092                           Regexp* re) {
1093  Compiler c;
1094
1095  Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
1096  c.Setup(pf, options.max_mem(), anchor);
1097
1098  // Compile alternation of fragments.
1099  Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_);
1100  re->Decref();
1101  if (c.failed_)
1102    return NULL;
1103
1104  if (anchor == RE2::UNANCHORED) {
1105    // The trailing .* was added while handling kRegexpHaveMatch.
1106    // We just have to add the leading one.
1107    all = c.Cat(c.DotStar(), all);
1108  }
1109
1110  c.prog_->set_start(all.begin);
1111  c.prog_->set_start_unanchored(all.begin);
1112  c.prog_->set_anchor_start(true);
1113  c.prog_->set_anchor_end(true);
1114
1115  Prog* prog = c.Finish();
1116  if (prog == NULL)
1117    return NULL;
1118
1119  // Make sure DFA has enough memory to operate,
1120  // since we're not going to fall back to the NFA.
1121  bool failed;
1122  StringPiece sp = "hello, world";
1123  prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1124                  NULL, &failed, NULL);
1125  if (failed) {
1126    delete prog;
1127    return NULL;
1128  }
1129
1130  return prog;
1131}
1132
1133Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1134                       Regexp* re) {
1135  return Compiler::CompileSet(options, anchor, re);
1136}
1137
1138}  // namespace re2
1139