1// Copyright 2007 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Compile regular expression to Prog.
6//
7// Prog and Inst are defined in prog.h.
8// This file's external interface is just Regexp::CompileToProg.
9// The Compiler class defined in this file is private.
10
11#include "re2/prog.h"
12#include "re2/re2.h"
13#include "re2/regexp.h"
14#include "re2/walker-inl.h"
15
16namespace re2 {
17
18// List of pointers to Inst* that need to be filled in (patched).
19// Because the Inst* haven't been filled in yet,
20// we can use the Inst* word to hold the list's "next" pointer.
21// It's kind of sleazy, but it works well in practice.
22// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
23//
24// Because the out and out1 fields in Inst are no longer pointers,
25// we can't use pointers directly here either.  Instead, p refers
26// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
27// p == 0 represents the NULL list.  This is okay because instruction #0
28// is always the fail instruction, which never appears on a list.
29
30struct PatchList {
31  uint32 p;
32
33  // Returns patch list containing just p.
34  static PatchList Mk(uint32 p);
35
36  // Patches all the entries on l to have value v.
37  // Caller must not ever use patch list again.
38  static void Patch(Prog::Inst *inst0, PatchList l, uint32 v);
39
40  // Deref returns the next pointer pointed at by p.
41  static PatchList Deref(Prog::Inst *inst0, PatchList l);
42
43  // Appends two patch lists and returns result.
44  static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
45};
46
47static PatchList nullPatchList = { 0 };
48
49// Returns patch list containing just p.
50PatchList PatchList::Mk(uint32 p) {
51  PatchList l;
52  l.p = p;
53  return l;
54}
55
56// Returns the next pointer pointed at by l.
57PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
58  Prog::Inst* ip = &inst0[l.p>>1];
59  if (l.p&1)
60    l.p = ip->out1();
61  else
62    l.p = ip->out();
63  return l;
64}
65
66// Patches all the entries on l to have value v.
67void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) {
68  while (l.p != 0) {
69    Prog::Inst* ip = &inst0[l.p>>1];
70    if (l.p&1) {
71      l.p = ip->out1();
72      ip->out1_ = val;
73    } else {
74      l.p = ip->out();
75      ip->set_out(val);
76    }
77  }
78}
79
80// Appends two patch lists and returns result.
81PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
82  if (l1.p == 0)
83    return l2;
84  if (l2.p == 0)
85    return l1;
86
87  PatchList l = l1;
88  for (;;) {
89    PatchList next = PatchList::Deref(inst0, l);
90    if (next.p == 0)
91      break;
92    l = next;
93  }
94
95  Prog::Inst* ip = &inst0[l.p>>1];
96  if (l.p&1)
97    ip->out1_ = l2.p;
98  else
99    ip->set_out(l2.p);
100
101  return l1;
102}
103
104// Compiled program fragment.
105struct Frag {
106  uint32 begin;
107  PatchList end;
108
109  Frag() : begin(0) { end.p = 0; }  // needed so Frag can go in vector
110  Frag(uint32 begin, PatchList end) : begin(begin), end(end) {}
111};
112
113static Frag NullFrag() {
114  return Frag();
115}
116
117// Input encodings.
118enum Encoding {
119  kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
120  kEncodingLatin1,    // Latin1 (0-FF)
121};
122
123class Compiler : public Regexp::Walker<Frag> {
124 public:
125  explicit Compiler();
126  ~Compiler();
127
128  // Compiles Regexp to a new Prog.
129  // Caller is responsible for deleting Prog when finished with it.
130  // If reversed is true, compiles for walking over the input
131  // string backward (reverses all concatenations).
132  static Prog *Compile(Regexp* re, bool reversed, int64 max_mem);
133
134  // Compiles alternation of all the re to a new Prog.
135  // Each re has a match with an id equal to its index in the vector.
136  static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
137                          Regexp* re);
138
139  // Interface for Regexp::Walker, which helps traverse the Regexp.
140  // The walk is purely post-recursive: given the machines for the
141  // children, PostVisit combines them to create the machine for
142  // the current node.  The child_args are Frags.
143  // The Compiler traverses the Regexp parse tree, visiting
144  // each node in depth-first order.  It invokes PreVisit before
145  // visiting the node's children and PostVisit after visiting
146  // the children.
147  Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
148  Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
149                 int nchild_args);
150  Frag ShortVisit(Regexp* re, Frag parent_arg);
151  Frag Copy(Frag arg);
152
153  // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
154  Frag Plus(Frag a, bool nongreedy);
155  Frag Star(Frag a, bool nongreedy);
156  Frag Quest(Frag a, bool nongreedy);
157
158  // Given fragment a, returns (a) capturing as \n.
159  Frag Capture(Frag a, int n);
160
161  // Given fragments a and b, returns ab; a|b
162  Frag Cat(Frag a, Frag b);
163  Frag Alt(Frag a, Frag b);
164
165  // Returns a fragment that can't match anything.
166  Frag NoMatch();
167
168  // Returns a fragment that matches the empty string.
169  Frag Match(int32 id);
170
171  // Returns a no-op fragment.
172  Frag Nop();
173
174  // Returns a fragment matching the byte range lo-hi.
175  Frag ByteRange(int lo, int hi, bool foldcase);
176
177  // Returns a fragment matching an empty-width special op.
178  Frag EmptyWidth(EmptyOp op);
179
180  // Adds n instructions to the program.
181  // Returns the index of the first one.
182  // Returns -1 if no more instructions are available.
183  int AllocInst(int n);
184
185  // Deletes unused instructions.
186  void Trim();
187
188  // Rune range compiler.
189
190  // Begins a new alternation.
191  void BeginRange();
192
193  // Adds a fragment matching the rune range lo-hi.
194  void AddRuneRange(Rune lo, Rune hi, bool foldcase);
195  void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
196  void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
197  void Add_80_10ffff();
198
199  // New suffix that matches the byte range lo-hi, then goes to next.
200  int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
201  int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
202
203  // Adds a suffix to alternation.
204  void AddSuffix(int id);
205
206  // Returns the alternation of all the added suffixes.
207  Frag EndRange();
208
209  // Single rune.
210  Frag Literal(Rune r, bool foldcase);
211
212  void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
213  Prog* Finish();
214
215  // Returns .* where dot = any byte
216  Frag DotStar();
217
218 private:
219  Prog* prog_;         // Program being built.
220  bool failed_;        // Did we give up compiling?
221  Encoding encoding_;  // Input encoding
222  bool reversed_;      // Should program run backward over text?
223
224  int max_inst_;       // Maximum number of instructions.
225
226  Prog::Inst* inst_;   // Pointer to first instruction.
227  int inst_len_;       // Number of instructions used.
228  int inst_cap_;       // Number of instructions allocated.
229
230  int64 max_mem_;      // Total memory budget.
231
232  map<uint64, int> rune_cache_;
233  Frag rune_range_;
234
235  RE2::Anchor anchor_;  // anchor mode for RE2::Set
236
237  DISALLOW_EVIL_CONSTRUCTORS(Compiler);
238};
239
240Compiler::Compiler() {
241  prog_ = new Prog();
242  failed_ = false;
243  encoding_ = kEncodingUTF8;
244  reversed_ = false;
245  inst_ = NULL;
246  inst_len_ = 0;
247  inst_cap_ = 0;
248  max_inst_ = 1;  // make AllocInst for fail instruction okay
249  max_mem_ = 0;
250  int fail = AllocInst(1);
251  inst_[fail].InitFail();
252  max_inst_ = 0;  // Caller must change
253}
254
255Compiler::~Compiler() {
256  delete prog_;
257  delete[] inst_;
258}
259
260int Compiler::AllocInst(int n) {
261  if (failed_ || inst_len_ + n > max_inst_) {
262    failed_ = true;
263    return -1;
264  }
265
266  if (inst_len_ + n > inst_cap_) {
267    if (inst_cap_ == 0)
268      inst_cap_ = 8;
269    while (inst_len_ + n > inst_cap_)
270      inst_cap_ *= 2;
271    Prog::Inst* ip = new Prog::Inst[inst_cap_];
272    memmove(ip, inst_, inst_len_ * sizeof ip[0]);
273    memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
274    delete[] inst_;
275    inst_ = ip;
276  }
277  int id = inst_len_;
278  inst_len_ += n;
279  return id;
280}
281
282void Compiler::Trim() {
283  if (inst_len_ < inst_cap_) {
284    Prog::Inst* ip = new Prog::Inst[inst_len_];
285    memmove(ip, inst_, inst_len_ * sizeof ip[0]);
286    delete[] inst_;
287    inst_ = ip;
288    inst_cap_ = inst_len_;
289  }
290}
291
292// These routines are somewhat hard to visualize in text --
293// see http://swtch.com/~rsc/regexp/regexp1.html for
294// pictures explaining what is going on here.
295
296// Returns an unmatchable fragment.
297Frag Compiler::NoMatch() {
298  return Frag(0, nullPatchList);
299}
300
301// Is a an unmatchable fragment?
302static bool IsNoMatch(Frag a) {
303  return a.begin == 0;
304}
305
306// Given fragments a and b, returns fragment for ab.
307Frag Compiler::Cat(Frag a, Frag b) {
308  if (IsNoMatch(a) || IsNoMatch(b))
309    return NoMatch();
310
311  // Elide no-op.
312  Prog::Inst* begin = &inst_[a.begin];
313  if (begin->opcode() == kInstNop &&
314      a.end.p == (a.begin << 1) &&
315      begin->out() == 0) {
316    PatchList::Patch(inst_, a.end, b.begin);  // in case refs to a somewhere
317    return b;
318  }
319
320  // To run backward over string, reverse all concatenations.
321  if (reversed_) {
322    PatchList::Patch(inst_, b.end, a.begin);
323    return Frag(b.begin, a.end);
324  }
325
326  PatchList::Patch(inst_, a.end, b.begin);
327  return Frag(a.begin, b.end);
328}
329
330// Given fragments for a and b, returns fragment for a|b.
331Frag Compiler::Alt(Frag a, Frag b) {
332  // Special case for convenience in loops.
333  if (IsNoMatch(a))
334    return b;
335  if (IsNoMatch(b))
336    return a;
337
338  int id = AllocInst(1);
339  if (id < 0)
340    return NoMatch();
341
342  inst_[id].InitAlt(a.begin, b.begin);
343  return Frag(id, PatchList::Append(inst_, a.end, b.end));
344}
345
346// When capturing submatches in like-Perl mode, a kOpAlt Inst
347// treats out_ as the first choice, out1_ as the second.
348//
349// For *, +, and ?, if out_ causes another repetition,
350// then the operator is greedy.  If out1_ is the repetition
351// (and out_ moves forward), then the operator is non-greedy.
352
353// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
354Frag Compiler::Star(Frag a, bool nongreedy) {
355  int id = AllocInst(1);
356  if (id < 0)
357    return NoMatch();
358  inst_[id].InitAlt(0, 0);
359  PatchList::Patch(inst_, a.end, id);
360  if (nongreedy) {
361    inst_[id].out1_ = a.begin;
362    return Frag(id, PatchList::Mk(id << 1));
363  } else {
364    inst_[id].set_out(a.begin);
365    return Frag(id, PatchList::Mk((id << 1) | 1));
366  }
367}
368
369// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
370Frag Compiler::Plus(Frag a, bool nongreedy) {
371  // a+ is just a* with a different entry point.
372  Frag f = Star(a, nongreedy);
373  return Frag(a.begin, f.end);
374}
375
376// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
377Frag Compiler::Quest(Frag a, bool nongreedy) {
378  int id = AllocInst(1);
379  if (id < 0)
380    return NoMatch();
381  PatchList pl;
382  if (nongreedy) {
383    inst_[id].InitAlt(0, a.begin);
384    pl = PatchList::Mk(id << 1);
385  } else {
386    inst_[id].InitAlt(a.begin, 0);
387    pl = PatchList::Mk((id << 1) | 1);
388  }
389  return Frag(id, PatchList::Append(inst_, pl, a.end));
390}
391
392// Returns a fragment for the byte range lo-hi.
393Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
394  int id = AllocInst(1);
395  if (id < 0)
396    return NoMatch();
397  inst_[id].InitByteRange(lo, hi, foldcase, 0);
398  prog_->byte_inst_count_++;
399  prog_->MarkByteRange(lo, hi);
400  if (foldcase && lo <= 'z' && hi >= 'a') {
401    if (lo < 'a')
402      lo = 'a';
403    if (hi > 'z')
404      hi = 'z';
405    if (lo <= hi)
406      prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
407  }
408  return Frag(id, PatchList::Mk(id << 1));
409}
410
411// Returns a no-op fragment.  Sometimes unavoidable.
412Frag Compiler::Nop() {
413  int id = AllocInst(1);
414  if (id < 0)
415    return NoMatch();
416  inst_[id].InitNop(0);
417  return Frag(id, PatchList::Mk(id << 1));
418}
419
420// Returns a fragment that signals a match.
421Frag Compiler::Match(int32 match_id) {
422  int id = AllocInst(1);
423  if (id < 0)
424    return NoMatch();
425  inst_[id].InitMatch(match_id);
426  return Frag(id, nullPatchList);
427}
428
429// Returns a fragment matching a particular empty-width op (like ^ or $)
430Frag Compiler::EmptyWidth(EmptyOp empty) {
431  int id = AllocInst(1);
432  if (id < 0)
433    return NoMatch();
434  inst_[id].InitEmptyWidth(empty, 0);
435  if (empty & (kEmptyBeginLine|kEmptyEndLine))
436    prog_->MarkByteRange('\n', '\n');
437  if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
438    int j;
439    for (int i = 0; i < 256; i = j) {
440      for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++)
441        ;
442      prog_->MarkByteRange(i, j-1);
443    }
444  }
445  return Frag(id, PatchList::Mk(id << 1));
446}
447
448// Given a fragment a, returns a fragment with capturing parens around a.
449Frag Compiler::Capture(Frag a, int n) {
450  int id = AllocInst(2);
451  if (id < 0)
452    return NoMatch();
453  inst_[id].InitCapture(2*n, a.begin);
454  inst_[id+1].InitCapture(2*n+1, 0);
455  PatchList::Patch(inst_, a.end, id+1);
456
457  return Frag(id, PatchList::Mk((id+1) << 1));
458}
459
460// A Rune is a name for a Unicode code point.
461// Returns maximum rune encoded by UTF-8 sequence of length len.
462static int MaxRune(int len) {
463  int b;  // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
464  if (len == 1)
465    b = 7;
466  else
467    b = 8-(len+1) + 6*(len-1);
468  return (1<<b) - 1;   // maximum Rune for b bits.
469}
470
471// The rune range compiler caches common suffix fragments,
472// which are very common in UTF-8 (e.g., [80-bf]).
473// The fragment suffixes are identified by their start
474// instructions.  NULL denotes the eventual end match.
475// The Frag accumulates in rune_range_.  Caching common
476// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
477// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
478
479void Compiler::BeginRange() {
480  rune_cache_.clear();
481  rune_range_.begin = 0;
482  rune_range_.end = nullPatchList;
483}
484
485int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
486                                     int next) {
487  Frag f = ByteRange(lo, hi, foldcase);
488  if (next != 0) {
489    PatchList::Patch(inst_, f.end, next);
490  } else {
491    rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end);
492  }
493  return f.begin;
494}
495
496int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
497  // In Latin1 mode, there's no point in caching.
498  // In forward UTF-8 mode, only need to cache continuation bytes.
499  if (encoding_ == kEncodingLatin1 ||
500      (encoding_ == kEncodingUTF8 &&
501       !reversed_ &&
502       !(0x80 <= lo && hi <= 0xbf))) {
503    return UncachedRuneByteSuffix(lo, hi, foldcase, next);
504  }
505
506  uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | (foldcase ? 1ULL : 0ULL);
507  map<uint64, int>::iterator it = rune_cache_.find(key);
508  if (it != rune_cache_.end())
509    return it->second;
510  int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
511  rune_cache_[key] = id;
512  return id;
513}
514
515void Compiler::AddSuffix(int id) {
516  if (rune_range_.begin == 0) {
517    rune_range_.begin = id;
518    return;
519  }
520
521  int alt = AllocInst(1);
522  if (alt < 0) {
523    rune_range_.begin = 0;
524    return;
525  }
526  inst_[alt].InitAlt(rune_range_.begin, id);
527  rune_range_.begin = alt;
528}
529
530Frag Compiler::EndRange() {
531  return rune_range_;
532}
533
534// Converts rune range lo-hi into a fragment that recognizes
535// the bytes that would make up those runes in the current
536// encoding (Latin 1 or UTF-8).
537// This lets the machine work byte-by-byte even when
538// using multibyte encodings.
539
540void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
541  switch (encoding_) {
542    default:
543    case kEncodingUTF8:
544      AddRuneRangeUTF8(lo, hi, foldcase);
545      break;
546    case kEncodingLatin1:
547      AddRuneRangeLatin1(lo, hi, foldcase);
548      break;
549  }
550}
551
552void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
553  // Latin1 is easy: runes *are* bytes.
554  if (lo > hi || lo > 0xFF)
555    return;
556  if (hi > 0xFF)
557    hi = 0xFF;
558  AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
559}
560
561// Table describing how to make a UTF-8 matching machine
562// for the rune range 80-10FFFF (Runeself-Runemax).
563// This range happens frequently enough (for example /./ and /[^a-z]/)
564// and the rune_cache_ map is slow enough that this is worth
565// special handling.  Makes compilation of a small expression
566// with a dot in it about 10% faster.
567// The * in the comments below mark whole sequences.
568static struct ByteRangeProg {
569  int next;
570  int lo;
571  int hi;
572} prog_80_10ffff[] = {
573  // Two-byte
574  { -1, 0x80, 0xBF, },  // 0:  80-BF
575  {  0, 0xC2, 0xDF, },  // 1:  C2-DF 80-BF*
576
577  // Three-byte
578  {  0, 0xA0, 0xBF, },  // 2:  A0-BF 80-BF
579  {  2, 0xE0, 0xE0, },  // 3:  E0 A0-BF 80-BF*
580  {  0, 0x80, 0xBF, },  // 4:  80-BF 80-BF
581  {  4, 0xE1, 0xEF, },  // 5:  E1-EF 80-BF 80-BF*
582
583  // Four-byte
584  {  4, 0x90, 0xBF, },  // 6:  90-BF 80-BF 80-BF
585  {  6, 0xF0, 0xF0, },  // 7:  F0 90-BF 80-BF 80-BF*
586  {  4, 0x80, 0xBF, },  // 8:  80-BF 80-BF 80-BF
587  {  8, 0xF1, 0xF3, },  // 9: F1-F3 80-BF 80-BF 80-BF*
588  {  4, 0x80, 0x8F, },  // 10: 80-8F 80-BF 80-BF
589  { 10, 0xF4, 0xF4, },  // 11: F4 80-8F 80-BF 80-BF*
590};
591
592void Compiler::Add_80_10ffff() {
593  int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
594  for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
595    const ByteRangeProg& p = prog_80_10ffff[i];
596    int next = 0;
597    if (p.next >= 0)
598      next = inst[p.next];
599    inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);
600    if ((p.lo & 0xC0) != 0x80)
601      AddSuffix(inst[i]);
602  }
603}
604
605void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
606  if (lo > hi)
607    return;
608
609  // Pick off 80-10FFFF as a common special case
610  // that can bypass the slow rune_cache_.
611  if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
612    Add_80_10ffff();
613    return;
614  }
615
616  // Split range into same-length sized ranges.
617  for (int i = 1; i < UTFmax; i++) {
618    Rune max = MaxRune(i);
619    if (lo <= max && max < hi) {
620      AddRuneRangeUTF8(lo, max, foldcase);
621      AddRuneRangeUTF8(max+1, hi, foldcase);
622      return;
623    }
624  }
625
626  // ASCII range is always a special case.
627  if (hi < Runeself) {
628    AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
629    return;
630  }
631
632  // Split range into sections that agree on leading bytes.
633  for (int i = 1; i < UTFmax; i++) {
634    uint m = (1<<(6*i)) - 1;  // last i bytes of a UTF-8 sequence
635    if ((lo & ~m) != (hi & ~m)) {
636      if ((lo & m) != 0) {
637        AddRuneRangeUTF8(lo, lo|m, foldcase);
638        AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
639        return;
640      }
641      if ((hi & m) != m) {
642        AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
643        AddRuneRangeUTF8(hi&~m, hi, foldcase);
644        return;
645      }
646    }
647  }
648
649  // Finally.  Generate byte matching equivalent for lo-hi.
650  uint8 ulo[UTFmax], uhi[UTFmax];
651  int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
652  int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
653  (void)m;  // USED(m)
654  DCHECK_EQ(n, m);
655
656  int id = 0;
657  if (reversed_) {
658    for (int i = 0; i < n; i++)
659      id = RuneByteSuffix(ulo[i], uhi[i], false, id);
660  } else {
661    for (int i = n-1; i >= 0; i--)
662      id = RuneByteSuffix(ulo[i], uhi[i], false, id);
663  }
664  AddSuffix(id);
665}
666
667// Should not be called.
668Frag Compiler::Copy(Frag arg) {
669  // We're using WalkExponential; there should be no copying.
670  LOG(DFATAL) << "Compiler::Copy called!";
671  failed_ = true;
672  return NoMatch();
673}
674
675// Visits a node quickly; called once WalkExponential has
676// decided to cut this walk short.
677Frag Compiler::ShortVisit(Regexp* re, Frag) {
678  failed_ = true;
679  return NoMatch();
680}
681
682// Called before traversing a node's children during the walk.
683Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
684  // Cut off walk if we've already failed.
685  if (failed_)
686    *stop = true;
687
688  return NullFrag();  // not used by caller
689}
690
691Frag Compiler::Literal(Rune r, bool foldcase) {
692  switch (encoding_) {
693    default:
694      return NullFrag();
695
696    case kEncodingLatin1:
697      return ByteRange(r, r, foldcase);
698
699    case kEncodingUTF8: {
700      if (r < Runeself)  // Make common case fast.
701        return ByteRange(r, r, foldcase);
702      uint8 buf[UTFmax];
703      int n = runetochar(reinterpret_cast<char*>(buf), &r);
704      Frag f = ByteRange((uint8)buf[0], buf[0], false);
705      for (int i = 1; i < n; i++)
706        f = Cat(f, ByteRange((uint8)buf[i], buf[i], false));
707      return f;
708    }
709  }
710}
711
712// Called after traversing the node's children during the walk.
713// Given their frags, build and return the frag for this re.
714Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
715                         int nchild_frags) {
716  // If a child failed, don't bother going forward, especially
717  // since the child_frags might contain Frags with NULLs in them.
718  if (failed_)
719    return NoMatch();
720
721  // Given the child fragments, return the fragment for this node.
722  switch (re->op()) {
723    case kRegexpRepeat:
724      // Should not see; code at bottom of function will print error
725      break;
726
727    case kRegexpNoMatch:
728      return NoMatch();
729
730    case kRegexpEmptyMatch:
731      return Nop();
732
733    case kRegexpHaveMatch: {
734      Frag f = Match(re->match_id());
735      // Remember unanchored match to end of string.
736      if (anchor_ != RE2::ANCHOR_BOTH)
737        f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f));
738      return f;
739    }
740
741    case kRegexpConcat: {
742      Frag f = child_frags[0];
743      for (int i = 1; i < nchild_frags; i++)
744        f = Cat(f, child_frags[i]);
745      return f;
746    }
747
748    case kRegexpAlternate: {
749      Frag f = child_frags[0];
750      for (int i = 1; i < nchild_frags; i++)
751        f = Alt(f, child_frags[i]);
752      return f;
753    }
754
755    case kRegexpStar:
756      return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
757
758    case kRegexpPlus:
759      return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
760
761    case kRegexpQuest:
762      return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
763
764    case kRegexpLiteral:
765      return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase);
766
767    case kRegexpLiteralString: {
768      // Concatenation of literals.
769      if (re->nrunes() == 0)
770        return Nop();
771      Frag f;
772      for (int i = 0; i < re->nrunes(); i++) {
773        Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase);
774        if (i == 0)
775          f = f1;
776        else
777          f = Cat(f, f1);
778      }
779      return f;
780    }
781
782    case kRegexpAnyChar:
783      BeginRange();
784      AddRuneRange(0, Runemax, false);
785      return EndRange();
786
787    case kRegexpAnyByte:
788      return ByteRange(0x00, 0xFF, false);
789
790    case kRegexpCharClass: {
791      CharClass* cc = re->cc();
792      if (cc->empty()) {
793        // This can't happen.
794        LOG(DFATAL) << "No ranges in char class";
795        failed_ = true;
796        return NoMatch();
797      }
798
799      // ASCII case-folding optimization: if the char class
800      // behaves the same on A-Z as it does on a-z,
801      // discard any ranges wholly contained in A-Z
802      // and mark the other ranges as foldascii.
803      // This reduces the size of a program for
804      // (?i)abc from 3 insts per letter to 1 per letter.
805      bool foldascii = cc->FoldsASCII();
806
807      // Character class is just a big OR of the different
808      // character ranges in the class.
809      BeginRange();
810      for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
811        // ASCII case-folding optimization (see above).
812        if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
813          continue;
814
815        // If this range contains all of A-Za-z or none of it,
816        // the fold flag is unnecessary; don't bother.
817        bool fold = foldascii;
818        if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo)
819          fold = false;
820
821        AddRuneRange(i->lo, i->hi, fold);
822      }
823      return EndRange();
824    }
825
826    case kRegexpCapture:
827      // If this is a non-capturing parenthesis -- (?:foo) --
828      // just use the inner expression.
829      if (re->cap() < 0)
830        return child_frags[0];
831      return Capture(child_frags[0], re->cap());
832
833    case kRegexpBeginLine:
834      return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
835
836    case kRegexpEndLine:
837      return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
838
839    case kRegexpBeginText:
840      return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
841
842    case kRegexpEndText:
843      return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
844
845    case kRegexpWordBoundary:
846      return EmptyWidth(kEmptyWordBoundary);
847
848    case kRegexpNoWordBoundary:
849      return EmptyWidth(kEmptyNonWordBoundary);
850  }
851  LOG(DFATAL) << "Missing case in Compiler: " << re->op();
852  failed_ = true;
853  return NoMatch();
854}
855
856// Is this regexp required to start at the beginning of the text?
857// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
858// but handles (\A(a|b)).  Could use the Walker to write a more exact one.
859static bool IsAnchorStart(Regexp** pre, int depth) {
860  Regexp* re = *pre;
861  Regexp* sub;
862  // The depth limit makes sure that we don't overflow
863  // the stack on a deeply nested regexp.  As the comment
864  // above says, IsAnchorStart is conservative, so returning
865  // a false negative is okay.  The exact limit is somewhat arbitrary.
866  if (re == NULL || depth >= 4)
867    return false;
868  switch (re->op()) {
869    default:
870      break;
871    case kRegexpConcat:
872      if (re->nsub() > 0) {
873        sub = re->sub()[0]->Incref();
874        if (IsAnchorStart(&sub, depth+1)) {
875          Regexp** subcopy = new Regexp*[re->nsub()];
876          subcopy[0] = sub;  // already have reference
877          for (int i = 1; i < re->nsub(); i++)
878            subcopy[i] = re->sub()[i]->Incref();
879          *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
880          delete[] subcopy;
881          re->Decref();
882          return true;
883        }
884        sub->Decref();
885      }
886      break;
887    case kRegexpCapture:
888      sub = re->sub()[0]->Incref();
889      if (IsAnchorStart(&sub, depth+1)) {
890        *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
891        re->Decref();
892        return true;
893      }
894      sub->Decref();
895      break;
896    case kRegexpBeginText:
897      *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
898      re->Decref();
899      return true;
900  }
901  return false;
902}
903
904// Is this regexp required to start at the end of the text?
905// Only approximate; can return false for complicated regexps like (a\z|b\z),
906// but handles ((a|b)\z).  Could use the Walker to write a more exact one.
907static bool IsAnchorEnd(Regexp** pre, int depth) {
908  Regexp* re = *pre;
909  Regexp* sub;
910  // The depth limit makes sure that we don't overflow
911  // the stack on a deeply nested regexp.  As the comment
912  // above says, IsAnchorEnd is conservative, so returning
913  // a false negative is okay.  The exact limit is somewhat arbitrary.
914  if (re == NULL || depth >= 4)
915    return false;
916  switch (re->op()) {
917    default:
918      break;
919    case kRegexpConcat:
920      if (re->nsub() > 0) {
921        sub = re->sub()[re->nsub() - 1]->Incref();
922        if (IsAnchorEnd(&sub, depth+1)) {
923          Regexp** subcopy = new Regexp*[re->nsub()];
924          subcopy[re->nsub() - 1] = sub;  // already have reference
925          for (int i = 0; i < re->nsub() - 1; i++)
926            subcopy[i] = re->sub()[i]->Incref();
927          *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
928          delete[] subcopy;
929          re->Decref();
930          return true;
931        }
932        sub->Decref();
933      }
934      break;
935    case kRegexpCapture:
936      sub = re->sub()[0]->Incref();
937      if (IsAnchorEnd(&sub, depth+1)) {
938        *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
939        re->Decref();
940        return true;
941      }
942      sub->Decref();
943      break;
944    case kRegexpEndText:
945      *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
946      re->Decref();
947      return true;
948  }
949  return false;
950}
951
952void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
953                     RE2::Anchor anchor) {
954  prog_->set_flags(flags);
955
956  if (flags & Regexp::Latin1)
957    encoding_ = kEncodingLatin1;
958  max_mem_ = max_mem;
959  if (max_mem <= 0) {
960    max_inst_ = 100000;  // more than enough
961  } else if (max_mem <= sizeof(Prog)) {
962    // No room for anything.
963    max_inst_ = 0;
964  } else {
965    int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
966    // Limit instruction count so that inst->id() fits nicely in an int.
967    // SparseArray also assumes that the indices (inst->id()) are ints.
968    // The call to WalkExponential uses 2*max_inst_ below,
969    // and other places in the code use 2 or 3 * prog->size().
970    // Limiting to 2^24 should avoid overflow in those places.
971    // (The point of allowing more than 32 bits of memory is to
972    // have plenty of room for the DFA states, not to use it up
973    // on the program.)
974    if (m >= 1<<24)
975      m = 1<<24;
976
977    // Inst imposes its own limit (currently bigger than 2^24 but be safe).
978    if (m > Prog::Inst::kMaxInst)
979      m = Prog::Inst::kMaxInst;
980
981    max_inst_ = m;
982  }
983
984  anchor_ = anchor;
985}
986
987// Compiles re, returning program.
988// Caller is responsible for deleting prog_.
989// If reversed is true, compiles a program that expects
990// to run over the input string backward (reverses all concatenations).
991// The reversed flag is also recorded in the returned program.
992Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
993  Compiler c;
994
995  c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
996  c.reversed_ = reversed;
997
998  // Simplify to remove things like counted repetitions
999  // and character classes like \d.
1000  Regexp* sre = re->Simplify();
1001  if (sre == NULL)
1002    return NULL;
1003
1004  // Record whether prog is anchored, removing the anchors.
1005  // (They get in the way of other optimizations.)
1006  bool is_anchor_start = IsAnchorStart(&sre, 0);
1007  bool is_anchor_end = IsAnchorEnd(&sre, 0);
1008
1009  // Generate fragment for entire regexp.
1010  Frag f = c.WalkExponential(sre, NullFrag(), 2*c.max_inst_);
1011  sre->Decref();
1012  if (c.failed_)
1013    return NULL;
1014
1015  // Success!  Finish by putting Match node at end, and record start.
1016  // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1017  // to behave normally.
1018  c.reversed_ = false;
1019  Frag all = c.Cat(f, c.Match(0));
1020  c.prog_->set_start(all.begin);
1021
1022  if (reversed) {
1023    c.prog_->set_anchor_start(is_anchor_end);
1024    c.prog_->set_anchor_end(is_anchor_start);
1025  } else {
1026    c.prog_->set_anchor_start(is_anchor_start);
1027    c.prog_->set_anchor_end(is_anchor_end);
1028  }
1029
1030  // Also create unanchored version, which starts with a .*? loop.
1031  if (c.prog_->anchor_start()) {
1032    c.prog_->set_start_unanchored(c.prog_->start());
1033  } else {
1034    Frag unanchored = c.Cat(c.DotStar(), all);
1035    c.prog_->set_start_unanchored(unanchored.begin);
1036  }
1037
1038  c.prog_->set_reversed(reversed);
1039
1040  // Hand ownership of prog_ to caller.
1041  return c.Finish();
1042}
1043
1044Prog* Compiler::Finish() {
1045  if (failed_)
1046    return NULL;
1047
1048  if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1049    // No possible matches; keep Fail instruction only.
1050    inst_len_ = 1;
1051  }
1052
1053  // Trim instruction to minimum array and transfer to Prog.
1054  Trim();
1055  prog_->inst_ = inst_;
1056  prog_->size_ = inst_len_;
1057  inst_ = NULL;
1058
1059  // Compute byte map.
1060  prog_->ComputeByteMap();
1061
1062  prog_->Optimize();
1063
1064  // Record remaining memory for DFA.
1065  if (max_mem_ <= 0) {
1066    prog_->set_dfa_mem(1<<20);
1067  } else {
1068    int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst);
1069    if (m < 0)
1070      m = 0;
1071    prog_->set_dfa_mem(m);
1072  }
1073
1074  Prog* p = prog_;
1075  prog_ = NULL;
1076  return p;
1077}
1078
1079// Converts Regexp to Prog.
1080Prog* Regexp::CompileToProg(int64 max_mem) {
1081  return Compiler::Compile(this, false, max_mem);
1082}
1083
1084Prog* Regexp::CompileToReverseProg(int64 max_mem) {
1085  return Compiler::Compile(this, true, max_mem);
1086}
1087
1088Frag Compiler::DotStar() {
1089  return Star(ByteRange(0x00, 0xff, false), true);
1090}
1091
1092// Compiles RE set to Prog.
1093Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1094                           Regexp* re) {
1095  Compiler c;
1096
1097  Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
1098  c.Setup(pf, options.max_mem(), anchor);
1099
1100  // Compile alternation of fragments.
1101  Frag all = c.WalkExponential(re, NullFrag(), 2*c.max_inst_);
1102  re->Decref();
1103  if (c.failed_)
1104    return NULL;
1105
1106  if (anchor == RE2::UNANCHORED) {
1107    // The trailing .* was added while handling kRegexpHaveMatch.
1108    // We just have to add the leading one.
1109    all = c.Cat(c.DotStar(), all);
1110  }
1111
1112  c.prog_->set_start(all.begin);
1113  c.prog_->set_start_unanchored(all.begin);
1114  c.prog_->set_anchor_start(true);
1115  c.prog_->set_anchor_end(true);
1116
1117  Prog* prog = c.Finish();
1118  if (prog == NULL)
1119    return NULL;
1120
1121  // Make sure DFA has enough memory to operate,
1122  // since we're not going to fall back to the NFA.
1123  bool failed;
1124  StringPiece sp = "hello, world";
1125  prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1126                  NULL, &failed, NULL);
1127  if (failed) {
1128    delete prog;
1129    return NULL;
1130  }
1131
1132  return prog;
1133}
1134
1135Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1136                       Regexp* re) {
1137  return Compiler::CompileSet(options, anchor, re);
1138}
1139
1140}  // namespace re2
1141