15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2007 The RE2 Authors.  All Rights Reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Compile regular expression to Prog.
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Prog and Inst are defined in prog.h.
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This file's external interface is just Regexp::CompileToProg.
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The Compiler class defined in this file is private.
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/prog.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/re2.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/regexp.h"
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/walker-inl.h"
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 {
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// List of pointers to Inst* that need to be filled in (patched).
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Because the Inst* haven't been filled in yet,
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// we can use the Inst* word to hold the list's "next" pointer.
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// It's kind of sleazy, but it works well in practice.
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Because the out and out1 fields in Inst are no longer pointers,
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// we can't use pointers directly here either.  Instead, p refers
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1).
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// p == 0 represents the NULL list.  This is okay because instruction #0
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is always the fail instruction, which never appears on a list.
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct PatchList {
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32 p;
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns patch list containing just p.
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static PatchList Mk(uint32 p);
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Patches all the entries on l to have value v.
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Caller must not ever use patch list again.
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static void Patch(Prog::Inst *inst0, PatchList l, uint32 v);
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Deref returns the next pointer pointed at by p.
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static PatchList Deref(Prog::Inst *inst0, PatchList l);
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Appends two patch lists and returns result.
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2);
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static PatchList nullPatchList = { 0 };
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns patch list containing just p.
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)PatchList PatchList::Mk(uint32 p) {
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PatchList l;
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  l.p = p;
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return l;
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns the next pointer pointed at by l.
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) {
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Prog::Inst* ip = &inst0[l.p>>1];
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (l.p&1)
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    l.p = ip->out1();
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  else
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    l.p = ip->out();
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return l;
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Patches all the entries on l to have value v.
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) {
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  while (l.p != 0) {
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Prog::Inst* ip = &inst0[l.p>>1];
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (l.p&1) {
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      l.p = ip->out1();
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ip->out1_ = val;
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    } else {
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      l.p = ip->out();
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      ip->set_out(val);
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Appends two patch lists and returns result.
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (l1.p == 0)
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return l2;
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (l2.p == 0)
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return l1;
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PatchList l = l1;
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (;;) {
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    PatchList next = PatchList::Deref(inst0, l);
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (next.p == 0)
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    l = next;
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Prog::Inst* ip = &inst0[l.p>>1];
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (l.p&1)
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ip->out1_ = l2.p;
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  else
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ip->set_out(l2.p);
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return l1;
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Compiled program fragment.
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct Frag {
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32 begin;
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PatchList end;
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag() : begin(0) { end.p = 0; }  // needed so Frag can go in vector
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag(uint32 begin, PatchList end) : begin(begin), end(end) {}
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static Frag NullFrag() {
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Frag();
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Input encodings.
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)enum Encoding {
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  kEncodingLatin1,    // Latin1 (0-FF)
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Compiler : public Regexp::Walker<Frag> {
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  explicit Compiler();
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~Compiler();
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Compiles Regexp to a new Prog.
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Caller is responsible for deleting Prog when finished with it.
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If reversed is true, compiles for walking over the input
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // string backward (reverses all concatenations).
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static Prog *Compile(Regexp* re, bool reversed, int64 max_mem);
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Compiles alternation of all the re to a new Prog.
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Each re has a match with an id equal to its index in the vector.
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                          Regexp* re);
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Interface for Regexp::Walker, which helps traverse the Regexp.
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The walk is purely post-recursive: given the machines for the
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // children, PostVisit combines them to create the machine for
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the current node.  The child_args are Frags.
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The Compiler traverses the Regexp parse tree, visiting
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // each node in depth-first order.  It invokes PreVisit before
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // visiting the node's children and PostVisit after visiting
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the children.
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 int nchild_args);
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag ShortVisit(Regexp* re, Frag parent_arg);
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag Copy(Frag arg);
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag Plus(Frag a, bool nongreedy);
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag Star(Frag a, bool nongreedy);
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag Quest(Frag a, bool nongreedy);
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Given fragment a, returns (a) capturing as \n.
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag Capture(Frag a, int n);
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Given fragments a and b, returns ab; a|b
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag Cat(Frag a, Frag b);
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag Alt(Frag a, Frag b);
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns a fragment that can't match anything.
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag NoMatch();
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns a fragment that matches the empty string.
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag Match(int32 id);
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns a no-op fragment.
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag Nop();
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns a fragment matching the byte range lo-hi.
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag ByteRange(int lo, int hi, bool foldcase);
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns a fragment matching an empty-width special op.
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag EmptyWidth(EmptyOp op);
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Adds n instructions to the program.
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns the index of the first one.
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns -1 if no more instructions are available.
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int AllocInst(int n);
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Deletes unused instructions.
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Trim();
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Rune range compiler.
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Begins a new alternation.
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void BeginRange();
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Adds a fragment matching the rune range lo-hi.
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void AddRuneRange(Rune lo, Rune hi, bool foldcase);
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Add_80_10ffff();
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // New suffix that matches the byte range lo-hi, then goes to next.
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next);
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Adds a suffix to alternation.
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void AddSuffix(int id);
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns the alternation of all the added suffixes.
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag EndRange();
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Single rune.
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag Literal(Rune r, bool foldcase);
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Setup(Regexp::ParseFlags, int64, RE2::Anchor);
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Prog* Finish();
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns .* where dot = any byte
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag DotStar();
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Prog* prog_;         // Program being built.
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool failed_;        // Did we give up compiling?
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Encoding encoding_;  // Input encoding
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool reversed_;      // Should program run backward over text?
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int max_inst_;       // Maximum number of instructions.
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Prog::Inst* inst_;   // Pointer to first instruction.
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int inst_len_;       // Number of instructions used.
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int inst_cap_;       // Number of instructions allocated.
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int64 max_mem_;      // Total memory budget.
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  map<uint64, int> rune_cache_;
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag rune_range_;
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  RE2::Anchor anchor_;  // anchor mode for RE2::Set
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_EVIL_CONSTRUCTORS(Compiler);
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Compiler::Compiler() {
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  prog_ = new Prog();
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  failed_ = false;
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  encoding_ = kEncodingUTF8;
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  reversed_ = false;
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_ = NULL;
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_len_ = 0;
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_cap_ = 0;
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  max_inst_ = 1;  // make AllocInst for fail instruction okay
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  max_mem_ = 0;
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int fail = AllocInst(1);
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_[fail].InitFail();
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  max_inst_ = 0;  // Caller must change
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Compiler::~Compiler() {
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  delete prog_;
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  delete[] inst_;
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int Compiler::AllocInst(int n) {
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (failed_ || inst_len_ + n > max_inst_) {
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    failed_ = true;
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return -1;
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (inst_len_ + n > inst_cap_) {
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (inst_cap_ == 0)
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      inst_cap_ = 8;
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    while (inst_len_ + n > inst_cap_)
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      inst_cap_ *= 2;
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Prog::Inst* ip = new Prog::Inst[inst_cap_];
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    memmove(ip, inst_, inst_len_ * sizeof ip[0]);
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]);
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    delete[] inst_;
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    inst_ = ip;
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int id = inst_len_;
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_len_ += n;
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return id;
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::Trim() {
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (inst_len_ < inst_cap_) {
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Prog::Inst* ip = new Prog::Inst[inst_len_];
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    memmove(ip, inst_, inst_len_ * sizeof ip[0]);
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    delete[] inst_;
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    inst_ = ip;
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    inst_cap_ = inst_len_;
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// These routines are somewhat hard to visualize in text --
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// see http://swtch.com/~rsc/regexp/regexp1.html for
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// pictures explaining what is going on here.
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns an unmatchable fragment.
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::NoMatch() {
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Frag(0, nullPatchList);
2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Is a an unmatchable fragment?
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool IsNoMatch(Frag a) {
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return a.begin == 0;
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given fragments a and b, returns fragment for ab.
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Cat(Frag a, Frag b) {
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (IsNoMatch(a) || IsNoMatch(b))
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NoMatch();
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Elide no-op.
3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Prog::Inst* begin = &inst_[a.begin];
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (begin->opcode() == kInstNop &&
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      a.end.p == (a.begin << 1) &&
3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      begin->out() == 0) {
3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    PatchList::Patch(inst_, a.end, b.begin);  // in case refs to a somewhere
3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return b;
3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // To run backward over string, reverse all concatenations.
3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (reversed_) {
3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    PatchList::Patch(inst_, b.end, a.begin);
3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return Frag(b.begin, a.end);
3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PatchList::Patch(inst_, a.end, b.begin);
3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Frag(a.begin, b.end);
3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given fragments for a and b, returns fragment for a|b.
3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Alt(Frag a, Frag b) {
3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Special case for convenience in loops.
3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (IsNoMatch(a))
3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return b;
3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (IsNoMatch(b))
3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return a;
3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int id = AllocInst(1);
3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (id < 0)
3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NoMatch();
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_[id].InitAlt(a.begin, b.begin);
3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Frag(id, PatchList::Append(inst_, a.end, b.end));
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// When capturing submatches in like-Perl mode, a kOpAlt Inst
3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// treats out_ as the first choice, out1_ as the second.
3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For *, +, and ?, if out_ causes another repetition,
3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// then the operator is greedy.  If out1_ is the repetition
3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (and out_ moves forward), then the operator is non-greedy.
3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Star(Frag a, bool nongreedy) {
3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int id = AllocInst(1);
3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (id < 0)
3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NoMatch();
3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_[id].InitAlt(0, 0);
3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PatchList::Patch(inst_, a.end, id);
3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (nongreedy) {
3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    inst_[id].out1_ = a.begin;
3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return Frag(id, PatchList::Mk(id << 1));
3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else {
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    inst_[id].set_out(a.begin);
3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return Frag(id, PatchList::Mk((id << 1) | 1));
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Plus(Frag a, bool nongreedy) {
3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // a+ is just a* with a different entry point.
3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag f = Star(a, nongreedy);
3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Frag(a.begin, f.end);
3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Quest(Frag a, bool nongreedy) {
3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int id = AllocInst(1);
3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (id < 0)
3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NoMatch();
3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PatchList pl;
3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (nongreedy) {
3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    inst_[id].InitAlt(0, a.begin);
3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pl = PatchList::Mk(id << 1);
3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else {
3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    inst_[id].InitAlt(a.begin, 0);
3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    pl = PatchList::Mk((id << 1) | 1);
3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Frag(id, PatchList::Append(inst_, pl, a.end));
3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns a fragment for the byte range lo-hi.
3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int id = AllocInst(1);
3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (id < 0)
3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NoMatch();
3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_[id].InitByteRange(lo, hi, foldcase, 0);
3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  prog_->byte_inst_count_++;
3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  prog_->MarkByteRange(lo, hi);
4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (foldcase && lo <= 'z' && hi >= 'a') {
4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (lo < 'a')
4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      lo = 'a';
4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (hi > 'z')
4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      hi = 'z';
4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (lo <= hi)
4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a');
4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Frag(id, PatchList::Mk(id << 1));
4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns a no-op fragment.  Sometimes unavoidable.
4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Nop() {
4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int id = AllocInst(1);
4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (id < 0)
4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NoMatch();
4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_[id].InitNop(0);
4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Frag(id, PatchList::Mk(id << 1));
4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns a fragment that signals a match.
4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Match(int32 match_id) {
4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int id = AllocInst(1);
4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (id < 0)
4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NoMatch();
4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_[id].InitMatch(match_id);
4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Frag(id, nullPatchList);
4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns a fragment matching a particular empty-width op (like ^ or $)
4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::EmptyWidth(EmptyOp empty) {
4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int id = AllocInst(1);
4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (id < 0)
4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NoMatch();
4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_[id].InitEmptyWidth(empty, 0);
4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (empty & (kEmptyBeginLine|kEmptyEndLine))
4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    prog_->MarkByteRange('\n', '\n');
4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) {
4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int j;
4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (int i = 0; i < 256; i = j) {
4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++)
4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        ;
4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      prog_->MarkByteRange(i, j-1);
4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Frag(id, PatchList::Mk(id << 1));
4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given a fragment a, returns a fragment with capturing parens around a.
4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Capture(Frag a, int n) {
4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int id = AllocInst(2);
4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (id < 0)
4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NoMatch();
4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_[id].InitCapture(2*n, a.begin);
4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_[id+1].InitCapture(2*n+1, 0);
4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PatchList::Patch(inst_, a.end, id+1);
4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Frag(id, PatchList::Mk((id+1) << 1));
4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A Rune is a name for a Unicode code point.
4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns maximum rune encoded by UTF-8 sequence of length len.
4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int MaxRune(int len) {
4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int b;  // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (len == 1)
4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    b = 7;
4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  else
4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    b = 8-(len+1) + 6*(len-1);
4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return (1<<b) - 1;   // maximum Rune for b bits.
4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The rune range compiler caches common suffix fragments,
4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// which are very common in UTF-8 (e.g., [80-bf]).
4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The fragment suffixes are identified by their start
4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// instructions.  NULL denotes the eventual end match.
4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The Frag accumulates in rune_range_.  Caching common
4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::BeginRange() {
4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  rune_cache_.clear();
4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  rune_range_.begin = 0;
4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  rune_range_.end = nullPatchList;
4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase,
4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                     int next) {
4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag f = ByteRange(lo, hi, foldcase);
4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (next != 0) {
4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    PatchList::Patch(inst_, f.end, next);
4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else {
4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end);
4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return f.begin;
4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) {
4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // In Latin1 mode, there's no point in caching.
4985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // In forward UTF-8 mode, only need to cache continuation bytes.
4995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (encoding_ == kEncodingLatin1 ||
5005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      (encoding_ == kEncodingUTF8 &&
5015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)       !reversed_ &&
5025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)       !(0x80 <= lo && hi <= 0xbf))) {
5035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return UncachedRuneByteSuffix(lo, hi, foldcase, next);
5045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
5055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | (foldcase ? 1ULL : 0ULL);
5075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  map<uint64, int>::iterator it = rune_cache_.find(key);
5085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (it != rune_cache_.end())
5095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return it->second;
5105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
5115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  rune_cache_[key] = id;
5125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return id;
5135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
5145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::AddSuffix(int id) {
5165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (rune_range_.begin == 0) {
5175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rune_range_.begin = id;
5185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
5195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
5205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int alt = AllocInst(1);
5225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (alt < 0) {
5235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    rune_range_.begin = 0;
5245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
5255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
5265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_[alt].InitAlt(rune_range_.begin, id);
5275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  rune_range_.begin = alt;
5285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
5295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::EndRange() {
5315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return rune_range_;
5325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
5335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Converts rune range lo-hi into a fragment that recognizes
5355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the bytes that would make up those runes in the current
5365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// encoding (Latin 1 or UTF-8).
5375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This lets the machine work byte-by-byte even when
5385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// using multibyte encodings.
5395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
5415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  switch (encoding_) {
5425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    default:
5435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kEncodingUTF8:
5445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      AddRuneRangeUTF8(lo, hi, foldcase);
5455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
5465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kEncodingLatin1:
5475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      AddRuneRangeLatin1(lo, hi, foldcase);
5485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
5495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
5505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
5515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
5535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Latin1 is easy: runes *are* bytes.
5545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (lo > hi || lo > 0xFF)
5555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
5565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (hi > 0xFF)
5575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    hi = 0xFF;
5585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
5595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
5605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Table describing how to make a UTF-8 matching machine
5625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// for the rune range 80-10FFFF (Runeself-Runemax).
5635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This range happens frequently enough (for example /./ and /[^a-z]/)
5645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and the rune_cache_ map is slow enough that this is worth
5655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// special handling.  Makes compilation of a small expression
5665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// with a dot in it about 10% faster.
5675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The * in the comments below mark whole sequences.
5685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static struct ByteRangeProg {
5695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int next;
5705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int lo;
5715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int hi;
5725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} prog_80_10ffff[] = {
5735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Two-byte
5745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  { -1, 0x80, 0xBF, },  // 0:  80-BF
5755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  {  0, 0xC2, 0xDF, },  // 1:  C2-DF 80-BF*
5765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Three-byte
5785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  {  0, 0xA0, 0xBF, },  // 2:  A0-BF 80-BF
5795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  {  2, 0xE0, 0xE0, },  // 3:  E0 A0-BF 80-BF*
5805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  {  0, 0x80, 0xBF, },  // 4:  80-BF 80-BF
5815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  {  4, 0xE1, 0xEF, },  // 5:  E1-EF 80-BF 80-BF*
5825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Four-byte
5845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  {  4, 0x90, 0xBF, },  // 6:  90-BF 80-BF 80-BF
5855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  {  6, 0xF0, 0xF0, },  // 7:  F0 90-BF 80-BF 80-BF*
5865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  {  4, 0x80, 0xBF, },  // 8:  80-BF 80-BF 80-BF
5875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  {  8, 0xF1, 0xF3, },  // 9: F1-F3 80-BF 80-BF 80-BF*
5885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  {  4, 0x80, 0x8F, },  // 10: 80-8F 80-BF 80-BF
5895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  { 10, 0xF4, 0xF4, },  // 11: F4 80-8F 80-BF 80-BF*
5905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
5915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::Add_80_10ffff() {
5935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning
5945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 0; i < arraysize(prog_80_10ffff); i++) {
5955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const ByteRangeProg& p = prog_80_10ffff[i];
5965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int next = 0;
5975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (p.next >= 0)
5985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      next = inst[p.next];
5995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next);
6005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if ((p.lo & 0xC0) != 0x80)
6015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      AddSuffix(inst[i]);
6025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
6035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
6045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
6065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (lo > hi)
6075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
6085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Pick off 80-10FFFF as a common special case
6105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // that can bypass the slow rune_cache_.
6115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (lo == 0x80 && hi == 0x10ffff && !reversed_) {
6125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Add_80_10ffff();
6135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
6145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
6155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Split range into same-length sized ranges.
6175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 1; i < UTFmax; i++) {
6185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Rune max = MaxRune(i);
6195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (lo <= max && max < hi) {
6205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      AddRuneRangeUTF8(lo, max, foldcase);
6215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      AddRuneRangeUTF8(max+1, hi, foldcase);
6225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return;
6235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
6245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
6255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // ASCII range is always a special case.
6275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (hi < Runeself) {
6285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0));
6295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
6305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
6315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Split range into sections that agree on leading bytes.
6335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 1; i < UTFmax; i++) {
6345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    uint m = (1<<(6*i)) - 1;  // last i bytes of a UTF-8 sequence
6355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if ((lo & ~m) != (hi & ~m)) {
6365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if ((lo & m) != 0) {
6375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        AddRuneRangeUTF8(lo, lo|m, foldcase);
6385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
6395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return;
6405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
6415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if ((hi & m) != m) {
6425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
6435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        AddRuneRangeUTF8(hi&~m, hi, foldcase);
6445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return;
6455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
6465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
6475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
6485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Finally.  Generate byte matching equivalent for lo-hi.
6505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint8 ulo[UTFmax], uhi[UTFmax];
6515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
6525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
6535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  (void)m;  // USED(m)
6545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK_EQ(n, m);
6555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int id = 0;
6575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (reversed_) {
6585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (int i = 0; i < n; i++)
6595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      id = RuneByteSuffix(ulo[i], uhi[i], false, id);
6605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else {
6615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    for (int i = n-1; i >= 0; i--)
6625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      id = RuneByteSuffix(ulo[i], uhi[i], false, id);
6635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
6645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  AddSuffix(id);
6655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
6665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Should not be called.
6685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Copy(Frag arg) {
6695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // We're using WalkExponential; there should be no copying.
6705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  LOG(DFATAL) << "Compiler::Copy called!";
6715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  failed_ = true;
6725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return NoMatch();
6735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
6745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Visits a node quickly; called once WalkExponential has
6765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// decided to cut this walk short.
6775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::ShortVisit(Regexp* re, Frag) {
6785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  failed_ = true;
6795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return NoMatch();
6805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
6815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Called before traversing a node's children during the walk.
6835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
6845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Cut off walk if we've already failed.
6855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (failed_)
6865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *stop = true;
6875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return NullFrag();  // not used by caller
6895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
6905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Literal(Rune r, bool foldcase) {
6925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  switch (encoding_) {
6935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    default:
6945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return NullFrag();
6955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kEncodingLatin1:
6975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return ByteRange(r, r, foldcase);
6985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
6995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kEncodingUTF8: {
7005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (r < Runeself)  // Make common case fast.
7015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return ByteRange(r, r, foldcase);
7025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      uint8 buf[UTFmax];
7035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      int n = runetochar(reinterpret_cast<char*>(buf), &r);
7045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Frag f = ByteRange((uint8)buf[0], buf[0], false);
7055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      for (int i = 1; i < n; i++)
7065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        f = Cat(f, ByteRange((uint8)buf[i], buf[i], false));
7075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return f;
7085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
7095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
7105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
7115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Called after traversing the node's children during the walk.
7135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given their frags, build and return the frag for this re.
7145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
7155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                         int nchild_frags) {
7165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If a child failed, don't bother going forward, especially
7175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // since the child_frags might contain Frags with NULLs in them.
7185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (failed_)
7195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NoMatch();
7205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Given the child fragments, return the fragment for this node.
7225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  switch (re->op()) {
7235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpRepeat:
7245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Should not see; code at bottom of function will print error
7255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
7265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpNoMatch:
7285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return NoMatch();
7295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpEmptyMatch:
7315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return Nop();
7325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpHaveMatch: {
7345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Frag f = Match(re->match_id());
7355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Remember unanchored match to end of string.
7365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (anchor_ != RE2::ANCHOR_BOTH)
7375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f));
7385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return f;
7395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
7405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpConcat: {
7425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Frag f = child_frags[0];
7435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      for (int i = 1; i < nchild_frags; i++)
7445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        f = Cat(f, child_frags[i]);
7455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return f;
7465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
7475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpAlternate: {
7495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Frag f = child_frags[0];
7505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      for (int i = 1; i < nchild_frags; i++)
7515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        f = Alt(f, child_frags[i]);
7525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return f;
7535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
7545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpStar:
7565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
7575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpPlus:
7595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
7605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpQuest:
7625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy);
7635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpLiteral:
7655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase);
7665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpLiteralString: {
7685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Concatenation of literals.
7695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (re->nrunes() == 0)
7705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return Nop();
7715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      Frag f;
7725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      for (int i = 0; i < re->nrunes(); i++) {
7735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase);
7745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (i == 0)
7755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          f = f1;
7765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        else
7775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          f = Cat(f, f1);
7785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
7795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return f;
7805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
7815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpAnyChar:
7835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      BeginRange();
7845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      AddRuneRange(0, Runemax, false);
7855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return EndRange();
7865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpAnyByte:
7885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return ByteRange(0x00, 0xFF, false);
7895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpCharClass: {
7915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      CharClass* cc = re->cc();
7925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (cc->empty()) {
7935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // This can't happen.
7945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        LOG(DFATAL) << "No ranges in char class";
7955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        failed_ = true;
7965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return NoMatch();
7975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
7985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // ASCII case-folding optimization: if the char class
8005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // behaves the same on A-Z as it does on a-z,
8015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // discard any ranges wholly contained in A-Z
8025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // and mark the other ranges as foldascii.
8035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // This reduces the size of a program for
8045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // (?i)abc from 3 insts per letter to 1 per letter.
8055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      bool foldascii = cc->FoldsASCII();
8065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // Character class is just a big OR of the different
8085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // character ranges in the class.
8095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      BeginRange();
8105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
8115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // ASCII case-folding optimization (see above).
8125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
8135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          continue;
8145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // If this range contains all of A-Za-z or none of it,
8165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        // the fold flag is unnecessary; don't bother.
8175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        bool fold = foldascii;
8185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo)
8195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          fold = false;
8205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        AddRuneRange(i->lo, i->hi, fold);
8225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
8235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return EndRange();
8245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
8255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpCapture:
8275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // If this is a non-capturing parenthesis -- (?:foo) --
8285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      // just use the inner expression.
8295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (re->cap() < 0)
8305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return child_frags[0];
8315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return Capture(child_frags[0], re->cap());
8325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpBeginLine:
8345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
8355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpEndLine:
8375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
8385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpBeginText:
8405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
8415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpEndText:
8435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
8445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpWordBoundary:
8465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return EmptyWidth(kEmptyWordBoundary);
8475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpNoWordBoundary:
8495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return EmptyWidth(kEmptyNonWordBoundary);
8505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
8515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  LOG(DFATAL) << "Missing case in Compiler: " << re->op();
8525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  failed_ = true;
8535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return NoMatch();
8545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
8555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
8565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Is this regexp required to start at the beginning of the text?
8575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
8585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// but handles (\A(a|b)).  Could use the Walker to write a more exact one.
8595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool IsAnchorStart(Regexp** pre, int depth) {
8605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Regexp* re = *pre;
8615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Regexp* sub;
8625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The depth limit makes sure that we don't overflow
8635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the stack on a deeply nested regexp.  As the comment
8645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // above says, IsAnchorStart is conservative, so returning
8655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // a false negative is okay.  The exact limit is somewhat arbitrary.
8665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (re == NULL || depth >= 4)
8675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
8685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  switch (re->op()) {
8695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    default:
8705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
8715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpConcat:
8725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (re->nsub() > 0) {
8735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        sub = re->sub()[0]->Incref();
8745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (IsAnchorStart(&sub, depth+1)) {
8755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          Regexp** subcopy = new Regexp*[re->nsub()];
8765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          subcopy[0] = sub;  // already have reference
8775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          for (int i = 1; i < re->nsub(); i++)
8785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            subcopy[i] = re->sub()[i]->Incref();
8795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
8805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          delete[] subcopy;
8815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          re->Decref();
8825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return true;
8835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
8845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        sub->Decref();
8855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
8865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
8875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpCapture:
8885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      sub = re->sub()[0]->Incref();
8895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (IsAnchorStart(&sub, depth+1)) {
8905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
8915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        re->Decref();
8925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return true;
8935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
8945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      sub->Decref();
8955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
8965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpBeginText:
8975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
8985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      re->Decref();
8995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return true;
9005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
9015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return false;
9025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
9035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
9045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Is this regexp required to start at the end of the text?
9055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Only approximate; can return false for complicated regexps like (a\z|b\z),
9065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// but handles ((a|b)\z).  Could use the Walker to write a more exact one.
9075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool IsAnchorEnd(Regexp** pre, int depth) {
9085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Regexp* re = *pre;
9095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Regexp* sub;
9105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The depth limit makes sure that we don't overflow
9115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the stack on a deeply nested regexp.  As the comment
9125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // above says, IsAnchorEnd is conservative, so returning
9135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // a false negative is okay.  The exact limit is somewhat arbitrary.
9145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (re == NULL || depth >= 4)
9155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return false;
9165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  switch (re->op()) {
9175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    default:
9185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
9195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpConcat:
9205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (re->nsub() > 0) {
9215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        sub = re->sub()[re->nsub() - 1]->Incref();
9225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        if (IsAnchorEnd(&sub, depth+1)) {
9235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          Regexp** subcopy = new Regexp*[re->nsub()];
9245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          subcopy[re->nsub() - 1] = sub;  // already have reference
9255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          for (int i = 0; i < re->nsub() - 1; i++)
9265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            subcopy[i] = re->sub()[i]->Incref();
9275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags());
9285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          delete[] subcopy;
9295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          re->Decref();
9305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)          return true;
9315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        }
9325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        sub->Decref();
9335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
9345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
9355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpCapture:
9365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      sub = re->sub()[0]->Incref();
9375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (IsAnchorEnd(&sub, depth+1)) {
9385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
9395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        re->Decref();
9405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        return true;
9415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
9425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      sub->Decref();
9435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      break;
9445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    case kRegexpEndText:
9455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
9465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      re->Decref();
9475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      return true;
9485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
9495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return false;
9505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
9515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
9525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem,
9535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     RE2::Anchor anchor) {
9545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  prog_->set_flags(flags);
9555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
9565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (flags & Regexp::Latin1)
9575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    encoding_ = kEncodingLatin1;
9585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  max_mem_ = max_mem;
9595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (max_mem <= 0) {
9605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    max_inst_ = 100000;  // more than enough
9615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else if (max_mem <= sizeof(Prog)) {
9625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // No room for anything.
9635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    max_inst_ = 0;
9645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else {
9655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
9665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Limit instruction count so that inst->id() fits nicely in an int.
9675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // SparseArray also assumes that the indices (inst->id()) are ints.
9685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // The call to WalkExponential uses 2*max_inst_ below,
9695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // and other places in the code use 2 or 3 * prog->size().
9705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Limiting to 2^24 should avoid overflow in those places.
9715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // (The point of allowing more than 32 bits of memory is to
9725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // have plenty of room for the DFA states, not to use it up
9735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // on the program.)
9745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (m >= 1<<24)
9755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      m = 1<<24;
9765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
9775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Inst imposes its own limit (currently bigger than 2^24 but be safe).
9785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (m > Prog::Inst::kMaxInst)
9795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      m = Prog::Inst::kMaxInst;
9805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
9815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    max_inst_ = m;
9825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
9835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
9845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  anchor_ = anchor;
9855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
9865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
9875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Compiles re, returning program.
9885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller is responsible for deleting prog_.
9895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If reversed is true, compiles a program that expects
9905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to run over the input string backward (reverses all concatenations).
9915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The reversed flag is also recorded in the returned program.
9925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) {
9935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Compiler c;
9945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
9955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */);
9965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c.reversed_ = reversed;
9975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
9985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Simplify to remove things like counted repetitions
9995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // and character classes like \d.
10005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Regexp* sre = re->Simplify();
10015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (sre == NULL)
10025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NULL;
10035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Record whether prog is anchored, removing the anchors.
10055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // (They get in the way of other optimizations.)
10065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool is_anchor_start = IsAnchorStart(&sre, 0);
10075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool is_anchor_end = IsAnchorEnd(&sre, 0);
10085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Generate fragment for entire regexp.
10105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag f = c.WalkExponential(sre, NullFrag(), 2*c.max_inst_);
10115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sre->Decref();
10125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (c.failed_)
10135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NULL;
10145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Success!  Finish by putting Match node at end, and record start.
10165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Turn off c.reversed_ (if it is set) to force the remaining concatenations
10175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to behave normally.
10185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c.reversed_ = false;
10195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag all = c.Cat(f, c.Match(0));
10205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c.prog_->set_start(all.begin);
10215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (reversed) {
10235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c.prog_->set_anchor_start(is_anchor_end);
10245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c.prog_->set_anchor_end(is_anchor_start);
10255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else {
10265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c.prog_->set_anchor_start(is_anchor_start);
10275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c.prog_->set_anchor_end(is_anchor_end);
10285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
10295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Also create unanchored version, which starts with a .*? loop.
10315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (c.prog_->anchor_start()) {
10325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c.prog_->set_start_unanchored(c.prog_->start());
10335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else {
10345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Frag unanchored = c.Cat(c.DotStar(), all);
10355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    c.prog_->set_start_unanchored(unanchored.begin);
10365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
10375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c.prog_->set_reversed(reversed);
10395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Hand ownership of prog_ to caller.
10415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return c.Finish();
10425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
10435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Prog* Compiler::Finish() {
10455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (failed_)
10465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NULL;
10475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
10495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // No possible matches; keep Fail instruction only.
10505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    inst_len_ = 1;
10515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
10525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Trim instruction to minimum array and transfer to Prog.
10545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Trim();
10555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  prog_->inst_ = inst_;
10565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  prog_->size_ = inst_len_;
10575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inst_ = NULL;
10585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Compute byte map.
10605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  prog_->ComputeByteMap();
10615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  prog_->Optimize();
10635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Record remaining memory for DFA.
10655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (max_mem_ <= 0) {
10665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    prog_->set_dfa_mem(1<<20);
10675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } else {
10685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst);
10695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (m < 0)
10705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      m = 0;
10715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    prog_->set_dfa_mem(m);
10725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
10735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Prog* p = prog_;
10755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  prog_ = NULL;
10765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return p;
10775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
10785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Converts Regexp to Prog.
10805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Prog* Regexp::CompileToProg(int64 max_mem) {
10815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Compiler::Compile(this, false, max_mem);
10825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
10835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Prog* Regexp::CompileToReverseProg(int64 max_mem) {
10855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Compiler::Compile(this, true, max_mem);
10865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
10875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::DotStar() {
10895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Star(ByteRange(0x00, 0xff, false), true);
10905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
10915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Compiles RE set to Prog.
10935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
10945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                           Regexp* re) {
10955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Compiler c;
10965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
10975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags());
10985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c.Setup(pf, options.max_mem(), anchor);
10995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Compile alternation of fragments.
11015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Frag all = c.WalkExponential(re, NullFrag(), 2*c.max_inst_);
11025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  re->Decref();
11035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (c.failed_)
11045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NULL;
11055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (anchor == RE2::UNANCHORED) {
11075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // The trailing .* was added while handling kRegexpHaveMatch.
11085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // We just have to add the leading one.
11095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    all = c.Cat(c.DotStar(), all);
11105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
11115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c.prog_->set_start(all.begin);
11135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c.prog_->set_start_unanchored(all.begin);
11145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c.prog_->set_anchor_start(true);
11155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  c.prog_->set_anchor_end(true);
11165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Prog* prog = c.Finish();
11185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (prog == NULL)
11195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NULL;
11205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Make sure DFA has enough memory to operate,
11225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // since we're not going to fall back to the NFA.
11235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool failed;
11245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  StringPiece sp = "hello, world";
11255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
11265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                  NULL, &failed, NULL);
11275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (failed) {
11285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    delete prog;
11295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return NULL;
11305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
11315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return prog;
11335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
11345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor,
11365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       Regexp* re) {
11375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return Compiler::CompileSet(options, anchor, re);
11385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
11395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
11405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace re2
1141