15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2007 The RE2 Authors. All Rights Reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Compile regular expression to Prog. 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Prog and Inst are defined in prog.h. 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This file's external interface is just Regexp::CompileToProg. 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The Compiler class defined in this file is private. 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/prog.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/re2.h" 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/regexp.h" 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/walker-inl.h" 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 { 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// List of pointers to Inst* that need to be filled in (patched). 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Because the Inst* haven't been filled in yet, 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// we can use the Inst* word to hold the list's "next" pointer. 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// It's kind of sleazy, but it works well in practice. 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Because the out and out1 fields in Inst are no longer pointers, 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// we can't use pointers directly here either. Instead, p refers 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// p == 0 represents the NULL list. This is okay because instruction #0 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is always the fail instruction, which never appears on a list. 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct PatchList { 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 p; 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns patch list containing just p. 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static PatchList Mk(uint32 p); 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Patches all the entries on l to have value v. 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Caller must not ever use patch list again. 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static void Patch(Prog::Inst *inst0, PatchList l, uint32 v); 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Deref returns the next pointer pointed at by p. 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static PatchList Deref(Prog::Inst *inst0, PatchList l); 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Appends two patch lists and returns result. 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static PatchList nullPatchList = { 0 }; 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns patch list containing just p. 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)PatchList PatchList::Mk(uint32 p) { 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PatchList l; 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) l.p = p; 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return l; 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns the next pointer pointed at by l. 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::Inst* ip = &inst0[l.p>>1]; 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (l.p&1) 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) l.p = ip->out1(); 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) l.p = ip->out(); 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return l; 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Patches all the entries on l to have value v. 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) { 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (l.p != 0) { 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::Inst* ip = &inst0[l.p>>1]; 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (l.p&1) { 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) l.p = ip->out1(); 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ip->out1_ = val; 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) l.p = ip->out(); 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ip->set_out(val); 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Appends two patch lists and returns result. 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (l1.p == 0) 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return l2; 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (l2.p == 0) 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return l1; 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PatchList l = l1; 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (;;) { 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PatchList next = PatchList::Deref(inst0, l); 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (next.p == 0) 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) l = next; 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::Inst* ip = &inst0[l.p>>1]; 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (l.p&1) 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ip->out1_ = l2.p; 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ip->set_out(l2.p); 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return l1; 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Compiled program fragment. 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct Frag { 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 begin; 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PatchList end; 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag(uint32 begin, PatchList end) : begin(begin), end(end) {} 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static Frag NullFrag() { 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(); 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Input encodings. 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)enum Encoding { 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kEncodingLatin1, // Latin1 (0-FF) 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Compiler : public Regexp::Walker<Frag> { 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) explicit Compiler(); 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~Compiler(); 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Compiles Regexp to a new Prog. 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Caller is responsible for deleting Prog when finished with it. 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If reversed is true, compiles for walking over the input 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // string backward (reverses all concatenations). 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Compiles alternation of all the re to a new Prog. 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Each re has a match with an id equal to its index in the vector. 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re); 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Interface for Regexp::Walker, which helps traverse the Regexp. 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The walk is purely post-recursive: given the machines for the 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // children, PostVisit combines them to create the machine for 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the current node. The child_args are Frags. 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The Compiler traverses the Regexp parse tree, visiting 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // each node in depth-first order. It invokes PreVisit before 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // visiting the node's children and PostVisit after visiting 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the children. 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nchild_args); 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag ShortVisit(Regexp* re, Frag parent_arg); 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag Copy(Frag arg); 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag Plus(Frag a, bool nongreedy); 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag Star(Frag a, bool nongreedy); 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag Quest(Frag a, bool nongreedy); 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Given fragment a, returns (a) capturing as \n. 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag Capture(Frag a, int n); 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Given fragments a and b, returns ab; a|b 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag Cat(Frag a, Frag b); 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag Alt(Frag a, Frag b); 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns a fragment that can't match anything. 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag NoMatch(); 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns a fragment that matches the empty string. 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag Match(int32 id); 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns a no-op fragment. 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag Nop(); 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns a fragment matching the byte range lo-hi. 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag ByteRange(int lo, int hi, bool foldcase); 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns a fragment matching an empty-width special op. 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag EmptyWidth(EmptyOp op); 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Adds n instructions to the program. 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns the index of the first one. 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns -1 if no more instructions are available. 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int AllocInst(int n); 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Deletes unused instructions. 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void Trim(); 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Rune range compiler. 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Begins a new alternation. 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void BeginRange(); 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Adds a fragment matching the rune range lo-hi. 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void AddRuneRange(Rune lo, Rune hi, bool foldcase); 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void Add_80_10ffff(); 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // New suffix that matches the byte range lo-hi, then goes to next. 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Adds a suffix to alternation. 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void AddSuffix(int id); 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns the alternation of all the added suffixes. 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag EndRange(); 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Single rune. 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag Literal(Rune r, bool foldcase); 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void Setup(Regexp::ParseFlags, int64, RE2::Anchor); 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog* Finish(); 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns .* where dot = any byte 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag DotStar(); 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog* prog_; // Program being built. 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool failed_; // Did we give up compiling? 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Encoding encoding_; // Input encoding 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool reversed_; // Should program run backward over text? 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int max_inst_; // Maximum number of instructions. 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::Inst* inst_; // Pointer to first instruction. 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int inst_len_; // Number of instructions used. 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int inst_cap_; // Number of instructions allocated. 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int64 max_mem_; // Total memory budget. 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) map<uint64, int> rune_cache_; 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag rune_range_; 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RE2::Anchor anchor_; // anchor mode for RE2::Set 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_EVIL_CONSTRUCTORS(Compiler); 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Compiler::Compiler() { 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_ = new Prog(); 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) failed_ = false; 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encoding_ = kEncodingUTF8; 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) reversed_ = false; 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_ = NULL; 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_len_ = 0; 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_cap_ = 0; 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) max_inst_ = 1; // make AllocInst for fail instruction okay 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) max_mem_ = 0; 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int fail = AllocInst(1); 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[fail].InitFail(); 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) max_inst_ = 0; // Caller must change 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Compiler::~Compiler() { 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete prog_; 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] inst_; 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int Compiler::AllocInst(int n) { 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (failed_ || inst_len_ + n > max_inst_) { 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) failed_ = true; 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return -1; 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (inst_len_ + n > inst_cap_) { 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (inst_cap_ == 0) 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_cap_ = 8; 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (inst_len_ + n > inst_cap_) 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_cap_ *= 2; 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::Inst* ip = new Prog::Inst[inst_cap_]; 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memmove(ip, inst_, inst_len_ * sizeof ip[0]); 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]); 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] inst_; 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_ = ip; 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = inst_len_; 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_len_ += n; 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return id; 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::Trim() { 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (inst_len_ < inst_cap_) { 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::Inst* ip = new Prog::Inst[inst_len_]; 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memmove(ip, inst_, inst_len_ * sizeof ip[0]); 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] inst_; 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_ = ip; 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_cap_ = inst_len_; 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// These routines are somewhat hard to visualize in text -- 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// see http://swtch.com/~rsc/regexp/regexp1.html for 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// pictures explaining what is going on here. 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns an unmatchable fragment. 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::NoMatch() { 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(0, nullPatchList); 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Is a an unmatchable fragment? 3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool IsNoMatch(Frag a) { 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return a.begin == 0; 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given fragments a and b, returns fragment for ab. 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Cat(Frag a, Frag b) { 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (IsNoMatch(a) || IsNoMatch(b)) 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Elide no-op. 3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::Inst* begin = &inst_[a.begin]; 3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (begin->opcode() == kInstNop && 3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) a.end.p == (a.begin << 1) && 3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) begin->out() == 0) { 3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere 3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return b; 3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // To run backward over string, reverse all concatenations. 3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (reversed_) { 3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PatchList::Patch(inst_, b.end, a.begin); 3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(b.begin, a.end); 3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PatchList::Patch(inst_, a.end, b.begin); 3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(a.begin, b.end); 3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given fragments for a and b, returns fragment for a|b. 3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Alt(Frag a, Frag b) { 3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Special case for convenience in loops. 3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (IsNoMatch(a)) 3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return b; 3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (IsNoMatch(b)) 3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return a; 3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = AllocInst(1); 3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (id < 0) 3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[id].InitAlt(a.begin, b.begin); 3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(id, PatchList::Append(inst_, a.end, b.end)); 3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// When capturing submatches in like-Perl mode, a kOpAlt Inst 3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// treats out_ as the first choice, out1_ as the second. 3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For *, +, and ?, if out_ causes another repetition, 3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// then the operator is greedy. If out1_ is the repetition 3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (and out_ moves forward), then the operator is non-greedy. 3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given a fragment a, returns a fragment for a* or a*? (if nongreedy) 3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Star(Frag a, bool nongreedy) { 3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = AllocInst(1); 3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (id < 0) 3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[id].InitAlt(0, 0); 3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PatchList::Patch(inst_, a.end, id); 3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (nongreedy) { 3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[id].out1_ = a.begin; 3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(id, PatchList::Mk(id << 1)); 3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[id].set_out(a.begin); 3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(id, PatchList::Mk((id << 1) | 1)); 3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) 3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Plus(Frag a, bool nongreedy) { 3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // a+ is just a* with a different entry point. 3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag f = Star(a, nongreedy); 3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(a.begin, f.end); 3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) 3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Quest(Frag a, bool nongreedy) { 3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = AllocInst(1); 3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (id < 0) 3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PatchList pl; 3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (nongreedy) { 3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[id].InitAlt(0, a.begin); 3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pl = PatchList::Mk(id << 1); 3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[id].InitAlt(a.begin, 0); 3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pl = PatchList::Mk((id << 1) | 1); 3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(id, PatchList::Append(inst_, pl, a.end)); 3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns a fragment for the byte range lo-hi. 3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { 3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = AllocInst(1); 3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (id < 0) 3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[id].InitByteRange(lo, hi, foldcase, 0); 3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->byte_inst_count_++; 3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->MarkByteRange(lo, hi); 4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (foldcase && lo <= 'z' && hi >= 'a') { 4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lo < 'a') 4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lo = 'a'; 4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (hi > 'z') 4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) hi = 'z'; 4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lo <= hi) 4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a'); 4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(id, PatchList::Mk(id << 1)); 4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns a no-op fragment. Sometimes unavoidable. 4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Nop() { 4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = AllocInst(1); 4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (id < 0) 4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[id].InitNop(0); 4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(id, PatchList::Mk(id << 1)); 4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns a fragment that signals a match. 4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Match(int32 match_id) { 4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = AllocInst(1); 4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (id < 0) 4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[id].InitMatch(match_id); 4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(id, nullPatchList); 4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns a fragment matching a particular empty-width op (like ^ or $) 4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::EmptyWidth(EmptyOp empty) { 4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = AllocInst(1); 4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (id < 0) 4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[id].InitEmptyWidth(empty, 0); 4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (empty & (kEmptyBeginLine|kEmptyEndLine)) 4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->MarkByteRange('\n', '\n'); 4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { 4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int j; 4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < 256; i = j) { 4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++) 4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; 4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->MarkByteRange(i, j-1); 4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(id, PatchList::Mk(id << 1)); 4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given a fragment a, returns a fragment with capturing parens around a. 4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Capture(Frag a, int n) { 4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = AllocInst(2); 4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (id < 0) 4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[id].InitCapture(2*n, a.begin); 4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[id+1].InitCapture(2*n+1, 0); 4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PatchList::Patch(inst_, a.end, id+1); 4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Frag(id, PatchList::Mk((id+1) << 1)); 4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A Rune is a name for a Unicode code point. 4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns maximum rune encoded by UTF-8 sequence of length len. 4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static int MaxRune(int len) { 4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax) 4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (len == 1) 4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) b = 7; 4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) b = 8-(len+1) + 6*(len-1); 4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return (1<<b) - 1; // maximum Rune for b bits. 4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The rune range compiler caches common suffix fragments, 4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// which are very common in UTF-8 (e.g., [80-bf]). 4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The fragment suffixes are identified by their start 4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// instructions. NULL denotes the eventual end match. 4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The Frag accumulates in rune_range_. Caching common 4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// suffixes reduces the UTF-8 "." from 32 to 24 instructions, 4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and it reduces the corresponding one-pass NFA from 16 nodes to 8. 4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::BeginRange() { 4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rune_cache_.clear(); 4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rune_range_.begin = 0; 4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rune_range_.end = nullPatchList; 4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, 4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int next) { 4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag f = ByteRange(lo, hi, foldcase); 4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (next != 0) { 4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PatchList::Patch(inst_, f.end, next); 4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end); 4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return f.begin; 4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { 4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // In Latin1 mode, there's no point in caching. 4985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // In forward UTF-8 mode, only need to cache continuation bytes. 4995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (encoding_ == kEncodingLatin1 || 5005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (encoding_ == kEncodingUTF8 && 5015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) !reversed_ && 5025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) !(0x80 <= lo && hi <= 0xbf))) { 5035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return UncachedRuneByteSuffix(lo, hi, foldcase, next); 5045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | (foldcase ? 1ULL : 0ULL); 5075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) map<uint64, int>::iterator it = rune_cache_.find(key); 5085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (it != rune_cache_.end()) 5095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return it->second; 5105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); 5115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rune_cache_[key] = id; 5125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return id; 5135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::AddSuffix(int id) { 5165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (rune_range_.begin == 0) { 5175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rune_range_.begin = id; 5185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 5195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int alt = AllocInst(1); 5225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (alt < 0) { 5235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rune_range_.begin = 0; 5245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 5255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_[alt].InitAlt(rune_range_.begin, id); 5275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) rune_range_.begin = alt; 5285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::EndRange() { 5315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return rune_range_; 5325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Converts rune range lo-hi into a fragment that recognizes 5355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the bytes that would make up those runes in the current 5365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// encoding (Latin 1 or UTF-8). 5375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This lets the machine work byte-by-byte even when 5385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// using multibyte encodings. 5395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { 5415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (encoding_) { 5425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: 5435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kEncodingUTF8: 5445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddRuneRangeUTF8(lo, hi, foldcase); 5455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 5465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kEncodingLatin1: 5475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddRuneRangeLatin1(lo, hi, foldcase); 5485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 5495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { 5535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Latin1 is easy: runes *are* bytes. 5545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lo > hi || lo > 0xFF) 5555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 5565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (hi > 0xFF) 5575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) hi = 0xFF; 5585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); 5595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Table describing how to make a UTF-8 matching machine 5625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// for the rune range 80-10FFFF (Runeself-Runemax). 5635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This range happens frequently enough (for example /./ and /[^a-z]/) 5645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and the rune_cache_ map is slow enough that this is worth 5655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// special handling. Makes compilation of a small expression 5665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// with a dot in it about 10% faster. 5675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The * in the comments below mark whole sequences. 5685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static struct ByteRangeProg { 5695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int next; 5705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int lo; 5715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int hi; 5725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} prog_80_10ffff[] = { 5735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Two-byte 5745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { -1, 0x80, 0xBF, }, // 0: 80-BF 5755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF* 5765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Three-byte 5785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF 5795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF* 5805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF 5815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF* 5825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Four-byte 5845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF 5855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF* 5865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF 5875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF* 5885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF 5895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* 5905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 5915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::Add_80_10ffff() { 5935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning 5945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < arraysize(prog_80_10ffff); i++) { 5955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const ByteRangeProg& p = prog_80_10ffff[i]; 5965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int next = 0; 5975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (p.next >= 0) 5985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) next = inst[p.next]; 5995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next); 6005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((p.lo & 0xC0) != 0x80) 6015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddSuffix(inst[i]); 6025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 6045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { 6065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lo > hi) 6075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 6085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pick off 80-10FFFF as a common special case 6105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // that can bypass the slow rune_cache_. 6115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lo == 0x80 && hi == 0x10ffff && !reversed_) { 6125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Add_80_10ffff(); 6135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 6145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Split range into same-length sized ranges. 6175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 1; i < UTFmax; i++) { 6185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune max = MaxRune(i); 6195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (lo <= max && max < hi) { 6205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddRuneRangeUTF8(lo, max, foldcase); 6215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddRuneRangeUTF8(max+1, hi, foldcase); 6225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 6235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ASCII range is always a special case. 6275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (hi < Runeself) { 6285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); 6295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 6305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Split range into sections that agree on leading bytes. 6335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 1; i < UTFmax; i++) { 6345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence 6355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((lo & ~m) != (hi & ~m)) { 6365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((lo & m) != 0) { 6375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddRuneRangeUTF8(lo, lo|m, foldcase); 6385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddRuneRangeUTF8((lo|m)+1, hi, foldcase); 6395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 6405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((hi & m) != m) { 6425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); 6435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddRuneRangeUTF8(hi&~m, hi, foldcase); 6445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 6455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Finally. Generate byte matching equivalent for lo-hi. 6505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint8 ulo[UTFmax], uhi[UTFmax]; 6515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int n = runetochar(reinterpret_cast<char*>(ulo), &lo); 6525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int m = runetochar(reinterpret_cast<char*>(uhi), &hi); 6535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (void)m; // USED(m) 6545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK_EQ(n, m); 6555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = 0; 6575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (reversed_) { 6585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < n; i++) 6595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) id = RuneByteSuffix(ulo[i], uhi[i], false, id); 6605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 6615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = n-1; i >= 0; i--) 6625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) id = RuneByteSuffix(ulo[i], uhi[i], false, id); 6635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddSuffix(id); 6655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 6665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Should not be called. 6685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Copy(Frag arg) { 6695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We're using WalkExponential; there should be no copying. 6705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "Compiler::Copy called!"; 6715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) failed_ = true; 6725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 6735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 6745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Visits a node quickly; called once WalkExponential has 6765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// decided to cut this walk short. 6775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::ShortVisit(Regexp* re, Frag) { 6785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) failed_ = true; 6795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 6805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 6815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Called before traversing a node's children during the walk. 6835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { 6845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Cut off walk if we've already failed. 6855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (failed_) 6865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *stop = true; 6875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NullFrag(); // not used by caller 6895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 6905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::Literal(Rune r, bool foldcase) { 6925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (encoding_) { 6935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: 6945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NullFrag(); 6955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kEncodingLatin1: 6975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ByteRange(r, r, foldcase); 6985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kEncodingUTF8: { 7005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (r < Runeself) // Make common case fast. 7015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ByteRange(r, r, foldcase); 7025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint8 buf[UTFmax]; 7035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int n = runetochar(reinterpret_cast<char*>(buf), &r); 7045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag f = ByteRange((uint8)buf[0], buf[0], false); 7055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 1; i < n; i++) 7065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) f = Cat(f, ByteRange((uint8)buf[i], buf[i], false)); 7075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return f; 7085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 7115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Called after traversing the node's children during the walk. 7135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Given their frags, build and return the frag for this re. 7145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, 7155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nchild_frags) { 7165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If a child failed, don't bother going forward, especially 7175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // since the child_frags might contain Frags with NULLs in them. 7185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (failed_) 7195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 7205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Given the child fragments, return the fragment for this node. 7225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (re->op()) { 7235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpRepeat: 7245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Should not see; code at bottom of function will print error 7255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 7265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpNoMatch: 7285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 7295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpEmptyMatch: 7315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Nop(); 7325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpHaveMatch: { 7345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag f = Match(re->match_id()); 7355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Remember unanchored match to end of string. 7365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (anchor_ != RE2::ANCHOR_BOTH) 7375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f)); 7385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return f; 7395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpConcat: { 7425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag f = child_frags[0]; 7435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 1; i < nchild_frags; i++) 7445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) f = Cat(f, child_frags[i]); 7455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return f; 7465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpAlternate: { 7495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag f = child_frags[0]; 7505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 1; i < nchild_frags; i++) 7515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) f = Alt(f, child_frags[i]); 7525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return f; 7535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpStar: 7565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 7575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpPlus: 7595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 7605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpQuest: 7625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 7635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpLiteral: 7655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase); 7665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpLiteralString: { 7685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Concatenation of literals. 7695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->nrunes() == 0) 7705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Nop(); 7715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag f; 7725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < re->nrunes(); i++) { 7735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase); 7745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (i == 0) 7755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) f = f1; 7765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 7775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) f = Cat(f, f1); 7785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return f; 7805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpAnyChar: 7835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) BeginRange(); 7845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddRuneRange(0, Runemax, false); 7855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return EndRange(); 7865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpAnyByte: 7885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ByteRange(0x00, 0xFF, false); 7895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpCharClass: { 7915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CharClass* cc = re->cc(); 7925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cc->empty()) { 7935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This can't happen. 7945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "No ranges in char class"; 7955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) failed_ = true; 7965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 7975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ASCII case-folding optimization: if the char class 8005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // behaves the same on A-Z as it does on a-z, 8015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // discard any ranges wholly contained in A-Z 8025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and mark the other ranges as foldascii. 8035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This reduces the size of a program for 8045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (?i)abc from 3 insts per letter to 1 per letter. 8055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool foldascii = cc->FoldsASCII(); 8065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Character class is just a big OR of the different 8085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // character ranges in the class. 8095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) BeginRange(); 8105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { 8115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ASCII case-folding optimization (see above). 8125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (foldascii && 'A' <= i->lo && i->hi <= 'Z') 8135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 8145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If this range contains all of A-Za-z or none of it, 8165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the fold flag is unnecessary; don't bother. 8175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool fold = foldascii; 8185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo) 8195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fold = false; 8205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddRuneRange(i->lo, i->hi, fold); 8225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return EndRange(); 8245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpCapture: 8275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If this is a non-capturing parenthesis -- (?:foo) -- 8285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // just use the inner expression. 8295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->cap() < 0) 8305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return child_frags[0]; 8315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Capture(child_frags[0], re->cap()); 8325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpBeginLine: 8345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); 8355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpEndLine: 8375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); 8385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpBeginText: 8405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); 8415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpEndText: 8435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); 8445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpWordBoundary: 8465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return EmptyWidth(kEmptyWordBoundary); 8475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpNoWordBoundary: 8495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return EmptyWidth(kEmptyNonWordBoundary); 8505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "Missing case in Compiler: " << re->op(); 8525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) failed_ = true; 8535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NoMatch(); 8545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 8555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Is this regexp required to start at the beginning of the text? 8575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Only approximate; can return false for complicated regexps like (\Aa|\Ab), 8585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// but handles (\A(a|b)). Could use the Walker to write a more exact one. 8595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool IsAnchorStart(Regexp** pre, int depth) { 8605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = *pre; 8615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* sub; 8625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The depth limit makes sure that we don't overflow 8635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the stack on a deeply nested regexp. As the comment 8645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // above says, IsAnchorStart is conservative, so returning 8655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // a false negative is okay. The exact limit is somewhat arbitrary. 8665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re == NULL || depth >= 4) 8675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 8685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (re->op()) { 8695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: 8705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 8715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpConcat: 8725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->nsub() > 0) { 8735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub = re->sub()[0]->Incref(); 8745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (IsAnchorStart(&sub, depth+1)) { 8755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** subcopy = new Regexp*[re->nsub()]; 8765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) subcopy[0] = sub; // already have reference 8775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 1; i < re->nsub(); i++) 8785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) subcopy[i] = re->sub()[i]->Incref(); 8795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); 8805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] subcopy; 8815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 8825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 8835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub->Decref(); 8855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 8875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpCapture: 8885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub = re->sub()[0]->Incref(); 8895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (IsAnchorStart(&sub, depth+1)) { 8905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); 8915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 8925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 8935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub->Decref(); 8955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 8965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpBeginText: 8975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); 8985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 8995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 9005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 9025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 9035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Is this regexp required to start at the end of the text? 9055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Only approximate; can return false for complicated regexps like (a\z|b\z), 9065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// but handles ((a|b)\z). Could use the Walker to write a more exact one. 9075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool IsAnchorEnd(Regexp** pre, int depth) { 9085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re = *pre; 9095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* sub; 9105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The depth limit makes sure that we don't overflow 9115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the stack on a deeply nested regexp. As the comment 9125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // above says, IsAnchorEnd is conservative, so returning 9135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // a false negative is okay. The exact limit is somewhat arbitrary. 9145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re == NULL || depth >= 4) 9155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 9165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (re->op()) { 9175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: 9185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 9195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpConcat: 9205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (re->nsub() > 0) { 9215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub = re->sub()[re->nsub() - 1]->Incref(); 9225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (IsAnchorEnd(&sub, depth+1)) { 9235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp** subcopy = new Regexp*[re->nsub()]; 9245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) subcopy[re->nsub() - 1] = sub; // already have reference 9255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < re->nsub() - 1; i++) 9265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) subcopy[i] = re->sub()[i]->Incref(); 9275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); 9285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] subcopy; 9295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 9305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 9315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub->Decref(); 9335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 9355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpCapture: 9365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub = re->sub()[0]->Incref(); 9375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (IsAnchorEnd(&sub, depth+1)) { 9385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); 9395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 9405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 9415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sub->Decref(); 9435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 9445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kRegexpEndText: 9455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); 9465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 9475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 9485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 9505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 9515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, 9535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RE2::Anchor anchor) { 9545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->set_flags(flags); 9555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (flags & Regexp::Latin1) 9575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encoding_ = kEncodingLatin1; 9585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) max_mem_ = max_mem; 9595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (max_mem <= 0) { 9605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) max_inst_ = 100000; // more than enough 9615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (max_mem <= sizeof(Prog)) { 9625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // No room for anything. 9635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) max_inst_ = 0; 9645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 9655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); 9665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Limit instruction count so that inst->id() fits nicely in an int. 9675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // SparseArray also assumes that the indices (inst->id()) are ints. 9685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The call to WalkExponential uses 2*max_inst_ below, 9695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and other places in the code use 2 or 3 * prog->size(). 9705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Limiting to 2^24 should avoid overflow in those places. 9715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (The point of allowing more than 32 bits of memory is to 9725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // have plenty of room for the DFA states, not to use it up 9735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // on the program.) 9745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (m >= 1<<24) 9755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) m = 1<<24; 9765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Inst imposes its own limit (currently bigger than 2^24 but be safe). 9785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (m > Prog::Inst::kMaxInst) 9795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) m = Prog::Inst::kMaxInst; 9805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) max_inst_ = m; 9825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) anchor_ = anchor; 9855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 9865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Compiles re, returning program. 9885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller is responsible for deleting prog_. 9895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If reversed is true, compiles a program that expects 9905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to run over the input string backward (reverses all concatenations). 9915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The reversed flag is also recorded in the returned program. 9925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { 9935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Compiler c; 9945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */); 9965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.reversed_ = reversed; 9975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Simplify to remove things like counted repetitions 9995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and character classes like \d. 10005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* sre = re->Simplify(); 10015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sre == NULL) 10025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 10035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record whether prog is anchored, removing the anchors. 10055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (They get in the way of other optimizations.) 10065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_anchor_start = IsAnchorStart(&sre, 0); 10075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_anchor_end = IsAnchorEnd(&sre, 0); 10085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Generate fragment for entire regexp. 10105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag f = c.WalkExponential(sre, NullFrag(), 2*c.max_inst_); 10115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sre->Decref(); 10125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (c.failed_) 10135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 10145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Success! Finish by putting Match node at end, and record start. 10165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Turn off c.reversed_ (if it is set) to force the remaining concatenations 10175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to behave normally. 10185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.reversed_ = false; 10195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag all = c.Cat(f, c.Match(0)); 10205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.prog_->set_start(all.begin); 10215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (reversed) { 10235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.prog_->set_anchor_start(is_anchor_end); 10245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.prog_->set_anchor_end(is_anchor_start); 10255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 10265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.prog_->set_anchor_start(is_anchor_start); 10275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.prog_->set_anchor_end(is_anchor_end); 10285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 10295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Also create unanchored version, which starts with a .*? loop. 10315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (c.prog_->anchor_start()) { 10325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.prog_->set_start_unanchored(c.prog_->start()); 10335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 10345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag unanchored = c.Cat(c.DotStar(), all); 10355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.prog_->set_start_unanchored(unanchored.begin); 10365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 10375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.prog_->set_reversed(reversed); 10395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Hand ownership of prog_ to caller. 10415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return c.Finish(); 10425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 10435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Prog* Compiler::Finish() { 10455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (failed_) 10465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 10475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (prog_->start() == 0 && prog_->start_unanchored() == 0) { 10495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // No possible matches; keep Fail instruction only. 10505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_len_ = 1; 10515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 10525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Trim instruction to minimum array and transfer to Prog. 10545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Trim(); 10555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->inst_ = inst_; 10565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->size_ = inst_len_; 10575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_ = NULL; 10585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Compute byte map. 10605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->ComputeByteMap(); 10615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->Optimize(); 10635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Record remaining memory for DFA. 10655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (max_mem_ <= 0) { 10665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->set_dfa_mem(1<<20); 10675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 10685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst); 10695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (m < 0) 10705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) m = 0; 10715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->set_dfa_mem(m); 10725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 10735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog* p = prog_; 10755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_ = NULL; 10765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return p; 10775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 10785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Converts Regexp to Prog. 10805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Prog* Regexp::CompileToProg(int64 max_mem) { 10815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Compiler::Compile(this, false, max_mem); 10825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 10835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Prog* Regexp::CompileToReverseProg(int64 max_mem) { 10855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Compiler::Compile(this, true, max_mem); 10865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 10875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Frag Compiler::DotStar() { 10895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Star(ByteRange(0x00, 0xff, false), true); 10905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 10915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Compiles RE set to Prog. 10935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, 10945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re) { 10955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Compiler c; 10965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags()); 10985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.Setup(pf, options.max_mem(), anchor); 10995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Compile alternation of fragments. 11015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Frag all = c.WalkExponential(re, NullFrag(), 2*c.max_inst_); 11025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) re->Decref(); 11035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (c.failed_) 11045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 11055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (anchor == RE2::UNANCHORED) { 11075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The trailing .* was added while handling kRegexpHaveMatch. 11085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We just have to add the leading one. 11095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) all = c.Cat(c.DotStar(), all); 11105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.prog_->set_start(all.begin); 11135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.prog_->set_start_unanchored(all.begin); 11145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.prog_->set_anchor_start(true); 11155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c.prog_->set_anchor_end(true); 11165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog* prog = c.Finish(); 11185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (prog == NULL) 11195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 11205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Make sure DFA has enough memory to operate, 11225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // since we're not going to fall back to the NFA. 11235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool failed; 11245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece sp = "hello, world"; 11255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, 11265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL, &failed, NULL); 11275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (failed) { 11285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete prog; 11295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 11305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return prog; 11335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 11345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, 11365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Regexp* re) { 11375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Compiler::CompileSet(options, anchor, re); 11385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 11395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace re2 1141