12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2007 The RE2 Authors. All Rights Reserved. 22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style 32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file. 42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Compile regular expression to Prog. 62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Prog and Inst are defined in prog.h. 82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This file's external interface is just Regexp::CompileToProg. 92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The Compiler class defined in this file is private. 102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/prog.h" 122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/re2.h" 132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/regexp.h" 142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/walker-inl.h" 152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 { 172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// List of pointers to Inst* that need to be filled in (patched). 192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Because the Inst* haven't been filled in yet, 202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// we can use the Inst* word to hold the list's "next" pointer. 212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// It's kind of sleazy, but it works well in practice. 222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. 232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Because the out and out1 fields in Inst are no longer pointers, 252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// we can't use pointers directly here either. Instead, p refers 262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). 272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// p == 0 represents the NULL list. This is okay because instruction #0 282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// is always the fail instruction, which never appears on a list. 292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstruct PatchList { 312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint32 p; 322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns patch list containing just p. 342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static PatchList Mk(uint32 p); 352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Patches all the entries on l to have value v. 372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Caller must not ever use patch list again. 382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static void Patch(Prog::Inst *inst0, PatchList l, uint32 v); 392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Deref returns the next pointer pointed at by p. 412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static PatchList Deref(Prog::Inst *inst0, PatchList l); 422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Appends two patch lists and returns result. 442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); 452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic PatchList nullPatchList = { 0 }; 482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns patch list containing just p. 502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonPatchList PatchList::Mk(uint32 p) { 512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PatchList l; 522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson l.p = p; 532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return l; 542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns the next pointer pointed at by l. 572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonPatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { 582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = &inst0[l.p>>1]; 592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (l.p&1) 602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson l.p = ip->out1(); 612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson l.p = ip->out(); 632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return l; 642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Patches all the entries on l to have value v. 672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) { 682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (l.p != 0) { 692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = &inst0[l.p>>1]; 702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (l.p&1) { 712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson l.p = ip->out1(); 722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ip->out1_ = val; 732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson l.p = ip->out(); 752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ip->set_out(val); 762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Appends two patch lists and returns result. 812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonPatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { 822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (l1.p == 0) 832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return l2; 842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (l2.p == 0) 852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return l1; 862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PatchList l = l1; 882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (;;) { 892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PatchList next = PatchList::Deref(inst0, l); 902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (next.p == 0) 912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson l = next; 932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = &inst0[l.p>>1]; 962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (l.p&1) 972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ip->out1_ = l2.p; 982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ip->set_out(l2.p); 1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return l1; 1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Compiled program fragment. 1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstruct Frag { 1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint32 begin; 1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PatchList end; 1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector 1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag(uint32 begin, PatchList end) : begin(begin), end(end) {} 1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic Frag kNullFrag; 1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Input encodings. 1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonenum Encoding { 1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) 1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kEncodingLatin1, // Latin1 (0-FF) 1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass Compiler : public Regexp::Walker<Frag> { 1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public: 1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson explicit Compiler(); 1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ~Compiler(); 1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Compiles Regexp to a new Prog. 1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Caller is responsible for deleting Prog when finished with it. 1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If reversed is true, compiles for walking over the input 1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // string backward (reverses all concatenations). 1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); 1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Compiles alternation of all the re to a new Prog. 1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Each re has a match with an id equal to its index in the vector. 1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, 1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re); 1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Interface for Regexp::Walker, which helps traverse the Regexp. 1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The walk is purely post-recursive: given the machines for the 1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // children, PostVisit combines them to create the machine for 1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the current node. The child_args are Frags. 1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The Compiler traverses the Regexp parse tree, visiting 1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // each node in depth-first order. It invokes PreVisit before 1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // visiting the node's children and PostVisit after visiting 1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the children. 1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); 1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, 1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nchild_args); 1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag ShortVisit(Regexp* re, Frag parent_arg); 1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag Copy(Frag arg); 1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? 1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag Plus(Frag a, bool nongreedy); 1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag Star(Frag a, bool nongreedy); 1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag Quest(Frag a, bool nongreedy); 1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Given fragment a, returns (a) capturing as \n. 1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag Capture(Frag a, int n); 1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Given fragments a and b, returns ab; a|b 1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag Cat(Frag a, Frag b); 1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag Alt(Frag a, Frag b); 1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns a fragment that can't match anything. 1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag NoMatch(); 1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns a fragment that matches the empty string. 1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag Match(int32 id); 1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns a no-op fragment. 1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag Nop(); 1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns a fragment matching the byte range lo-hi. 1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag ByteRange(int lo, int hi, bool foldcase); 1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns a fragment matching an empty-width special op. 1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag EmptyWidth(EmptyOp op); 1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Adds n instructions to the program. 1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns the index of the first one. 1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns -1 if no more instructions are available. 1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int AllocInst(int n); 1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Deletes unused instructions. 1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void Trim(); 1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Rune range compiler. 1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Begins a new alternation. 1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void BeginRange(); 1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Adds a fragment matching the rune range lo-hi. 1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void AddRuneRange(Rune lo, Rune hi, bool foldcase); 1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); 1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); 1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void Add_80_10ffff(); 1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // New suffix that matches the byte range lo-hi, then goes to next. 1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); 1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); 2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Adds a suffix to alternation. 2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void AddSuffix(int id); 2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns the alternation of all the added suffixes. 2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag EndRange(); 2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Single rune. 2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag Literal(Rune r, bool foldcase); 2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void Setup(Regexp::ParseFlags, int64, RE2::Anchor); 2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog* Finish(); 2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns .* where dot = any byte 2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag DotStar(); 2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private: 2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog* prog_; // Program being built. 2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool failed_; // Did we give up compiling? 2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Encoding encoding_; // Input encoding 2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool reversed_; // Should program run backward over text? 2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int max_inst_; // Maximum number of instructions. 2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* inst_; // Pointer to first instruction. 2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int inst_len_; // Number of instructions used. 2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int inst_cap_; // Number of instructions allocated. 2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int64 max_mem_; // Total memory budget. 2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson map<uint64, int> rune_cache_; 2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag rune_range_; 2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RE2::Anchor anchor_; // anchor mode for RE2::Set 2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DISALLOW_EVIL_CONSTRUCTORS(Compiler); 2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonCompiler::Compiler() { 2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_ = new Prog(); 2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson failed_ = false; 2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson encoding_ = kEncodingUTF8; 2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson reversed_ = false; 2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_ = NULL; 2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_len_ = 0; 2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_cap_ = 0; 2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson max_inst_ = 1; // make AllocInst for fail instruction okay 2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson max_mem_ = 0; 2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int fail = AllocInst(1); 2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[fail].InitFail(); 2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson max_inst_ = 0; // Caller must change 2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonCompiler::~Compiler() { 2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete prog_; 2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] inst_; 2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonint Compiler::AllocInst(int n) { 2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (failed_ || inst_len_ + n > max_inst_) { 2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson failed_ = true; 2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return -1; 2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (inst_len_ + n > inst_cap_) { 2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (inst_cap_ == 0) 2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_cap_ = 8; 2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (inst_len_ + n > inst_cap_) 2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_cap_ *= 2; 2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = new Prog::Inst[inst_cap_]; 2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memmove(ip, inst_, inst_len_ * sizeof ip[0]); 2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]); 2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] inst_; 2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_ = ip; 2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = inst_len_; 2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_len_ += n; 2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return id; 2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Compiler::Trim() { 2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (inst_len_ < inst_cap_) { 2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = new Prog::Inst[inst_len_]; 2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memmove(ip, inst_, inst_len_ * sizeof ip[0]); 2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] inst_; 2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_ = ip; 2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_cap_ = inst_len_; 2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// These routines are somewhat hard to visualize in text -- 2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// see http://swtch.com/~rsc/regexp/regexp1.html for 2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// pictures explaining what is going on here. 2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns an unmatchable fragment. 2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::NoMatch() { 2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Frag(0, nullPatchList); 2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Is a an unmatchable fragment? 3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool IsNoMatch(Frag a) { 3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return a.begin == 0; 3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Given fragments a and b, returns fragment for ab. 3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::Cat(Frag a, Frag b) { 3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (IsNoMatch(a) || IsNoMatch(b)) 3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Elide no-op. 3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* begin = &inst_[a.begin]; 3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (begin->opcode() == kInstNop && 3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson a.end.p == (a.begin << 1) && 3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson begin->out() == 0) { 3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere 3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return b; 3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // To run backward over string, reverse all concatenations. 3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (reversed_) { 3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PatchList::Patch(inst_, b.end, a.begin); 3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Frag(b.begin, a.end); 3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PatchList::Patch(inst_, a.end, b.begin); 3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Frag(a.begin, b.end); 3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Given fragments for a and b, returns fragment for a|b. 3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::Alt(Frag a, Frag b) { 3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Special case for convenience in loops. 3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (IsNoMatch(a)) 3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return b; 3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (IsNoMatch(b)) 3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return a; 3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = AllocInst(1); 3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id < 0) 3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[id].InitAlt(a.begin, b.begin); 3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Frag(id, PatchList::Append(inst_, a.end, b.end)); 3422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// When capturing submatches in like-Perl mode, a kOpAlt Inst 3452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// treats out_ as the first choice, out1_ as the second. 3462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 3472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// For *, +, and ?, if out_ causes another repetition, 3482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// then the operator is greedy. If out1_ is the repetition 3492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// (and out_ moves forward), then the operator is non-greedy. 3502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Given a fragment a, returns a fragment for a* or a*? (if nongreedy) 3522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::Star(Frag a, bool nongreedy) { 3532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = AllocInst(1); 3542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id < 0) 3552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 3562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[id].InitAlt(0, 0); 3572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PatchList::Patch(inst_, a.end, id); 3582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (nongreedy) { 3592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[id].out1_ = a.begin; 3602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Frag(id, PatchList::Mk(id << 1)); 3612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 3622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[id].set_out(a.begin); 3632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Frag(id, PatchList::Mk((id << 1) | 1)); 3642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) 3682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::Plus(Frag a, bool nongreedy) { 3692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // a+ is just a* with a different entry point. 3702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag f = Star(a, nongreedy); 3712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Frag(a.begin, f.end); 3722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) 3752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::Quest(Frag a, bool nongreedy) { 3762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = AllocInst(1); 3772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id < 0) 3782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 3792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PatchList pl; 3802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (nongreedy) { 3812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[id].InitAlt(0, a.begin); 3822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson pl = PatchList::Mk(id << 1); 3832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 3842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[id].InitAlt(a.begin, 0); 3852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson pl = PatchList::Mk((id << 1) | 1); 3862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Frag(id, PatchList::Append(inst_, pl, a.end)); 3882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns a fragment for the byte range lo-hi. 3912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::ByteRange(int lo, int hi, bool foldcase) { 3922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = AllocInst(1); 3932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id < 0) 3942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 3952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[id].InitByteRange(lo, hi, foldcase, 0); 3962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->byte_inst_count_++; 3972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->MarkByteRange(lo, hi); 3982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (foldcase && lo <= 'z' && hi >= 'a') { 3992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (lo < 'a') 4002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lo = 'a'; 4012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (hi > 'z') 4022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson hi = 'z'; 4032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (lo <= hi) 4042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a'); 4052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Frag(id, PatchList::Mk(id << 1)); 4072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns a no-op fragment. Sometimes unavoidable. 4102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::Nop() { 4112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = AllocInst(1); 4122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id < 0) 4132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 4142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[id].InitNop(0); 4152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Frag(id, PatchList::Mk(id << 1)); 4162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns a fragment that signals a match. 4192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::Match(int32 match_id) { 4202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = AllocInst(1); 4212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id < 0) 4222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 4232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[id].InitMatch(match_id); 4242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Frag(id, nullPatchList); 4252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns a fragment matching a particular empty-width op (like ^ or $) 4282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::EmptyWidth(EmptyOp empty) { 4292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = AllocInst(1); 4302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id < 0) 4312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 4322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[id].InitEmptyWidth(empty, 0); 4332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (empty & (kEmptyBeginLine|kEmptyEndLine)) 4342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->MarkByteRange('\n', '\n'); 4352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { 4362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int j; 4372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < 256; i = j) { 4382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++) 4392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ; 4402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->MarkByteRange(i, j-1); 4412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Frag(id, PatchList::Mk(id << 1)); 4442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Given a fragment a, returns a fragment with capturing parens around a. 4472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::Capture(Frag a, int n) { 4482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = AllocInst(2); 4492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id < 0) 4502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 4512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[id].InitCapture(2*n, a.begin); 4522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[id+1].InitCapture(2*n+1, 0); 4532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PatchList::Patch(inst_, a.end, id+1); 4542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Frag(id, PatchList::Mk((id+1) << 1)); 4562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// A Rune is a name for a Unicode code point. 4592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns maximum rune encoded by UTF-8 sequence of length len. 4602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic int MaxRune(int len) { 4612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int b; // number of Rune blents lenn len-byte UTF-8 sequence (len < UTFmax) 4622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (len == 1) 4632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson b = 7; 4642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 4652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson b = 8-(len+1) + 6*(len-1); 4662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return (1<<b) - 1; // maximum Rune for b bits. 4672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The rune range compiler caches common suffix fragments, 4702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// which are very common in UTF-8 (e.g., [80-bf]). 4712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The fragment suffixes are identified by their start 4722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// instructions. NULL denotes the eventual end match. 4732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The Frag accumulates in rune_range_. Caching common 4742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// suffixes reduces the UTF-8 "." from 32 to 24 instructions, 4752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and it reduces the corresponding one-pass NFA from 16 nodes to 8. 4762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Compiler::BeginRange() { 4782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rune_cache_.clear(); 4792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rune_range_.begin = 0; 4802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rune_range_.end = nullPatchList; 4812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonint Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, 4842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int next) { 4852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag f = ByteRange(lo, hi, foldcase); 4862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (next != 0) { 4872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PatchList::Patch(inst_, f.end, next); 4882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 4892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end); 4902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return f.begin; 4922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonint Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { 4952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // In Latin1 mode, there's no point in caching. 4962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // In forward UTF-8 mode, only need to cache continuation bytes. 4972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (encoding_ == kEncodingLatin1 || 4982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (encoding_ == kEncodingUTF8 && 4992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson !reversed_ && 5002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson !(0x80 <= lo && hi <= 0xbf))) { 5012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return UncachedRuneByteSuffix(lo, hi, foldcase, next); 5022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | foldcase; 5052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson map<uint64, int>::iterator it = rune_cache_.find(key); 5062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (it != rune_cache_.end()) 5072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return it->second; 5082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); 5092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rune_cache_[key] = id; 5102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return id; 5112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 5122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Compiler::AddSuffix(int id) { 5142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (rune_range_.begin == 0) { 5152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rune_range_.begin = id; 5162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 5172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int alt = AllocInst(1); 5202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (alt < 0) { 5212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rune_range_.begin = 0; 5222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 5232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_[alt].InitAlt(rune_range_.begin, id); 5252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson rune_range_.begin = alt; 5262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 5272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::EndRange() { 5292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return rune_range_; 5302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 5312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Converts rune range lo-hi into a fragment that recognizes 5332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the bytes that would make up those runes in the current 5342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// encoding (Latin 1 or UTF-8). 5352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This lets the machine work byte-by-byte even when 5362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// using multibyte encodings. 5372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { 5392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (encoding_) { 5402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 5412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kEncodingUTF8: 5422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddRuneRangeUTF8(lo, hi, foldcase); 5432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 5442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kEncodingLatin1: 5452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddRuneRangeLatin1(lo, hi, foldcase); 5462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 5472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 5492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { 5512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Latin1 is easy: runes *are* bytes. 5522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (lo > hi || lo > 0xFF) 5532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 5542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (hi > 0xFF) 5552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson hi = 0xFF; 5562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); 5572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 5582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Table describing how to make a UTF-8 matching machine 5602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// for the rune range 80-10FFFF (Runeself-Runemax). 5612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This range happens frequently enough (for example /./ and /[^a-z]/) 5622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and the rune_cache_ map is slow enough that this is worth 5632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// special handling. Makes compilation of a small expression 5642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// with a dot in it about 10% faster. 5652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The * in the comments below mark whole sequences. 5662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic struct ByteRangeProg { 5672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int next; 5682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int lo; 5692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int hi; 5702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} prog_80_10ffff[] = { 5712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Two-byte 5722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson { -1, 0x80, 0xBF, }, // 0: 80-BF 5732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF* 5742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Three-byte 5762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF 5772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF* 5782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF 5792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF* 5802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Four-byte 5822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF 5832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF* 5842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF 5852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF* 5862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF 5872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* 5882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 5892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Compiler::Add_80_10ffff() { 5912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int inst[arraysize(prog_80_10ffff)]; 5922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < arraysize(prog_80_10ffff); i++) { 5932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const ByteRangeProg& p = prog_80_10ffff[i]; 5942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int next = 0; 5952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (p.next >= 0) 5962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson next = inst[p.next]; 5972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next); 5982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((p.lo & 0xC0) != 0x80) 5992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddSuffix(inst[i]); 6002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 6022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { 6042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (lo > hi) 6052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 6062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pick off 80-10FFFF as a common special case 6082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // that can bypass the slow rune_cache_. 6092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (lo == 0x80 && hi == 0x10ffff && !reversed_) { 6102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Add_80_10ffff(); 6112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 6122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Split range into same-length sized ranges. 6152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 1; i < UTFmax; i++) { 6162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Rune max = MaxRune(i); 6172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (lo <= max && max < hi) { 6182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddRuneRangeUTF8(lo, max, foldcase); 6192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddRuneRangeUTF8(max+1, hi, foldcase); 6202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 6212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ASCII range is always a special case. 6252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (hi < Runeself) { 6262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); 6272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 6282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Split range into sections that agree on leading bytes. 6312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 1; i < UTFmax; i++) { 6322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence 6332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((lo & ~m) != (hi & ~m)) { 6342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((lo & m) != 0) { 6352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddRuneRangeUTF8(lo, lo|m, foldcase); 6362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddRuneRangeUTF8((lo|m)+1, hi, foldcase); 6372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 6382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((hi & m) != m) { 6402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); 6412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddRuneRangeUTF8(hi&~m, hi, foldcase); 6422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 6432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Finally. Generate byte matching equivalent for lo-hi. 6482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint8 ulo[UTFmax], uhi[UTFmax]; 6492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int n = runetochar(reinterpret_cast<char*>(ulo), &lo); 6502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int m = runetochar(reinterpret_cast<char*>(uhi), &hi); 6512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (void)m; // USED(m) 6522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_EQ(n, m); 6532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = 0; 6552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (reversed_) { 6562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < n; i++) 6572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = RuneByteSuffix(ulo[i], uhi[i], false, id); 6582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 6592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = n-1; i >= 0; i--) 6602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = RuneByteSuffix(ulo[i], uhi[i], false, id); 6612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddSuffix(id); 6632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 6642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Should not be called. 6662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::Copy(Frag arg) { 6672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // We're using WalkExponential; there should be no copying. 6682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Compiler::Copy called!"; 6692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson failed_ = true; 6702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 6712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 6722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Visits a node quickly; called once WalkExponential has 6742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// decided to cut this walk short. 6752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::ShortVisit(Regexp* re, Frag) { 6762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson failed_ = true; 6772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 6782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 6792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Called before traversing a node's children during the walk. 6812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { 6822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Cut off walk if we've already failed. 6832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (failed_) 6842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *stop = true; 6852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kNullFrag; // not used by caller 6872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 6882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::Literal(Rune r, bool foldcase) { 6902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (encoding_) { 6912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 6922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return kNullFrag; 6932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kEncodingLatin1: 6952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ByteRange(r, r, foldcase); 6962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kEncodingUTF8: { 6982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (r < Runeself) // Make common case fast. 6992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ByteRange(r, r, foldcase); 7002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint8 buf[UTFmax]; 7012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int n = runetochar(reinterpret_cast<char*>(buf), &r); 7022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag f = ByteRange((uint8)buf[0], buf[0], false); 7032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 1; i < n; i++) 7042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson f = Cat(f, ByteRange((uint8)buf[i], buf[i], false)); 7052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return f; 7062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 7092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Called after traversing the node's children during the walk. 7112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Given their frags, build and return the frag for this re. 7122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, 7132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nchild_frags) { 7142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If a child failed, don't bother going forward, especially 7152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // since the child_frags might contain Frags with NULLs in them. 7162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (failed_) 7172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 7182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Given the child fragments, return the fragment for this node. 7202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (re->op()) { 7212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpRepeat: 7222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Should not see; code at bottom of function will print error 7232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 7242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpNoMatch: 7262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 7272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpEmptyMatch: 7292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Nop(); 7302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpHaveMatch: { 7322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag f = Match(re->match_id()); 7332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Remember unanchored match to end of string. 7342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (anchor_ != RE2::ANCHOR_BOTH) 7352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson f = Cat(DotStar(), f); 7362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return f; 7372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpConcat: { 7402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag f = child_frags[0]; 7412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 1; i < nchild_frags; i++) 7422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson f = Cat(f, child_frags[i]); 7432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return f; 7442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpAlternate: { 7472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag f = child_frags[0]; 7482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 1; i < nchild_frags; i++) 7492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson f = Alt(f, child_frags[i]); 7502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return f; 7512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpStar: 7542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 7552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpPlus: 7572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 7582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpQuest: 7602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 7612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpLiteral: 7632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase); 7642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpLiteralString: { 7662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Concatenation of literals. 7672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->nrunes() == 0) 7682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Nop(); 7692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag f; 7702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < re->nrunes(); i++) { 7712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase); 7722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i == 0) 7732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson f = f1; 7742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 7752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson f = Cat(f, f1); 7762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return f; 7782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpAnyChar: 7812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson BeginRange(); 7822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddRuneRange(0, Runemax, false); 7832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return EndRange(); 7842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpAnyByte: 7862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ByteRange(0x00, 0xFF, false); 7872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpCharClass: { 7892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CharClass* cc = re->cc(); 7902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (cc->empty()) { 7912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // This can't happen. 7922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "No ranges in char class"; 7932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson failed_ = true; 7942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 7952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ASCII case-folding optimization: if the char class 7982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // behaves the same on A-Z as it does on a-z, 7992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // discard any ranges wholly contained in A-Z 8002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and mark the other ranges as foldascii. 8012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // This reduces the size of a program for 8022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (?i)abc from 3 insts per letter to 1 per letter. 8032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool foldascii = cc->FoldsASCII(); 8042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Character class is just a big OR of the different 8062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // character ranges in the class. 8072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson BeginRange(); 8082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { 8092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ASCII case-folding optimization (see above). 8102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (foldascii && 'A' <= i->lo && i->hi <= 'Z') 8112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 8122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If this range contains all of A-Za-z or none of it, 8142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the fold flag is unnecessary; don't bother. 8152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool fold = foldascii; 8162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo) 8172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fold = false; 8182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddRuneRange(i->lo, i->hi, fold); 8202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return EndRange(); 8222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpCapture: 8252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If this is a non-capturing parenthesis -- (?:foo) -- 8262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // just use the inner expression. 8272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->cap() < 0) 8282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return child_frags[0]; 8292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Capture(child_frags[0], re->cap()); 8302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpBeginLine: 8322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); 8332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpEndLine: 8352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); 8362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpBeginText: 8382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); 8392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpEndText: 8412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); 8422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpWordBoundary: 8442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return EmptyWidth(kEmptyWordBoundary); 8452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpNoWordBoundary: 8472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return EmptyWidth(kEmptyNonWordBoundary); 8482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Missing case in Compiler: " << re->op(); 8502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson failed_ = true; 8512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NoMatch(); 8522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 8532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Is this regexp required to start at the beginning of the text? 8552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Only approximate; can return false for complicated regexps like (\Aa|\Ab), 8562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// but handles (\A(a|b)). Could use the Walker to write a more exact one. 8572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool IsAnchorStart(Regexp** pre, int depth) { 8582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = *pre; 8592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* sub; 8602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The depth limit makes sure that we don't overflow 8612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the stack on a deeply nested regexp. As the comment 8622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // above says, IsAnchorStart is conservative, so returning 8632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // a false negative is okay. The exact limit is somewhat arbitrary. 8642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re == NULL || depth >= 4) 8652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 8662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (re->op()) { 8672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 8682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 8692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpConcat: 8702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->nsub() > 0) { 8712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub = re->sub()[0]->Incref(); 8722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (IsAnchorStart(&sub, depth+1)) { 8732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp** subcopy = new Regexp*[re->nsub()]; 8742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson subcopy[0] = sub; // already have reference 8752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 1; i < re->nsub(); i++) 8762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson subcopy[i] = re->sub()[i]->Incref(); 8772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); 8782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] subcopy; 8792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 8802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 8812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub->Decref(); 8832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 8852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpCapture: 8862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub = re->sub()[0]->Incref(); 8872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (IsAnchorStart(&sub, depth+1)) { 8882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); 8892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 8902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 8912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub->Decref(); 8932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 8942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpBeginText: 8952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); 8962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 8972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 8982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 9002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 9012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Is this regexp required to start at the end of the text? 9032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Only approximate; can return false for complicated regexps like (a\z|b\z), 9042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// but handles ((a|b)\z). Could use the Walker to write a more exact one. 9052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool IsAnchorEnd(Regexp** pre, int depth) { 9062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re = *pre; 9072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* sub; 9082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The depth limit makes sure that we don't overflow 9092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the stack on a deeply nested regexp. As the comment 9102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // above says, IsAnchorEnd is conservative, so returning 9112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // a false negative is okay. The exact limit is somewhat arbitrary. 9122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re == NULL || depth >= 4) 9132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 9142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (re->op()) { 9152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 9162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 9172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpConcat: 9182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (re->nsub() > 0) { 9192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub = re->sub()[re->nsub() - 1]->Incref(); 9202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (IsAnchorEnd(&sub, depth+1)) { 9212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp** subcopy = new Regexp*[re->nsub()]; 9222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson subcopy[re->nsub() - 1] = sub; // already have reference 9232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < re->nsub() - 1; i++) 9242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson subcopy[i] = re->sub()[i]->Incref(); 9252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); 9262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] subcopy; 9272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 9282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 9292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub->Decref(); 9312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 9332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpCapture: 9342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub = re->sub()[0]->Incref(); 9352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (IsAnchorEnd(&sub, depth+1)) { 9362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); 9372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 9382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 9392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sub->Decref(); 9412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 9422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kRegexpEndText: 9432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); 9442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 9452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 9462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 9482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 9492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, 9512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RE2::Anchor anchor) { 9522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->set_flags(flags); 9532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (flags & Regexp::Latin1) 9552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson encoding_ = kEncodingLatin1; 9562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson max_mem_ = max_mem; 9572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (max_mem <= 0) { 9582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson max_inst_ = 100000; // more than enough 9592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (max_mem <= sizeof(Prog)) { 9602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // No room for anything. 9612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson max_inst_ = 0; 9622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 9632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); 9642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Limit instruction count so that inst->id() fits nicely in an int. 9652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // SparseArray also assumes that the indices (inst->id()) are ints. 9662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The call to WalkExponential uses 2*max_inst_ below, 9672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and other places in the code use 2 or 3 * prog->size(). 9682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Limiting to 2^24 should avoid overflow in those places. 9692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (The point of allowing more than 32 bits of memory is to 9702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // have plenty of room for the DFA states, not to use it up 9712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // on the program.) 9722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (m >= 1<<24) 9732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson m = 1<<24; 9742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Inst imposes its own limit (currently bigger than 2^24 but be safe). 9762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (m > Prog::Inst::kMaxInst) 9772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson m = Prog::Inst::kMaxInst; 9782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson max_inst_ = m; 9802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson anchor_ = anchor; 9832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 9842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Compiles re, returning program. 9862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Caller is responsible for deleting prog_. 9872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If reversed is true, compiles a program that expects 9882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// to run over the input string backward (reverses all concatenations). 9892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The reversed flag is also recorded in the returned program. 9902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonProg* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { 9912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Compiler c; 9922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */); 9942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.reversed_ = reversed; 9952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Simplify to remove things like counted repetitions 9972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and character classes like \d. 9982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* sre = re->Simplify(); 9992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (sre == NULL) 10002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 10012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Record whether prog is anchored, removing the anchors. 10032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (They get in the way of other optimizations.) 10042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool is_anchor_start = IsAnchorStart(&sre, 0); 10052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool is_anchor_end = IsAnchorEnd(&sre, 0); 10062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Generate fragment for entire regexp. 10082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag f = c.WalkExponential(sre, kNullFrag, 2*c.max_inst_); 10092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sre->Decref(); 10102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (c.failed_) 10112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 10122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Success! Finish by putting Match node at end, and record start. 10142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Turn off c.reversed_ (if it is set) to force the remaining concatenations 10152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // to behave normally. 10162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.reversed_ = false; 10172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag all = c.Cat(f, c.Match(0)); 10182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.prog_->set_start(all.begin); 10192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (reversed) { 10212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.prog_->set_anchor_start(is_anchor_end); 10222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.prog_->set_anchor_end(is_anchor_start); 10232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 10242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.prog_->set_anchor_start(is_anchor_start); 10252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.prog_->set_anchor_end(is_anchor_end); 10262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 10272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Also create unanchored version, which starts with a .*? loop. 10292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (c.prog_->anchor_start()) { 10302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.prog_->set_start_unanchored(c.prog_->start()); 10312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 10322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag unanchored = c.Cat(c.DotStar(), all); 10332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.prog_->set_start_unanchored(unanchored.begin); 10342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 10352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.prog_->set_reversed(reversed); 10372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Hand ownership of prog_ to caller. 10392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return c.Finish(); 10402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 10412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonProg* Compiler::Finish() { 10432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (failed_) 10442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 10452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (prog_->start() == 0 && prog_->start_unanchored() == 0) { 10472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // No possible matches; keep Fail instruction only. 10482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_len_ = 1; 10492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 10502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Trim instruction to minimum array and transfer to Prog. 10522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Trim(); 10532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->inst_ = inst_; 10542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->size_ = inst_len_; 10552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_ = NULL; 10562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Compute byte map. 10582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->ComputeByteMap(); 10592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->Optimize(); 10612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Record remaining memory for DFA. 10632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (max_mem_ <= 0) { 10642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->set_dfa_mem(1<<20); 10652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 10662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst); 10672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (m < 0) 10682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson m = 0; 10692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->set_dfa_mem(m); 10702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 10712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog* p = prog_; 10732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_ = NULL; 10742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return p; 10752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 10762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Converts Regexp to Prog. 10782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonProg* Regexp::CompileToProg(int64 max_mem) { 10792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Compiler::Compile(this, false, max_mem); 10802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 10812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonProg* Regexp::CompileToReverseProg(int64 max_mem) { 10832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Compiler::Compile(this, true, max_mem); 10842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 10852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonFrag Compiler::DotStar() { 10872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Star(ByteRange(0x00, 0xff, false), true); 10882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 10892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Compiles RE set to Prog. 10912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonProg* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, 10922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re) { 10932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Compiler c; 10942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags()); 10962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.Setup(pf, options.max_mem(), anchor); 10972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Compile alternation of fragments. 10992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Frag all = c.WalkExponential(re, kNullFrag, 2*c.max_inst_); 11002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson re->Decref(); 11012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (c.failed_) 11022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 11032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (anchor == RE2::UNANCHORED) { 11052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The trailing .* was added while handling kRegexpHaveMatch. 11062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // We just have to add the leading one. 11072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson all = c.Cat(c.DotStar(), all); 11082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 11092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.prog_->set_start(all.begin); 11112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.prog_->set_start_unanchored(all.begin); 11122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.prog_->set_anchor_start(true); 11132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c.prog_->set_anchor_end(true); 11142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog* prog = c.Finish(); 11162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (prog == NULL) 11172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 11182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Make sure DFA has enough memory to operate, 11202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // since we're not going to fall back to the NFA. 11212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool failed; 11222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece sp = "hello, world"; 11232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, 11242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson NULL, &failed, NULL); 11252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (failed) { 11262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete prog; 11272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 11282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 11292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return prog; 11312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 11322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonProg* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, 11342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Regexp* re) { 11352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Compiler::CompileSet(options, anchor, re); 11362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 11372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} // namespace re2 1139