12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2007 The RE2 Authors. All Rights Reserved. 22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style 32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file. 42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Compiled regular expression representation. 62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Tested by compile_test.cc 72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/util.h" 92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/sparse_set.h" 102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/prog.h" 112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/stringpiece.h" 122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 { 142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Constructors per Inst opcode 162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitAlt(uint32 out, uint32 out1) { 182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_EQ(out_opcode_, 0); 192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson set_out_opcode(out, kInstAlt); 202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson out1_ = out1; 212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) { 242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_EQ(out_opcode_, 0); 252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson set_out_opcode(out, kInstByteRange); 262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lo_ = lo & 0xFF; 272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson hi_ = hi & 0xFF; 282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson foldcase_ = foldcase; 292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitCapture(int cap, uint32 out) { 322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_EQ(out_opcode_, 0); 332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson set_out_opcode(out, kInstCapture); 342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cap_ = cap; 352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) { 382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_EQ(out_opcode_, 0); 392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson set_out_opcode(out, kInstEmptyWidth); 402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson empty_ = empty; 412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitMatch(int32 id) { 442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_EQ(out_opcode_, 0); 452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson set_opcode(kInstMatch); 462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson match_id_ = id; 472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitNop(uint32 out) { 502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_EQ(out_opcode_, 0); 512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson set_opcode(kInstNop); 522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitFail() { 552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_EQ(out_opcode_, 0); 562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson set_opcode(kInstFail); 572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring Prog::Inst::Dump() { 602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (opcode()) { 612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return StringPrintf("opcode %d", static_cast<int>(opcode())); 632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAlt: 652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return StringPrintf("alt -> %d | %d", out(), out1_); 662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAltMatch: 682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return StringPrintf("altmatch -> %d | %d", out(), out1_); 692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstByteRange: 712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return StringPrintf("byte%s [%02x-%02x] -> %d", 722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson foldcase_ ? "/i" : "", 732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lo_, hi_, out()); 742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstCapture: 762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return StringPrintf("capture %d -> %d", cap_, out()); 772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstEmptyWidth: 792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return StringPrintf("emptywidth %#x -> %d", 802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static_cast<int>(empty_), out()); 812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstMatch: 832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return StringPrintf("match! %d", match_id()); 842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstNop: 862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return StringPrintf("nop -> %d", out()); 872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstFail: 892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return StringPrintf("fail"); 902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonProg::Prog() 942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson : anchor_start_(false), 952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson anchor_end_(false), 962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson reversed_(false), 972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson did_onepass_(false), 982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start_(0), 992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start_unanchored_(0), 1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson size_(0), 1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson byte_inst_count_(0), 1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bytemap_range_(0), 1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags_(0), 1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson onepass_statesize_(0), 1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_(NULL), 1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson dfa_first_(NULL), 1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson dfa_longest_(NULL), 1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson dfa_mem_(0), 1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete_dfa_(NULL), 1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson unbytemap_(NULL), 1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson onepass_nodes_(NULL), 1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson onepass_start_(NULL) { 1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonProg::~Prog() { 1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (delete_dfa_) { 1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (dfa_first_) 1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete_dfa_(dfa_first_); 1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (dfa_longest_) 1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete_dfa_(dfa_longest_); 1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] onepass_nodes_; 1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] inst_; 1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] unbytemap_; 1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsontypedef SparseSet Workq; 1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic inline void AddToQueue(Workq* q, int id) { 1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id != 0) 1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q->insert(id); 1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic string ProgToString(Prog* prog, Workq* q) { 1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson string s; 1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (Workq::iterator i = q->begin(); i != q->end(); ++i) { 1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = *i; 1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = prog->inst(id); 1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str()); 1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(q, ip->out()); 1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch) 1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(q, ip->out1()); 1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return s; 1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring Prog::Dump() { 1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson string map; 1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (false) { // Debugging 1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int lo = 0; 1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringAppendF(&map, "byte map:\n"); 1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < bytemap_range_; i++) { 1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]); 1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lo = unbytemap_[i] + 1; 1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringAppendF(&map, "\n"); 1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Workq q(size_); 1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(&q, start_); 1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return map + ProgToString(this, &q); 1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring Prog::DumpUnanchored() { 1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Workq q(size_); 1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(&q, start_unanchored_); 1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ProgToString(this, &q); 1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool IsMatch(Prog*, Prog::Inst*); 1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Peep-hole optimizer. 1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Optimize() { 1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Workq q(size_); 1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Eliminate nops. Most are taken out during compilation 1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // but a few are hard to avoid. 1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q.clear(); 1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(&q, start_); 1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (Workq::iterator i = q.begin(); i != q.end(); ++i) { 1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = *i; 1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Inst* ip = inst(id); 1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int j = ip->out(); 1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Inst* jp; 1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { 1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson j = jp->out(); 1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ip->set_out(j); 1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(&q, ip->out()); 1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->opcode() == kInstAlt) { 1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson j = ip->out1(); 1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { 1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson j = jp->out(); 1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ip->out1_ = j; 1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(&q, ip->out1()); 2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Insert kInstAltMatch instructions 2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Look for 2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ip: Alt -> j | k 2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // j: ByteRange [00-FF] -> ip 2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // k: Match 2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // or the reverse (the above is the greedy one). 2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Rewrite Alt to AltMatch. 2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q.clear(); 2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(&q, start_); 2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (Workq::iterator i = q.begin(); i != q.end(); ++i) { 2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = *i; 2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Inst* ip = inst(id); 2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(&q, ip->out()); 2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->opcode() == kInstAlt) 2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(&q, ip->out1()); 2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->opcode() == kInstAlt) { 2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Inst* j = inst(ip->out()); 2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Inst* k = inst(ip->out1()); 2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (j->opcode() == kInstByteRange && j->out() == id && 2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson j->lo() == 0x00 && j->hi() == 0xFF && 2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson IsMatch(this, k)) { 2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ip->set_opcode(kInstAltMatch); 2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (IsMatch(this, j) && 2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson k->opcode() == kInstByteRange && k->out() == id && 2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson k->lo() == 0x00 && k->hi() == 0xFF) { 2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ip->set_opcode(kInstAltMatch); 2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Is ip a guaranteed match at end of text, perhaps after some capturing? 2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool IsMatch(Prog* prog, Prog::Inst* ip) { 2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (;;) { 2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (ip->opcode()) { 2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); 2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAlt: 2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAltMatch: 2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstByteRange: 2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstFail: 2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstEmptyWidth: 2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstCapture: 2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstNop: 2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ip = prog->inst(ip->out()); 2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstMatch: 2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonuint32 Prog::EmptyFlags(const StringPiece& text, const char* p) { 2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int flags = 0; 2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ^ and \A 2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (p == text.begin()) 2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags |= kEmptyBeginText | kEmptyBeginLine; 2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else if (p[-1] == '\n') 2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags |= kEmptyBeginLine; 2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // $ and \z 2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (p == text.end()) 2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags |= kEmptyEndText | kEmptyEndLine; 2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else if (p < text.end() && p[0] == '\n') 2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags |= kEmptyEndLine; 2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // \b and \B 2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (p == text.begin() && p == text.end()) { 2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // no word boundary here 2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (p == text.begin()) { 2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (IsWordChar(p[0])) 2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags |= kEmptyWordBoundary; 2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (p == text.end()) { 2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (IsWordChar(p[-1])) 2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags |= kEmptyWordBoundary; 2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (IsWordChar(p[-1]) != IsWordChar(p[0])) 2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags |= kEmptyWordBoundary; 2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!(flags & kEmptyWordBoundary)) 2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags |= kEmptyNonWordBoundary; 2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return flags; 2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::MarkByteRange(int lo, int hi) { 2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CHECK_GE(lo, 0); 2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CHECK_GE(hi, 0); 3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CHECK_LE(lo, 255); 3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson CHECK_LE(hi, 255); 3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (lo > 0) 3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson byterange_.Set(lo - 1); 3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson byterange_.Set(hi); 3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::ComputeByteMap() { 3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Fill in bytemap with byte classes for prog_. 3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Ranges of bytes that are treated as indistinguishable 3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // by the regexp program are mapped to a single byte class. 3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The vector prog_->byterange() marks the end of each 3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // such range. 3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const Bitmap<256>& v = byterange(); 3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize); 3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint8 n = 0; 3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint32 bits = 0; 3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < 256; i++) { 3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((i&31) == 0) 3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bits = v.Word(i >> 5); 3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bytemap_[i] = n; 3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson n += bits & 1; 3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bits >>= 1; 3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bytemap_range_ = bytemap_[255] + 1; 3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson unbytemap_ = new uint8[bytemap_range_]; 3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < 256; i++) 3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson unbytemap_[bytemap_[i]] = i; 3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (0) { // For debugging: use trivial byte map. 3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < 256; i++) { 3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bytemap_[i] = i; 3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson unbytemap_[i] = i; 3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bytemap_range_ = 256; 3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(INFO) << "Using trivial bytemap."; 3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} // namespace re2 3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 342