12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2007 The RE2 Authors.  All Rights Reserved.
22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style
32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file.
42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Compiled regular expression representation.
62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Tested by compile_test.cc
72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/util.h"
92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/sparse_set.h"
102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/prog.h"
112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/stringpiece.h"
122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 {
142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Constructors per Inst opcode
162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitAlt(uint32 out, uint32 out1) {
182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DCHECK_EQ(out_opcode_, 0);
192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  set_out_opcode(out, kInstAlt);
202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  out1_ = out1;
212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DCHECK_EQ(out_opcode_, 0);
252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  set_out_opcode(out, kInstByteRange);
262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  lo_ = lo & 0xFF;
272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  hi_ = hi & 0xFF;
282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  foldcase_ = foldcase;
292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitCapture(int cap, uint32 out) {
322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DCHECK_EQ(out_opcode_, 0);
332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  set_out_opcode(out, kInstCapture);
342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  cap_ = cap;
352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DCHECK_EQ(out_opcode_, 0);
392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  set_out_opcode(out, kInstEmptyWidth);
402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  empty_ = empty;
412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitMatch(int32 id) {
442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DCHECK_EQ(out_opcode_, 0);
452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  set_opcode(kInstMatch);
462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  match_id_ = id;
472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitNop(uint32 out) {
502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DCHECK_EQ(out_opcode_, 0);
512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  set_opcode(kInstNop);
522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Inst::InitFail() {
552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  DCHECK_EQ(out_opcode_, 0);
562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  set_opcode(kInstFail);
572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring Prog::Inst::Dump() {
602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  switch (opcode()) {
612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    default:
622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return StringPrintf("opcode %d", static_cast<int>(opcode()));
632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstAlt:
652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return StringPrintf("alt -> %d | %d", out(), out1_);
662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstAltMatch:
682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return StringPrintf("altmatch -> %d | %d", out(), out1_);
692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstByteRange:
712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return StringPrintf("byte%s [%02x-%02x] -> %d",
722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                          foldcase_ ? "/i" : "",
732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                          lo_, hi_, out());
742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstCapture:
762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return StringPrintf("capture %d -> %d", cap_, out());
772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstEmptyWidth:
792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return StringPrintf("emptywidth %#x -> %d",
802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                          static_cast<int>(empty_), out());
812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstMatch:
832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return StringPrintf("match! %d", match_id());
842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstNop:
862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return StringPrintf("nop -> %d", out());
872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case kInstFail:
892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return StringPrintf("fail");
902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonProg::Prog()
942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  : anchor_start_(false),
952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    anchor_end_(false),
962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    reversed_(false),
972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    did_onepass_(false),
982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    start_(0),
992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    start_unanchored_(0),
1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    size_(0),
1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    byte_inst_count_(0),
1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    bytemap_range_(0),
1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    flags_(0),
1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    onepass_statesize_(0),
1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    inst_(NULL),
1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    dfa_first_(NULL),
1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    dfa_longest_(NULL),
1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    dfa_mem_(0),
1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    delete_dfa_(NULL),
1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    unbytemap_(NULL),
1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    onepass_nodes_(NULL),
1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    onepass_start_(NULL) {
1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonProg::~Prog() {
1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (delete_dfa_) {
1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (dfa_first_)
1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      delete_dfa_(dfa_first_);
1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (dfa_longest_)
1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      delete_dfa_(dfa_longest_);
1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  delete[] onepass_nodes_;
1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  delete[] inst_;
1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  delete[] unbytemap_;
1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsontypedef SparseSet Workq;
1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic inline void AddToQueue(Workq* q, int id) {
1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (id != 0)
1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    q->insert(id);
1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic string ProgToString(Prog* prog, Workq* q) {
1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  string s;
1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int id = *i;
1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Prog::Inst* ip = prog->inst(id);
1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    AddToQueue(q, ip->out());
1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      AddToQueue(q, ip->out1());
1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return s;
1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring Prog::Dump() {
1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  string map;
1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (false) {  // Debugging
1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int lo = 0;
1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    StringAppendF(&map, "byte map:\n");
1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    for (int i = 0; i < bytemap_range_; i++) {
1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      lo = unbytemap_[i] + 1;
1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    StringAppendF(&map, "\n");
1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Workq q(size_);
1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  AddToQueue(&q, start_);
1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return map + ProgToString(this, &q);
1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring Prog::DumpUnanchored() {
1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Workq q(size_);
1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  AddToQueue(&q, start_unanchored_);
1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return ProgToString(this, &q);
1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool IsMatch(Prog*, Prog::Inst*);
1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Peep-hole optimizer.
1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::Optimize() {
1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  Workq q(size_);
1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Eliminate nops.  Most are taken out during compilation
1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // but a few are hard to avoid.
1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  q.clear();
1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  AddToQueue(&q, start_);
1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int id = *i;
1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Inst* ip = inst(id);
1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int j = ip->out();
1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Inst* jp;
1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      j = jp->out();
1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    ip->set_out(j);
1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    AddToQueue(&q, ip->out());
1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (ip->opcode() == kInstAlt) {
1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      j = ip->out1();
1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        j = jp->out();
1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      ip->out1_ = j;
1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      AddToQueue(&q, ip->out1());
2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Insert kInstAltMatch instructions
2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Look for
2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //   ip: Alt -> j | k
2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //	  j: ByteRange [00-FF] -> ip
2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  //    k: Match
2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // or the reverse (the above is the greedy one).
2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Rewrite Alt to AltMatch.
2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  q.clear();
2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  AddToQueue(&q, start_);
2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int id = *i;
2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Inst* ip = inst(id);
2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    AddToQueue(&q, ip->out());
2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (ip->opcode() == kInstAlt)
2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      AddToQueue(&q, ip->out1());
2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (ip->opcode() == kInstAlt) {
2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      Inst* j = inst(ip->out());
2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      Inst* k = inst(ip->out1());
2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (j->opcode() == kInstByteRange && j->out() == id &&
2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          j->lo() == 0x00 && j->hi() == 0xFF &&
2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          IsMatch(this, k)) {
2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        ip->set_opcode(kInstAltMatch);
2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        continue;
2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (IsMatch(this, j) &&
2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          k->opcode() == kInstByteRange && k->out() == id &&
2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          k->lo() == 0x00 && k->hi() == 0xFF) {
2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        ip->set_opcode(kInstAltMatch);
2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Is ip a guaranteed match at end of text, perhaps after some capturing?
2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool IsMatch(Prog* prog, Prog::Inst* ip) {
2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (;;) {
2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    switch (ip->opcode()) {
2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      default:
2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        return false;
2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstAlt:
2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstAltMatch:
2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstByteRange:
2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstFail:
2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstEmptyWidth:
2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        return false;
2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstCapture:
2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstNop:
2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        ip = prog->inst(ip->out());
2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case kInstMatch:
2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        return true;
2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonuint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int flags = 0;
2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // ^ and \A
2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (p == text.begin())
2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    flags |= kEmptyBeginText | kEmptyBeginLine;
2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  else if (p[-1] == '\n')
2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    flags |= kEmptyBeginLine;
2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // $ and \z
2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (p == text.end())
2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    flags |= kEmptyEndText | kEmptyEndLine;
2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  else if (p < text.end() && p[0] == '\n')
2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    flags |= kEmptyEndLine;
2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // \b and \B
2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (p == text.begin() && p == text.end()) {
2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // no word boundary here
2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  } else if (p == text.begin()) {
2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (IsWordChar(p[0]))
2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      flags |= kEmptyWordBoundary;
2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  } else if (p == text.end()) {
2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (IsWordChar(p[-1]))
2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      flags |= kEmptyWordBoundary;
2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  } else {
2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (IsWordChar(p[-1]) != IsWordChar(p[0]))
2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      flags |= kEmptyWordBoundary;
2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (!(flags & kEmptyWordBoundary))
2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    flags |= kEmptyNonWordBoundary;
2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return flags;
2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::MarkByteRange(int lo, int hi) {
2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  CHECK_GE(lo, 0);
2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  CHECK_GE(hi, 0);
3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  CHECK_LE(lo, 255);
3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  CHECK_LE(hi, 255);
3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (lo > 0)
3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    byterange_.Set(lo - 1);
3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  byterange_.Set(hi);
3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid Prog::ComputeByteMap() {
3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Fill in bytemap with byte classes for prog_.
3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Ranges of bytes that are treated as indistinguishable
3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // by the regexp program are mapped to a single byte class.
3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // The vector prog_->byterange() marks the end of each
3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // such range.
3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  const Bitmap<256>& v = byterange();
3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint8 n = 0;
3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  uint32 bits = 0;
3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = 0; i < 256; i++) {
3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if ((i&31) == 0)
3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      bits = v.Word(i >> 5);
3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    bytemap_[i] = n;
3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    n += bits & 1;
3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    bits >>= 1;
3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  bytemap_range_ = bytemap_[255] + 1;
3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  unbytemap_ = new uint8[bytemap_range_];
3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = 0; i < 256; i++)
3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    unbytemap_[bytemap_[i]] = i;
3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (0) {  // For debugging: use trivial byte map.
3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    for (int i = 0; i < 256; i++) {
3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      bytemap_[i] = i;
3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      unbytemap_[i] = i;
3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    bytemap_range_ = 256;
3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    LOG(INFO) << "Using trivial bytemap.";
3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}  // namespace re2
3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
342