12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2009 The RE2 Authors.  All Rights Reserved.
22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style
32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file.
42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/util.h"
62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/flags.h"
72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/prefilter.h"
82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/prefilter_tree.h"
92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/re2.h"
102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDEFINE_int32(filtered_re2_min_atom_len,
122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson             3,
132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson             "Strings less than this length are not stored as atoms");
142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 {
162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonPrefilterTree::PrefilterTree()
182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    : compiled_(false) {
192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonPrefilterTree::~PrefilterTree() {
222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = 0; i < prefilter_vec_.size(); i++)
232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    delete prefilter_vec_[i];
242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = 0; i < entries_.size(); i++)
262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    delete entries_[i].parents;
272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Functions used for adding and Compiling prefilters to the
302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// PrefilterTree.
312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool KeepPart(Prefilter* prefilter, int level) {
322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (prefilter == NULL)
332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return false;
342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  switch (prefilter->op()) {
362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    default:
372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      LOG(DFATAL) << "Unexpected op in KeepPart: "
382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                  << prefilter->op();
392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return false;
402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case Prefilter::ALL:
422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return false;
432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case Prefilter::ATOM:
452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return prefilter->atom().size() >=
462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          FLAGS_filtered_re2_min_atom_len;
472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case Prefilter::AND: {
492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      int j = 0;
502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      vector<Prefilter*>* subs = prefilter->subs();
512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      for (int i = 0; i < subs->size(); i++)
522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (KeepPart((*subs)[i], level + 1))
532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          (*subs)[j++] = (*subs)[i];
542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        else
552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          delete (*subs)[i];
562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      subs->resize(j);
582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return j > 0;
592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    case Prefilter::OR:
622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      for (int i = 0; i < prefilter->subs()->size(); i++)
632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (!KeepPart((*prefilter->subs())[i], level + 1))
642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          return false;
652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      return true;
662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::Add(Prefilter *f) {
702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (compiled_) {
712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    LOG(DFATAL) << "Add after Compile.";
722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return;
732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (f != NULL && !KeepPart(f, 0)) {
752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    delete f;
762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    f = NULL;
772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  prefilter_vec_.push_back(f);
802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::Compile(vector<string>* atom_vec) {
832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (compiled_) {
842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    LOG(DFATAL) << "Compile after Compile.";
852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return;
862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // We do this check to support some legacy uses of
892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // PrefilterTree that call Compile before adding any regexps,
902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // and expect Compile not to have effect.
912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (prefilter_vec_.empty())
922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return;
932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  compiled_ = true;
952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  AssignUniqueIds(atom_vec);
972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Identify nodes that are too common among prefilters and are
992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // triggering too many parents. Then get rid of them if possible.
1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Note that getting rid of a prefilter node simply means they are
1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // no longer necessary for their parent to trigger; that is, we do
1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // not miss out on any regexps triggering by getting rid of a
1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // prefilter node.
1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = 0; i < entries_.size(); i++) {
1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    IntMap* parents = entries_[i].parents;
1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (parents->size() > 8) {
1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // This one triggers too many things. If all the parents are AND
1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // nodes and have other things guarding them, then get rid of
1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // this trigger. TODO(vsri): Adjust the threshold appropriately,
1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // make it a function of total number of nodes?
1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      bool have_other_guard = true;
1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        have_other_guard = have_other_guard &&
1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            (entries_[it->index()].propagate_up_at_count > 1);
1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (have_other_guard) {
1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        for (IntMap::iterator it = parents->begin();
1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson             it != parents->end(); ++it)
1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          entries_[it->index()].propagate_up_at_count -= 1;
1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        parents->clear();  // Forget the parents
1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  PrintDebugInfo();
1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonPrefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  string node_string = NodeString(node);
1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (iter == node_map_.end())
1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    return NULL;
1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return (*iter).second;
1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic string Itoa(int n) {
1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  char buf[100];
1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  snprintf(buf, sizeof buf, "%d", n);
1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return string(buf);
1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring PrefilterTree::NodeString(Prefilter* node) const {
1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Adding the operation disambiguates AND/OR/atom nodes.
1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  string s = Itoa(node->op()) + ":";
1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (node->op() == Prefilter::ATOM) {
1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    s += node->atom();
1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  } else {
1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    for (int i = 0; i < node->subs()->size() ; i++) {
1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (i > 0)
1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        s += ',';
1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      s += Itoa((*node->subs())[i]->unique_id());
1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return s;
1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  atom_vec->clear();
1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Build vector of all filter nodes, sorted topologically
1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // from top to bottom in v.
1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  vector<Prefilter*> v;
1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Add the top level nodes of each regexp prefilter.
1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = 0; i < prefilter_vec_.size(); i++) {
1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Prefilter* f = prefilter_vec_[i];
1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (f == NULL)
1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      unfiltered_.push_back(i);
1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // We push NULL also on to v, so that we maintain the
1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // mapping of index==regexpid for level=0 prefilter nodes.
1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    v.push_back(f);
1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Now add all the descendant nodes.
1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = 0; i < v.size(); i++) {
1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Prefilter* f = v[i];
1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (f == NULL)
1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      continue;
1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      const vector<Prefilter*>& subs = *f->subs();
1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      for (int j = 0; j < subs.size(); j++)
1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        v.push_back(subs[j]);
1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Identify unique nodes.
1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  int unique_id = 0;
1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = v.size() - 1; i >= 0; i--) {
1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Prefilter *node = v[i];
1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (node == NULL)
1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      continue;
1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    node->set_unique_id(-1);
1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Prefilter* canonical = CanonicalNode(node);
1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (canonical == NULL) {
1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // Any further nodes that have the same node string
1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // will find this node as the canonical node.
1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      node_map_[NodeString(node)] = node;
2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (node->op() == Prefilter::ATOM) {
2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        atom_vec->push_back(node->atom());
2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        atom_index_to_id_.push_back(unique_id);
2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      node->set_unique_id(unique_id++);
2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    } else {
2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      node->set_unique_id(canonical->unique_id());
2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  entries_.resize(node_map_.size());
2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Create parent IntMap for the entries.
2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = v.size()  - 1; i >= 0; i--) {
2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Prefilter* prefilter = v[i];
2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (prefilter == NULL)
2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      continue;
2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (CanonicalNode(prefilter) != prefilter)
2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      continue;
2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Entry* entry = &entries_[prefilter->unique_id()];
2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    entry->parents = new IntMap(node_map_.size());
2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // Fill the entries.
2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = v.size()  - 1; i >= 0; i--) {
2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Prefilter* prefilter = v[i];
2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (prefilter == NULL)
2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      continue;
2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (CanonicalNode(prefilter) != prefilter)
2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      continue;
2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Entry* entry = &entries_[prefilter->unique_id()];
2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    switch (prefilter->op()) {
2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      default:
2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case Prefilter::ALL:
2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        LOG(DFATAL) << "Unexpected op: " << prefilter->op();
2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        return;
2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case Prefilter::ATOM:
2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        entry->propagate_up_at_count = 1;
2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case Prefilter::OR:
2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      case Prefilter::AND: {
2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        IntMap uniq_child(node_map_.size());
2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        for (int j = 0; j < prefilter->subs()->size() ; j++) {
2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          Prefilter* child = (*prefilter->subs())[j];
2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          Prefilter* canonical = CanonicalNode(child);
2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          if (canonical == NULL) {
2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            LOG(DFATAL) << "Null canonical node";
2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            return;
2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          }
2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          int child_id = canonical->unique_id();
2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          if (!uniq_child.has_index(child_id))
2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            uniq_child.set_new(child_id, 1);
2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          // To the child, we want to add to parent indices.
2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          Entry* child_entry = &entries_[child_id];
2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          if (!child_entry->parents->has_index(prefilter->unique_id()))
2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            child_entry->parents->set_new(prefilter->unique_id(), 1);
2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        }
2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        entry->propagate_up_at_count =
2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        break;
2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  // For top level nodes, populate regexp id.
2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = 0; i < prefilter_vec_.size(); i++) {
2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (prefilter_vec_[i] == NULL)
2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      continue;
2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int id = CanonicalNode(prefilter_vec_[i])->unique_id();
2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    DCHECK_LE(0, id);
2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    Entry* entry = &entries_[id];
2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    entry->regexps.push_back(i);
2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Functions for triggering during search.
2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::RegexpsGivenStrings(
2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    const vector<int>& matched_atoms,
2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    vector<int>* regexps) const {
2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  regexps->clear();
2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (!compiled_) {
2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    LOG(WARNING) << "Compile() not called";
2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    for (int i = 0; i < prefilter_vec_.size(); ++i)
2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      regexps->push_back(i);
2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  } else {
2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    if (!prefilter_vec_.empty()) {
2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      IntMap regexps_map(prefilter_vec_.size());
2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      vector<int> matched_atom_ids;
2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      for (int j = 0; j < matched_atoms.size(); j++) {
2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      PropagateMatch(matched_atom_ids, &regexps_map);
3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      for (IntMap::iterator it = regexps_map.begin();
3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson           it != regexps_map.end();
3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson           ++it)
3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        regexps->push_back(it->index());
3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  sort(regexps->begin(), regexps->end());
3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson                                   IntMap* regexps) const {
3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  IntMap count(entries_.size());
3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  IntMap work(entries_.size());
3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = 0; i < atom_ids.size(); i++)
3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    work.set(atom_ids[i], 1);
3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    const Entry& entry = entries_[it->index()];
3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    VLOG(10) << "Processing: " << it->index();
3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Record regexps triggered.
3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    for (int i = 0; i < entry.regexps.size(); i++) {
3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      VLOG(10) << "Regexp triggered: " << entry.regexps[i];
3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      regexps->set(entry.regexps[i], 1);
3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    int c;
3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Pass trigger up to parents.
3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    for (IntMap::iterator it = entry.parents->begin();
3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson         it != entry.parents->end();
3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson         ++it) {
3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      int j = it->index();
3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      const Entry& parent = entries_[j];
3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // Delay until all the children have succeeded.
3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (parent.propagate_up_at_count > 1) {
3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (count.has_index(j)) {
3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          c = count.get_existing(j) + 1;
3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          count.set_existing(j, c);
3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        } else {
3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          c = 1;
3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          count.set_new(j, c);
3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        }
3422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        if (c < parent.propagate_up_at_count)
3432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson          continue;
3442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      }
3452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      VLOG(10) << "Triggering: " << j;
3462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      // Trigger the parent.
3472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      work.set(j, 1);
3482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
3492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
3502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
3512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Debugging help.
3532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::PrintPrefilter(int regexpid) {
3542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
3552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
3562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::PrintDebugInfo() {
3582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
3592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  VLOG(10) << "#Unique Nodes: " << entries_.size();
3602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (int i = 0; i < entries_.size(); ++i) {
3622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    IntMap* parents = entries_[i].parents;
3632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    const vector<int>& regexps = entries_[i].regexps;
3642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    VLOG(10) << "EntryId: " << i
3652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            << " N: " << parents->size() << " R: " << regexps.size();
3662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
3672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      VLOG(10) << it->index();
3682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
3692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  VLOG(10) << "Map:";
3702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
3712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson       iter != node_map_.end(); ++iter)
3722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    VLOG(10) << "NodeId: " << (*iter).second->unique_id()
3732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson            << " Str: " << (*iter).first;
3742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
3752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring PrefilterTree::DebugNodeString(Prefilter* node) const {
3772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  string node_string = "";
3782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  if (node->op() == Prefilter::ATOM) {
3802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    DCHECK(!node->atom().empty());
3812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    node_string += node->atom();
3822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  } else {
3832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    // Adding the operation disambiguates AND and OR nodes.
3842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    node_string +=  node->op() == Prefilter::AND ? "AND" : "OR";
3852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    node_string += "(";
3862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    for (int i = 0; i < node->subs()->size() ; i++) {
3872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      if (i > 0)
3882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson        node_string += ',';
3892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      node_string += Itoa((*node->subs())[i]->unique_id());
3902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      node_string += ":";
3912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson      node_string += DebugNodeString((*node->subs())[i]);
3922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    }
3932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson    node_string += ")";
3942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  }
3952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson  return node_string;
3962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}
3972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson
3982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}  // namespace re2
399