12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2009 The RE2 Authors. All Rights Reserved. 22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style 32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file. 42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/util.h" 62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/flags.h" 72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/prefilter.h" 82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/prefilter_tree.h" 92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/re2.h" 102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDEFINE_int32(filtered_re2_min_atom_len, 122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3, 132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson "Strings less than this length are not stored as atoms"); 142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 { 162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonPrefilterTree::PrefilterTree() 182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson : compiled_(false) { 192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonPrefilterTree::~PrefilterTree() { 222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < prefilter_vec_.size(); i++) 232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete prefilter_vec_[i]; 242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < entries_.size(); i++) 262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete entries_[i].parents; 272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Functions used for adding and Compiling prefilters to the 302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// PrefilterTree. 312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic bool KeepPart(Prefilter* prefilter, int level) { 322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (prefilter == NULL) 332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (prefilter->op()) { 362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Unexpected op in KeepPart: " 382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson << prefilter->op(); 392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case Prefilter::ALL: 422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case Prefilter::ATOM: 452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return prefilter->atom().size() >= 462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson FLAGS_filtered_re2_min_atom_len; 472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case Prefilter::AND: { 492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int j = 0; 502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson vector<Prefilter*>* subs = prefilter->subs(); 512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < subs->size(); i++) 522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (KeepPart((*subs)[i], level + 1)) 532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (*subs)[j++] = (*subs)[i]; 542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete (*subs)[i]; 562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson subs->resize(j); 582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return j > 0; 592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case Prefilter::OR: 622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < prefilter->subs()->size(); i++) 632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!KeepPart((*prefilter->subs())[i], level + 1)) 642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::Add(Prefilter *f) { 702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (compiled_) { 712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Add after Compile."; 722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (f != NULL && !KeepPart(f, 0)) { 752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete f; 762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson f = NULL; 772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prefilter_vec_.push_back(f); 802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::Compile(vector<string>* atom_vec) { 832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (compiled_) { 842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Compile after Compile."; 852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // We do this check to support some legacy uses of 892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // PrefilterTree that call Compile before adding any regexps, 902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and expect Compile not to have effect. 912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (prefilter_vec_.empty()) 922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson compiled_ = true; 952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AssignUniqueIds(atom_vec); 972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Identify nodes that are too common among prefilters and are 992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // triggering too many parents. Then get rid of them if possible. 1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Note that getting rid of a prefilter node simply means they are 1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // no longer necessary for their parent to trigger; that is, we do 1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // not miss out on any regexps triggering by getting rid of a 1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // prefilter node. 1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < entries_.size(); i++) { 1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson IntMap* parents = entries_[i].parents; 1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (parents->size() > 8) { 1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // This one triggers too many things. If all the parents are AND 1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // nodes and have other things guarding them, then get rid of 1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // this trigger. TODO(vsri): Adjust the threshold appropriately, 1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // make it a function of total number of nodes? 1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool have_other_guard = true; 1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it) 1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson have_other_guard = have_other_guard && 1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (entries_[it->index()].propagate_up_at_count > 1); 1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (have_other_guard) { 1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (IntMap::iterator it = parents->begin(); 1182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson it != parents->end(); ++it) 1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson entries_[it->index()].propagate_up_at_count -= 1; 1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson parents->clear(); // Forget the parents 1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PrintDebugInfo(); 1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonPrefilter* PrefilterTree::CanonicalNode(Prefilter* node) { 1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson string node_string = NodeString(node); 1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson map<string, Prefilter*>::iterator iter = node_map_.find(node_string); 1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (iter == node_map_.end()) 1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return (*iter).second; 1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic string Itoa(int n) { 1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson char buf[100]; 1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson snprintf(buf, sizeof buf, "%d", n); 1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return string(buf); 1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring PrefilterTree::NodeString(Prefilter* node) const { 1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Adding the operation disambiguates AND/OR/atom nodes. 1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson string s = Itoa(node->op()) + ":"; 1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (node->op() == Prefilter::ATOM) { 1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s += node->atom(); 1482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 1492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < node->subs()->size() ; i++) { 1502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i > 0) 1512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s += ','; 1522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s += Itoa((*node->subs())[i]->unique_id()); 1532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return s; 1562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 1572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { 1592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson atom_vec->clear(); 1602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Build vector of all filter nodes, sorted topologically 1622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // from top to bottom in v. 1632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson vector<Prefilter*> v; 1642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Add the top level nodes of each regexp prefilter. 1662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < prefilter_vec_.size(); i++) { 1672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prefilter* f = prefilter_vec_[i]; 1682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (f == NULL) 1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson unfiltered_.push_back(i); 1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // We push NULL also on to v, so that we maintain the 1722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // mapping of index==regexpid for level=0 prefilter nodes. 1732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson v.push_back(f); 1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Now add all the descendant nodes. 1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < v.size(); i++) { 1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prefilter* f = v[i]; 1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (f == NULL) 1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { 1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const vector<Prefilter*>& subs = *f->subs(); 1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int j = 0; j < subs.size(); j++) 1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson v.push_back(subs[j]); 1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Identify unique nodes. 1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int unique_id = 0; 1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = v.size() - 1; i >= 0; i--) { 1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prefilter *node = v[i]; 1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (node == NULL) 1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson node->set_unique_id(-1); 1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prefilter* canonical = CanonicalNode(node); 1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (canonical == NULL) { 1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Any further nodes that have the same node string 1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // will find this node as the canonical node. 1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson node_map_[NodeString(node)] = node; 2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (node->op() == Prefilter::ATOM) { 2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson atom_vec->push_back(node->atom()); 2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson atom_index_to_id_.push_back(unique_id); 2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson node->set_unique_id(unique_id++); 2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson node->set_unique_id(canonical->unique_id()); 2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson entries_.resize(node_map_.size()); 2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Create parent IntMap for the entries. 2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = v.size() - 1; i >= 0; i--) { 2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prefilter* prefilter = v[i]; 2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (prefilter == NULL) 2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (CanonicalNode(prefilter) != prefilter) 2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Entry* entry = &entries_[prefilter->unique_id()]; 2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson entry->parents = new IntMap(node_map_.size()); 2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Fill the entries. 2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = v.size() - 1; i >= 0; i--) { 2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prefilter* prefilter = v[i]; 2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (prefilter == NULL) 2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (CanonicalNode(prefilter) != prefilter) 2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Entry* entry = &entries_[prefilter->unique_id()]; 2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (prefilter->op()) { 2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: 2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case Prefilter::ALL: 2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Unexpected op: " << prefilter->op(); 2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case Prefilter::ATOM: 2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson entry->propagate_up_at_count = 1; 2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case Prefilter::OR: 2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case Prefilter::AND: { 2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson IntMap uniq_child(node_map_.size()); 2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int j = 0; j < prefilter->subs()->size() ; j++) { 2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prefilter* child = (*prefilter->subs())[j]; 2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prefilter* canonical = CanonicalNode(child); 2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (canonical == NULL) { 2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Null canonical node"; 2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int child_id = canonical->unique_id(); 2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!uniq_child.has_index(child_id)) 2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uniq_child.set_new(child_id, 1); 2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // To the child, we want to add to parent indices. 2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Entry* child_entry = &entries_[child_id]; 2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!child_entry->parents->has_index(prefilter->unique_id())) 2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson child_entry->parents->set_new(prefilter->unique_id(), 1); 2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson entry->propagate_up_at_count = 2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prefilter->op() == Prefilter::AND ? uniq_child.size() : 1; 2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // For top level nodes, populate regexp id. 2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < prefilter_vec_.size(); i++) { 2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (prefilter_vec_[i] == NULL) 2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = CanonicalNode(prefilter_vec_[i])->unique_id(); 2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_LE(0, id); 2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Entry* entry = &entries_[id]; 2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson entry->regexps.push_back(i); 2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Functions for triggering during search. 2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::RegexpsGivenStrings( 2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const vector<int>& matched_atoms, 2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson vector<int>* regexps) const { 2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson regexps->clear(); 2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!compiled_) { 2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(WARNING) << "Compile() not called"; 2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < prefilter_vec_.size(); ++i) 2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson regexps->push_back(i); 2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!prefilter_vec_.empty()) { 2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson IntMap regexps_map(prefilter_vec_.size()); 2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson vector<int> matched_atom_ids; 2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int j = 0; j < matched_atoms.size(); j++) { 2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); 2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]]; 2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson PropagateMatch(matched_atom_ids, ®exps_map); 3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (IntMap::iterator it = regexps_map.begin(); 3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson it != regexps_map.end(); 3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ++it) 3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson regexps->push_back(it->index()); 3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); 3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sort(regexps->begin(), regexps->end()); 3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::PropagateMatch(const vector<int>& atom_ids, 3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson IntMap* regexps) const { 3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson IntMap count(entries_.size()); 3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson IntMap work(entries_.size()); 3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < atom_ids.size(); i++) 3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson work.set(atom_ids[i], 1); 3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { 3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const Entry& entry = entries_[it->index()]; 3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(10) << "Processing: " << it->index(); 3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Record regexps triggered. 3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < entry.regexps.size(); i++) { 3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(10) << "Regexp triggered: " << entry.regexps[i]; 3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson regexps->set(entry.regexps[i], 1); 3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int c; 3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pass trigger up to parents. 3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (IntMap::iterator it = entry.parents->begin(); 3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson it != entry.parents->end(); 3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ++it) { 3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int j = it->index(); 3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const Entry& parent = entries_[j]; 3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count; 3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Delay until all the children have succeeded. 3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (parent.propagate_up_at_count > 1) { 3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (count.has_index(j)) { 3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c = count.get_existing(j) + 1; 3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson count.set_existing(j, c); 3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c = 1; 3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson count.set_new(j, c); 3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (c < parent.propagate_up_at_count) 3432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 3442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(10) << "Triggering: " << j; 3462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Trigger the parent. 3472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson work.set(j, 1); 3482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Debugging help. 3532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::PrintPrefilter(int regexpid) { 3542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]); 3552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid PrefilterTree::PrintDebugInfo() { 3582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size(); 3592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(10) << "#Unique Nodes: " << entries_.size(); 3602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < entries_.size(); ++i) { 3622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson IntMap* parents = entries_[i].parents; 3632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const vector<int>& regexps = entries_[i].regexps; 3642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(10) << "EntryId: " << i 3652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson << " N: " << parents->size() << " R: " << regexps.size(); 3662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it) 3672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(10) << it->index(); 3682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(10) << "Map:"; 3702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (map<string, Prefilter*>::const_iterator iter = node_map_.begin(); 3712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson iter != node_map_.end(); ++iter) 3722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(10) << "NodeId: " << (*iter).second->unique_id() 3732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson << " Str: " << (*iter).first; 3742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring PrefilterTree::DebugNodeString(Prefilter* node) const { 3772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson string node_string = ""; 3782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (node->op() == Prefilter::ATOM) { 3802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK(!node->atom().empty()); 3812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson node_string += node->atom(); 3822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 3832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Adding the operation disambiguates AND and OR nodes. 3842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson node_string += node->op() == Prefilter::AND ? "AND" : "OR"; 3852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson node_string += "("; 3862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < node->subs()->size() ; i++) { 3872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (i > 0) 3882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson node_string += ','; 3892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson node_string += Itoa((*node->subs())[i]->unique_id()); 3902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson node_string += ":"; 3912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson node_string += DebugNodeString((*node->subs())[i]); 3922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson node_string += ")"; 3942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return node_string; 3962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} // namespace re2 399