1// Copyright 2009 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "util/util.h"
6#include "util/flags.h"
7#include "re2/prefilter.h"
8#include "re2/prefilter_tree.h"
9#include "re2/re2.h"
10
11DEFINE_int32(filtered_re2_min_atom_len,
12             3,
13             "Strings less than this length are not stored as atoms");
14
15namespace re2 {
16
17PrefilterTree::PrefilterTree()
18    : compiled_(false) {
19}
20
21PrefilterTree::~PrefilterTree() {
22  for (int i = 0; i < prefilter_vec_.size(); i++)
23    delete prefilter_vec_[i];
24
25  for (int i = 0; i < entries_.size(); i++)
26    delete entries_[i].parents;
27}
28
29// Functions used for adding and Compiling prefilters to the
30// PrefilterTree.
31static bool KeepPart(Prefilter* prefilter, int level) {
32  if (prefilter == NULL)
33    return false;
34
35  switch (prefilter->op()) {
36    default:
37      LOG(DFATAL) << "Unexpected op in KeepPart: "
38                  << prefilter->op();
39      return false;
40
41    case Prefilter::ALL:
42      return false;
43
44    case Prefilter::ATOM:
45      return prefilter->atom().size() >=
46          FLAGS_filtered_re2_min_atom_len;
47
48    case Prefilter::AND: {
49      int j = 0;
50      vector<Prefilter*>* subs = prefilter->subs();
51      for (int i = 0; i < subs->size(); i++)
52        if (KeepPart((*subs)[i], level + 1))
53          (*subs)[j++] = (*subs)[i];
54        else
55          delete (*subs)[i];
56
57      subs->resize(j);
58      return j > 0;
59    }
60
61    case Prefilter::OR:
62      for (int i = 0; i < prefilter->subs()->size(); i++)
63        if (!KeepPart((*prefilter->subs())[i], level + 1))
64          return false;
65      return true;
66  }
67}
68
69void PrefilterTree::Add(Prefilter *f) {
70  if (compiled_) {
71    LOG(DFATAL) << "Add after Compile.";
72    return;
73  }
74  if (f != NULL && !KeepPart(f, 0)) {
75    delete f;
76    f = NULL;
77  }
78
79  prefilter_vec_.push_back(f);
80}
81
82void PrefilterTree::Compile(vector<string>* atom_vec) {
83  if (compiled_) {
84    LOG(DFATAL) << "Compile after Compile.";
85    return;
86  }
87
88  // We do this check to support some legacy uses of
89  // PrefilterTree that call Compile before adding any regexps,
90  // and expect Compile not to have effect.
91  if (prefilter_vec_.empty())
92    return;
93
94  compiled_ = true;
95
96  AssignUniqueIds(atom_vec);
97
98  // Identify nodes that are too common among prefilters and are
99  // triggering too many parents. Then get rid of them if possible.
100  // Note that getting rid of a prefilter node simply means they are
101  // no longer necessary for their parent to trigger; that is, we do
102  // not miss out on any regexps triggering by getting rid of a
103  // prefilter node.
104  for (int i = 0; i < entries_.size(); i++) {
105    IntMap* parents = entries_[i].parents;
106    if (parents->size() > 8) {
107      // This one triggers too many things. If all the parents are AND
108      // nodes and have other things guarding them, then get rid of
109      // this trigger. TODO(vsri): Adjust the threshold appropriately,
110      // make it a function of total number of nodes?
111      bool have_other_guard = true;
112      for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
113        have_other_guard = have_other_guard &&
114            (entries_[it->index()].propagate_up_at_count > 1);
115
116      if (have_other_guard) {
117        for (IntMap::iterator it = parents->begin();
118             it != parents->end(); ++it)
119          entries_[it->index()].propagate_up_at_count -= 1;
120
121        parents->clear();  // Forget the parents
122      }
123    }
124  }
125
126  PrintDebugInfo();
127}
128
129Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
130  string node_string = NodeString(node);
131  map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
132  if (iter == node_map_.end())
133    return NULL;
134  return (*iter).second;
135}
136
137static string Itoa(int n) {
138  char buf[100];
139  snprintf(buf, sizeof buf, "%d", n);
140  return string(buf);
141}
142
143string PrefilterTree::NodeString(Prefilter* node) const {
144  // Adding the operation disambiguates AND/OR/atom nodes.
145  string s = Itoa(node->op()) + ":";
146  if (node->op() == Prefilter::ATOM) {
147    s += node->atom();
148  } else {
149    for (int i = 0; i < node->subs()->size() ; i++) {
150      if (i > 0)
151        s += ',';
152      s += Itoa((*node->subs())[i]->unique_id());
153    }
154  }
155  return s;
156}
157
158void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
159  atom_vec->clear();
160
161  // Build vector of all filter nodes, sorted topologically
162  // from top to bottom in v.
163  vector<Prefilter*> v;
164
165  // Add the top level nodes of each regexp prefilter.
166  for (int i = 0; i < prefilter_vec_.size(); i++) {
167    Prefilter* f = prefilter_vec_[i];
168    if (f == NULL)
169      unfiltered_.push_back(i);
170
171    // We push NULL also on to v, so that we maintain the
172    // mapping of index==regexpid for level=0 prefilter nodes.
173    v.push_back(f);
174  }
175
176  // Now add all the descendant nodes.
177  for (int i = 0; i < v.size(); i++) {
178    Prefilter* f = v[i];
179    if (f == NULL)
180      continue;
181    if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
182      const vector<Prefilter*>& subs = *f->subs();
183      for (int j = 0; j < subs.size(); j++)
184        v.push_back(subs[j]);
185    }
186  }
187
188  // Identify unique nodes.
189  int unique_id = 0;
190  for (int i = v.size() - 1; i >= 0; i--) {
191    Prefilter *node = v[i];
192    if (node == NULL)
193      continue;
194    node->set_unique_id(-1);
195    Prefilter* canonical = CanonicalNode(node);
196    if (canonical == NULL) {
197      // Any further nodes that have the same node string
198      // will find this node as the canonical node.
199      node_map_[NodeString(node)] = node;
200      if (node->op() == Prefilter::ATOM) {
201        atom_vec->push_back(node->atom());
202        atom_index_to_id_.push_back(unique_id);
203      }
204      node->set_unique_id(unique_id++);
205    } else {
206      node->set_unique_id(canonical->unique_id());
207    }
208  }
209  entries_.resize(node_map_.size());
210
211  // Create parent IntMap for the entries.
212  for (int i = v.size()  - 1; i >= 0; i--) {
213    Prefilter* prefilter = v[i];
214    if (prefilter == NULL)
215      continue;
216
217    if (CanonicalNode(prefilter) != prefilter)
218      continue;
219
220    Entry* entry = &entries_[prefilter->unique_id()];
221    entry->parents = new IntMap(node_map_.size());
222  }
223
224  // Fill the entries.
225  for (int i = v.size()  - 1; i >= 0; i--) {
226    Prefilter* prefilter = v[i];
227    if (prefilter == NULL)
228      continue;
229
230    if (CanonicalNode(prefilter) != prefilter)
231      continue;
232
233    Entry* entry = &entries_[prefilter->unique_id()];
234
235    switch (prefilter->op()) {
236      default:
237      case Prefilter::ALL:
238        LOG(DFATAL) << "Unexpected op: " << prefilter->op();
239        return;
240
241      case Prefilter::ATOM:
242        entry->propagate_up_at_count = 1;
243        break;
244
245      case Prefilter::OR:
246      case Prefilter::AND: {
247        IntMap uniq_child(node_map_.size());
248        for (int j = 0; j < prefilter->subs()->size() ; j++) {
249          Prefilter* child = (*prefilter->subs())[j];
250          Prefilter* canonical = CanonicalNode(child);
251          if (canonical == NULL) {
252            LOG(DFATAL) << "Null canonical node";
253            return;
254          }
255          int child_id = canonical->unique_id();
256          if (!uniq_child.has_index(child_id))
257            uniq_child.set_new(child_id, 1);
258          // To the child, we want to add to parent indices.
259          Entry* child_entry = &entries_[child_id];
260          if (!child_entry->parents->has_index(prefilter->unique_id()))
261            child_entry->parents->set_new(prefilter->unique_id(), 1);
262        }
263        entry->propagate_up_at_count =
264            prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
265
266        break;
267      }
268    }
269  }
270
271  // For top level nodes, populate regexp id.
272  for (int i = 0; i < prefilter_vec_.size(); i++) {
273    if (prefilter_vec_[i] == NULL)
274      continue;
275    int id = CanonicalNode(prefilter_vec_[i])->unique_id();
276    DCHECK_LE(0, id);
277    Entry* entry = &entries_[id];
278    entry->regexps.push_back(i);
279  }
280}
281
282// Functions for triggering during search.
283void PrefilterTree::RegexpsGivenStrings(
284    const vector<int>& matched_atoms,
285    vector<int>* regexps) const {
286  regexps->clear();
287  if (!compiled_) {
288    LOG(WARNING) << "Compile() not called";
289    for (int i = 0; i < prefilter_vec_.size(); ++i)
290      regexps->push_back(i);
291  } else {
292    if (!prefilter_vec_.empty()) {
293      IntMap regexps_map(prefilter_vec_.size());
294      vector<int> matched_atom_ids;
295      for (int j = 0; j < matched_atoms.size(); j++) {
296        matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
297        VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
298      }
299      PropagateMatch(matched_atom_ids, &regexps_map);
300      for (IntMap::iterator it = regexps_map.begin();
301           it != regexps_map.end();
302           ++it)
303        regexps->push_back(it->index());
304
305      regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
306    }
307  }
308  sort(regexps->begin(), regexps->end());
309}
310
311void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
312                                   IntMap* regexps) const {
313  IntMap count(entries_.size());
314  IntMap work(entries_.size());
315  for (int i = 0; i < atom_ids.size(); i++)
316    work.set(atom_ids[i], 1);
317  for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
318    const Entry& entry = entries_[it->index()];
319    VLOG(10) << "Processing: " << it->index();
320    // Record regexps triggered.
321    for (int i = 0; i < entry.regexps.size(); i++) {
322      VLOG(10) << "Regexp triggered: " << entry.regexps[i];
323      regexps->set(entry.regexps[i], 1);
324    }
325    int c;
326    // Pass trigger up to parents.
327    for (IntMap::iterator it = entry.parents->begin();
328         it != entry.parents->end();
329         ++it) {
330      int j = it->index();
331      const Entry& parent = entries_[j];
332      VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
333      // Delay until all the children have succeeded.
334      if (parent.propagate_up_at_count > 1) {
335        if (count.has_index(j)) {
336          c = count.get_existing(j) + 1;
337          count.set_existing(j, c);
338        } else {
339          c = 1;
340          count.set_new(j, c);
341        }
342        if (c < parent.propagate_up_at_count)
343          continue;
344      }
345      VLOG(10) << "Triggering: " << j;
346      // Trigger the parent.
347      work.set(j, 1);
348    }
349  }
350}
351
352// Debugging help.
353void PrefilterTree::PrintPrefilter(int regexpid) {
354  LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
355}
356
357void PrefilterTree::PrintDebugInfo() {
358  VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
359  VLOG(10) << "#Unique Nodes: " << entries_.size();
360
361  for (int i = 0; i < entries_.size(); ++i) {
362    IntMap* parents = entries_[i].parents;
363    const vector<int>& regexps = entries_[i].regexps;
364    VLOG(10) << "EntryId: " << i
365            << " N: " << parents->size() << " R: " << regexps.size();
366    for (IntMap::iterator it = parents->begin(); it != parents->end(); ++it)
367      VLOG(10) << it->index();
368  }
369  VLOG(10) << "Map:";
370  for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
371       iter != node_map_.end(); ++iter)
372    VLOG(10) << "NodeId: " << (*iter).second->unique_id()
373            << " Str: " << (*iter).first;
374}
375
376string PrefilterTree::DebugNodeString(Prefilter* node) const {
377  string node_string = "";
378
379  if (node->op() == Prefilter::ATOM) {
380    DCHECK(!node->atom().empty());
381    node_string += node->atom();
382  } else {
383    // Adding the operation disambiguates AND and OR nodes.
384    node_string +=  node->op() == Prefilter::AND ? "AND" : "OR";
385    node_string += "(";
386    for (int i = 0; i < node->subs()->size() ; i++) {
387      if (i > 0)
388        node_string += ',';
389      node_string += Itoa((*node->subs())[i]->unique_id());
390      node_string += ":";
391      node_string += DebugNodeString((*node->subs())[i]);
392    }
393    node_string += ")";
394  }
395  return node_string;
396}
397
398}  // namespace re2
399