1// Copyright 2009 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "util/util.h"
6#include "util/flags.h"
7#include "re2/prefilter.h"
8#include "re2/prefilter_tree.h"
9#include "re2/re2.h"
10
11#ifdef WIN32
12#include <stdio.h>
13#define snprintf _snprintf
14#endif
15
16DEFINE_int32(filtered_re2_min_atom_len,
17             3,
18             "Strings less than this length are not stored as atoms");
19
20namespace re2 {
21
22PrefilterTree::PrefilterTree()
23    : compiled_(false) {
24}
25
26PrefilterTree::~PrefilterTree() {
27  for (int i = 0; i < prefilter_vec_.size(); i++)
28    delete prefilter_vec_[i];
29
30  for (int i = 0; i < entries_.size(); i++)
31    delete entries_[i].parents;
32}
33
34// Functions used for adding and Compiling prefilters to the
35// PrefilterTree.
36static bool KeepPart(Prefilter* prefilter, int level) {
37  if (prefilter == NULL)
38    return false;
39
40  switch (prefilter->op()) {
41    default:
42      LOG(DFATAL) << "Unexpected op in KeepPart: "
43                  << prefilter->op();
44      return false;
45
46    case Prefilter::ALL:
47      return false;
48
49    case Prefilter::ATOM:
50      return prefilter->atom().size() >=
51          FLAGS_filtered_re2_min_atom_len;
52
53    case Prefilter::AND: {
54      int j = 0;
55      vector<Prefilter*>* subs = prefilter->subs();
56      for (int i = 0; i < subs->size(); i++)
57        if (KeepPart((*subs)[i], level + 1))
58          (*subs)[j++] = (*subs)[i];
59        else
60          delete (*subs)[i];
61
62      subs->resize(j);
63      return j > 0;
64    }
65
66    case Prefilter::OR:
67      for (int i = 0; i < prefilter->subs()->size(); i++)
68        if (!KeepPart((*prefilter->subs())[i], level + 1))
69          return false;
70      return true;
71  }
72}
73
74void PrefilterTree::Add(Prefilter *f) {
75  if (compiled_) {
76    LOG(DFATAL) << "Add after Compile.";
77    return;
78  }
79  if (f != NULL && !KeepPart(f, 0)) {
80    delete f;
81    f = NULL;
82  }
83
84  prefilter_vec_.push_back(f);
85}
86
87void PrefilterTree::Compile(vector<string>* atom_vec) {
88  if (compiled_) {
89    LOG(DFATAL) << "Compile after Compile.";
90    return;
91  }
92
93  // We do this check to support some legacy uses of
94  // PrefilterTree that call Compile before adding any regexps,
95  // and expect Compile not to have effect.
96  if (prefilter_vec_.empty())
97    return;
98
99  compiled_ = true;
100
101  AssignUniqueIds(atom_vec);
102
103  // Identify nodes that are too common among prefilters and are
104  // triggering too many parents. Then get rid of them if possible.
105  // Note that getting rid of a prefilter node simply means they are
106  // no longer necessary for their parent to trigger; that is, we do
107  // not miss out on any regexps triggering by getting rid of a
108  // prefilter node.
109  for (int i = 0; i < entries_.size(); i++) {
110    StdIntMap* parents = entries_[i].parents;
111    if (parents->size() > 8) {
112      // This one triggers too many things. If all the parents are AND
113      // nodes and have other things guarding them, then get rid of
114      // this trigger. TODO(vsri): Adjust the threshold appropriately,
115      // make it a function of total number of nodes?
116      bool have_other_guard = true;
117      for (StdIntMap::iterator it = parents->begin();
118           it != parents->end(); ++it) {
119        have_other_guard = have_other_guard &&
120            (entries_[it->first].propagate_up_at_count > 1);
121      }
122
123      if (have_other_guard) {
124        for (StdIntMap::iterator it = parents->begin();
125             it != parents->end(); ++it)
126          entries_[it->first].propagate_up_at_count -= 1;
127
128        parents->clear();  // Forget the parents
129      }
130    }
131  }
132
133  PrintDebugInfo();
134}
135
136Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
137  string node_string = NodeString(node);
138  map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
139  if (iter == node_map_.end())
140    return NULL;
141  return (*iter).second;
142}
143
144static string Itoa(int n) {
145  char buf[100];
146  snprintf(buf, sizeof buf, "%d", n);
147  return string(buf);
148}
149
150string PrefilterTree::NodeString(Prefilter* node) const {
151  // Adding the operation disambiguates AND/OR/atom nodes.
152  string s = Itoa(node->op()) + ":";
153  if (node->op() == Prefilter::ATOM) {
154    s += node->atom();
155  } else {
156    for (int i = 0; i < node->subs()->size() ; i++) {
157      if (i > 0)
158        s += ',';
159      s += Itoa((*node->subs())[i]->unique_id());
160    }
161  }
162  return s;
163}
164
165void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
166  atom_vec->clear();
167
168  // Build vector of all filter nodes, sorted topologically
169  // from top to bottom in v.
170  vector<Prefilter*> v;
171
172  // Add the top level nodes of each regexp prefilter.
173  for (int i = 0; i < prefilter_vec_.size(); i++) {
174    Prefilter* f = prefilter_vec_[i];
175    if (f == NULL)
176      unfiltered_.push_back(i);
177
178    // We push NULL also on to v, so that we maintain the
179    // mapping of index==regexpid for level=0 prefilter nodes.
180    v.push_back(f);
181  }
182
183  // Now add all the descendant nodes.
184  for (int i = 0; i < v.size(); i++) {
185    Prefilter* f = v[i];
186    if (f == NULL)
187      continue;
188    if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
189      const vector<Prefilter*>& subs = *f->subs();
190      for (int j = 0; j < subs.size(); j++)
191        v.push_back(subs[j]);
192    }
193  }
194
195  // Identify unique nodes.
196  int unique_id = 0;
197  for (int i = v.size() - 1; i >= 0; i--) {
198    Prefilter *node = v[i];
199    if (node == NULL)
200      continue;
201    node->set_unique_id(-1);
202    Prefilter* canonical = CanonicalNode(node);
203    if (canonical == NULL) {
204      // Any further nodes that have the same node string
205      // will find this node as the canonical node.
206      node_map_[NodeString(node)] = node;
207      if (node->op() == Prefilter::ATOM) {
208        atom_vec->push_back(node->atom());
209        atom_index_to_id_.push_back(unique_id);
210      }
211      node->set_unique_id(unique_id++);
212    } else {
213      node->set_unique_id(canonical->unique_id());
214    }
215  }
216  entries_.resize(node_map_.size());
217
218  // Create parent StdIntMap for the entries.
219  for (int i = v.size()  - 1; i >= 0; i--) {
220    Prefilter* prefilter = v[i];
221    if (prefilter == NULL)
222      continue;
223
224    if (CanonicalNode(prefilter) != prefilter)
225      continue;
226
227    Entry* entry = &entries_[prefilter->unique_id()];
228    entry->parents = new StdIntMap();
229  }
230
231  // Fill the entries.
232  for (int i = v.size()  - 1; i >= 0; i--) {
233    Prefilter* prefilter = v[i];
234    if (prefilter == NULL)
235      continue;
236
237    if (CanonicalNode(prefilter) != prefilter)
238      continue;
239
240    Entry* entry = &entries_[prefilter->unique_id()];
241
242    switch (prefilter->op()) {
243      default:
244      case Prefilter::ALL:
245        LOG(DFATAL) << "Unexpected op: " << prefilter->op();
246        return;
247
248      case Prefilter::ATOM:
249        entry->propagate_up_at_count = 1;
250        break;
251
252      case Prefilter::OR:
253      case Prefilter::AND: {
254        std::set<int> uniq_child;
255        for (int j = 0; j < prefilter->subs()->size() ; j++) {
256          Prefilter* child = (*prefilter->subs())[j];
257          Prefilter* canonical = CanonicalNode(child);
258          if (canonical == NULL) {
259            LOG(DFATAL) << "Null canonical node";
260            return;
261          }
262          int child_id = canonical->unique_id();
263          uniq_child.insert(child_id);
264          // To the child, we want to add to parent indices.
265          Entry* child_entry = &entries_[child_id];
266          if (child_entry->parents->find(prefilter->unique_id()) ==
267              child_entry->parents->end())
268            (*child_entry->parents)[prefilter->unique_id()] = 1;
269        }
270        entry->propagate_up_at_count =
271            prefilter->op() == Prefilter::AND ? uniq_child.size() : 1;
272
273        break;
274      }
275    }
276  }
277
278  // For top level nodes, populate regexp id.
279  for (int i = 0; i < prefilter_vec_.size(); i++) {
280    if (prefilter_vec_[i] == NULL)
281      continue;
282    int id = CanonicalNode(prefilter_vec_[i])->unique_id();
283    DCHECK_LE(0, id);
284    Entry* entry = &entries_[id];
285    entry->regexps.push_back(i);
286  }
287}
288
289// Functions for triggering during search.
290void PrefilterTree::RegexpsGivenStrings(
291    const vector<int>& matched_atoms,
292    vector<int>* regexps) const {
293  regexps->clear();
294  if (!compiled_) {
295    LOG(WARNING) << "Compile() not called";
296    for (int i = 0; i < prefilter_vec_.size(); ++i)
297      regexps->push_back(i);
298  } else {
299    if (!prefilter_vec_.empty()) {
300      IntMap regexps_map(prefilter_vec_.size());
301      vector<int> matched_atom_ids;
302      for (int j = 0; j < matched_atoms.size(); j++) {
303        matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
304        VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
305      }
306      PropagateMatch(matched_atom_ids, &regexps_map);
307      for (IntMap::iterator it = regexps_map.begin();
308           it != regexps_map.end();
309           ++it)
310        regexps->push_back(it->index());
311
312      regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
313    }
314  }
315  sort(regexps->begin(), regexps->end());
316}
317
318void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
319                                   IntMap* regexps) const {
320  IntMap count(entries_.size());
321  IntMap work(entries_.size());
322  for (int i = 0; i < atom_ids.size(); i++)
323    work.set(atom_ids[i], 1);
324  for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
325    const Entry& entry = entries_[it->index()];
326    VLOG(10) << "Processing: " << it->index();
327    // Record regexps triggered.
328    for (int i = 0; i < entry.regexps.size(); i++) {
329      VLOG(10) << "Regexp triggered: " << entry.regexps[i];
330      regexps->set(entry.regexps[i], 1);
331    }
332    int c;
333    // Pass trigger up to parents.
334    for (StdIntMap::iterator it = entry.parents->begin();
335         it != entry.parents->end();
336         ++it) {
337      int j = it->first;
338      const Entry& parent = entries_[j];
339      VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
340      // Delay until all the children have succeeded.
341      if (parent.propagate_up_at_count > 1) {
342        if (count.has_index(j)) {
343          c = count.get_existing(j) + 1;
344          count.set_existing(j, c);
345        } else {
346          c = 1;
347          count.set_new(j, c);
348        }
349        if (c < parent.propagate_up_at_count)
350          continue;
351      }
352      VLOG(10) << "Triggering: " << j;
353      // Trigger the parent.
354      work.set(j, 1);
355    }
356  }
357}
358
359// Debugging help.
360void PrefilterTree::PrintPrefilter(int regexpid) {
361  LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
362}
363
364void PrefilterTree::PrintDebugInfo() {
365  VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
366  VLOG(10) << "#Unique Nodes: " << entries_.size();
367
368  for (int i = 0; i < entries_.size(); ++i) {
369    StdIntMap* parents = entries_[i].parents;
370    const vector<int>& regexps = entries_[i].regexps;
371    VLOG(10) << "EntryId: " << i
372            << " N: " << parents->size() << " R: " << regexps.size();
373    for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
374      VLOG(10) << it->first;
375  }
376  VLOG(10) << "Map:";
377  for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
378       iter != node_map_.end(); ++iter)
379    VLOG(10) << "NodeId: " << (*iter).second->unique_id()
380            << " Str: " << (*iter).first;
381}
382
383string PrefilterTree::DebugNodeString(Prefilter* node) const {
384  string node_string = "";
385
386  if (node->op() == Prefilter::ATOM) {
387    DCHECK(!node->atom().empty());
388    node_string += node->atom();
389  } else {
390    // Adding the operation disambiguates AND and OR nodes.
391    node_string +=  node->op() == Prefilter::AND ? "AND" : "OR";
392    node_string += "(";
393    for (int i = 0; i < node->subs()->size() ; i++) {
394      if (i > 0)
395        node_string += ',';
396      node_string += Itoa((*node->subs())[i]->unique_id());
397      node_string += ":";
398      node_string += DebugNodeString((*node->subs())[i]);
399    }
400    node_string += ")";
401  }
402  return node_string;
403}
404
405}  // namespace re2
406