prefilter_tree.h revision c2e0dbddbe15c98d52c4786dac06cb8952a8ae6d
1// Copyright 2009 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// The PrefilterTree class is used to form an AND-OR tree of strings
6// that would trigger each regexp. The 'prefilter' of each regexp is
7// added tp PrefilterTree, and then PrefilterTree is used to find all
8// the unique strings across the prefilters. During search, by using
9// matches from a string matching engine, PrefilterTree deduces the
10// set of regexps that are to be triggered. The 'string matching
11// engine' itself is outside of this class, and the caller can use any
12// favorite engine. PrefilterTree provides a set of strings (called
13// atoms) that the user of this class should use to do the string
14// matching.
15//
16#ifndef RE2_PREFILTER_TREE_H_
17#define RE2_PREFILTER_TREE_H_
18
19#include <map>
20
21#include "util/util.h"
22#include "util/sparse_array.h"
23
24namespace re2 {
25
26typedef SparseArray<int> IntMap;
27typedef std::map<int, int> StdIntMap;
28
29class Prefilter;
30
31class PrefilterTree {
32 public:
33  PrefilterTree();
34  ~PrefilterTree();
35
36  // Adds the prefilter for the next regexp. Note that we assume that
37  // Add called sequentially for all regexps. All Add calls
38  // must precede Compile.
39  void Add(Prefilter* prefilter);
40
41  // The Compile returns a vector of string in atom_vec.
42  // Call this after all the prefilters are added through Add.
43  // No calls to Add after Compile are allowed.
44  // The caller should use the returned set of strings to do string matching.
45  // Each time a string matches, the corresponding index then has to be
46  // and passed to RegexpsGivenStrings below.
47  void Compile(vector<string>* atom_vec);
48
49  // Given the indices of the atoms that matched, returns the indexes
50  // of regexps that should be searched.  The matched_atoms should
51  // contain all the ids of string atoms that were found to match the
52  // content. The caller can use any string match engine to perform
53  // this function. This function is thread safe.
54  void RegexpsGivenStrings(const vector<int>& matched_atoms,
55                           vector<int>* regexps) const;
56
57  // Print debug prefilter. Also prints unique ids associated with
58  // nodes of the prefilter of the regexp.
59  void PrintPrefilter(int regexpid);
60
61
62  // Each unique node has a corresponding Entry that helps in
63  // passing the matching trigger information along the tree.
64  struct Entry {
65   public:
66    // How many children should match before this node triggers the
67    // parent. For an atom and an OR node, this is 1 and for an AND
68    // node, it is the number of unique children.
69    int propagate_up_at_count;
70
71    // When this node is ready to trigger the parent, what are the indices
72    // of the parent nodes to trigger. The reason there may be more than
73    // one is because of sharing. For example (abc | def) and (xyz | def)
74    // are two different nodes, but they share the atom 'def'. So when
75    // 'def' matches, it triggers two parents, corresponding to the two
76    // different OR nodes.
77    StdIntMap* parents;
78
79    // When this node is ready to trigger the parent, what are the
80    // regexps that are triggered.
81    vector<int> regexps;
82  };
83
84 private:
85  // This function assigns unique ids to various parts of the
86  // prefilter, by looking at if these nodes are already in the
87  // PrefilterTree.
88  void AssignUniqueIds(vector<string>* atom_vec);
89
90  // Given the matching atoms, find the regexps to be triggered.
91  void PropagateMatch(const vector<int>& atom_ids,
92                      IntMap* regexps) const;
93
94  // Returns the prefilter node that has the same NodeString as this
95  // node. For the canonical node, returns node.
96  Prefilter* CanonicalNode(Prefilter* node);
97
98  // A string that uniquely identifies the node. Assumes that the
99  // children of node has already been assigned unique ids.
100  string NodeString(Prefilter* node) const;
101
102  // Recursively constructs a readable prefilter string.
103  string DebugNodeString(Prefilter* node) const;
104
105  // Used for debugging.
106  void PrintDebugInfo();
107
108  // These are all the nodes formed by Compile. Essentially, there is
109  // one node for each unique atom and each unique AND/OR node.
110  vector<Entry> entries_;
111
112  // Map node string to canonical Prefilter node.
113  map<string, Prefilter*> node_map_;
114
115  // indices of regexps that always pass through the filter (since we
116  // found no required literals in these regexps).
117  vector<int> unfiltered_;
118
119  // vector of Prefilter for all regexps.
120  vector<Prefilter*> prefilter_vec_;
121
122  // Atom index in returned strings to entry id mapping.
123  vector<int> atom_index_to_id_;
124
125  // Has the prefilter tree been compiled.
126  bool compiled_;
127
128  DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
129};
130
131}  // namespace
132
133#endif  // RE2_PREFILTER_TREE_H_
134