prefilter_tree.h revision 5821806d5e7f356e8fa4b058a389a808ea183019
1// Copyright 2009 The RE2 Authors.  All Rights Reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// The PrefilterTree class is used to form an AND-OR tree of strings
6// that would trigger each regexp. The 'prefilter' of each regexp is
7// added tp PrefilterTree, and then PrefilterTree is used to find all
8// the unique strings across the prefilters. During search, by using
9// matches from a string matching engine, PrefilterTree deduces the
10// set of regexps that are to be triggered. The 'string matching
11// engine' itself is outside of this class, and the caller can use any
12// favorite engine. PrefilterTree provides a set of strings (called
13// atoms) that the user of this class should use to do the string
14// matching.
15//
16#ifndef RE2_PREFILTER_TREE_H_
17#define RE2_PREFILTER_TREE_H_
18
19#include "util/util.h"
20#include "util/sparse_array.h"
21
22namespace re2 {
23
24typedef SparseArray<int> IntMap;
25
26class Prefilter;
27
28class PrefilterTree {
29 public:
30  PrefilterTree();
31  ~PrefilterTree();
32
33  // Adds the prefilter for the next regexp. Note that we assume that
34  // Add called sequentially for all regexps. All Add calls
35  // must precede Compile.
36  void Add(Prefilter* prefilter);
37
38  // The Compile returns a vector of string in atom_vec.
39  // Call this after all the prefilters are added through Add.
40  // No calls to Add after Compile are allowed.
41  // The caller should use the returned set of strings to do string matching.
42  // Each time a string matches, the corresponding index then has to be
43  // and passed to RegexpsGivenStrings below.
44  void Compile(vector<string>* atom_vec);
45
46  // Given the indices of the atoms that matched, returns the indexes
47  // of regexps that should be searched.  The matched_atoms should
48  // contain all the ids of string atoms that were found to match the
49  // content. The caller can use any string match engine to perform
50  // this function. This function is thread safe.
51  void RegexpsGivenStrings(const vector<int>& matched_atoms,
52                           vector<int>* regexps) const;
53
54  // Print debug prefilter. Also prints unique ids associated with
55  // nodes of the prefilter of the regexp.
56  void PrintPrefilter(int regexpid);
57
58
59  // Each unique node has a corresponding Entry that helps in
60  // passing the matching trigger information along the tree.
61  struct Entry {
62   public:
63    // How many children should match before this node triggers the
64    // parent. For an atom and an OR node, this is 1 and for an AND
65    // node, it is the number of unique children.
66    int propagate_up_at_count;
67
68    // When this node is ready to trigger the parent, what are the indices
69    // of the parent nodes to trigger. The reason there may be more than
70    // one is because of sharing. For example (abc | def) and (xyz | def)
71    // are two different nodes, but they share the atom 'def'. So when
72    // 'def' matches, it triggers two parents, corresponding to the two
73    // different OR nodes.
74    IntMap* parents;
75
76    // When this node is ready to trigger the parent, what are the
77    // regexps that are triggered.
78    vector<int> regexps;
79  };
80
81 private:
82  // This function assigns unique ids to various parts of the
83  // prefilter, by looking at if these nodes are already in the
84  // PrefilterTree.
85  void AssignUniqueIds(vector<string>* atom_vec);
86
87  // Given the matching atoms, find the regexps to be triggered.
88  void PropagateMatch(const vector<int>& atom_ids,
89                      IntMap* regexps) const;
90
91  // Returns the prefilter node that has the same NodeString as this
92  // node. For the canonical node, returns node.
93  Prefilter* CanonicalNode(Prefilter* node);
94
95  // A string that uniquely identifies the node. Assumes that the
96  // children of node has already been assigned unique ids.
97  string NodeString(Prefilter* node) const;
98
99  // Recursively constructs a readable prefilter string.
100  string DebugNodeString(Prefilter* node) const;
101
102  // Used for debugging.
103  void PrintDebugInfo();
104
105  // These are all the nodes formed by Compile. Essentially, there is
106  // one node for each unique atom and each unique AND/OR node.
107  vector<Entry> entries_;
108
109  // Map node string to canonical Prefilter node.
110  map<string, Prefilter*> node_map_;
111
112  // indices of regexps that always pass through the filter (since we
113  // found no required literals in these regexps).
114  vector<int> unfiltered_;
115
116  // vector of Prefilter for all regexps.
117  vector<Prefilter*> prefilter_vec_;
118
119  // Atom index in returned strings to entry id mapping.
120  vector<int> atom_index_to_id_;
121
122  // Has the prefilter tree been compiled.
123  bool compiled_;
124
125  DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
126};
127
128}  // namespace
129
130#endif  // RE2_PREFILTER_TREE_H_
131