1// Copyright 2009 The RE2 Authors. All Rights Reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// The PrefilterTree class is used to form an AND-OR tree of strings 6// that would trigger each regexp. The 'prefilter' of each regexp is 7// added tp PrefilterTree, and then PrefilterTree is used to find all 8// the unique strings across the prefilters. During search, by using 9// matches from a string matching engine, PrefilterTree deduces the 10// set of regexps that are to be triggered. The 'string matching 11// engine' itself is outside of this class, and the caller can use any 12// favorite engine. PrefilterTree provides a set of strings (called 13// atoms) that the user of this class should use to do the string 14// matching. 15// 16#ifndef RE2_PREFILTER_TREE_H_ 17#define RE2_PREFILTER_TREE_H_ 18 19#include <map> 20 21#include "util/util.h" 22#include "util/sparse_array.h" 23 24namespace re2 { 25 26typedef SparseArray<int> IntMap; 27typedef std::map<int, int> StdIntMap; 28 29class Prefilter; 30 31class PrefilterTree { 32 public: 33 PrefilterTree(); 34 ~PrefilterTree(); 35 36 // Adds the prefilter for the next regexp. Note that we assume that 37 // Add called sequentially for all regexps. All Add calls 38 // must precede Compile. 39 void Add(Prefilter* prefilter); 40 41 // The Compile returns a vector of string in atom_vec. 42 // Call this after all the prefilters are added through Add. 43 // No calls to Add after Compile are allowed. 44 // The caller should use the returned set of strings to do string matching. 45 // Each time a string matches, the corresponding index then has to be 46 // and passed to RegexpsGivenStrings below. 47 void Compile(vector<string>* atom_vec); 48 49 // Given the indices of the atoms that matched, returns the indexes 50 // of regexps that should be searched. The matched_atoms should 51 // contain all the ids of string atoms that were found to match the 52 // content. The caller can use any string match engine to perform 53 // this function. This function is thread safe. 54 void RegexpsGivenStrings(const vector<int>& matched_atoms, 55 vector<int>* regexps) const; 56 57 // Print debug prefilter. Also prints unique ids associated with 58 // nodes of the prefilter of the regexp. 59 void PrintPrefilter(int regexpid); 60 61 62 // Each unique node has a corresponding Entry that helps in 63 // passing the matching trigger information along the tree. 64 struct Entry { 65 public: 66 // How many children should match before this node triggers the 67 // parent. For an atom and an OR node, this is 1 and for an AND 68 // node, it is the number of unique children. 69 int propagate_up_at_count; 70 71 // When this node is ready to trigger the parent, what are the indices 72 // of the parent nodes to trigger. The reason there may be more than 73 // one is because of sharing. For example (abc | def) and (xyz | def) 74 // are two different nodes, but they share the atom 'def'. So when 75 // 'def' matches, it triggers two parents, corresponding to the two 76 // different OR nodes. 77 StdIntMap* parents; 78 79 // When this node is ready to trigger the parent, what are the 80 // regexps that are triggered. 81 vector<int> regexps; 82 }; 83 84 private: 85 // This function assigns unique ids to various parts of the 86 // prefilter, by looking at if these nodes are already in the 87 // PrefilterTree. 88 void AssignUniqueIds(vector<string>* atom_vec); 89 90 // Given the matching atoms, find the regexps to be triggered. 91 void PropagateMatch(const vector<int>& atom_ids, 92 IntMap* regexps) const; 93 94 // Returns the prefilter node that has the same NodeString as this 95 // node. For the canonical node, returns node. 96 Prefilter* CanonicalNode(Prefilter* node); 97 98 // A string that uniquely identifies the node. Assumes that the 99 // children of node has already been assigned unique ids. 100 string NodeString(Prefilter* node) const; 101 102 // Recursively constructs a readable prefilter string. 103 string DebugNodeString(Prefilter* node) const; 104 105 // Used for debugging. 106 void PrintDebugInfo(); 107 108 // These are all the nodes formed by Compile. Essentially, there is 109 // one node for each unique atom and each unique AND/OR node. 110 vector<Entry> entries_; 111 112 // Map node string to canonical Prefilter node. 113 map<string, Prefilter*> node_map_; 114 115 // indices of regexps that always pass through the filter (since we 116 // found no required literals in these regexps). 117 vector<int> unfiltered_; 118 119 // vector of Prefilter for all regexps. 120 vector<Prefilter*> prefilter_vec_; 121 122 // Atom index in returned strings to entry id mapping. 123 vector<int> atom_index_to_id_; 124 125 // Has the prefilter tree been compiled. 126 bool compiled_; 127 128 DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree); 129}; 130 131} // namespace 132 133#endif // RE2_PREFILTER_TREE_H_ 134