15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2009 The RE2 Authors. All Rights Reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The PrefilterTree class is used to form an AND-OR tree of strings 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// that would trigger each regexp. The 'prefilter' of each regexp is 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// added tp PrefilterTree, and then PrefilterTree is used to find all 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the unique strings across the prefilters. During search, by using 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// matches from a string matching engine, PrefilterTree deduces the 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// set of regexps that are to be triggered. The 'string matching 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// engine' itself is outside of this class, and the caller can use any 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// favorite engine. PrefilterTree provides a set of strings (called 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// atoms) that the user of this class should use to do the string 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// matching. 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef RE2_PREFILTER_TREE_H_ 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define RE2_PREFILTER_TREE_H_ 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include <map> 20c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/util.h" 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/sparse_array.h" 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 { 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)typedef SparseArray<int> IntMap; 27c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)typedef std::map<int, int> StdIntMap; 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Prefilter; 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class PrefilterTree { 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PrefilterTree(); 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~PrefilterTree(); 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Adds the prefilter for the next regexp. Note that we assume that 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add called sequentially for all regexps. All Add calls 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // must precede Compile. 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void Add(Prefilter* prefilter); 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The Compile returns a vector of string in atom_vec. 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Call this after all the prefilters are added through Add. 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // No calls to Add after Compile are allowed. 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The caller should use the returned set of strings to do string matching. 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Each time a string matches, the corresponding index then has to be 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and passed to RegexpsGivenStrings below. 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void Compile(vector<string>* atom_vec); 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Given the indices of the atoms that matched, returns the indexes 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of regexps that should be searched. The matched_atoms should 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // contain all the ids of string atoms that were found to match the 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // content. The caller can use any string match engine to perform 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // this function. This function is thread safe. 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void RegexpsGivenStrings(const vector<int>& matched_atoms, 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<int>* regexps) const; 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Print debug prefilter. Also prints unique ids associated with 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // nodes of the prefilter of the regexp. 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void PrintPrefilter(int regexpid); 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Each unique node has a corresponding Entry that helps in 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // passing the matching trigger information along the tree. 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) struct Entry { 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // How many children should match before this node triggers the 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // parent. For an atom and an OR node, this is 1 and for an AND 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // node, it is the number of unique children. 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int propagate_up_at_count; 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // When this node is ready to trigger the parent, what are the indices 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of the parent nodes to trigger. The reason there may be more than 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // one is because of sharing. For example (abc | def) and (xyz | def) 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // are two different nodes, but they share the atom 'def'. So when 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 'def' matches, it triggers two parents, corresponding to the two 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // different OR nodes. 77c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) StdIntMap* parents; 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // When this node is ready to trigger the parent, what are the 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // regexps that are triggered. 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<int> regexps; 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This function assigns unique ids to various parts of the 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // prefilter, by looking at if these nodes are already in the 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // PrefilterTree. 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void AssignUniqueIds(vector<string>* atom_vec); 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Given the matching atoms, find the regexps to be triggered. 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void PropagateMatch(const vector<int>& atom_ids, 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) IntMap* regexps) const; 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns the prefilter node that has the same NodeString as this 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // node. For the canonical node, returns node. 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prefilter* CanonicalNode(Prefilter* node); 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // A string that uniquely identifies the node. Assumes that the 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // children of node has already been assigned unique ids. 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string NodeString(Prefilter* node) const; 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Recursively constructs a readable prefilter string. 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string DebugNodeString(Prefilter* node) const; 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Used for debugging. 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void PrintDebugInfo(); 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // These are all the nodes formed by Compile. Essentially, there is 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // one node for each unique atom and each unique AND/OR node. 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<Entry> entries_; 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Map node string to canonical Prefilter node. 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) map<string, Prefilter*> node_map_; 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // indices of regexps that always pass through the filter (since we 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // found no required literals in these regexps). 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<int> unfiltered_; 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // vector of Prefilter for all regexps. 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<Prefilter*> prefilter_vec_; 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Atom index in returned strings to entry id mapping. 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<int> atom_index_to_id_; 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Has the prefilter tree been compiled. 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool compiled_; 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree); 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // RE2_PREFILTER_TREE_H_ 134