15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2009 The RE2 Authors.  All Rights Reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The PrefilterTree class is used to form an AND-OR tree of strings
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// that would trigger each regexp. The 'prefilter' of each regexp is
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// added tp PrefilterTree, and then PrefilterTree is used to find all
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the unique strings across the prefilters. During search, by using
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// matches from a string matching engine, PrefilterTree deduces the
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// set of regexps that are to be triggered. The 'string matching
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// engine' itself is outside of this class, and the caller can use any
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// favorite engine. PrefilterTree provides a set of strings (called
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// atoms) that the user of this class should use to do the string
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// matching.
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef RE2_PREFILTER_TREE_H_
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define RE2_PREFILTER_TREE_H_
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
19c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)#include <map>
20c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/util.h"
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/sparse_array.h"
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 {
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)typedef SparseArray<int> IntMap;
27c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)typedef std::map<int, int> StdIntMap;
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Prefilter;
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class PrefilterTree {
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PrefilterTree();
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~PrefilterTree();
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Adds the prefilter for the next regexp. Note that we assume that
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Add called sequentially for all regexps. All Add calls
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // must precede Compile.
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Add(Prefilter* prefilter);
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The Compile returns a vector of string in atom_vec.
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Call this after all the prefilters are added through Add.
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // No calls to Add after Compile are allowed.
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The caller should use the returned set of strings to do string matching.
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Each time a string matches, the corresponding index then has to be
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // and passed to RegexpsGivenStrings below.
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Compile(vector<string>* atom_vec);
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Given the indices of the atoms that matched, returns the indexes
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // of regexps that should be searched.  The matched_atoms should
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // contain all the ids of string atoms that were found to match the
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // content. The caller can use any string match engine to perform
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // this function. This function is thread safe.
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void RegexpsGivenStrings(const vector<int>& matched_atoms,
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                           vector<int>* regexps) const;
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Print debug prefilter. Also prints unique ids associated with
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // nodes of the prefilter of the regexp.
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void PrintPrefilter(int regexpid);
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Each unique node has a corresponding Entry that helps in
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // passing the matching trigger information along the tree.
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  struct Entry {
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   public:
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // How many children should match before this node triggers the
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // parent. For an atom and an OR node, this is 1 and for an AND
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // node, it is the number of unique children.
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int propagate_up_at_count;
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // When this node is ready to trigger the parent, what are the indices
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // of the parent nodes to trigger. The reason there may be more than
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // one is because of sharing. For example (abc | def) and (xyz | def)
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // are two different nodes, but they share the atom 'def'. So when
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // 'def' matches, it triggers two parents, corresponding to the two
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // different OR nodes.
77c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)    StdIntMap* parents;
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // When this node is ready to trigger the parent, what are the
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // regexps that are triggered.
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    vector<int> regexps;
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This function assigns unique ids to various parts of the
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // prefilter, by looking at if these nodes are already in the
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // PrefilterTree.
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void AssignUniqueIds(vector<string>* atom_vec);
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Given the matching atoms, find the regexps to be triggered.
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void PropagateMatch(const vector<int>& atom_ids,
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      IntMap* regexps) const;
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns the prefilter node that has the same NodeString as this
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // node. For the canonical node, returns node.
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Prefilter* CanonicalNode(Prefilter* node);
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // A string that uniquely identifies the node. Assumes that the
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // children of node has already been assigned unique ids.
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  string NodeString(Prefilter* node) const;
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Recursively constructs a readable prefilter string.
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  string DebugNodeString(Prefilter* node) const;
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Used for debugging.
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void PrintDebugInfo();
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // These are all the nodes formed by Compile. Essentially, there is
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // one node for each unique atom and each unique AND/OR node.
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  vector<Entry> entries_;
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Map node string to canonical Prefilter node.
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  map<string, Prefilter*> node_map_;
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // indices of regexps that always pass through the filter (since we
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // found no required literals in these regexps).
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  vector<int> unfiltered_;
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // vector of Prefilter for all regexps.
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  vector<Prefilter*> prefilter_vec_;
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Atom index in returned strings to entry id mapping.
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  vector<int> atom_index_to_id_;
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Has the prefilter tree been compiled.
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool compiled_;
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_EVIL_CONSTRUCTORS(PrefilterTree);
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // RE2_PREFILTER_TREE_H_
134