1// Copyright (c) 2012 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#ifndef EXTENSIONS_COMMON_MATCHER_SUBSTRING_SET_MATCHER_H_ 6#define EXTENSIONS_COMMON_MATCHER_SUBSTRING_SET_MATCHER_H_ 7 8#include <limits> 9#include <map> 10#include <set> 11#include <string> 12#include <vector> 13 14#include "base/basictypes.h" 15#include "extensions/common/matcher/string_pattern.h" 16 17namespace extensions { 18 19// Class that store a set of string patterns and can find for a string S, 20// which string patterns occur in S. 21class SubstringSetMatcher { 22 public: 23 SubstringSetMatcher(); 24 ~SubstringSetMatcher(); 25 26 // Registers all |patterns|. The ownership remains with the caller. 27 // The same pattern cannot be registered twice and each pattern needs to have 28 // a unique ID. 29 // Ownership of the patterns remains with the caller. 30 void RegisterPatterns(const std::vector<const StringPattern*>& patterns); 31 32 // Unregisters the passed |patterns|. 33 void UnregisterPatterns(const std::vector<const StringPattern*>& patterns); 34 35 // Analogous to RegisterPatterns and UnregisterPatterns but executes both 36 // operations in one step, which is cheaper in the execution. 37 void RegisterAndUnregisterPatterns( 38 const std::vector<const StringPattern*>& to_register, 39 const std::vector<const StringPattern*>& to_unregister); 40 41 // Matches |text| against all registered StringPatterns. Stores the IDs 42 // of matching patterns in |matches|. |matches| is not cleared before adding 43 // to it. 44 bool Match(const std::string& text, 45 std::set<StringPattern::ID>* matches) const; 46 47 // Returns true if this object retains no allocated data. Only for debugging. 48 bool IsEmpty() const; 49 50 private: 51 // A node of an Aho Corasick Tree. This is implemented according to 52 // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf 53 // 54 // The algorithm is based on the idea of building a trie of all registered 55 // patterns. Each node of the tree is annotated with a set of pattern 56 // IDs that are used to report matches. 57 // 58 // The root of the trie represents an empty match. If we were looking whether 59 // any registered pattern matches a text at the beginning of the text (i.e. 60 // whether any pattern is a prefix of the text), we could just follow 61 // nodes in the trie according to the matching characters in the text. 62 // E.g., if text == "foobar", we would follow the trie from the root node 63 // to its child labeled 'f', from there to child 'o', etc. In this process we 64 // would report all pattern IDs associated with the trie nodes as matches. 65 // 66 // As we are not looking for all prefix matches but all substring matches, 67 // this algorithm would need to compare text.substr(0), text.substr(1), ... 68 // against the trie, which is in O(|text|^2). 69 // 70 // The Aho Corasick algorithm improves this runtime by using failure edges. 71 // In case we have found a partial match of length k in the text 72 // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at 73 // a node at depth k, but cannot find a match in the trie for character 74 // text[i + k] at depth k + 1, we follow a failure edge. This edge 75 // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that 76 // is a prefix of any registered pattern. 77 // 78 // If your brain thinks "Forget it, let's go shopping.", don't worry. 79 // Take a nap and read an introductory text on the Aho Corasick algorithm. 80 // It will make sense. Eventually. 81 class AhoCorasickNode { 82 public: 83 // Key: label of the edge, value: node index in |tree_| of parent class. 84 typedef std::map<char, uint32> Edges; 85 typedef std::set<StringPattern::ID> Matches; 86 87 static const uint32 kNoSuchEdge; // Represents an invalid node index. 88 89 AhoCorasickNode(); 90 ~AhoCorasickNode(); 91 AhoCorasickNode(const AhoCorasickNode& other); 92 AhoCorasickNode& operator=(const AhoCorasickNode& other); 93 94 uint32 GetEdge(char c) const; 95 void SetEdge(char c, uint32 node); 96 const Edges& edges() const { return edges_; } 97 98 uint32 failure() const { return failure_; } 99 void set_failure(uint32 failure) { failure_ = failure; } 100 101 void AddMatch(StringPattern::ID id); 102 void AddMatches(const Matches& matches); 103 const Matches& matches() const { return matches_; } 104 105 private: 106 // Outgoing edges of current node. 107 Edges edges_; 108 109 // Node index that failure edge leads to. 110 uint32 failure_; 111 112 // Identifiers of matches. 113 Matches matches_; 114 }; 115 116 typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap; 117 typedef std::vector<const StringPattern*> SubstringPatternVector; 118 119 // |sorted_patterns| is a copy of |patterns_| sorted by the pattern string. 120 void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns); 121 122 // Inserts a path for |pattern->pattern()| into the tree and adds 123 // |pattern->id()| to the set of matches. Ownership of |pattern| remains with 124 // the caller. 125 void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern); 126 void CreateFailureEdges(); 127 128 // Set of all registered StringPatterns. Used to regenerate the 129 // Aho-Corasick tree in case patterns are registered or unregistered. 130 SubstringPatternMap patterns_; 131 132 // The nodes of a Aho-Corasick tree. 133 std::vector<AhoCorasickNode> tree_; 134 135 DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher); 136}; 137 138} // namespace extensions 139 140#endif // EXTENSIONS_COMMON_MATCHER_SUBSTRING_SET_MATCHER_H_ 141