1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef EXTENSIONS_COMMON_MATCHER_SUBSTRING_SET_MATCHER_H_
6#define EXTENSIONS_COMMON_MATCHER_SUBSTRING_SET_MATCHER_H_
7
8#include <limits>
9#include <map>
10#include <set>
11#include <string>
12#include <vector>
13
14#include "base/basictypes.h"
15#include "extensions/common/matcher/string_pattern.h"
16
17namespace extensions {
18
19// Class that store a set of string patterns and can find for a string S,
20// which string patterns occur in S.
21class SubstringSetMatcher {
22 public:
23  SubstringSetMatcher();
24  ~SubstringSetMatcher();
25
26  // Registers all |patterns|. The ownership remains with the caller.
27  // The same pattern cannot be registered twice and each pattern needs to have
28  // a unique ID.
29  // Ownership of the patterns remains with the caller.
30  void RegisterPatterns(const std::vector<const StringPattern*>& patterns);
31
32  // Unregisters the passed |patterns|.
33  void UnregisterPatterns(const std::vector<const StringPattern*>& patterns);
34
35  // Analogous to RegisterPatterns and UnregisterPatterns but executes both
36  // operations in one step, which is cheaper in the execution.
37  void RegisterAndUnregisterPatterns(
38      const std::vector<const StringPattern*>& to_register,
39      const std::vector<const StringPattern*>& to_unregister);
40
41  // Matches |text| against all registered StringPatterns. Stores the IDs
42  // of matching patterns in |matches|. |matches| is not cleared before adding
43  // to it.
44  bool Match(const std::string& text,
45             std::set<StringPattern::ID>* matches) const;
46
47  // Returns true if this object retains no allocated data. Only for debugging.
48  bool IsEmpty() const;
49
50 private:
51  // A node of an Aho Corasick Tree. This is implemented according to
52  // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf
53  //
54  // The algorithm is based on the idea of building a trie of all registered
55  // patterns. Each node of the tree is annotated with a set of pattern
56  // IDs that are used to report matches.
57  //
58  // The root of the trie represents an empty match. If we were looking whether
59  // any registered pattern matches a text at the beginning of the text (i.e.
60  // whether any pattern is a prefix of the text), we could just follow
61  // nodes in the trie according to the matching characters in the text.
62  // E.g., if text == "foobar", we would follow the trie from the root node
63  // to its child labeled 'f', from there to child 'o', etc. In this process we
64  // would report all pattern IDs associated with the trie nodes as matches.
65  //
66  // As we are not looking for all prefix matches but all substring matches,
67  // this algorithm would need to compare text.substr(0), text.substr(1), ...
68  // against the trie, which is in O(|text|^2).
69  //
70  // The Aho Corasick algorithm improves this runtime by using failure edges.
71  // In case we have found a partial match of length k in the text
72  // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
73  // a node at depth k, but cannot find a match in the trie for character
74  // text[i + k] at depth k + 1, we follow a failure edge. This edge
75  // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
76  // is a prefix of any registered pattern.
77  //
78  // If your brain thinks "Forget it, let's go shopping.", don't worry.
79  // Take a nap and read an introductory text on the Aho Corasick algorithm.
80  // It will make sense. Eventually.
81  class AhoCorasickNode {
82   public:
83    // Key: label of the edge, value: node index in |tree_| of parent class.
84    typedef std::map<char, uint32> Edges;
85    typedef std::set<StringPattern::ID> Matches;
86
87    static const uint32 kNoSuchEdge;  // Represents an invalid node index.
88
89    AhoCorasickNode();
90    ~AhoCorasickNode();
91    AhoCorasickNode(const AhoCorasickNode& other);
92    AhoCorasickNode& operator=(const AhoCorasickNode& other);
93
94    uint32 GetEdge(char c) const;
95    void SetEdge(char c, uint32 node);
96    const Edges& edges() const { return edges_; }
97
98    uint32 failure() const { return failure_; }
99    void set_failure(uint32 failure) { failure_ = failure; }
100
101    void AddMatch(StringPattern::ID id);
102    void AddMatches(const Matches& matches);
103    const Matches& matches() const { return matches_; }
104
105   private:
106    // Outgoing edges of current node.
107    Edges edges_;
108
109    // Node index that failure edge leads to.
110    uint32 failure_;
111
112    // Identifiers of matches.
113    Matches matches_;
114  };
115
116  typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap;
117  typedef std::vector<const StringPattern*> SubstringPatternVector;
118
119  // |sorted_patterns| is a copy of |patterns_| sorted by the pattern string.
120  void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns);
121
122  // Inserts a path for |pattern->pattern()| into the tree and adds
123  // |pattern->id()| to the set of matches. Ownership of |pattern| remains with
124  // the caller.
125  void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern);
126  void CreateFailureEdges();
127
128  // Set of all registered StringPatterns. Used to regenerate the
129  // Aho-Corasick tree in case patterns are registered or unregistered.
130  SubstringPatternMap patterns_;
131
132  // The nodes of a Aho-Corasick tree.
133  std::vector<AhoCorasickNode> tree_;
134
135  DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher);
136};
137
138}  // namespace extensions
139
140#endif  // EXTENSIONS_COMMON_MATCHER_SUBSTRING_SET_MATCHER_H_
141