1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
6#define COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
7
8#include <limits>
9#include <map>
10#include <set>
11#include <string>
12#include <vector>
13
14#include "base/basictypes.h"
15#include "components/url_matcher/string_pattern.h"
16#include "components/url_matcher/url_matcher_export.h"
17
18namespace url_matcher {
19
20// Class that store a set of string patterns and can find for a string S,
21// which string patterns occur in S.
22class URL_MATCHER_EXPORT SubstringSetMatcher {
23 public:
24  SubstringSetMatcher();
25  ~SubstringSetMatcher();
26
27  // Registers all |patterns|. The ownership remains with the caller.
28  // The same pattern cannot be registered twice and each pattern needs to have
29  // a unique ID.
30  // Ownership of the patterns remains with the caller.
31  void RegisterPatterns(const std::vector<const StringPattern*>& patterns);
32
33  // Unregisters the passed |patterns|.
34  void UnregisterPatterns(const std::vector<const StringPattern*>& patterns);
35
36  // Analogous to RegisterPatterns and UnregisterPatterns but executes both
37  // operations in one step, which is cheaper in the execution.
38  void RegisterAndUnregisterPatterns(
39      const std::vector<const StringPattern*>& to_register,
40      const std::vector<const StringPattern*>& to_unregister);
41
42  // Matches |text| against all registered StringPatterns. Stores the IDs
43  // of matching patterns in |matches|. |matches| is not cleared before adding
44  // to it.
45  bool Match(const std::string& text,
46             std::set<StringPattern::ID>* matches) const;
47
48  // Returns true if this object retains no allocated data. Only for debugging.
49  bool IsEmpty() const;
50
51 private:
52  // A node of an Aho Corasick Tree. This is implemented according to
53  // http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf
54  //
55  // The algorithm is based on the idea of building a trie of all registered
56  // patterns. Each node of the tree is annotated with a set of pattern
57  // IDs that are used to report matches.
58  //
59  // The root of the trie represents an empty match. If we were looking whether
60  // any registered pattern matches a text at the beginning of the text (i.e.
61  // whether any pattern is a prefix of the text), we could just follow
62  // nodes in the trie according to the matching characters in the text.
63  // E.g., if text == "foobar", we would follow the trie from the root node
64  // to its child labeled 'f', from there to child 'o', etc. In this process we
65  // would report all pattern IDs associated with the trie nodes as matches.
66  //
67  // As we are not looking for all prefix matches but all substring matches,
68  // this algorithm would need to compare text.substr(0), text.substr(1), ...
69  // against the trie, which is in O(|text|^2).
70  //
71  // The Aho Corasick algorithm improves this runtime by using failure edges.
72  // In case we have found a partial match of length k in the text
73  // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
74  // a node at depth k, but cannot find a match in the trie for character
75  // text[i + k] at depth k + 1, we follow a failure edge. This edge
76  // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
77  // is a prefix of any registered pattern.
78  //
79  // If your brain thinks "Forget it, let's go shopping.", don't worry.
80  // Take a nap and read an introductory text on the Aho Corasick algorithm.
81  // It will make sense. Eventually.
82  class AhoCorasickNode {
83   public:
84    // Key: label of the edge, value: node index in |tree_| of parent class.
85    typedef std::map<char, uint32> Edges;
86    typedef std::set<StringPattern::ID> Matches;
87
88    static const uint32 kNoSuchEdge;  // Represents an invalid node index.
89
90    AhoCorasickNode();
91    ~AhoCorasickNode();
92    AhoCorasickNode(const AhoCorasickNode& other);
93    AhoCorasickNode& operator=(const AhoCorasickNode& other);
94
95    uint32 GetEdge(char c) const;
96    void SetEdge(char c, uint32 node);
97    const Edges& edges() const { return edges_; }
98
99    uint32 failure() const { return failure_; }
100    void set_failure(uint32 failure) { failure_ = failure; }
101
102    void AddMatch(StringPattern::ID id);
103    void AddMatches(const Matches& matches);
104    const Matches& matches() const { return matches_; }
105
106   private:
107    // Outgoing edges of current node.
108    Edges edges_;
109
110    // Node index that failure edge leads to.
111    uint32 failure_;
112
113    // Identifiers of matches.
114    Matches matches_;
115  };
116
117  typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap;
118  typedef std::vector<const StringPattern*> SubstringPatternVector;
119
120  // |sorted_patterns| is a copy of |patterns_| sorted by the pattern string.
121  void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns);
122
123  // Inserts a path for |pattern->pattern()| into the tree and adds
124  // |pattern->id()| to the set of matches. Ownership of |pattern| remains with
125  // the caller.
126  void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern);
127  void CreateFailureEdges();
128
129  // Set of all registered StringPatterns. Used to regenerate the
130  // Aho-Corasick tree in case patterns are registered or unregistered.
131  SubstringPatternMap patterns_;
132
133  // The nodes of a Aho-Corasick tree.
134  std::vector<AhoCorasickNode> tree_;
135
136  DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher);
137};
138
139}  // namespace url_matcher
140
141#endif  // COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
142