1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "components/url_matcher/substring_set_matcher.h"
6
7#include <algorithm>
8#include <queue>
9
10#include "base/logging.h"
11#include "base/stl_util.h"
12
13namespace url_matcher {
14
15namespace {
16
17// Compare StringPattern instances based on their string patterns.
18bool ComparePatterns(const StringPattern* a, const StringPattern* b) {
19  return a->pattern() < b->pattern();
20}
21
22// Given the set of patterns, compute how many nodes will the corresponding
23// Aho-Corasick tree have. Note that |patterns| need to be sorted.
24uint32 TreeSize(const std::vector<const StringPattern*>& patterns) {
25  uint32 result = 1u;  // 1 for the root node.
26  if (patterns.empty())
27    return result;
28
29  std::vector<const StringPattern*>::const_iterator last = patterns.begin();
30  std::vector<const StringPattern*>::const_iterator current = last + 1;
31  // For the first pattern, each letter is a label of an edge to a new node.
32  result += (*last)->pattern().size();
33
34  // For the subsequent patterns, only count the edges which were not counted
35  // yet. For this it suffices to test against the previous pattern, because the
36  // patterns are sorted.
37  for (; current != patterns.end(); ++last, ++current) {
38    const std::string& last_pattern = (*last)->pattern();
39    const std::string& current_pattern = (*current)->pattern();
40    const uint32 prefix_bound =
41        std::min(last_pattern.size(), current_pattern.size());
42
43    uint32 common_prefix = 0;
44    while (common_prefix < prefix_bound &&
45           last_pattern[common_prefix] == current_pattern[common_prefix])
46      ++common_prefix;
47    result += current_pattern.size() - common_prefix;
48  }
49  return result;
50}
51
52}  // namespace
53
54//
55// SubstringSetMatcher
56//
57
58SubstringSetMatcher::SubstringSetMatcher() {
59  RebuildAhoCorasickTree(SubstringPatternVector());
60}
61
62SubstringSetMatcher::~SubstringSetMatcher() {}
63
64void SubstringSetMatcher::RegisterPatterns(
65    const std::vector<const StringPattern*>& patterns) {
66  RegisterAndUnregisterPatterns(patterns,
67                                std::vector<const StringPattern*>());
68}
69
70void SubstringSetMatcher::UnregisterPatterns(
71    const std::vector<const StringPattern*>& patterns) {
72  RegisterAndUnregisterPatterns(std::vector<const StringPattern*>(),
73                                patterns);
74}
75
76void SubstringSetMatcher::RegisterAndUnregisterPatterns(
77      const std::vector<const StringPattern*>& to_register,
78      const std::vector<const StringPattern*>& to_unregister) {
79  // Register patterns.
80  for (std::vector<const StringPattern*>::const_iterator i =
81      to_register.begin(); i != to_register.end(); ++i) {
82    DCHECK(patterns_.find((*i)->id()) == patterns_.end());
83    patterns_[(*i)->id()] = *i;
84  }
85
86  // Unregister patterns
87  for (std::vector<const StringPattern*>::const_iterator i =
88      to_unregister.begin(); i != to_unregister.end(); ++i) {
89    patterns_.erase((*i)->id());
90  }
91
92  // Now we compute the total number of tree nodes needed.
93  SubstringPatternVector sorted_patterns;
94  sorted_patterns.resize(patterns_.size());
95
96  size_t next = 0;
97  for (SubstringPatternMap::const_iterator i = patterns_.begin();
98       i != patterns_.end();
99       ++i, ++next) {
100    sorted_patterns[next] = i->second;
101  }
102
103  std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns);
104  tree_.reserve(TreeSize(sorted_patterns));
105
106  RebuildAhoCorasickTree(sorted_patterns);
107}
108
109bool SubstringSetMatcher::Match(const std::string& text,
110                                std::set<StringPattern::ID>* matches) const {
111  const size_t old_number_of_matches = matches->size();
112
113  // Handle patterns matching the empty string.
114  matches->insert(tree_[0].matches().begin(), tree_[0].matches().end());
115
116  uint32 current_node = 0;
117  for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) {
118    uint32 edge_from_current = tree_[current_node].GetEdge(*i);
119    while (edge_from_current == AhoCorasickNode::kNoSuchEdge &&
120           current_node != 0) {
121      current_node = tree_[current_node].failure();
122      edge_from_current = tree_[current_node].GetEdge(*i);
123    }
124    if (edge_from_current != AhoCorasickNode::kNoSuchEdge) {
125      current_node = edge_from_current;
126      matches->insert(tree_[current_node].matches().begin(),
127                      tree_[current_node].matches().end());
128    } else {
129      DCHECK_EQ(0u, current_node);
130    }
131  }
132
133  return old_number_of_matches != matches->size();
134}
135
136bool SubstringSetMatcher::IsEmpty() const {
137  // An empty tree consists of only the root node.
138  return patterns_.empty() && tree_.size() == 1u;
139}
140
141void SubstringSetMatcher::RebuildAhoCorasickTree(
142    const SubstringPatternVector& sorted_patterns) {
143  tree_.clear();
144
145  // Initialize root note of tree.
146  AhoCorasickNode root;
147  root.set_failure(0);
148  tree_.push_back(root);
149
150  // Insert all patterns.
151  for (SubstringPatternVector::const_iterator i = sorted_patterns.begin();
152       i != sorted_patterns.end();
153       ++i) {
154    InsertPatternIntoAhoCorasickTree(*i);
155  }
156
157  CreateFailureEdges();
158}
159
160void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
161    const StringPattern* pattern) {
162  const std::string& text = pattern->pattern();
163  const std::string::const_iterator text_end = text.end();
164
165  // Iterators on the tree and the text.
166  uint32 current_node = 0;
167  std::string::const_iterator i = text.begin();
168
169  // Follow existing paths for as long as possible.
170  while (i != text_end) {
171    uint32 edge_from_current = tree_[current_node].GetEdge(*i);
172    if (edge_from_current == AhoCorasickNode::kNoSuchEdge)
173      break;
174    current_node = edge_from_current;
175    ++i;
176  }
177
178  // Create new nodes if necessary.
179  while (i != text_end) {
180    tree_.push_back(AhoCorasickNode());
181    tree_[current_node].SetEdge(*i, tree_.size() - 1);
182    current_node = tree_.size() - 1;
183    ++i;
184  }
185
186  // Register match.
187  tree_[current_node].AddMatch(pattern->id());
188}
189
190void SubstringSetMatcher::CreateFailureEdges() {
191  typedef AhoCorasickNode::Edges Edges;
192
193  std::queue<uint32> queue;
194
195  AhoCorasickNode& root = tree_[0];
196  root.set_failure(0);
197  const Edges& root_edges = root.edges();
198  for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end();
199       ++e) {
200    const uint32& leads_to = e->second;
201    tree_[leads_to].set_failure(0);
202    queue.push(leads_to);
203  }
204
205  while (!queue.empty()) {
206    AhoCorasickNode& current_node = tree_[queue.front()];
207    queue.pop();
208    for (Edges::const_iterator e = current_node.edges().begin();
209         e != current_node.edges().end(); ++e) {
210      const char& edge_label = e->first;
211      const uint32& leads_to = e->second;
212      queue.push(leads_to);
213
214      uint32 failure = current_node.failure();
215      uint32 edge_from_failure = tree_[failure].GetEdge(edge_label);
216      while (edge_from_failure == AhoCorasickNode::kNoSuchEdge &&
217             failure != 0) {
218        failure = tree_[failure].failure();
219        edge_from_failure = tree_[failure].GetEdge(edge_label);
220      }
221
222      const uint32 follow_in_case_of_failure =
223          edge_from_failure != AhoCorasickNode::kNoSuchEdge
224              ? edge_from_failure
225              : 0;
226      tree_[leads_to].set_failure(follow_in_case_of_failure);
227      tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches());
228    }
229  }
230}
231
232const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF;
233
234SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode()
235    : failure_(kNoSuchEdge) {}
236
237SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {}
238
239SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(
240    const SubstringSetMatcher::AhoCorasickNode& other)
241    : edges_(other.edges_),
242      failure_(other.failure_),
243      matches_(other.matches_) {}
244
245SubstringSetMatcher::AhoCorasickNode&
246SubstringSetMatcher::AhoCorasickNode::operator=(
247    const SubstringSetMatcher::AhoCorasickNode& other) {
248  edges_ = other.edges_;
249  failure_ = other.failure_;
250  matches_ = other.matches_;
251  return *this;
252}
253
254uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const {
255  Edges::const_iterator i = edges_.find(c);
256  return i == edges_.end() ? kNoSuchEdge : i->second;
257}
258
259void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) {
260  edges_[c] = node;
261}
262
263void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) {
264  matches_.insert(id);
265}
266
267void SubstringSetMatcher::AhoCorasickNode::AddMatches(
268    const SubstringSetMatcher::AhoCorasickNode::Matches& matches) {
269  matches_.insert(matches.begin(), matches.end());
270}
271
272}  // namespace url_matcher
273