15d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Copyright 2013 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#ifndef COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_
65d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <map>
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set>
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector>
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h"
145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "components/url_matcher/string_pattern.h"
155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "components/url_matcher/substring_set_matcher.h"
165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "components/url_matcher/url_matcher_export.h"
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 {
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FilteredRE2;
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)namespace url_matcher {
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Efficiently matches URLs against a collection of regular expressions,
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// using FilteredRE2 to reduce the number of regexes that must be matched
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// by pre-filtering with substring matching. See:
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// http://swtch.com/~rsc/regexp/regexp3.html#analysis
285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)class URL_MATCHER_EXPORT RegexSetMatcher {
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  RegexSetMatcher();
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual ~RegexSetMatcher();
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Adds the regex patterns in |regex_list| to the matcher. Also rebuilds
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the FilteredRE2 matcher; thus, for efficiency, prefer adding multiple
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // patterns at once.
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Ownership of the patterns remains with the caller.
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void AddPatterns(const std::vector<const StringPattern*>& regex_list);
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Removes all regex patterns.
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void ClearPatterns();
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Appends the IDs of regular expressions in our set that match the |text|
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to |matches|.
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool Match(const std::string& text,
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)             std::set<StringPattern::ID>* matches) const;
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
47c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  bool IsEmpty() const;
48c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef int RE2ID;
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef std::map<StringPattern::ID, const StringPattern*> RegexMap;
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef std::vector<StringPattern::ID> RE2IDMap;
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Use Aho-Corasick SubstringSetMatcher to find which literal patterns
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // match the |text|.
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<RE2ID> FindSubstringMatches(const std::string& text) const;
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Rebuild FilteredRE2 from scratch. Needs to be called whenever
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // our set of regexes changes.
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // TODO(yoz): investigate if it could be done incrementally;
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // apparently not supported by FilteredRE2.
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void RebuildMatcher();
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Clean up StringPatterns in |substring_patterns_|.
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void DeleteSubstringPatterns();
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Mapping of regex StringPattern::IDs to regexes.
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  RegexMap regexes_;
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Mapping of RE2IDs from FilteredRE2 (which are assigned in order)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to regex StringPattern::IDs.
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  RE2IDMap re2_id_map_;
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<re2::FilteredRE2> filtered_re2_;
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scoped_ptr<SubstringSetMatcher> substring_matcher_;
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The substring patterns from FilteredRE2, which are used in
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // |substring_matcher_| but whose lifetime is managed here.
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  std::vector<const StringPattern*> substring_patterns_;
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)}  // namespace url_matcher
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#endif  // COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_
84