15d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)// Copyright 2013 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#ifndef COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_ 65d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#define COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_ 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <map> 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <set> 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string> 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector> 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h" 145d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "components/url_matcher/string_pattern.h" 155d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "components/url_matcher/substring_set_matcher.h" 165d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#include "components/url_matcher/url_matcher_export.h" 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 { 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FilteredRE2; 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 225d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)namespace url_matcher { 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Efficiently matches URLs against a collection of regular expressions, 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// using FilteredRE2 to reduce the number of regexes that must be matched 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// by pre-filtering with substring matching. See: 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// http://swtch.com/~rsc/regexp/regexp3.html#analysis 285d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)class URL_MATCHER_EXPORT RegexSetMatcher { 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexSetMatcher(); 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual ~RegexSetMatcher(); 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Adds the regex patterns in |regex_list| to the matcher. Also rebuilds 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the FilteredRE2 matcher; thus, for efficiency, prefer adding multiple 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // patterns at once. 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Ownership of the patterns remains with the caller. 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void AddPatterns(const std::vector<const StringPattern*>& regex_list); 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Removes all regex patterns. 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void ClearPatterns(); 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Appends the IDs of regular expressions in our set that match the |text| 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to |matches|. 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool Match(const std::string& text, 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::set<StringPattern::ID>* matches) const; 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 47c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) bool IsEmpty() const; 48c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef int RE2ID; 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef std::map<StringPattern::ID, const StringPattern*> RegexMap; 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef std::vector<StringPattern::ID> RE2IDMap; 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Use Aho-Corasick SubstringSetMatcher to find which literal patterns 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // match the |text|. 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<RE2ID> FindSubstringMatches(const std::string& text) const; 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Rebuild FilteredRE2 from scratch. Needs to be called whenever 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // our set of regexes changes. 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TODO(yoz): investigate if it could be done incrementally; 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // apparently not supported by FilteredRE2. 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void RebuildMatcher(); 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Clean up StringPatterns in |substring_patterns_|. 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void DeleteSubstringPatterns(); 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Mapping of regex StringPattern::IDs to regexes. 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RegexMap regexes_; 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Mapping of RE2IDs from FilteredRE2 (which are assigned in order) 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to regex StringPattern::IDs. 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RE2IDMap re2_id_map_; 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scoped_ptr<re2::FilteredRE2> filtered_re2_; 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) scoped_ptr<SubstringSetMatcher> substring_matcher_; 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The substring patterns from FilteredRE2, which are used in 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // |substring_matcher_| but whose lifetime is managed here. 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) std::vector<const StringPattern*> substring_patterns_; 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 815d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)} // namespace url_matcher 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 835d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)#endif // COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_ 84