1// Copyright 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "components/url_matcher/url_matcher.h" 6 7#include <algorithm> 8#include <iterator> 9 10#include "base/logging.h" 11#include "base/profiler/scoped_profile.h" 12#include "url/gurl.h" 13#include "url/url_canon.h" 14 15namespace url_matcher { 16 17// This set of classes implement a mapping of URL Component Patterns, such as 18// host_prefix, host_suffix, host_equals, ..., etc., to StringPatterns 19// for use in substring comparisons. 20// 21// The idea of this mapping is to reduce the problem of comparing many 22// URL Component Patterns against one URL to the problem of searching many 23// substrings in one string: 24// 25// ---------------------- ----------------- 26// | URL Query operator | ----translate----> | StringPattern | 27// ---------------------- ----------------- 28// ^ 29// | 30// compare 31// | 32// v 33// ---------------------- ----------------- 34// | URL to compare | | | 35// | to all URL Query | ----translate----> | String | 36// | operators | | | 37// ---------------------- ----------------- 38// 39// The reason for this problem reduction is that there are efficient algorithms 40// for searching many substrings in one string (see Aho-Corasick algorithm). 41// 42// Additionally, some of the same pieces are reused to implement regular 43// expression comparisons. The FilteredRE2 implementation for matching many 44// regular expressions against one string uses prefiltering, in which a set 45// of substrings (derived from the regexes) are first searched for, to reduce 46// the number of regular expressions to test; the prefiltering step also 47// uses Aho-Corasick. 48// 49// Case 1: {host,path,query}_{prefix,suffix,equals} searches. 50// ========================================================== 51// 52// For searches in this class, we normalize URLs as follows: 53// 54// Step 1: 55// Remove scheme, port and segment from URL: 56// -> http://www.example.com:8080/index.html?search=foo#first_match becomes 57// www.example.com/index.html?search=foo 58// 59// We remove the scheme and port number because they can be checked later 60// in a secondary filter step. We remove the segment (the #... part) because 61// this is not guaranteed to be ASCII-7 encoded. 62// 63// Step 2: 64// Translate URL to String and add the following position markers: 65// - BU = Beginning of URL 66// - ED = End of Domain 67// - EP = End of Path 68// - EU = End of URL 69// Furthermore, the hostname is canonicalized to start with a ".". 70// 71// Position markers are represented as characters >127, which are therefore 72// guaranteed not to be part of the ASCII-7 encoded URL character set. 73// 74// -> www.example.com/index.html?search=foo becomes 75// BU .www.example.com ED /index.html EP ?search=foo EU 76// 77// -> www.example.com/index.html becomes 78// BU .www.example.com ED /index.html EP EU 79// 80// Step 3: 81// Translate URL Component Patterns as follows: 82// 83// host_prefix(prefix) = BU add_missing_dot_prefix(prefix) 84// -> host_prefix("www.example") = BU .www.example 85// 86// host_suffix(suffix) = suffix ED 87// -> host_suffix("example.com") = example.com ED 88// -> host_suffix(".example.com") = .example.com ED 89// 90// host_equals(domain) = BU add_missing_dot_prefix(domain) ED 91// -> host_equals("www.example.com") = BU .www.example.com ED 92// 93// Similarly for path query parameters ({path, query}_{prefix, suffix, equals}). 94// 95// With this, we can search the StringPatterns in the normalized URL. 96// 97// 98// Case 2: url_{prefix,suffix,equals,contains} searches. 99// ===================================================== 100// 101// Step 1: as above, except that 102// - the scheme is not removed 103// - the port is not removed if it is specified and does not match the default 104// port for the given scheme. 105// 106// Step 2: 107// Translate URL to String and add the following position markers: 108// - BU = Beginning of URL 109// - EU = End of URL 110// 111// -> http://www.example.com:8080/index.html?search=foo#first_match becomes 112// BU http://www.example.com:8080/index.html?search=foo EU 113// -> http://www.example.com:80/index.html?search=foo#first_match becomes 114// BU http://www.example.com/index.html?search=foo EU 115// 116// url_prefix(prefix) = BU prefix 117// -> url_prefix("http://www.example") = BU http://www.example 118// 119// url_contains(substring) = substring 120// -> url_contains("index") = index 121// 122// 123// Case 3: {host,path,query}_contains searches. 124// ============================================ 125// 126// These kinds of searches are not supported directly but can be derived 127// by a combination of a url_contains() query followed by an explicit test: 128// 129// host_contains(str) = url_contains(str) followed by test whether str occurs 130// in host component of original URL. 131// -> host_contains("example.co") = example.co 132// followed by gurl.host().find("example.co"); 133// 134// [similarly for path_contains and query_contains]. 135// 136// 137// Regular expression matching (url_matches searches) 138// ================================================== 139// 140// This class also supports matching regular expressions (RE2 syntax) 141// against full URLs, which are transformed as in case 2. 142 143namespace { 144 145bool IsRegexCriterion(URLMatcherCondition::Criterion criterion) { 146 return criterion == URLMatcherCondition::URL_MATCHES; 147} 148 149bool IsOriginAndPathRegexCriterion(URLMatcherCondition::Criterion criterion) { 150 return criterion == URLMatcherCondition::ORIGIN_AND_PATH_MATCHES; 151} 152 153} // namespace 154 155// 156// URLMatcherCondition 157// 158 159URLMatcherCondition::URLMatcherCondition() 160 : criterion_(HOST_PREFIX), 161 string_pattern_(NULL) {} 162 163URLMatcherCondition::~URLMatcherCondition() {} 164 165URLMatcherCondition::URLMatcherCondition( 166 Criterion criterion, 167 const StringPattern* string_pattern) 168 : criterion_(criterion), 169 string_pattern_(string_pattern) {} 170 171URLMatcherCondition::URLMatcherCondition(const URLMatcherCondition& rhs) 172 : criterion_(rhs.criterion_), 173 string_pattern_(rhs.string_pattern_) {} 174 175URLMatcherCondition& URLMatcherCondition::operator=( 176 const URLMatcherCondition& rhs) { 177 criterion_ = rhs.criterion_; 178 string_pattern_ = rhs.string_pattern_; 179 return *this; 180} 181 182bool URLMatcherCondition::operator<(const URLMatcherCondition& rhs) const { 183 if (criterion_ < rhs.criterion_) return true; 184 if (criterion_ > rhs.criterion_) return false; 185 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL) 186 return *string_pattern_ < *rhs.string_pattern_; 187 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) return true; 188 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL, 189 // or both are NULL. 190 return false; 191} 192 193bool URLMatcherCondition::IsFullURLCondition() const { 194 // For these criteria the SubstringMatcher needs to be executed on the 195 // GURL that is canonicalized with 196 // URLMatcherConditionFactory::CanonicalizeURLForFullSearches. 197 switch (criterion_) { 198 case HOST_CONTAINS: 199 case PATH_CONTAINS: 200 case QUERY_CONTAINS: 201 case URL_PREFIX: 202 case URL_SUFFIX: 203 case URL_CONTAINS: 204 case URL_EQUALS: 205 return true; 206 default: 207 break; 208 } 209 return false; 210} 211 212bool URLMatcherCondition::IsRegexCondition() const { 213 return IsRegexCriterion(criterion_); 214} 215 216bool URLMatcherCondition::IsOriginAndPathRegexCondition() const { 217 return IsOriginAndPathRegexCriterion(criterion_); 218} 219 220bool URLMatcherCondition::IsMatch( 221 const std::set<StringPattern::ID>& matching_patterns, 222 const GURL& url) const { 223 DCHECK(string_pattern_); 224 if (!ContainsKey(matching_patterns, string_pattern_->id())) 225 return false; 226 // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on 227 // a substring match on the raw URL. In case of a match, we need to verify 228 // that the match was found in the correct component of the URL. 229 switch (criterion_) { 230 case HOST_CONTAINS: 231 return url.host().find(string_pattern_->pattern()) != 232 std::string::npos; 233 case PATH_CONTAINS: 234 return url.path().find(string_pattern_->pattern()) != 235 std::string::npos; 236 case QUERY_CONTAINS: 237 return url.query().find(string_pattern_->pattern()) != 238 std::string::npos; 239 default: 240 break; 241 } 242 return true; 243} 244 245// 246// URLMatcherConditionFactory 247// 248 249namespace { 250// These are symbols that are not contained in 7-bit ASCII used in GURLs. 251const char kBeginningOfURL[] = {static_cast<char>(-1), 0}; 252const char kEndOfDomain[] = {static_cast<char>(-2), 0}; 253const char kEndOfPath[] = {static_cast<char>(-3), 0}; 254const char kQueryComponentDelimiter[] = {static_cast<char>(-4), 0}; 255const char kEndOfURL[] = {static_cast<char>(-5), 0}; 256 257// The delimiter for query parameters 258const char kQuerySeparator = '&'; 259} // namespace 260 261URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {} 262 263URLMatcherConditionFactory::~URLMatcherConditionFactory() { 264 STLDeleteElements(&substring_pattern_singletons_); 265 STLDeleteElements(®ex_pattern_singletons_); 266 STLDeleteElements(&origin_and_path_regex_pattern_singletons_); 267} 268 269std::string URLMatcherConditionFactory::CanonicalizeURLForComponentSearches( 270 const GURL& url) const { 271 return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain + 272 url.path() + kEndOfPath + 273 (url.has_query() ? CanonicalizeQuery(url.query(), true, true) 274 : std::string()) + 275 kEndOfURL; 276} 277 278URLMatcherCondition URLMatcherConditionFactory::CreateHostPrefixCondition( 279 const std::string& prefix) { 280 return CreateCondition(URLMatcherCondition::HOST_PREFIX, 281 kBeginningOfURL + CanonicalizeHostname(prefix)); 282} 283 284URLMatcherCondition URLMatcherConditionFactory::CreateHostSuffixCondition( 285 const std::string& suffix) { 286 return CreateCondition(URLMatcherCondition::HOST_SUFFIX, 287 suffix + kEndOfDomain); 288} 289 290URLMatcherCondition URLMatcherConditionFactory::CreateHostContainsCondition( 291 const std::string& str) { 292 return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str); 293} 294 295URLMatcherCondition URLMatcherConditionFactory::CreateHostEqualsCondition( 296 const std::string& str) { 297 return CreateCondition(URLMatcherCondition::HOST_EQUALS, 298 kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain); 299} 300 301URLMatcherCondition URLMatcherConditionFactory::CreatePathPrefixCondition( 302 const std::string& prefix) { 303 return CreateCondition(URLMatcherCondition::PATH_PREFIX, 304 kEndOfDomain + prefix); 305} 306 307URLMatcherCondition URLMatcherConditionFactory::CreatePathSuffixCondition( 308 const std::string& suffix) { 309 return CreateCondition(URLMatcherCondition::PATH_SUFFIX, suffix + kEndOfPath); 310} 311 312URLMatcherCondition URLMatcherConditionFactory::CreatePathContainsCondition( 313 const std::string& str) { 314 return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str); 315} 316 317URLMatcherCondition URLMatcherConditionFactory::CreatePathEqualsCondition( 318 const std::string& str) { 319 return CreateCondition(URLMatcherCondition::PATH_EQUALS, 320 kEndOfDomain + str + kEndOfPath); 321} 322 323URLMatcherCondition URLMatcherConditionFactory::CreateQueryPrefixCondition( 324 const std::string& prefix) { 325 std::string pattern; 326 if (!prefix.empty() && prefix[0] == '?') 327 pattern = kEndOfPath + CanonicalizeQuery(prefix.substr(1), true, false); 328 else 329 pattern = kEndOfPath + CanonicalizeQuery(prefix, true, false); 330 331 return CreateCondition(URLMatcherCondition::QUERY_PREFIX, pattern); 332} 333 334URLMatcherCondition URLMatcherConditionFactory::CreateQuerySuffixCondition( 335 const std::string& suffix) { 336 if (!suffix.empty() && suffix[0] == '?') { 337 return CreateQueryEqualsCondition(suffix); 338 } else { 339 return CreateCondition(URLMatcherCondition::QUERY_SUFFIX, 340 CanonicalizeQuery(suffix, false, true) + kEndOfURL); 341 } 342} 343 344URLMatcherCondition URLMatcherConditionFactory::CreateQueryContainsCondition( 345 const std::string& str) { 346 if (!str.empty() && str[0] == '?') 347 return CreateQueryPrefixCondition(str); 348 else 349 return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str); 350} 351 352URLMatcherCondition URLMatcherConditionFactory::CreateQueryEqualsCondition( 353 const std::string& str) { 354 std::string pattern; 355 if (!str.empty() && str[0] == '?') 356 pattern = 357 kEndOfPath + CanonicalizeQuery(str.substr(1), true, true) + kEndOfURL; 358 else 359 pattern = kEndOfPath + CanonicalizeQuery(str, true, true) + kEndOfURL; 360 361 return CreateCondition(URLMatcherCondition::QUERY_EQUALS, pattern); 362} 363 364URLMatcherCondition 365 URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition( 366 const std::string& host_suffix, 367 const std::string& path_prefix) { 368 return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX, 369 host_suffix + kEndOfDomain + path_prefix); 370} 371 372URLMatcherCondition 373URLMatcherConditionFactory::CreateHostEqualsPathPrefixCondition( 374 const std::string& host, 375 const std::string& path_prefix) { 376 return CreateCondition(URLMatcherCondition::HOST_EQUALS_PATH_PREFIX, 377 kBeginningOfURL + CanonicalizeHostname(host) + kEndOfDomain + 378 path_prefix); 379} 380 381std::string URLMatcherConditionFactory::CanonicalizeURLForFullSearches( 382 const GURL& url) const { 383 GURL::Replacements replacements; 384 replacements.ClearPassword(); 385 replacements.ClearUsername(); 386 replacements.ClearRef(); 387 // Clear port if it is implicit from scheme. 388 if (url.has_port()) { 389 const std::string& port = url.scheme(); 390 if (url::DefaultPortForScheme(port.c_str(), port.size()) == 391 url.EffectiveIntPort()) { 392 replacements.ClearPort(); 393 } 394 } 395 return kBeginningOfURL + url.ReplaceComponents(replacements).spec() + 396 kEndOfURL; 397} 398 399static std::string CanonicalizeURLForRegexSearchesHelper( 400 const GURL& url, 401 bool clear_query) { 402 GURL::Replacements replacements; 403 replacements.ClearPassword(); 404 replacements.ClearUsername(); 405 replacements.ClearRef(); 406 if (clear_query) 407 replacements.ClearQuery(); 408 // Clear port if it is implicit from scheme. 409 if (url.has_port()) { 410 const std::string& port = url.scheme(); 411 if (url::DefaultPortForScheme(port.c_str(), port.size()) == 412 url.EffectiveIntPort()) { 413 replacements.ClearPort(); 414 } 415 } 416 return url.ReplaceComponents(replacements).spec(); 417} 418 419std::string URLMatcherConditionFactory::CanonicalizeURLForRegexSearches( 420 const GURL& url) const { 421 return CanonicalizeURLForRegexSearchesHelper(url, false); 422} 423 424std::string 425URLMatcherConditionFactory::CanonicalizeURLForOriginAndPathRegexSearches( 426 const GURL& url) const { 427 return CanonicalizeURLForRegexSearchesHelper(url, true); 428} 429 430URLMatcherCondition URLMatcherConditionFactory::CreateURLPrefixCondition( 431 const std::string& prefix) { 432 return CreateCondition(URLMatcherCondition::URL_PREFIX, 433 kBeginningOfURL + prefix); 434} 435 436URLMatcherCondition URLMatcherConditionFactory::CreateURLSuffixCondition( 437 const std::string& suffix) { 438 return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL); 439} 440 441URLMatcherCondition URLMatcherConditionFactory::CreateURLContainsCondition( 442 const std::string& str) { 443 return CreateCondition(URLMatcherCondition::URL_CONTAINS, str); 444} 445 446URLMatcherCondition URLMatcherConditionFactory::CreateURLEqualsCondition( 447 const std::string& str) { 448 return CreateCondition(URLMatcherCondition::URL_EQUALS, 449 kBeginningOfURL + str + kEndOfURL); 450} 451 452URLMatcherCondition URLMatcherConditionFactory::CreateURLMatchesCondition( 453 const std::string& regex) { 454 return CreateCondition(URLMatcherCondition::URL_MATCHES, regex); 455} 456 457URLMatcherCondition 458URLMatcherConditionFactory::CreateOriginAndPathMatchesCondition( 459 const std::string& regex) { 460 return CreateCondition(URLMatcherCondition::ORIGIN_AND_PATH_MATCHES, regex); 461} 462 463void URLMatcherConditionFactory::ForgetUnusedPatterns( 464 const std::set<StringPattern::ID>& used_patterns) { 465 PatternSingletons::iterator i = substring_pattern_singletons_.begin(); 466 while (i != substring_pattern_singletons_.end()) { 467 if (ContainsKey(used_patterns, (*i)->id())) { 468 ++i; 469 } else { 470 delete *i; 471 substring_pattern_singletons_.erase(i++); 472 } 473 } 474 i = regex_pattern_singletons_.begin(); 475 while (i != regex_pattern_singletons_.end()) { 476 if (ContainsKey(used_patterns, (*i)->id())) { 477 ++i; 478 } else { 479 delete *i; 480 regex_pattern_singletons_.erase(i++); 481 } 482 } 483 i = origin_and_path_regex_pattern_singletons_.begin(); 484 while (i != origin_and_path_regex_pattern_singletons_.end()) { 485 if (ContainsKey(used_patterns, (*i)->id())) { 486 ++i; 487 } else { 488 delete *i; 489 origin_and_path_regex_pattern_singletons_.erase(i++); 490 } 491 } 492} 493 494bool URLMatcherConditionFactory::IsEmpty() const { 495 return substring_pattern_singletons_.empty() && 496 regex_pattern_singletons_.empty() && 497 origin_and_path_regex_pattern_singletons_.empty(); 498} 499 500URLMatcherCondition URLMatcherConditionFactory::CreateCondition( 501 URLMatcherCondition::Criterion criterion, 502 const std::string& pattern) { 503 StringPattern search_pattern(pattern, 0); 504 PatternSingletons* pattern_singletons = NULL; 505 if (IsRegexCriterion(criterion)) 506 pattern_singletons = ®ex_pattern_singletons_; 507 else if (IsOriginAndPathRegexCriterion(criterion)) 508 pattern_singletons = &origin_and_path_regex_pattern_singletons_; 509 else 510 pattern_singletons = &substring_pattern_singletons_; 511 512 PatternSingletons::const_iterator iter = 513 pattern_singletons->find(&search_pattern); 514 515 if (iter != pattern_singletons->end()) { 516 return URLMatcherCondition(criterion, *iter); 517 } else { 518 StringPattern* new_pattern = 519 new StringPattern(pattern, id_counter_++); 520 pattern_singletons->insert(new_pattern); 521 return URLMatcherCondition(criterion, new_pattern); 522 } 523} 524 525std::string URLMatcherConditionFactory::CanonicalizeHostname( 526 const std::string& hostname) const { 527 if (!hostname.empty() && hostname[0] == '.') 528 return hostname; 529 else 530 return "." + hostname; 531} 532 533// This function prepares the query string by replacing query separator with a 534// magic value (|kQueryComponentDelimiter|). When the boolean 535// |prepend_beginning_of_query_component| is true the function prepends the 536// query with the same magic. This is done to locate the start of a key value 537// pair in the query string. The parameter |query| is passed by value 538// intentionally, since it is locally modified. 539std::string URLMatcherConditionFactory::CanonicalizeQuery( 540 std::string query, 541 bool prepend_beginning_of_query_component, 542 bool append_end_of_query_component) const { 543 for (std::string::iterator it = query.begin(); it != query.end(); ++it) { 544 if (*it == kQuerySeparator) 545 *it = kQueryComponentDelimiter[0]; 546 } 547 if (prepend_beginning_of_query_component) 548 query = kQueryComponentDelimiter + query; 549 if (append_end_of_query_component) 550 query += kQueryComponentDelimiter; 551 return query; 552} 553 554bool URLMatcherConditionFactory::StringPatternPointerCompare::operator()( 555 StringPattern* lhs, 556 StringPattern* rhs) const { 557 if (lhs == NULL && rhs != NULL) return true; 558 if (lhs != NULL && rhs != NULL) 559 return lhs->pattern() < rhs->pattern(); 560 // Either both are NULL or only rhs is NULL. 561 return false; 562} 563 564// 565// URLQueryElementMatcherCondition 566// 567 568URLQueryElementMatcherCondition::URLQueryElementMatcherCondition( 569 const std::string& key, 570 const std::string& value, 571 QueryValueMatchType query_value_match_type, 572 QueryElementType query_element_type, 573 Type match_type, 574 URLMatcherConditionFactory* factory) { 575 match_type_ = match_type; 576 577 if (query_element_type == ELEMENT_TYPE_KEY_VALUE) { 578 key_ = kQueryComponentDelimiter + key + "="; 579 value_ = value; 580 } else { 581 key_ = kQueryComponentDelimiter + key; 582 value_ = std::string(); 583 } 584 585 if (query_value_match_type == QUERY_VALUE_MATCH_EXACT) 586 value_ += kQueryComponentDelimiter; 587 588 // If |value_| is empty no need to find the |key_| and verify if the value 589 // matches. Simply checking the presence of key is sufficient, which is done 590 // by MATCH_ANY 591 if (value_.empty()) 592 match_type_ = MATCH_ANY; 593 594 URLMatcherCondition condition; 595 // If |match_type_| is MATCH_ANY, then we could simply look for the 596 // combination of |key_| + |value_|, which can be efficiently done by 597 // SubstringMatcher 598 if (match_type_ == MATCH_ANY) 599 condition = factory->CreateQueryContainsCondition(key_ + value_); 600 else 601 condition = factory->CreateQueryContainsCondition(key_); 602 string_pattern_ = condition.string_pattern(); 603 604 key_length_ = key_.length(); 605 value_length_ = value_.length(); 606} 607 608URLQueryElementMatcherCondition::~URLQueryElementMatcherCondition() {} 609 610bool URLQueryElementMatcherCondition::operator<( 611 const URLQueryElementMatcherCondition& rhs) const { 612 if (match_type_ != rhs.match_type_) 613 return match_type_ < rhs.match_type_; 614 if (string_pattern_ != NULL && rhs.string_pattern_ != NULL) 615 return *string_pattern_ < *rhs.string_pattern_; 616 if (string_pattern_ == NULL && rhs.string_pattern_ != NULL) 617 return true; 618 // Either string_pattern_ != NULL && rhs.string_pattern_ == NULL, 619 // or both are NULL. 620 return false; 621} 622 623bool URLQueryElementMatcherCondition::IsMatch( 624 const std::string& url_for_component_searches) const { 625 switch (match_type_) { 626 case MATCH_ANY: { 627 // For MATCH_ANY, no additional verification step is needed. We can trust 628 // the SubstringMatcher to do the verification. 629 return true; 630 } 631 case MATCH_ALL: { 632 size_t start = 0; 633 int found = 0; 634 size_t offset; 635 while ((offset = url_for_component_searches.find(key_, start)) != 636 std::string::npos) { 637 if (url_for_component_searches.compare( 638 offset + key_length_, value_length_, value_) != 0) { 639 return false; 640 } else { 641 ++found; 642 } 643 start = offset + key_length_ + value_length_ - 1; 644 } 645 return !!found; 646 } 647 case MATCH_FIRST: { 648 size_t offset = url_for_component_searches.find(key_); 649 return url_for_component_searches.compare( 650 offset + key_length_, value_length_, value_) == 0; 651 } 652 case MATCH_LAST: { 653 size_t offset = url_for_component_searches.rfind(key_); 654 return url_for_component_searches.compare( 655 offset + key_length_, value_length_, value_) == 0; 656 } 657 } 658 NOTREACHED(); 659 return false; 660} 661 662// 663// URLMatcherSchemeFilter 664// 665 666URLMatcherSchemeFilter::URLMatcherSchemeFilter(const std::string& filter) 667 : filters_(1) { 668 filters_.push_back(filter); 669} 670 671URLMatcherSchemeFilter::URLMatcherSchemeFilter( 672 const std::vector<std::string>& filters) 673 : filters_(filters) {} 674 675URLMatcherSchemeFilter::~URLMatcherSchemeFilter() {} 676 677bool URLMatcherSchemeFilter::IsMatch(const GURL& url) const { 678 return std::find(filters_.begin(), filters_.end(), url.scheme()) != 679 filters_.end(); 680} 681 682// 683// URLMatcherPortFilter 684// 685 686URLMatcherPortFilter::URLMatcherPortFilter( 687 const std::vector<URLMatcherPortFilter::Range>& ranges) 688 : ranges_(ranges) {} 689 690URLMatcherPortFilter::~URLMatcherPortFilter() {} 691 692bool URLMatcherPortFilter::IsMatch(const GURL& url) const { 693 int port = url.EffectiveIntPort(); 694 for (std::vector<Range>::const_iterator i = ranges_.begin(); 695 i != ranges_.end(); ++i) { 696 if (i->first <= port && port <= i->second) 697 return true; 698 } 699 return false; 700} 701 702// static 703URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int from, 704 int to) { 705 return Range(from, to); 706} 707 708// static 709URLMatcherPortFilter::Range URLMatcherPortFilter::CreateRange(int port) { 710 return Range(port, port); 711} 712 713// 714// URLMatcherConditionSet 715// 716 717URLMatcherConditionSet::~URLMatcherConditionSet() {} 718 719URLMatcherConditionSet::URLMatcherConditionSet( 720 ID id, 721 const Conditions& conditions) 722 : id_(id), 723 conditions_(conditions) {} 724 725URLMatcherConditionSet::URLMatcherConditionSet( 726 ID id, 727 const Conditions& conditions, 728 scoped_ptr<URLMatcherSchemeFilter> scheme_filter, 729 scoped_ptr<URLMatcherPortFilter> port_filter) 730 : id_(id), 731 conditions_(conditions), 732 scheme_filter_(scheme_filter.Pass()), 733 port_filter_(port_filter.Pass()) {} 734 735URLMatcherConditionSet::URLMatcherConditionSet( 736 ID id, 737 const Conditions& conditions, 738 const QueryConditions& query_conditions, 739 scoped_ptr<URLMatcherSchemeFilter> scheme_filter, 740 scoped_ptr<URLMatcherPortFilter> port_filter) 741 : id_(id), 742 conditions_(conditions), 743 query_conditions_(query_conditions), 744 scheme_filter_(scheme_filter.Pass()), 745 port_filter_(port_filter.Pass()) {} 746 747bool URLMatcherConditionSet::IsMatch( 748 const std::set<StringPattern::ID>& matching_patterns, 749 const GURL& url) const { 750 return IsMatch(matching_patterns, url, std::string()); 751} 752 753bool URLMatcherConditionSet::IsMatch( 754 const std::set<StringPattern::ID>& matching_patterns, 755 const GURL& url, 756 const std::string& url_for_component_searches) const { 757 for (Conditions::const_iterator i = conditions_.begin(); 758 i != conditions_.end(); ++i) { 759 if (!i->IsMatch(matching_patterns, url)) 760 return false; 761 } 762 if (scheme_filter_.get() && !scheme_filter_->IsMatch(url)) 763 return false; 764 if (port_filter_.get() && !port_filter_->IsMatch(url)) 765 return false; 766 if (query_conditions_.empty()) 767 return true; 768 // The loop is duplicated below for performance reasons. If not all query 769 // elements are found, no need to verify match that is expected to take more 770 // cycles. 771 for (QueryConditions::const_iterator i = query_conditions_.begin(); 772 i != query_conditions_.end(); 773 ++i) { 774 if (!ContainsKey(matching_patterns, i->string_pattern()->id())) 775 return false; 776 } 777 for (QueryConditions::const_iterator i = query_conditions_.begin(); 778 i != query_conditions_.end(); 779 ++i) { 780 if (!i->IsMatch(url_for_component_searches)) 781 return false; 782 } 783 return true; 784} 785 786// 787// URLMatcher 788// 789 790URLMatcher::URLMatcher() {} 791 792URLMatcher::~URLMatcher() {} 793 794void URLMatcher::AddConditionSets( 795 const URLMatcherConditionSet::Vector& condition_sets) { 796 for (URLMatcherConditionSet::Vector::const_iterator i = 797 condition_sets.begin(); i != condition_sets.end(); ++i) { 798 DCHECK(url_matcher_condition_sets_.find((*i)->id()) == 799 url_matcher_condition_sets_.end()); 800 url_matcher_condition_sets_[(*i)->id()] = *i; 801 } 802 UpdateInternalDatastructures(); 803} 804 805void URLMatcher::RemoveConditionSets( 806 const std::vector<URLMatcherConditionSet::ID>& condition_set_ids) { 807 for (std::vector<URLMatcherConditionSet::ID>::const_iterator i = 808 condition_set_ids.begin(); i != condition_set_ids.end(); ++i) { 809 DCHECK(url_matcher_condition_sets_.find(*i) != 810 url_matcher_condition_sets_.end()); 811 url_matcher_condition_sets_.erase(*i); 812 } 813 UpdateInternalDatastructures(); 814} 815 816void URLMatcher::ClearUnusedConditionSets() { 817 UpdateConditionFactory(); 818} 819 820std::set<URLMatcherConditionSet::ID> URLMatcher::MatchURL( 821 const GURL& url) const { 822 // Find all IDs of StringPatterns that match |url|. 823 // See URLMatcherConditionFactory for the canonicalization of URLs and the 824 // distinction between full url searches and url component searches. 825 std::set<StringPattern::ID> matches; 826 std::string url_for_component_searches; 827 828 if (!full_url_matcher_.IsEmpty()) { 829 full_url_matcher_.Match( 830 condition_factory_.CanonicalizeURLForFullSearches(url), &matches); 831 } 832 if (!url_component_matcher_.IsEmpty()) { 833 url_for_component_searches = 834 condition_factory_.CanonicalizeURLForComponentSearches(url); 835 url_component_matcher_.Match(url_for_component_searches, &matches); 836 } 837 if (!regex_set_matcher_.IsEmpty()) { 838 regex_set_matcher_.Match( 839 condition_factory_.CanonicalizeURLForRegexSearches(url), &matches); 840 } 841 if (!origin_and_path_regex_set_matcher_.IsEmpty()) { 842 origin_and_path_regex_set_matcher_.Match( 843 condition_factory_.CanonicalizeURLForOriginAndPathRegexSearches(url), 844 &matches); 845 } 846 847 // Calculate all URLMatcherConditionSets for which all URLMatcherConditions 848 // were fulfilled. 849 std::set<URLMatcherConditionSet::ID> result; 850 for (std::set<StringPattern::ID>::const_iterator i = matches.begin(); 851 i != matches.end(); ++i) { 852 // For each URLMatcherConditionSet there is exactly one condition 853 // registered in substring_match_triggers_. This means that the following 854 // logic tests each URLMatcherConditionSet exactly once if it can be 855 // completely fulfilled. 856 StringPatternTriggers::const_iterator triggered_condition_sets_iter = 857 substring_match_triggers_.find(*i); 858 if (triggered_condition_sets_iter == substring_match_triggers_.end()) 859 continue; // Not all substring matches are triggers for a condition set. 860 const std::set<URLMatcherConditionSet::ID>& condition_sets = 861 triggered_condition_sets_iter->second; 862 for (std::set<URLMatcherConditionSet::ID>::const_iterator j = 863 condition_sets.begin(); j != condition_sets.end(); ++j) { 864 URLMatcherConditionSets::const_iterator condition_set_iter = 865 url_matcher_condition_sets_.find(*j); 866 DCHECK(condition_set_iter != url_matcher_condition_sets_.end()); 867 if (condition_set_iter->second->IsMatch( 868 matches, url, url_for_component_searches)) 869 result.insert(*j); 870 } 871 } 872 873 return result; 874} 875 876bool URLMatcher::IsEmpty() const { 877 return condition_factory_.IsEmpty() && 878 url_matcher_condition_sets_.empty() && 879 substring_match_triggers_.empty() && 880 full_url_matcher_.IsEmpty() && 881 url_component_matcher_.IsEmpty() && 882 regex_set_matcher_.IsEmpty() && 883 origin_and_path_regex_set_matcher_.IsEmpty() && 884 registered_full_url_patterns_.empty() && 885 registered_url_component_patterns_.empty(); 886} 887 888void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) { 889 // The purpose of |full_url_conditions| is just that we need to execute 890 // the same logic once for Full URL searches and once for URL Component 891 // searches (see URLMatcherConditionFactory). 892 893 // Determine which patterns need to be registered when this function 894 // terminates. 895 std::set<const StringPattern*> new_patterns; 896 for (URLMatcherConditionSets::const_iterator condition_set_iter = 897 url_matcher_condition_sets_.begin(); 898 condition_set_iter != url_matcher_condition_sets_.end(); 899 ++condition_set_iter) { 900 const URLMatcherConditionSet::Conditions& conditions = 901 condition_set_iter->second->conditions(); 902 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 903 conditions.begin(); condition_iter != conditions.end(); 904 ++condition_iter) { 905 // If we are called to process Full URL searches, ignore others, and 906 // vice versa. (Regex conditions are updated in UpdateRegexSetMatcher.) 907 if (!condition_iter->IsRegexCondition() && 908 !condition_iter->IsOriginAndPathRegexCondition() && 909 full_url_conditions == condition_iter->IsFullURLCondition()) 910 new_patterns.insert(condition_iter->string_pattern()); 911 } 912 913 if (full_url_conditions) 914 continue; 915 916 const URLMatcherConditionSet::QueryConditions& query_conditions = 917 condition_set_iter->second->query_conditions(); 918 for (URLMatcherConditionSet::QueryConditions::const_iterator 919 query_condition_iter = query_conditions.begin(); 920 query_condition_iter != query_conditions.end(); 921 ++query_condition_iter) { 922 new_patterns.insert(query_condition_iter->string_pattern()); 923 } 924 } 925 926 // This is the set of patterns that were registered before this function 927 // is called. 928 std::set<const StringPattern*>& registered_patterns = 929 full_url_conditions ? registered_full_url_patterns_ 930 : registered_url_component_patterns_; 931 932 // Add all patterns that are in new_patterns but not in registered_patterns. 933 std::vector<const StringPattern*> patterns_to_register = 934 base::STLSetDifference<std::vector<const StringPattern*> >( 935 new_patterns, registered_patterns); 936 937 // Remove all patterns that are in registered_patterns but not in 938 // new_patterns. 939 std::vector<const StringPattern*> patterns_to_unregister = 940 base::STLSetDifference<std::vector<const StringPattern*> >( 941 registered_patterns, new_patterns); 942 943 // Update the SubstringSetMatcher. 944 SubstringSetMatcher& url_matcher = 945 full_url_conditions ? full_url_matcher_ : url_component_matcher_; 946 url_matcher.RegisterAndUnregisterPatterns(patterns_to_register, 947 patterns_to_unregister); 948 949 // Update the set of registered_patterns for the next time this function 950 // is being called. 951 registered_patterns.swap(new_patterns); 952} 953 954void URLMatcher::UpdateRegexSetMatcher() { 955 std::vector<const StringPattern*> new_patterns; 956 std::vector<const StringPattern*> new_origin_and_path_patterns; 957 958 for (URLMatcherConditionSets::const_iterator condition_set_iter = 959 url_matcher_condition_sets_.begin(); 960 condition_set_iter != url_matcher_condition_sets_.end(); 961 ++condition_set_iter) { 962 const URLMatcherConditionSet::Conditions& conditions = 963 condition_set_iter->second->conditions(); 964 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 965 conditions.begin(); condition_iter != conditions.end(); 966 ++condition_iter) { 967 if (condition_iter->IsRegexCondition()) { 968 new_patterns.push_back(condition_iter->string_pattern()); 969 } else if (condition_iter->IsOriginAndPathRegexCondition()) { 970 new_origin_and_path_patterns.push_back( 971 condition_iter->string_pattern()); 972 } 973 } 974 } 975 976 // Start over from scratch. We can't really do better than this, since the 977 // FilteredRE2 backend doesn't support incremental updates. 978 regex_set_matcher_.ClearPatterns(); 979 regex_set_matcher_.AddPatterns(new_patterns); 980 origin_and_path_regex_set_matcher_.ClearPatterns(); 981 origin_and_path_regex_set_matcher_.AddPatterns(new_origin_and_path_patterns); 982} 983 984void URLMatcher::UpdateTriggers() { 985 // Count substring pattern frequencies. 986 std::map<StringPattern::ID, size_t> substring_pattern_frequencies; 987 for (URLMatcherConditionSets::const_iterator condition_set_iter = 988 url_matcher_condition_sets_.begin(); 989 condition_set_iter != url_matcher_condition_sets_.end(); 990 ++condition_set_iter) { 991 const URLMatcherConditionSet::Conditions& conditions = 992 condition_set_iter->second->conditions(); 993 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 994 conditions.begin(); condition_iter != conditions.end(); 995 ++condition_iter) { 996 const StringPattern* pattern = condition_iter->string_pattern(); 997 substring_pattern_frequencies[pattern->id()]++; 998 } 999 1000 const URLMatcherConditionSet::QueryConditions& query_conditions = 1001 condition_set_iter->second->query_conditions(); 1002 for (URLMatcherConditionSet::QueryConditions::const_iterator 1003 query_condition_iter = query_conditions.begin(); 1004 query_condition_iter != query_conditions.end(); 1005 ++query_condition_iter) { 1006 const StringPattern* pattern = query_condition_iter->string_pattern(); 1007 substring_pattern_frequencies[pattern->id()]++; 1008 } 1009 } 1010 1011 // Update trigger conditions: Determine for each URLMatcherConditionSet which 1012 // URLMatcherCondition contains a StringPattern that occurs least 1013 // frequently in this URLMatcher. We assume that this condition is very 1014 // specific and occurs rarely in URLs. If a match occurs for this 1015 // URLMatcherCondition, we want to test all other URLMatcherCondition in the 1016 // respective URLMatcherConditionSet as well to see whether the entire 1017 // URLMatcherConditionSet is considered matching. 1018 substring_match_triggers_.clear(); 1019 for (URLMatcherConditionSets::const_iterator condition_set_iter = 1020 url_matcher_condition_sets_.begin(); 1021 condition_set_iter != url_matcher_condition_sets_.end(); 1022 ++condition_set_iter) { 1023 const URLMatcherConditionSet::Conditions& conditions = 1024 condition_set_iter->second->conditions(); 1025 if (conditions.empty()) 1026 continue; 1027 URLMatcherConditionSet::Conditions::const_iterator condition_iter = 1028 conditions.begin(); 1029 StringPattern::ID trigger = condition_iter->string_pattern()->id(); 1030 // We skip the first element in the following loop. 1031 ++condition_iter; 1032 for (; condition_iter != conditions.end(); ++condition_iter) { 1033 StringPattern::ID current_id = 1034 condition_iter->string_pattern()->id(); 1035 if (substring_pattern_frequencies[trigger] > 1036 substring_pattern_frequencies[current_id]) { 1037 trigger = current_id; 1038 } 1039 } 1040 1041 const URLMatcherConditionSet::QueryConditions& query_conditions = 1042 condition_set_iter->second->query_conditions(); 1043 for (URLMatcherConditionSet::QueryConditions::const_iterator 1044 query_condition_iter = query_conditions.begin(); 1045 query_condition_iter != query_conditions.end(); 1046 ++query_condition_iter) { 1047 StringPattern::ID current_id = 1048 query_condition_iter->string_pattern()->id(); 1049 if (substring_pattern_frequencies[trigger] > 1050 substring_pattern_frequencies[current_id]) { 1051 trigger = current_id; 1052 } 1053 } 1054 1055 substring_match_triggers_[trigger].insert(condition_set_iter->second->id()); 1056 } 1057} 1058 1059void URLMatcher::UpdateConditionFactory() { 1060 std::set<StringPattern::ID> used_patterns; 1061 for (URLMatcherConditionSets::const_iterator condition_set_iter = 1062 url_matcher_condition_sets_.begin(); 1063 condition_set_iter != url_matcher_condition_sets_.end(); 1064 ++condition_set_iter) { 1065 const URLMatcherConditionSet::Conditions& conditions = 1066 condition_set_iter->second->conditions(); 1067 for (URLMatcherConditionSet::Conditions::const_iterator condition_iter = 1068 conditions.begin(); condition_iter != conditions.end(); 1069 ++condition_iter) { 1070 used_patterns.insert(condition_iter->string_pattern()->id()); 1071 } 1072 const URLMatcherConditionSet::QueryConditions& query_conditions = 1073 condition_set_iter->second->query_conditions(); 1074 for (URLMatcherConditionSet::QueryConditions::const_iterator 1075 query_condition_iter = query_conditions.begin(); 1076 query_condition_iter != query_conditions.end(); 1077 ++query_condition_iter) { 1078 used_patterns.insert(query_condition_iter->string_pattern()->id()); 1079 } 1080 } 1081 condition_factory_.ForgetUnusedPatterns(used_patterns); 1082} 1083 1084void URLMatcher::UpdateInternalDatastructures() { 1085 // TODO(vadimt): Remove ScopedProfile below once crbug.com/417106 is fixed. 1086 tracked_objects::ScopedProfile tracking_profile( 1087 FROM_HERE_WITH_EXPLICIT_FUNCTION( 1088 "URLMatcher_UpdateInternalDatastructures")); 1089 UpdateSubstringSetMatcher(false); 1090 UpdateSubstringSetMatcher(true); 1091 UpdateRegexSetMatcher(); 1092 UpdateTriggers(); 1093 UpdateConditionFactory(); 1094} 1095 1096} // namespace url_matcher 1097