1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "extensions/common/url_pattern.h"
6
7#include <ostream>
8
9#include "base/strings/string_number_conversions.h"
10#include "base/strings/string_piece.h"
11#include "base/strings/string_split.h"
12#include "base/strings/string_util.h"
13#include "base/strings/stringprintf.h"
14#include "content/public/common/url_constants.h"
15#include "extensions/common/constants.h"
16#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
17#include "url/gurl.h"
18#include "url/url_util.h"
19
20const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
21
22namespace {
23
24// TODO(aa): What about more obscure schemes like data: and javascript: ?
25// Note: keep this array in sync with kValidSchemeMasks.
26const char* kValidSchemes[] = {
27    url::kHttpScheme,
28    url::kHttpsScheme,
29    url::kFileScheme,
30    url::kFtpScheme,
31    content::kChromeUIScheme,
32    extensions::kExtensionScheme,
33    url::kFileSystemScheme,
34};
35
36const int kValidSchemeMasks[] = {
37  URLPattern::SCHEME_HTTP,
38  URLPattern::SCHEME_HTTPS,
39  URLPattern::SCHEME_FILE,
40  URLPattern::SCHEME_FTP,
41  URLPattern::SCHEME_CHROMEUI,
42  URLPattern::SCHEME_EXTENSION,
43  URLPattern::SCHEME_FILESYSTEM,
44};
45
46COMPILE_ASSERT(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
47               must_keep_these_arrays_in_sync);
48
49const char kParseSuccess[] = "Success.";
50const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
51const char kParseErrorInvalidScheme[] = "Invalid scheme.";
52const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
53const char kParseErrorEmptyHost[] = "Host can not be empty.";
54const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
55const char kParseErrorEmptyPath[] = "Empty path.";
56const char kParseErrorInvalidPort[] = "Invalid port.";
57const char kParseErrorInvalidHost[] = "Invalid host.";
58
59// Message explaining each URLPattern::ParseResult.
60const char* const kParseResultMessages[] = {
61  kParseSuccess,
62  kParseErrorMissingSchemeSeparator,
63  kParseErrorInvalidScheme,
64  kParseErrorWrongSchemeType,
65  kParseErrorEmptyHost,
66  kParseErrorInvalidHostWildcard,
67  kParseErrorEmptyPath,
68  kParseErrorInvalidPort,
69  kParseErrorInvalidHost,
70};
71
72COMPILE_ASSERT(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
73               must_add_message_for_each_parse_result);
74
75const char kPathSeparator[] = "/";
76
77bool IsStandardScheme(const std::string& scheme) {
78  // "*" gets the same treatment as a standard scheme.
79  if (scheme == "*")
80    return true;
81
82  return url::IsStandard(scheme.c_str(),
83                         url::Component(0, static_cast<int>(scheme.length())));
84}
85
86bool IsValidPortForScheme(const std::string& scheme, const std::string& port) {
87  if (port == "*")
88    return true;
89
90  // Only accept non-wildcard ports if the scheme uses ports.
91  if (url::DefaultPortForScheme(scheme.c_str(), scheme.length()) ==
92      url::PORT_UNSPECIFIED) {
93    return false;
94  }
95
96  int parsed_port = url::PORT_UNSPECIFIED;
97  if (!base::StringToInt(port, &parsed_port))
98    return false;
99  return (parsed_port >= 0) && (parsed_port < 65536);
100}
101
102// Returns |path| with the trailing wildcard stripped if one existed.
103//
104// The functions that rely on this (OverlapsWith and Contains) are only
105// called for the patterns inside URLPatternSet. In those cases, we know that
106// the path will have only a single wildcard at the end. This makes figuring
107// out overlap much easier. It seems like there is probably a computer-sciency
108// way to solve the general case, but we don't need that yet.
109std::string StripTrailingWildcard(const std::string& path) {
110  size_t wildcard_index = path.find('*');
111  size_t path_last = path.size() - 1;
112  DCHECK(wildcard_index == std::string::npos || wildcard_index == path_last);
113  return wildcard_index == path_last ? path.substr(0, path_last) : path;
114}
115
116}  // namespace
117
118// static
119bool URLPattern::IsValidSchemeForExtensions(const std::string& scheme) {
120  for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
121    if (scheme == kValidSchemes[i])
122      return true;
123  }
124  return false;
125}
126
127URLPattern::URLPattern()
128    : valid_schemes_(SCHEME_NONE),
129      match_all_urls_(false),
130      match_subdomains_(false),
131      port_("*") {}
132
133URLPattern::URLPattern(int valid_schemes)
134    : valid_schemes_(valid_schemes),
135      match_all_urls_(false),
136      match_subdomains_(false),
137      port_("*") {}
138
139URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
140    // Strict error checking is used, because this constructor is only
141    // appropriate when we know |pattern| is valid.
142    : valid_schemes_(valid_schemes),
143      match_all_urls_(false),
144      match_subdomains_(false),
145      port_("*") {
146  ParseResult result = Parse(pattern);
147  if (PARSE_SUCCESS != result)
148    NOTREACHED() << "URLPattern invalid: " << pattern << " result " << result;
149}
150
151URLPattern::~URLPattern() {
152}
153
154bool URLPattern::operator<(const URLPattern& other) const {
155  return GetAsString() < other.GetAsString();
156}
157
158bool URLPattern::operator>(const URLPattern& other) const {
159  return GetAsString() > other.GetAsString();
160}
161
162bool URLPattern::operator==(const URLPattern& other) const {
163  return GetAsString() == other.GetAsString();
164}
165
166std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern) {
167  return out << '"' << url_pattern.GetAsString() << '"';
168}
169
170URLPattern::ParseResult URLPattern::Parse(const std::string& pattern) {
171  spec_.clear();
172  SetMatchAllURLs(false);
173  SetMatchSubdomains(false);
174  SetPort("*");
175
176  // Special case pattern to match every valid URL.
177  if (pattern == kAllUrlsPattern) {
178    SetMatchAllURLs(true);
179    return PARSE_SUCCESS;
180  }
181
182  // Parse out the scheme.
183  size_t scheme_end_pos = pattern.find(url::kStandardSchemeSeparator);
184  bool has_standard_scheme_separator = true;
185
186  // Some urls also use ':' alone as the scheme separator.
187  if (scheme_end_pos == std::string::npos) {
188    scheme_end_pos = pattern.find(':');
189    has_standard_scheme_separator = false;
190  }
191
192  if (scheme_end_pos == std::string::npos)
193    return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;
194
195  if (!SetScheme(pattern.substr(0, scheme_end_pos)))
196    return PARSE_ERROR_INVALID_SCHEME;
197
198  bool standard_scheme = IsStandardScheme(scheme_);
199  if (standard_scheme != has_standard_scheme_separator)
200    return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;
201
202  // Advance past the scheme separator.
203  scheme_end_pos +=
204      (standard_scheme ? strlen(url::kStandardSchemeSeparator) : 1);
205  if (scheme_end_pos >= pattern.size())
206    return PARSE_ERROR_EMPTY_HOST;
207
208  // Parse out the host and path.
209  size_t host_start_pos = scheme_end_pos;
210  size_t path_start_pos = 0;
211
212  if (!standard_scheme) {
213    path_start_pos = host_start_pos;
214  } else if (scheme_ == url::kFileScheme) {
215    size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
216    if (host_end_pos == std::string::npos) {
217      // Allow hostname omission.
218      // e.g. file://* is interpreted as file:///*,
219      // file://foo* is interpreted as file:///foo*.
220      path_start_pos = host_start_pos - 1;
221    } else {
222      // Ignore hostname if scheme is file://.
223      // e.g. file://localhost/foo is equal to file:///foo.
224      path_start_pos = host_end_pos;
225    }
226  } else {
227    size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
228
229    // Host is required.
230    if (host_start_pos == host_end_pos)
231      return PARSE_ERROR_EMPTY_HOST;
232
233    if (host_end_pos == std::string::npos)
234      return PARSE_ERROR_EMPTY_PATH;
235
236    host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
237
238    // The first component can optionally be '*' to match all subdomains.
239    std::vector<std::string> host_components;
240    base::SplitString(host_, '.', &host_components);
241
242    // Could be empty if the host only consists of whitespace characters.
243    if (host_components.empty())
244      return PARSE_ERROR_EMPTY_HOST;
245
246    if (host_components[0] == "*") {
247      match_subdomains_ = true;
248      host_components.erase(host_components.begin(),
249                            host_components.begin() + 1);
250    }
251    host_ = JoinString(host_components, '.');
252
253    path_start_pos = host_end_pos;
254  }
255
256  SetPath(pattern.substr(path_start_pos));
257
258  size_t port_pos = host_.find(':');
259  if (port_pos != std::string::npos) {
260    if (!SetPort(host_.substr(port_pos + 1)))
261      return PARSE_ERROR_INVALID_PORT;
262    host_ = host_.substr(0, port_pos);
263  }
264
265  // No other '*' can occur in the host, though. This isn't necessary, but is
266  // done as a convenience to developers who might otherwise be confused and
267  // think '*' works as a glob in the host.
268  if (host_.find('*') != std::string::npos)
269    return PARSE_ERROR_INVALID_HOST_WILDCARD;
270
271  // Null characters are not allowed in hosts.
272  if (host_.find('\0') != std::string::npos)
273    return PARSE_ERROR_INVALID_HOST;
274
275  return PARSE_SUCCESS;
276}
277
278void URLPattern::SetValidSchemes(int valid_schemes) {
279  spec_.clear();
280  valid_schemes_ = valid_schemes;
281}
282
283void URLPattern::SetHost(const std::string& host) {
284  spec_.clear();
285  host_ = host;
286}
287
288void URLPattern::SetMatchAllURLs(bool val) {
289  spec_.clear();
290  match_all_urls_ = val;
291
292  if (val) {
293    match_subdomains_ = true;
294    scheme_ = "*";
295    host_.clear();
296    SetPath("/*");
297  }
298}
299
300void URLPattern::SetMatchSubdomains(bool val) {
301  spec_.clear();
302  match_subdomains_ = val;
303}
304
305bool URLPattern::SetScheme(const std::string& scheme) {
306  spec_.clear();
307  scheme_ = scheme;
308  if (scheme_ == "*") {
309    valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
310  } else if (!IsValidScheme(scheme_)) {
311    return false;
312  }
313  return true;
314}
315
316bool URLPattern::IsValidScheme(const std::string& scheme) const {
317  if (valid_schemes_ == SCHEME_ALL)
318    return true;
319
320  for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
321    if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
322      return true;
323  }
324
325  return false;
326}
327
328void URLPattern::SetPath(const std::string& path) {
329  spec_.clear();
330  path_ = path;
331  path_escaped_ = path_;
332  ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
333  ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
334}
335
336bool URLPattern::SetPort(const std::string& port) {
337  spec_.clear();
338  if (IsValidPortForScheme(scheme_, port)) {
339    port_ = port;
340    return true;
341  }
342  return false;
343}
344
345bool URLPattern::MatchesURL(const GURL& test) const {
346  const GURL* test_url = &test;
347  bool has_inner_url = test.inner_url() != NULL;
348
349  if (has_inner_url) {
350    if (!test.SchemeIsFileSystem())
351      return false;  // The only nested URLs we handle are filesystem URLs.
352    test_url = test.inner_url();
353  }
354
355  if (!MatchesScheme(test_url->scheme()))
356    return false;
357
358  if (match_all_urls_)
359    return true;
360
361  std::string path_for_request = test.PathForRequest();
362  if (has_inner_url)
363    path_for_request = test_url->path() + path_for_request;
364
365  return MatchesSecurityOriginHelper(*test_url) &&
366         MatchesPath(path_for_request);
367}
368
369bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
370  const GURL* test_url = &test;
371  bool has_inner_url = test.inner_url() != NULL;
372
373  if (has_inner_url) {
374    if (!test.SchemeIsFileSystem())
375      return false;  // The only nested URLs we handle are filesystem URLs.
376    test_url = test.inner_url();
377  }
378
379  if (!MatchesScheme(test_url->scheme()))
380    return false;
381
382  if (match_all_urls_)
383    return true;
384
385  return MatchesSecurityOriginHelper(*test_url);
386}
387
388bool URLPattern::MatchesScheme(const std::string& test) const {
389  if (!IsValidScheme(test))
390    return false;
391
392  return scheme_ == "*" || test == scheme_;
393}
394
395bool URLPattern::MatchesHost(const std::string& host) const {
396  std::string test(url::kHttpScheme);
397  test += url::kStandardSchemeSeparator;
398  test += host;
399  test += "/";
400  return MatchesHost(GURL(test));
401}
402
403bool URLPattern::MatchesHost(const GURL& test) const {
404  // If the hosts are exactly equal, we have a match.
405  if (test.host() == host_)
406    return true;
407
408  // If we're matching subdomains, and we have no host in the match pattern,
409  // that means that we're matching all hosts, which means we have a match no
410  // matter what the test host is.
411  if (match_subdomains_ && host_.empty())
412    return true;
413
414  // Otherwise, we can only match if our match pattern matches subdomains.
415  if (!match_subdomains_)
416    return false;
417
418  // We don't do subdomain matching against IP addresses, so we can give up now
419  // if the test host is an IP address.
420  if (test.HostIsIPAddress())
421    return false;
422
423  // Check if the test host is a subdomain of our host.
424  if (test.host().length() <= (host_.length() + 1))
425    return false;
426
427  if (test.host().compare(test.host().length() - host_.length(),
428                          host_.length(), host_) != 0)
429    return false;
430
431  return test.host()[test.host().length() - host_.length() - 1] == '.';
432}
433
434bool URLPattern::ImpliesAllHosts() const {
435  // Check if it matches all urls or is a pattern like http://*/*.
436  if (match_all_urls_ ||
437      (match_subdomains_ && host_.empty() && port_ == "*" && path_ == "/*")) {
438    return true;
439  }
440
441  // If this doesn't even match subdomains, it can't possibly imply all hosts.
442  if (!match_subdomains_)
443    return false;
444
445  // If |host_| is a recognized TLD, this will be 0. We don't include private
446  // TLDs, so that, e.g., *.appspot.com does not imply all hosts.
447  size_t registry_length = net::registry_controlled_domains::GetRegistryLength(
448      host_,
449      net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
450      net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
451  // If there was more than just a TLD in the host (e.g., *.foobar.com), it
452  // doesn't imply all hosts.
453  if (registry_length > 0)
454    return false;
455
456  // At this point the host could either be just a TLD ("com") or some unknown
457  // TLD-like string ("notatld"). To disambiguate between them construct a
458  // fake URL, and check the registry. This returns 0 if the TLD is
459  // unrecognized, or the length of the recognized TLD.
460  registry_length = net::registry_controlled_domains::GetRegistryLength(
461      base::StringPrintf("foo.%s", host_.c_str()),
462      net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
463      net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
464  // If we recognized this TLD, then this is a pattern like *.com, and it
465  // should imply all hosts. Otherwise, this doesn't imply all hosts.
466  return registry_length > 0;
467}
468
469bool URLPattern::MatchesSingleOrigin() const {
470  // Strictly speaking, the port is part of the origin, but in URLPattern it
471  // defaults to *. It's not very interesting anyway, so leave it out.
472  return !ImpliesAllHosts() && scheme_ != "*" && !match_subdomains_;
473}
474
475bool URLPattern::MatchesPath(const std::string& test) const {
476  // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
477  // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
478  if (test + "/*" == path_escaped_)
479    return true;
480
481  return MatchPattern(test, path_escaped_);
482}
483
484const std::string& URLPattern::GetAsString() const {
485  if (!spec_.empty())
486    return spec_;
487
488  if (match_all_urls_) {
489    spec_ = kAllUrlsPattern;
490    return spec_;
491  }
492
493  bool standard_scheme = IsStandardScheme(scheme_);
494
495  std::string spec = scheme_ +
496      (standard_scheme ? url::kStandardSchemeSeparator : ":");
497
498  if (scheme_ != url::kFileScheme && standard_scheme) {
499    if (match_subdomains_) {
500      spec += "*";
501      if (!host_.empty())
502        spec += ".";
503    }
504
505    if (!host_.empty())
506      spec += host_;
507
508    if (port_ != "*") {
509      spec += ":";
510      spec += port_;
511    }
512  }
513
514  if (!path_.empty())
515    spec += path_;
516
517  spec_ = spec;
518  return spec_;
519}
520
521bool URLPattern::OverlapsWith(const URLPattern& other) const {
522  if (match_all_urls() || other.match_all_urls())
523    return true;
524  return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
525          other.MatchesAnyScheme(GetExplicitSchemes()))
526      && (MatchesHost(other.host()) || other.MatchesHost(host()))
527      && (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port()))
528      && (MatchesPath(StripTrailingWildcard(other.path())) ||
529          other.MatchesPath(StripTrailingWildcard(path())));
530}
531
532bool URLPattern::Contains(const URLPattern& other) const {
533  if (match_all_urls())
534    return true;
535  return MatchesAllSchemes(other.GetExplicitSchemes())
536      && MatchesHost(other.host())
537      && MatchesPortPattern(other.port())
538      && MatchesPath(StripTrailingWildcard(other.path()));
539}
540
541bool URLPattern::MatchesAnyScheme(
542    const std::vector<std::string>& schemes) const {
543  for (std::vector<std::string>::const_iterator i = schemes.begin();
544       i != schemes.end(); ++i) {
545    if (MatchesScheme(*i))
546      return true;
547  }
548
549  return false;
550}
551
552bool URLPattern::MatchesAllSchemes(
553    const std::vector<std::string>& schemes) const {
554  for (std::vector<std::string>::const_iterator i = schemes.begin();
555       i != schemes.end(); ++i) {
556    if (!MatchesScheme(*i))
557      return false;
558  }
559
560  return true;
561}
562
563bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
564  // Ignore hostname if scheme is file://.
565  if (scheme_ != url::kFileScheme && !MatchesHost(test))
566    return false;
567
568  if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
569    return false;
570
571  return true;
572}
573
574bool URLPattern::MatchesPortPattern(const std::string& port) const {
575  return port_ == "*" || port_ == port;
576}
577
578std::vector<std::string> URLPattern::GetExplicitSchemes() const {
579  std::vector<std::string> result;
580
581  if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
582    result.push_back(scheme_);
583    return result;
584  }
585
586  for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
587    if (MatchesScheme(kValidSchemes[i])) {
588      result.push_back(kValidSchemes[i]);
589    }
590  }
591
592  return result;
593}
594
595std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
596  std::vector<std::string> explicit_schemes = GetExplicitSchemes();
597  std::vector<URLPattern> result;
598
599  for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
600       i != explicit_schemes.end(); ++i) {
601    URLPattern temp = *this;
602    temp.SetScheme(*i);
603    temp.SetMatchAllURLs(false);
604    result.push_back(temp);
605  }
606
607  return result;
608}
609
610// static
611const char* URLPattern::GetParseResultString(
612    URLPattern::ParseResult parse_result) {
613  return kParseResultMessages[parse_result];
614}
615