url_pattern.cc revision a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7
1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "extensions/common/url_pattern.h"
6
7#include "base/strings/string_number_conversions.h"
8#include "base/strings/string_piece.h"
9#include "base/strings/string_split.h"
10#include "base/strings/string_util.h"
11#include "content/public/common/url_constants.h"
12#include "extensions/common/constants.h"
13#include "url/gurl.h"
14#include "url/url_util.h"
15
16const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
17
18namespace {
19
20// TODO(aa): What about more obscure schemes like data: and javascript: ?
21// Note: keep this array in sync with kValidSchemeMasks.
22const char* kValidSchemes[] = {
23  content::kHttpScheme,
24  content::kHttpsScheme,
25  chrome::kFileScheme,
26  content::kFtpScheme,
27  chrome::kChromeUIScheme,
28  extensions::kExtensionScheme,
29  chrome::kFileSystemScheme,
30};
31
32const int kValidSchemeMasks[] = {
33  URLPattern::SCHEME_HTTP,
34  URLPattern::SCHEME_HTTPS,
35  URLPattern::SCHEME_FILE,
36  URLPattern::SCHEME_FTP,
37  URLPattern::SCHEME_CHROMEUI,
38  URLPattern::SCHEME_EXTENSION,
39  URLPattern::SCHEME_FILESYSTEM,
40};
41
42COMPILE_ASSERT(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
43               must_keep_these_arrays_in_sync);
44
45const char kParseSuccess[] = "Success.";
46const char kParseErrorMissingSchemeSeparator[] = "Missing scheme separator.";
47const char kParseErrorInvalidScheme[] = "Invalid scheme.";
48const char kParseErrorWrongSchemeType[] = "Wrong scheme type.";
49const char kParseErrorEmptyHost[] = "Host can not be empty.";
50const char kParseErrorInvalidHostWildcard[] = "Invalid host wildcard.";
51const char kParseErrorEmptyPath[] = "Empty path.";
52const char kParseErrorInvalidPort[] = "Invalid port.";
53
54// Message explaining each URLPattern::ParseResult.
55const char* const kParseResultMessages[] = {
56  kParseSuccess,
57  kParseErrorMissingSchemeSeparator,
58  kParseErrorInvalidScheme,
59  kParseErrorWrongSchemeType,
60  kParseErrorEmptyHost,
61  kParseErrorInvalidHostWildcard,
62  kParseErrorEmptyPath,
63  kParseErrorInvalidPort,
64};
65
66COMPILE_ASSERT(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
67               must_add_message_for_each_parse_result);
68
69const char kPathSeparator[] = "/";
70
71bool IsStandardScheme(const std::string& scheme) {
72  // "*" gets the same treatment as a standard scheme.
73  if (scheme == "*")
74    return true;
75
76  return url_util::IsStandard(scheme.c_str(),
77      url_parse::Component(0, static_cast<int>(scheme.length())));
78}
79
80bool IsValidPortForScheme(const std::string& scheme, const std::string& port) {
81  if (port == "*")
82    return true;
83
84  // Only accept non-wildcard ports if the scheme uses ports.
85  if (url_canon::DefaultPortForScheme(scheme.c_str(), scheme.length()) ==
86      url_parse::PORT_UNSPECIFIED) {
87    return false;
88  }
89
90  int parsed_port = url_parse::PORT_UNSPECIFIED;
91  if (!base::StringToInt(port, &parsed_port))
92    return false;
93  return (parsed_port >= 0) && (parsed_port < 65536);
94}
95
96// Returns |path| with the trailing wildcard stripped if one existed.
97//
98// The functions that rely on this (OverlapsWith and Contains) are only
99// called for the patterns inside URLPatternSet. In those cases, we know that
100// the path will have only a single wildcard at the end. This makes figuring
101// out overlap much easier. It seems like there is probably a computer-sciency
102// way to solve the general case, but we don't need that yet.
103std::string StripTrailingWildcard(const std::string& path) {
104  size_t wildcard_index = path.find('*');
105  size_t path_last = path.size() - 1;
106  DCHECK(wildcard_index == std::string::npos || wildcard_index == path_last);
107  return wildcard_index == path_last ? path.substr(0, path_last) : path;
108}
109
110}  // namespace
111
112URLPattern::URLPattern()
113    : valid_schemes_(SCHEME_NONE),
114      match_all_urls_(false),
115      match_subdomains_(false),
116      port_("*") {}
117
118URLPattern::URLPattern(int valid_schemes)
119    : valid_schemes_(valid_schemes),
120      match_all_urls_(false),
121      match_subdomains_(false),
122      port_("*") {}
123
124URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
125    // Strict error checking is used, because this constructor is only
126    // appropriate when we know |pattern| is valid.
127    : valid_schemes_(valid_schemes),
128      match_all_urls_(false),
129      match_subdomains_(false),
130      port_("*") {
131  if (PARSE_SUCCESS != Parse(pattern))
132    NOTREACHED() << "URLPattern is invalid: " << pattern;
133}
134
135URLPattern::~URLPattern() {
136}
137
138bool URLPattern::operator<(const URLPattern& other) const {
139  return GetAsString() < other.GetAsString();
140}
141
142bool URLPattern::operator>(const URLPattern& other) const {
143  return GetAsString() > other.GetAsString();
144}
145
146bool URLPattern::operator==(const URLPattern& other) const {
147  return GetAsString() == other.GetAsString();
148}
149
150URLPattern::ParseResult URLPattern::Parse(const std::string& pattern) {
151  spec_.clear();
152  SetMatchAllURLs(false);
153  SetMatchSubdomains(false);
154  SetPort("*");
155
156  // Special case pattern to match every valid URL.
157  if (pattern == kAllUrlsPattern) {
158    SetMatchAllURLs(true);
159    return PARSE_SUCCESS;
160  }
161
162  // Parse out the scheme.
163  size_t scheme_end_pos = pattern.find(content::kStandardSchemeSeparator);
164  bool has_standard_scheme_separator = true;
165
166  // Some urls also use ':' alone as the scheme separator.
167  if (scheme_end_pos == std::string::npos) {
168    scheme_end_pos = pattern.find(':');
169    has_standard_scheme_separator = false;
170  }
171
172  if (scheme_end_pos == std::string::npos)
173    return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;
174
175  if (!SetScheme(pattern.substr(0, scheme_end_pos)))
176    return PARSE_ERROR_INVALID_SCHEME;
177
178  bool standard_scheme = IsStandardScheme(scheme_);
179  if (standard_scheme != has_standard_scheme_separator)
180    return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;
181
182  // Advance past the scheme separator.
183  scheme_end_pos +=
184      (standard_scheme ? strlen(content::kStandardSchemeSeparator) : 1);
185  if (scheme_end_pos >= pattern.size())
186    return PARSE_ERROR_EMPTY_HOST;
187
188  // Parse out the host and path.
189  size_t host_start_pos = scheme_end_pos;
190  size_t path_start_pos = 0;
191
192  if (!standard_scheme) {
193    path_start_pos = host_start_pos;
194  } else if (scheme_ == chrome::kFileScheme) {
195    size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
196    if (host_end_pos == std::string::npos) {
197      // Allow hostname omission.
198      // e.g. file://* is interpreted as file:///*,
199      // file://foo* is interpreted as file:///foo*.
200      path_start_pos = host_start_pos - 1;
201    } else {
202      // Ignore hostname if scheme is file://.
203      // e.g. file://localhost/foo is equal to file:///foo.
204      path_start_pos = host_end_pos;
205    }
206  } else {
207    size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
208
209    // Host is required.
210    if (host_start_pos == host_end_pos)
211      return PARSE_ERROR_EMPTY_HOST;
212
213    if (host_end_pos == std::string::npos)
214      return PARSE_ERROR_EMPTY_PATH;
215
216    host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
217
218    // The first component can optionally be '*' to match all subdomains.
219    std::vector<std::string> host_components;
220    base::SplitString(host_, '.', &host_components);
221    if (host_components[0] == "*") {
222      match_subdomains_ = true;
223      host_components.erase(host_components.begin(),
224                            host_components.begin() + 1);
225    }
226    host_ = JoinString(host_components, '.');
227
228    path_start_pos = host_end_pos;
229  }
230
231  SetPath(pattern.substr(path_start_pos));
232
233  size_t port_pos = host_.find(':');
234  if (port_pos != std::string::npos) {
235    if (!SetPort(host_.substr(port_pos + 1)))
236      return PARSE_ERROR_INVALID_PORT;
237    host_ = host_.substr(0, port_pos);
238  }
239
240  // No other '*' can occur in the host, though. This isn't necessary, but is
241  // done as a convenience to developers who might otherwise be confused and
242  // think '*' works as a glob in the host.
243  if (host_.find('*') != std::string::npos)
244    return PARSE_ERROR_INVALID_HOST_WILDCARD;
245
246  return PARSE_SUCCESS;
247}
248
249void URLPattern::SetValidSchemes(int valid_schemes) {
250  spec_.clear();
251  valid_schemes_ = valid_schemes;
252}
253
254void URLPattern::SetHost(const std::string& host) {
255  spec_.clear();
256  host_ = host;
257}
258
259void URLPattern::SetMatchAllURLs(bool val) {
260  spec_.clear();
261  match_all_urls_ = val;
262
263  if (val) {
264    match_subdomains_ = true;
265    scheme_ = "*";
266    host_.clear();
267    SetPath("/*");
268  }
269}
270
271void URLPattern::SetMatchSubdomains(bool val) {
272  spec_.clear();
273  match_subdomains_ = val;
274}
275
276bool URLPattern::SetScheme(const std::string& scheme) {
277  spec_.clear();
278  scheme_ = scheme;
279  if (scheme_ == "*") {
280    valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
281  } else if (!IsValidScheme(scheme_)) {
282    return false;
283  }
284  return true;
285}
286
287bool URLPattern::IsValidScheme(const std::string& scheme) const {
288  if (valid_schemes_ == SCHEME_ALL)
289    return true;
290
291  for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
292    if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
293      return true;
294  }
295
296  return false;
297}
298
299void URLPattern::SetPath(const std::string& path) {
300  spec_.clear();
301  path_ = path;
302  path_escaped_ = path_;
303  ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
304  ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
305}
306
307bool URLPattern::SetPort(const std::string& port) {
308  spec_.clear();
309  if (IsValidPortForScheme(scheme_, port)) {
310    port_ = port;
311    return true;
312  }
313  return false;
314}
315
316bool URLPattern::MatchesURL(const GURL& test) const {
317  const GURL* test_url = &test;
318  bool has_inner_url = test.inner_url() != NULL;
319
320  if (has_inner_url) {
321    if (!test.SchemeIsFileSystem())
322      return false;  // The only nested URLs we handle are filesystem URLs.
323    test_url = test.inner_url();
324  }
325
326  if (!MatchesScheme(test_url->scheme()))
327    return false;
328
329  if (match_all_urls_)
330    return true;
331
332  std::string path_for_request = test.PathForRequest();
333  if (has_inner_url)
334    path_for_request = test_url->path() + path_for_request;
335
336  return MatchesSecurityOriginHelper(*test_url) &&
337         MatchesPath(path_for_request);
338}
339
340bool URLPattern::MatchesSecurityOrigin(const GURL& test) const {
341  const GURL* test_url = &test;
342  bool has_inner_url = test.inner_url() != NULL;
343
344  if (has_inner_url) {
345    if (!test.SchemeIsFileSystem())
346      return false;  // The only nested URLs we handle are filesystem URLs.
347    test_url = test.inner_url();
348  }
349
350  if (!MatchesScheme(test_url->scheme()))
351    return false;
352
353  if (match_all_urls_)
354    return true;
355
356  return MatchesSecurityOriginHelper(*test_url);
357}
358
359bool URLPattern::MatchesScheme(const std::string& test) const {
360  if (!IsValidScheme(test))
361    return false;
362
363  return scheme_ == "*" || test == scheme_;
364}
365
366bool URLPattern::MatchesHost(const std::string& host) const {
367  std::string test(content::kHttpScheme);
368  test += content::kStandardSchemeSeparator;
369  test += host;
370  test += "/";
371  return MatchesHost(GURL(test));
372}
373
374bool URLPattern::MatchesHost(const GURL& test) const {
375  // If the hosts are exactly equal, we have a match.
376  if (test.host() == host_)
377    return true;
378
379  // If we're matching subdomains, and we have no host in the match pattern,
380  // that means that we're matching all hosts, which means we have a match no
381  // matter what the test host is.
382  if (match_subdomains_ && host_.empty())
383    return true;
384
385  // Otherwise, we can only match if our match pattern matches subdomains.
386  if (!match_subdomains_)
387    return false;
388
389  // We don't do subdomain matching against IP addresses, so we can give up now
390  // if the test host is an IP address.
391  if (test.HostIsIPAddress())
392    return false;
393
394  // Check if the test host is a subdomain of our host.
395  if (test.host().length() <= (host_.length() + 1))
396    return false;
397
398  if (test.host().compare(test.host().length() - host_.length(),
399                          host_.length(), host_) != 0)
400    return false;
401
402  return test.host()[test.host().length() - host_.length() - 1] == '.';
403}
404
405bool URLPattern::MatchesPath(const std::string& test) const {
406  // Make the behaviour of OverlapsWith consistent with MatchesURL, which is
407  // need to match hosted apps on e.g. 'google.com' also run on 'google.com/'.
408  if (test + "/*" == path_escaped_)
409    return true;
410
411  return MatchPattern(test, path_escaped_);
412}
413
414const std::string& URLPattern::GetAsString() const {
415  if (!spec_.empty())
416    return spec_;
417
418  if (match_all_urls_) {
419    spec_ = kAllUrlsPattern;
420    return spec_;
421  }
422
423  bool standard_scheme = IsStandardScheme(scheme_);
424
425  std::string spec = scheme_ +
426      (standard_scheme ? content::kStandardSchemeSeparator : ":");
427
428  if (scheme_ != chrome::kFileScheme && standard_scheme) {
429    if (match_subdomains_) {
430      spec += "*";
431      if (!host_.empty())
432        spec += ".";
433    }
434
435    if (!host_.empty())
436      spec += host_;
437
438    if (port_ != "*") {
439      spec += ":";
440      spec += port_;
441    }
442  }
443
444  if (!path_.empty())
445    spec += path_;
446
447  spec_ = spec;
448  return spec_;
449}
450
451bool URLPattern::OverlapsWith(const URLPattern& other) const {
452  if (match_all_urls() || other.match_all_urls())
453    return true;
454  return (MatchesAnyScheme(other.GetExplicitSchemes()) ||
455          other.MatchesAnyScheme(GetExplicitSchemes()))
456      && (MatchesHost(other.host()) || other.MatchesHost(host()))
457      && (MatchesPortPattern(other.port()) || other.MatchesPortPattern(port()))
458      && (MatchesPath(StripTrailingWildcard(other.path())) ||
459          other.MatchesPath(StripTrailingWildcard(path())));
460}
461
462bool URLPattern::Contains(const URLPattern& other) const {
463  if (match_all_urls())
464    return true;
465  return MatchesAllSchemes(other.GetExplicitSchemes())
466      && MatchesHost(other.host())
467      && MatchesPortPattern(other.port())
468      && MatchesPath(StripTrailingWildcard(other.path()));
469}
470
471bool URLPattern::MatchesAnyScheme(
472    const std::vector<std::string>& schemes) const {
473  for (std::vector<std::string>::const_iterator i = schemes.begin();
474       i != schemes.end(); ++i) {
475    if (MatchesScheme(*i))
476      return true;
477  }
478
479  return false;
480}
481
482bool URLPattern::MatchesAllSchemes(
483    const std::vector<std::string>& schemes) const {
484  for (std::vector<std::string>::const_iterator i = schemes.begin();
485       i != schemes.end(); ++i) {
486    if (!MatchesScheme(*i))
487      return false;
488  }
489
490  return true;
491}
492
493bool URLPattern::MatchesSecurityOriginHelper(const GURL& test) const {
494  // Ignore hostname if scheme is file://.
495  if (scheme_ != chrome::kFileScheme && !MatchesHost(test))
496    return false;
497
498  if (!MatchesPortPattern(base::IntToString(test.EffectiveIntPort())))
499    return false;
500
501  return true;
502}
503
504bool URLPattern::MatchesPortPattern(const std::string& port) const {
505  return port_ == "*" || port_ == port;
506}
507
508std::vector<std::string> URLPattern::GetExplicitSchemes() const {
509  std::vector<std::string> result;
510
511  if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
512    result.push_back(scheme_);
513    return result;
514  }
515
516  for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
517    if (MatchesScheme(kValidSchemes[i])) {
518      result.push_back(kValidSchemes[i]);
519    }
520  }
521
522  return result;
523}
524
525std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
526  std::vector<std::string> explicit_schemes = GetExplicitSchemes();
527  std::vector<URLPattern> result;
528
529  for (std::vector<std::string>::const_iterator i = explicit_schemes.begin();
530       i != explicit_schemes.end(); ++i) {
531    URLPattern temp = *this;
532    temp.SetScheme(*i);
533    temp.SetMatchAllURLs(false);
534    result.push_back(temp);
535  }
536
537  return result;
538}
539
540// static
541const char* URLPattern::GetParseResultString(
542    URLPattern::ParseResult parse_result) {
543  return kParseResultMessages[parse_result];
544}
545