1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/common/extensions/url_pattern.h"
6
7#include "base/string_piece.h"
8#include "base/string_split.h"
9#include "base/string_util.h"
10#include "chrome/common/url_constants.h"
11#include "googleurl/src/gurl.h"
12#include "googleurl/src/url_util.h"
13
14const char URLPattern::kAllUrlsPattern[] = "<all_urls>";
15
16namespace {
17
18// TODO(aa): Consider adding chrome-extension? What about more obscure ones
19// like data: and javascript: ?
20// Note: keep this array in sync with kValidSchemeMasks.
21const char* kValidSchemes[] = {
22  chrome::kHttpScheme,
23  chrome::kHttpsScheme,
24  chrome::kFileScheme,
25  chrome::kFtpScheme,
26  chrome::kChromeUIScheme,
27  chrome::kFileSystemScheme,
28};
29
30const int kValidSchemeMasks[] = {
31  URLPattern::SCHEME_HTTP,
32  URLPattern::SCHEME_HTTPS,
33  URLPattern::SCHEME_FILE,
34  URLPattern::SCHEME_FTP,
35  URLPattern::SCHEME_CHROMEUI,
36  URLPattern::SCHEME_FILESYSTEM,
37};
38
39COMPILE_ASSERT(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
40               must_keep_these_arrays_in_sync);
41
42const char* kParseSuccess = "Success.";
43const char* kParseErrorMissingSchemeSeparator = "Missing scheme separator.";
44const char* kParseErrorInvalidScheme = "Invalid scheme.";
45const char* kParseErrorWrongSchemeType = "Wrong scheme type.";
46const char* kParseErrorEmptyHost = "Host can not be empty.";
47const char* kParseErrorInvalidHostWildcard = "Invalid host wildcard.";
48const char* kParseErrorEmptyPath = "Empty path.";
49const char* kParseErrorHasColon =
50    "Ports are not supported in URL patterns. ':' may not be used in a host.";
51
52// Message explaining each URLPattern::ParseResult.
53const char* kParseResultMessages[] = {
54  kParseSuccess,
55  kParseErrorMissingSchemeSeparator,
56  kParseErrorInvalidScheme,
57  kParseErrorWrongSchemeType,
58  kParseErrorEmptyHost,
59  kParseErrorInvalidHostWildcard,
60  kParseErrorEmptyPath,
61  kParseErrorHasColon
62};
63
64COMPILE_ASSERT(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
65               must_add_message_for_each_parse_result);
66
67const char kPathSeparator[] = "/";
68
69bool IsStandardScheme(const std::string& scheme) {
70  // "*" gets the same treatment as a standard scheme.
71  if (scheme == "*")
72    return true;
73
74  return url_util::IsStandard(scheme.c_str(),
75      url_parse::Component(0, static_cast<int>(scheme.length())));
76}
77
78}  // namespace
79
80URLPattern::URLPattern()
81    : valid_schemes_(SCHEME_NONE),
82      match_all_urls_(false),
83      match_subdomains_(false) {}
84
85URLPattern::URLPattern(int valid_schemes)
86    : valid_schemes_(valid_schemes), match_all_urls_(false),
87      match_subdomains_(false) {}
88
89URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
90    : valid_schemes_(valid_schemes), match_all_urls_(false),
91      match_subdomains_(false) {
92
93  // Strict error checking is used, because this constructor is only
94  // appropriate when we know |pattern| is valid.
95  if (PARSE_SUCCESS != Parse(pattern, PARSE_STRICT))
96    NOTREACHED() << "URLPattern is invalid: " << pattern;
97}
98
99URLPattern::~URLPattern() {
100}
101
102URLPattern::ParseResult URLPattern::Parse(const std::string& pattern,
103                                          ParseOption strictness) {
104  CHECK(strictness == PARSE_LENIENT ||
105        strictness == PARSE_STRICT);
106
107  // Special case pattern to match every valid URL.
108  if (pattern == kAllUrlsPattern) {
109    match_all_urls_ = true;
110    match_subdomains_ = true;
111    scheme_ = "*";
112    host_.clear();
113    SetPath("/*");
114    return PARSE_SUCCESS;
115  }
116
117  // Parse out the scheme.
118  size_t scheme_end_pos = pattern.find(chrome::kStandardSchemeSeparator);
119  bool has_standard_scheme_separator = true;
120
121  // Some urls also use ':' alone as the scheme separator.
122  if (scheme_end_pos == std::string::npos) {
123    scheme_end_pos = pattern.find(':');
124    has_standard_scheme_separator = false;
125  }
126
127  if (scheme_end_pos == std::string::npos)
128    return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;
129
130  if (!SetScheme(pattern.substr(0, scheme_end_pos)))
131    return PARSE_ERROR_INVALID_SCHEME;
132
133  bool standard_scheme = IsStandardScheme(scheme_);
134  if (standard_scheme != has_standard_scheme_separator)
135    return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;
136
137  // Advance past the scheme separator.
138  scheme_end_pos +=
139      (standard_scheme ? strlen(chrome::kStandardSchemeSeparator) : 1);
140  if (scheme_end_pos >= pattern.size())
141    return PARSE_ERROR_EMPTY_HOST;
142
143  // Parse out the host and path.
144  size_t host_start_pos = scheme_end_pos;
145  size_t path_start_pos = 0;
146
147  // File URLs are special because they have no host.
148  if (scheme_ == chrome::kFileScheme || !standard_scheme) {
149    path_start_pos = host_start_pos;
150  } else {
151    size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);
152
153    // Host is required.
154    if (host_start_pos == host_end_pos)
155      return PARSE_ERROR_EMPTY_HOST;
156
157    if (host_end_pos == std::string::npos)
158      return PARSE_ERROR_EMPTY_PATH;
159
160    host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);
161
162    // The first component can optionally be '*' to match all subdomains.
163    std::vector<std::string> host_components;
164    base::SplitString(host_, '.', &host_components);
165    if (host_components[0] == "*") {
166      match_subdomains_ = true;
167      host_components.erase(host_components.begin(),
168                            host_components.begin() + 1);
169    }
170    host_ = JoinString(host_components, '.');
171
172    // No other '*' can occur in the host, though. This isn't necessary, but is
173    // done as a convenience to developers who might otherwise be confused and
174    // think '*' works as a glob in the host.
175    if (host_.find('*') != std::string::npos)
176      return PARSE_ERROR_INVALID_HOST_WILDCARD;
177
178    path_start_pos = host_end_pos;
179  }
180
181  SetPath(pattern.substr(path_start_pos));
182
183  if (strictness == PARSE_STRICT && host_.find(':') != std::string::npos)
184    return PARSE_ERROR_HAS_COLON;
185
186  return PARSE_SUCCESS;
187}
188
189bool URLPattern::SetScheme(const std::string& scheme) {
190  scheme_ = scheme;
191  if (scheme_ == "*") {
192    valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
193  } else if (!IsValidScheme(scheme_)) {
194    return false;
195  }
196  return true;
197}
198
199bool URLPattern::IsValidScheme(const std::string& scheme) const {
200  if (valid_schemes_ == SCHEME_ALL)
201    return true;
202
203  for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
204    if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
205      return true;
206  }
207
208  return false;
209}
210
211void URLPattern::SetPath(const std::string& path) {
212  path_ = path;
213  path_escaped_ = path_;
214  ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
215  ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
216}
217
218bool URLPattern::MatchesUrl(const GURL &test) const {
219  if (!MatchesScheme(test.scheme()))
220    return false;
221
222  if (match_all_urls_)
223    return true;
224
225  if (!MatchesHost(test))
226    return false;
227
228  if (!MatchesPath(test.PathForRequest()))
229    return false;
230
231  return true;
232}
233
234bool URLPattern::MatchesScheme(const std::string& test) const {
235  if (!IsValidScheme(test))
236    return false;
237
238  return scheme_ == "*" || test == scheme_;
239}
240
241bool URLPattern::MatchesHost(const std::string& host) const {
242  std::string test(chrome::kHttpScheme);
243  test += chrome::kStandardSchemeSeparator;
244  test += host;
245  test += "/";
246  return MatchesHost(GURL(test));
247}
248
249bool URLPattern::MatchesHost(const GURL& test) const {
250  // If the hosts are exactly equal, we have a match.
251  if (test.host() == host_)
252    return true;
253
254  // If we're matching subdomains, and we have no host in the match pattern,
255  // that means that we're matching all hosts, which means we have a match no
256  // matter what the test host is.
257  if (match_subdomains_ && host_.empty())
258    return true;
259
260  // Otherwise, we can only match if our match pattern matches subdomains.
261  if (!match_subdomains_)
262    return false;
263
264  // We don't do subdomain matching against IP addresses, so we can give up now
265  // if the test host is an IP address.
266  if (test.HostIsIPAddress())
267    return false;
268
269  // Check if the test host is a subdomain of our host.
270  if (test.host().length() <= (host_.length() + 1))
271    return false;
272
273  if (test.host().compare(test.host().length() - host_.length(),
274                          host_.length(), host_) != 0)
275    return false;
276
277  return test.host()[test.host().length() - host_.length() - 1] == '.';
278}
279
280bool URLPattern::MatchesPath(const std::string& test) const {
281  if (!MatchPattern(test, path_escaped_))
282    return false;
283
284  return true;
285}
286
287std::string URLPattern::GetAsString() const {
288  if (match_all_urls_)
289    return kAllUrlsPattern;
290
291  bool standard_scheme = IsStandardScheme(scheme_);
292
293  std::string spec = scheme_ +
294      (standard_scheme ? chrome::kStandardSchemeSeparator : ":");
295
296  if (scheme_ != chrome::kFileScheme && standard_scheme) {
297    if (match_subdomains_) {
298      spec += "*";
299      if (!host_.empty())
300        spec += ".";
301    }
302
303    if (!host_.empty())
304      spec += host_;
305  }
306
307  if (!path_.empty())
308    spec += path_;
309
310  return spec;
311}
312
313bool URLPattern::OverlapsWith(const URLPattern& other) const {
314  if (!MatchesScheme(other.scheme_) && !other.MatchesScheme(scheme_))
315    return false;
316
317  if (!MatchesHost(other.host()) && !other.MatchesHost(host_))
318    return false;
319
320  // We currently only use OverlapsWith() for the patterns inside
321  // ExtensionExtent. In those cases, we know that the path will have only a
322  // single wildcard at the end. This makes figuring out overlap much easier. It
323  // seems like there is probably a computer-sciency way to solve the general
324  // case, but we don't need that yet.
325  DCHECK(path_.find('*') == path_.size() - 1);
326  DCHECK(other.path().find('*') == other.path().size() - 1);
327
328  if (!MatchesPath(other.path().substr(0, other.path().size() - 1)) &&
329      !other.MatchesPath(path_.substr(0, path_.size() - 1)))
330    return false;
331
332  return true;
333}
334
335std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
336  std::vector<URLPattern> result;
337
338  if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
339    result.push_back(*this);
340    return result;
341  }
342
343  for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
344    if (MatchesScheme(kValidSchemes[i])) {
345      URLPattern temp = *this;
346      temp.SetScheme(kValidSchemes[i]);
347      temp.set_match_all_urls(false);
348      result.push_back(temp);
349    }
350  }
351
352  return result;
353}
354
355// static
356const char* URLPattern::GetParseResultString(
357    URLPattern::ParseResult parse_result) {
358  return kParseResultMessages[parse_result];
359}
360