1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4#ifndef CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
5#define CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
6#pragma once
7
8#include <functional>
9#include <string>
10#include <vector>
11
12class GURL;
13
14// A pattern that can be used to match URLs. A URLPattern is a very restricted
15// subset of URL syntax:
16//
17// <url-pattern> := <scheme>://<host><path> | '<all_urls>'
18// <scheme> := '*' | 'http' | 'https' | 'file' | 'ftp' | 'chrome'
19// <host> := '*' | '*.' <anychar except '/' and '*'>+
20// <path> := '/' <any chars>
21//
22// * Host is not used when the scheme is 'file'.
23// * The path can have embedded '*' characters which act as glob wildcards.
24// * '<all_urls>' is a special pattern that matches any URL that contains a
25//   valid scheme (as specified by valid_schemes_).
26// * The '*' scheme pattern excludes file URLs.
27//
28// Examples of valid patterns:
29// - http://*/*
30// - http://*/foo*
31// - https://*.google.com/foo*bar
32// - file://monkey*
33// - http://127.0.0.1/*
34//
35// Examples of invalid patterns:
36// - http://* -- path not specified
37// - http://*foo/bar -- * not allowed as substring of host component
38// - http://foo.*.bar/baz -- * must be first component
39// - http:/bar -- scheme separator not found
40// - foo://* -- invalid scheme
41// - chrome:// -- we don't support chrome internal URLs
42//
43// Design rationale:
44// * We need to be able to tell users what 'sites' a given URLPattern will
45//   affect. For example "This extension will interact with the site
46//   'www.google.com'.
47// * We'd like to be able to convert as many existing Greasemonkey @include
48//   patterns to URLPatterns as possible. Greasemonkey @include patterns are
49//   simple globs, so this won't be perfect.
50// * Although we would like to support any scheme, it isn't clear what to tell
51//   users about URLPatterns that affect data or javascript URLs, so those are
52//   left out for now.
53//
54// From a 2008-ish crawl of userscripts.org, the following patterns were found
55// in @include lines:
56// - total lines                    : 24471
57// - @include *                     :   919
58// - @include http://[^\*]+?/       : 11128 (no star in host)
59// - @include http://\*\.[^\*]+?/   :  2325 (host prefixed by *.)
60// - @include http://\*[^\.][^\*]+?/:  1524 (host prefixed by *, no dot -- many
61//                                           appear to only need subdomain
62//                                           matching, not real prefix matching)
63// - @include http://[^\*/]+\*/     :   320 (host suffixed by *)
64// - @include contains .tld         :   297 (host suffixed by .tld -- a special
65//                                           Greasemonkey domain component that
66//                                           tries to match all valid registry-
67//                                           controlled suffixes)
68// - @include http://\*/            :   228 (host is * exactly, but there is
69//                                           more to the pattern)
70//
71// So, we can support at least half of current @include lines without supporting
72// subdomain matching. We can pick up at least another 10% by supporting
73// subdomain matching. It is probably possible to coerce more of the existing
74// patterns to URLPattern, but the resulting pattern will be more restrictive
75// than the original glob, which is probably better than nothing.
76class URLPattern {
77 public:
78  // A collection of scheme bitmasks for use with valid_schemes.
79  enum SchemeMasks {
80    SCHEME_NONE       = 0,
81    SCHEME_HTTP       = 1 << 0,
82    SCHEME_HTTPS      = 1 << 1,
83    SCHEME_FILE       = 1 << 2,
84    SCHEME_FTP        = 1 << 3,
85    SCHEME_CHROMEUI   = 1 << 4,
86    SCHEME_FILESYSTEM = 1 << 5,
87    // SCHEME_ALL will match every scheme, including chrome://, chrome-
88    // extension://, about:, etc. Because this has lots of security
89    // implications, third-party extensions should never be able to get access
90    // to URL patterns initialized this way. It should only be used for internal
91    // Chrome code.
92    SCHEME_ALL      = -1,
93  };
94
95  // Options for URLPattern::Parse().
96  enum ParseOption {
97    PARSE_LENIENT,
98    PARSE_STRICT
99  };
100
101  // Error codes returned from Parse().
102  enum ParseResult {
103    PARSE_SUCCESS = 0,
104    PARSE_ERROR_MISSING_SCHEME_SEPARATOR,
105    PARSE_ERROR_INVALID_SCHEME,
106    PARSE_ERROR_WRONG_SCHEME_SEPARATOR,
107    PARSE_ERROR_EMPTY_HOST,
108    PARSE_ERROR_INVALID_HOST_WILDCARD,
109    PARSE_ERROR_EMPTY_PATH,
110    PARSE_ERROR_HAS_COLON,  // Only checked when strict checks are enabled.
111    NUM_PARSE_RESULTS
112  };
113
114  // The <all_urls> string pattern.
115  static const char kAllUrlsPattern[];
116
117  // Construct an URLPattern with the given set of allowable schemes. See
118  // valid_schemes_ for more info.
119  explicit URLPattern(int valid_schemes);
120
121  // Convenience to construct a URLPattern from a string. The string is expected
122  // to be a valid pattern. If the string is not known ahead of time, use
123  // Parse() instead, which returns success or failure.
124  URLPattern(int valid_schemes, const std::string& pattern);
125
126#if defined(_MSC_VER) && _MSC_VER >= 1600
127  // Note: don't use this directly. This exists so URLPattern can be used
128  // with STL containers.  Starting with Visual Studio 2010, we can't have this
129  // method private and use "friend class std::vector<URLPattern>;" as we used
130  // to do.
131  URLPattern();
132#endif
133
134  ~URLPattern();
135
136  // Gets the bitmask of valid schemes.
137  int valid_schemes() const { return valid_schemes_; }
138  void set_valid_schemes(int valid_schemes) { valid_schemes_ = valid_schemes; }
139
140  // Gets the host the pattern matches. This can be an empty string if the
141  // pattern matches all hosts (the input was <scheme>://*/<whatever>).
142  const std::string& host() const { return host_; }
143  void set_host(const std::string& host) { host_ = host; }
144
145  // Gets whether to match subdomains of host().
146  bool match_subdomains() const { return match_subdomains_; }
147  void set_match_subdomains(bool val) { match_subdomains_ = val; }
148
149  // Gets the path the pattern matches with the leading slash. This can have
150  // embedded asterisks which are interpreted using glob rules.
151  const std::string& path() const { return path_; }
152  void SetPath(const std::string& path);
153
154  // Returns true if this pattern matches all urls.
155  bool match_all_urls() const { return match_all_urls_; }
156  void set_match_all_urls(bool val) { match_all_urls_ = val; }
157
158  // Initializes this instance by parsing the provided string. Returns
159  // URLPattern::PARSE_SUCCESS on success, or an error code otherwise. On
160  // failure, this instance will have some intermediate values and is in an
161  // invalid state.  Adding error checks to URLPattern::Parse() can cause
162  // patterns in installed extensions to fail.  If an installed extension
163  // uses a pattern that was valid but fails a new error check, the
164  // extension will fail to load when chrome is auto-updated.  To avoid
165  // this, new parse checks are enabled only when |strictness| is
166  // OPTION_STRICT.  OPTION_STRICT should be used when loading in developer
167  // mode, or when an extension's patterns are controlled by chrome (such
168  // as component extensions).
169  ParseResult Parse(const std::string& pattern_str,
170                    ParseOption strictness);
171
172  // Sets the scheme for pattern matches. This can be a single '*' if the
173  // pattern matches all valid schemes (as defined by the valid_schemes_
174  // property). Returns false on failure (if the scheme is not valid).
175  bool SetScheme(const std::string& scheme);
176  // Note: You should use MatchesScheme() instead of this getter unless you
177  // absolutely need the exact scheme. This is exposed for testing.
178  const std::string& scheme() const { return scheme_; }
179
180  // Returns true if the specified scheme can be used in this URL pattern, and
181  // false otherwise. Uses valid_schemes_ to determine validity.
182  bool IsValidScheme(const std::string& scheme) const;
183
184  // Returns true if this instance matches the specified URL.
185  bool MatchesUrl(const GURL& url) const;
186
187  // Returns true if |test| matches our scheme.
188  bool MatchesScheme(const std::string& test) const;
189
190  // Returns true if |test| matches our host.
191  bool MatchesHost(const std::string& test) const;
192  bool MatchesHost(const GURL& test) const;
193
194  // Returns true if |test| matches our path.
195  bool MatchesPath(const std::string& test) const;
196
197  // Returns a string representing this instance.
198  std::string GetAsString() const;
199
200  // Determine whether there is a URL that would match this instance and another
201  // instance. This method is symmetrical: Calling other.OverlapsWith(this)
202  // would result in the same answer.
203  bool OverlapsWith(const URLPattern& other) const;
204
205  // Convert this URLPattern into an equivalent set of URLPatterns that don't
206  // use a wildcard in the scheme component. If this URLPattern doesn't use a
207  // wildcard scheme, then the returned set will contain one element that is
208  // equivalent to this instance.
209  std::vector<URLPattern> ConvertToExplicitSchemes() const;
210
211  static bool EffectiveHostCompare(const URLPattern& a, const URLPattern& b) {
212    if (a.match_all_urls_ && b.match_all_urls_)
213      return false;
214    return a.host_.compare(b.host_) < 0;
215  };
216
217  // Used for origin comparisons in a std::set.
218  class EffectiveHostCompareFunctor {
219   public:
220    bool operator()(const URLPattern& a, const URLPattern& b) const {
221      return EffectiveHostCompare(a, b);
222    };
223  };
224
225  // Get an error string for a ParseResult.
226  static const char* GetParseResultString(URLPattern::ParseResult parse_result);
227
228 private:
229#if !(defined(_MSC_VER) && _MSC_VER >= 1600)
230  friend class std::vector<URLPattern>;
231
232  // Note: don't use this directly. This exists so URLPattern can be used
233  // with STL containers.
234  URLPattern();
235#endif
236
237  // A bitmask containing the schemes which are considered valid for this
238  // pattern. Parse() uses this to decide whether a pattern contains a valid
239  // scheme. MatchesScheme uses this to decide whether a wildcard scheme_
240  // matches a given test scheme.
241  int valid_schemes_;
242
243  // True if this is a special-case "<all_urls>" pattern.
244  bool match_all_urls_;
245
246  // The scheme for the pattern.
247  std::string scheme_;
248
249  // The host without any leading "*" components.
250  std::string host_;
251
252  // Whether we should match subdomains of the host. This is true if the first
253  // component of the pattern's host was "*".
254  bool match_subdomains_;
255
256  // The path to match. This is everything after the host of the URL, or
257  // everything after the scheme in the case of file:// URLs.
258  std::string path_;
259
260  // The path with "?" and "\" characters escaped for use with the
261  // MatchPattern() function.
262  std::string path_escaped_;
263};
264
265typedef std::vector<URLPattern> URLPatternList;
266
267#endif  // CHROME_COMMON_EXTENSIONS_URL_PATTERN_H_
268