common/extensions/url_pattern.cc

// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chrome/common/extensions/url_pattern.h"

#include "base/string_piece.h"
#include "base/string_split.h"
#include "base/string_util.h"
#include "chrome/common/url_constants.h"
#include "googleurl/src/gurl.h"
#include "googleurl/src/url_util.h"

const char URLPattern::kAllUrlsPattern[] = "<all_urls>";

namespace {

// TODO(aa): Consider adding chrome-extension? What about more obscure ones
// like data: and javascript: ?
// Note: keep this array in sync with kValidSchemeMasks.
const char* kValidSchemes[] = {
  chrome::kHttpScheme,
  chrome::kHttpsScheme,
  chrome::kFileScheme,
  chrome::kFtpScheme,
  chrome::kChromeUIScheme,
  chrome::kFileSystemScheme,
};

const int kValidSchemeMasks[] = {
  URLPattern::SCHEME_HTTP,
  URLPattern::SCHEME_HTTPS,
  URLPattern::SCHEME_FILE,
  URLPattern::SCHEME_FTP,
  URLPattern::SCHEME_CHROMEUI,
  URLPattern::SCHEME_FILESYSTEM,
};

COMPILE_ASSERT(arraysize(kValidSchemes) == arraysize(kValidSchemeMasks),
               must_keep_these_arrays_in_sync);

const char* kParseSuccess = "Success.";
const char* kParseErrorMissingSchemeSeparator = "Missing scheme separator.";
const char* kParseErrorInvalidScheme = "Invalid scheme.";
const char* kParseErrorWrongSchemeType = "Wrong scheme type.";
const char* kParseErrorEmptyHost = "Host can not be empty.";
const char* kParseErrorInvalidHostWildcard = "Invalid host wildcard.";
const char* kParseErrorEmptyPath = "Empty path.";
const char* kParseErrorHasColon =
    "Ports are not supported in URL patterns. ':' may not be used in a host.";

// Message explaining each URLPattern::ParseResult.
const char* kParseResultMessages[] = {
  kParseSuccess,
  kParseErrorMissingSchemeSeparator,
  kParseErrorInvalidScheme,
  kParseErrorWrongSchemeType,
  kParseErrorEmptyHost,
  kParseErrorInvalidHostWildcard,
  kParseErrorEmptyPath,
  kParseErrorHasColon
};

COMPILE_ASSERT(URLPattern::NUM_PARSE_RESULTS == arraysize(kParseResultMessages),
               must_add_message_for_each_parse_result);

const char kPathSeparator[] = "/";

bool IsStandardScheme(const std::string& scheme) {
  // "*" gets the same treatment as a standard scheme.
  if (scheme == "*")
    return true;

  return url_util::IsStandard(scheme.c_str(),
      url_parse::Component(0, static_cast<int>(scheme.length())));
}

}  // namespace

URLPattern::URLPattern()
    : valid_schemes_(SCHEME_NONE),
      match_all_urls_(false),
      match_subdomains_(false) {}

URLPattern::URLPattern(int valid_schemes)
    : valid_schemes_(valid_schemes), match_all_urls_(false),
      match_subdomains_(false) {}

URLPattern::URLPattern(int valid_schemes, const std::string& pattern)
    : valid_schemes_(valid_schemes), match_all_urls_(false),
      match_subdomains_(false) {

  // Strict error checking is used, because this constructor is only
  // appropriate when we know |pattern| is valid.
  if (PARSE_SUCCESS != Parse(pattern, PARSE_STRICT))
    NOTREACHED() << "URLPattern is invalid: " << pattern;
}

URLPattern::~URLPattern() {
}

URLPattern::ParseResult URLPattern::Parse(const std::string& pattern,
                                          ParseOption strictness) {
  CHECK(strictness == PARSE_LENIENT ||
        strictness == PARSE_STRICT);

  // Special case pattern to match every valid URL.
  if (pattern == kAllUrlsPattern) {
    match_all_urls_ = true;
    match_subdomains_ = true;
    scheme_ = "*";
    host_.clear();
    SetPath("/*");
    return PARSE_SUCCESS;
  }

  // Parse out the scheme.
  size_t scheme_end_pos = pattern.find(chrome::kStandardSchemeSeparator);
  bool has_standard_scheme_separator = true;

  // Some urls also use ':' alone as the scheme separator.
  if (scheme_end_pos == std::string::npos) {
    scheme_end_pos = pattern.find(':');
    has_standard_scheme_separator = false;
  }

  if (scheme_end_pos == std::string::npos)
    return PARSE_ERROR_MISSING_SCHEME_SEPARATOR;

  if (!SetScheme(pattern.substr(0, scheme_end_pos)))
    return PARSE_ERROR_INVALID_SCHEME;

  bool standard_scheme = IsStandardScheme(scheme_);
  if (standard_scheme != has_standard_scheme_separator)
    return PARSE_ERROR_WRONG_SCHEME_SEPARATOR;

  // Advance past the scheme separator.
  scheme_end_pos +=
      (standard_scheme ? strlen(chrome::kStandardSchemeSeparator) : 1);
  if (scheme_end_pos >= pattern.size())
    return PARSE_ERROR_EMPTY_HOST;

  // Parse out the host and path.
  size_t host_start_pos = scheme_end_pos;
  size_t path_start_pos = 0;

  // File URLs are special because they have no host.
  if (scheme_ == chrome::kFileScheme || !standard_scheme) {
    path_start_pos = host_start_pos;
  } else {
    size_t host_end_pos = pattern.find(kPathSeparator, host_start_pos);

    // Host is required.
    if (host_start_pos == host_end_pos)
      return PARSE_ERROR_EMPTY_HOST;

    if (host_end_pos == std::string::npos)
      return PARSE_ERROR_EMPTY_PATH;

    host_ = pattern.substr(host_start_pos, host_end_pos - host_start_pos);

    // The first component can optionally be '*' to match all subdomains.
    std::vector<std::string> host_components;
    base::SplitString(host_, '.', &host_components);
    if (host_components[0] == "*") {
      match_subdomains_ = true;
      host_components.erase(host_components.begin(),
                            host_components.begin() + 1);
    }
    host_ = JoinString(host_components, '.');

    // No other '*' can occur in the host, though. This isn't necessary, but is
    // done as a convenience to developers who might otherwise be confused and
    // think '*' works as a glob in the host.
    if (host_.find('*') != std::string::npos)
      return PARSE_ERROR_INVALID_HOST_WILDCARD;

    path_start_pos = host_end_pos;
  }

  SetPath(pattern.substr(path_start_pos));

  if (strictness == PARSE_STRICT && host_.find(':') != std::string::npos)
    return PARSE_ERROR_HAS_COLON;

  return PARSE_SUCCESS;
}

bool URLPattern::SetScheme(const std::string& scheme) {
  scheme_ = scheme;
  if (scheme_ == "*") {
    valid_schemes_ &= (SCHEME_HTTP | SCHEME_HTTPS);
  } else if (!IsValidScheme(scheme_)) {
    return false;
  }
  return true;
}

bool URLPattern::IsValidScheme(const std::string& scheme) const {
  if (valid_schemes_ == SCHEME_ALL)
    return true;

  for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    if (scheme == kValidSchemes[i] && (valid_schemes_ & kValidSchemeMasks[i]))
      return true;
  }

  return false;
}

void URLPattern::SetPath(const std::string& path) {
  path_ = path;
  path_escaped_ = path_;
  ReplaceSubstringsAfterOffset(&path_escaped_, 0, "\\", "\\\\");
  ReplaceSubstringsAfterOffset(&path_escaped_, 0, "?", "\\?");
}

bool URLPattern::MatchesUrl(const GURL &test) const {
  if (!MatchesScheme(test.scheme()))
    return false;

  if (match_all_urls_)
    return true;

  if (!MatchesHost(test))
    return false;

  if (!MatchesPath(test.PathForRequest()))
    return false;

  return true;
}

bool URLPattern::MatchesScheme(const std::string& test) const {
  if (!IsValidScheme(test))
    return false;

  return scheme_ == "*" || test == scheme_;
}

bool URLPattern::MatchesHost(const std::string& host) const {
  std::string test(chrome::kHttpScheme);
  test += chrome::kStandardSchemeSeparator;
  test += host;
  test += "/";
  return MatchesHost(GURL(test));
}

bool URLPattern::MatchesHost(const GURL& test) const {
  // If the hosts are exactly equal, we have a match.
  if (test.host() == host_)
    return true;

  // If we're matching subdomains, and we have no host in the match pattern,
  // that means that we're matching all hosts, which means we have a match no
  // matter what the test host is.
  if (match_subdomains_ && host_.empty())
    return true;

  // Otherwise, we can only match if our match pattern matches subdomains.
  if (!match_subdomains_)
    return false;

  // We don't do subdomain matching against IP addresses, so we can give up now
  // if the test host is an IP address.
  if (test.HostIsIPAddress())
    return false;

  // Check if the test host is a subdomain of our host.
  if (test.host().length() <= (host_.length() + 1))
    return false;

  if (test.host().compare(test.host().length() - host_.length(),
                          host_.length(), host_) != 0)
    return false;

  return test.host()[test.host().length() - host_.length() - 1] == '.';
}

bool URLPattern::MatchesPath(const std::string& test) const {
  if (!MatchPattern(test, path_escaped_))
    return false;

  return true;
}

std::string URLPattern::GetAsString() const {
  if (match_all_urls_)
    return kAllUrlsPattern;

  bool standard_scheme = IsStandardScheme(scheme_);

  std::string spec = scheme_ +
      (standard_scheme ? chrome::kStandardSchemeSeparator : ":");

  if (scheme_ != chrome::kFileScheme && standard_scheme) {
    if (match_subdomains_) {
      spec += "*";
      if (!host_.empty())
        spec += ".";
    }

    if (!host_.empty())
      spec += host_;
  }

  if (!path_.empty())
    spec += path_;

  return spec;
}

bool URLPattern::OverlapsWith(const URLPattern& other) const {
  if (!MatchesScheme(other.scheme_) && !other.MatchesScheme(scheme_))
    return false;

  if (!MatchesHost(other.host()) && !other.MatchesHost(host_))
    return false;

  // We currently only use OverlapsWith() for the patterns inside
  // ExtensionExtent. In those cases, we know that the path will have only a
  // single wildcard at the end. This makes figuring out overlap much easier. It
  // seems like there is probably a computer-sciency way to solve the general
  // case, but we don't need that yet.
  DCHECK(path_.find('*') == path_.size() - 1);
  DCHECK(other.path().find('*') == other.path().size() - 1);

  if (!MatchesPath(other.path().substr(0, other.path().size() - 1)) &&
      !other.MatchesPath(path_.substr(0, path_.size() - 1)))
    return false;

  return true;
}

std::vector<URLPattern> URLPattern::ConvertToExplicitSchemes() const {
  std::vector<URLPattern> result;

  if (scheme_ != "*" && !match_all_urls_ && IsValidScheme(scheme_)) {
    result.push_back(*this);
    return result;
  }

  for (size_t i = 0; i < arraysize(kValidSchemes); ++i) {
    if (MatchesScheme(kValidSchemes[i])) {
      URLPattern temp = *this;
      temp.SetScheme(kValidSchemes[i]);
      temp.set_match_all_urls(false);
      result.push_back(temp);
    }
  }

  return result;
}

// static
const char* URLPattern::GetParseResultString(
    URLPattern::ParseResult parse_result) {
  return kParseResultMessages[parse_result];
}