gurl.cc revision 3551c9c881056c480085172ff9840cab31610854
1// Copyright 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#ifdef WIN32 6#include <windows.h> 7#else 8#include <pthread.h> 9#endif 10 11#include <algorithm> 12#include <ostream> 13 14#include "url/gurl.h" 15 16#include "base/logging.h" 17#include "url/url_canon_stdstring.h" 18#include "url/url_util.h" 19 20namespace { 21 22// External template that can handle initialization of either character type. 23// The input spec is given, and the canonical version will be placed in 24// |*canonical|, along with the parsing of the canonical spec in |*parsed|. 25template<typename STR> 26bool InitCanonical(const STR& input_spec, 27 std::string* canonical, 28 url_parse::Parsed* parsed) { 29 // Reserve enough room in the output for the input, plus some extra so that 30 // we have room if we have to escape a few things without reallocating. 31 canonical->reserve(input_spec.size() + 32); 32 url_canon::StdStringCanonOutput output(canonical); 33 bool success = url_util::Canonicalize( 34 input_spec.data(), static_cast<int>(input_spec.length()), 35 NULL, &output, parsed); 36 37 output.Complete(); // Must be done before using string. 38 return success; 39} 40 41static std::string* empty_string = NULL; 42static GURL* empty_gurl = NULL; 43 44#ifdef WIN32 45 46// Returns a static reference to an empty string for returning a reference 47// when there is no underlying string. 48const std::string& EmptyStringForGURL() { 49 // Avoid static object construction/destruction on startup/shutdown. 50 if (!empty_string) { 51 // Create the string. Be careful that we don't break in the case that this 52 // is being called from multiple threads. Statics are not threadsafe. 53 std::string* new_empty_string = new std::string; 54 if (InterlockedCompareExchangePointer( 55 reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) { 56 // The old value was non-NULL, so no replacement was done. Another 57 // thread did the initialization out from under us. 58 delete new_empty_string; 59 } 60 } 61 return *empty_string; 62} 63 64#else 65 66static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT; 67static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT; 68 69void EmptyStringForGURLOnce(void) { 70 empty_string = new std::string; 71} 72 73const std::string& EmptyStringForGURL() { 74 // Avoid static object construction/destruction on startup/shutdown. 75 pthread_once(&empty_string_once, EmptyStringForGURLOnce); 76 return *empty_string; 77} 78 79#endif // WIN32 80 81} // namespace 82 83GURL::GURL() : is_valid_(false), inner_url_(NULL) { 84} 85 86GURL::GURL(const GURL& other) 87 : spec_(other.spec_), 88 is_valid_(other.is_valid_), 89 parsed_(other.parsed_), 90 inner_url_(NULL) { 91 if (other.inner_url_) 92 inner_url_ = new GURL(*other.inner_url_); 93 // Valid filesystem urls should always have an inner_url_. 94 DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_); 95} 96 97GURL::GURL(const std::string& url_string) : inner_url_(NULL) { 98 is_valid_ = InitCanonical(url_string, &spec_, &parsed_); 99 if (is_valid_ && SchemeIsFileSystem()) { 100 inner_url_ = 101 new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true); 102 } 103} 104 105GURL::GURL(const base::string16& url_string) : inner_url_(NULL) { 106 is_valid_ = InitCanonical(url_string, &spec_, &parsed_); 107 if (is_valid_ && SchemeIsFileSystem()) { 108 inner_url_ = 109 new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true); 110 } 111} 112 113GURL::GURL(const char* canonical_spec, size_t canonical_spec_len, 114 const url_parse::Parsed& parsed, bool is_valid) 115 : spec_(canonical_spec, canonical_spec_len), 116 is_valid_(is_valid), 117 parsed_(parsed), 118 inner_url_(NULL) { 119 InitializeFromCanonicalSpec(); 120} 121 122GURL::GURL(std::string canonical_spec, 123 const url_parse::Parsed& parsed, bool is_valid) 124 : is_valid_(is_valid), 125 parsed_(parsed), 126 inner_url_(NULL) { 127 spec_.swap(canonical_spec); 128 InitializeFromCanonicalSpec(); 129} 130 131void GURL::InitializeFromCanonicalSpec() { 132 if (is_valid_ && SchemeIsFileSystem()) { 133 inner_url_ = 134 new GURL(spec_.data(), parsed_.Length(), *parsed_.inner_parsed(), true); 135 } 136 137#ifndef NDEBUG 138 // For testing purposes, check that the parsed canonical URL is identical to 139 // what we would have produced. Skip checking for invalid URLs have no meaning 140 // and we can't always canonicalize then reproducabely. 141 if (is_valid_) { 142 url_parse::Component scheme; 143 if (!url_util::FindAndCompareScheme(spec_.data(), spec_.length(), 144 "filesystem", &scheme) || 145 scheme.begin == parsed_.scheme.begin) { 146 // We can't do this check on the inner_url of a filesystem URL, as 147 // canonical_spec actually points to the start of the outer URL, so we'd 148 // end up with infinite recursion in this constructor. 149 GURL test_url(spec_); 150 151 DCHECK(test_url.is_valid_ == is_valid_); 152 DCHECK(test_url.spec_ == spec_); 153 154 DCHECK(test_url.parsed_.scheme == parsed_.scheme); 155 DCHECK(test_url.parsed_.username == parsed_.username); 156 DCHECK(test_url.parsed_.password == parsed_.password); 157 DCHECK(test_url.parsed_.host == parsed_.host); 158 DCHECK(test_url.parsed_.port == parsed_.port); 159 DCHECK(test_url.parsed_.path == parsed_.path); 160 DCHECK(test_url.parsed_.query == parsed_.query); 161 DCHECK(test_url.parsed_.ref == parsed_.ref); 162 } 163 } 164#endif 165} 166 167GURL::~GURL() { 168 delete inner_url_; 169} 170 171GURL& GURL::operator=(const GURL& other) { 172 spec_ = other.spec_; 173 is_valid_ = other.is_valid_; 174 parsed_ = other.parsed_; 175 delete inner_url_; 176 inner_url_ = NULL; 177 if (other.inner_url_) 178 inner_url_ = new GURL(*other.inner_url_); 179 // Valid filesystem urls should always have an inner_url_. 180 DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_); 181 return *this; 182} 183 184const std::string& GURL::spec() const { 185 if (is_valid_ || spec_.empty()) 186 return spec_; 187 188 DCHECK(false) << "Trying to get the spec of an invalid URL!"; 189 return EmptyStringForGURL(); 190} 191 192GURL GURL::Resolve(const std::string& relative) const { 193 return ResolveWithCharsetConverter(relative, NULL); 194} 195GURL GURL::Resolve(const base::string16& relative) const { 196 return ResolveWithCharsetConverter(relative, NULL); 197} 198 199// Note: code duplicated below (it's inconvenient to use a template here). 200GURL GURL::ResolveWithCharsetConverter( 201 const std::string& relative, 202 url_canon::CharsetConverter* charset_converter) const { 203 // Not allowed for invalid URLs. 204 if (!is_valid_) 205 return GURL(); 206 207 GURL result; 208 209 // Reserve enough room in the output for the input, plus some extra so that 210 // we have room if we have to escape a few things without reallocating. 211 result.spec_.reserve(spec_.size() + 32); 212 url_canon::StdStringCanonOutput output(&result.spec_); 213 214 if (!url_util::ResolveRelative( 215 spec_.data(), static_cast<int>(spec_.length()), parsed_, 216 relative.data(), static_cast<int>(relative.length()), 217 charset_converter, &output, &result.parsed_)) { 218 // Error resolving, return an empty URL. 219 return GURL(); 220 } 221 222 output.Complete(); 223 result.is_valid_ = true; 224 if (result.SchemeIsFileSystem()) { 225 result.inner_url_ = new GURL(result.spec_.data(), result.parsed_.Length(), 226 *result.parsed_.inner_parsed(), true); 227 } 228 return result; 229} 230 231// Note: code duplicated above (it's inconvenient to use a template here). 232GURL GURL::ResolveWithCharsetConverter( 233 const base::string16& relative, 234 url_canon::CharsetConverter* charset_converter) const { 235 // Not allowed for invalid URLs. 236 if (!is_valid_) 237 return GURL(); 238 239 GURL result; 240 241 // Reserve enough room in the output for the input, plus some extra so that 242 // we have room if we have to escape a few things without reallocating. 243 result.spec_.reserve(spec_.size() + 32); 244 url_canon::StdStringCanonOutput output(&result.spec_); 245 246 if (!url_util::ResolveRelative( 247 spec_.data(), static_cast<int>(spec_.length()), parsed_, 248 relative.data(), static_cast<int>(relative.length()), 249 charset_converter, &output, &result.parsed_)) { 250 // Error resolving, return an empty URL. 251 return GURL(); 252 } 253 254 output.Complete(); 255 result.is_valid_ = true; 256 if (result.SchemeIsFileSystem()) { 257 result.inner_url_ = new GURL(result.spec_.data(), result.parsed_.Length(), 258 *result.parsed_.inner_parsed(), true); 259 } 260 return result; 261} 262 263// Note: code duplicated below (it's inconvenient to use a template here). 264GURL GURL::ReplaceComponents( 265 const url_canon::Replacements<char>& replacements) const { 266 GURL result; 267 268 // Not allowed for invalid URLs. 269 if (!is_valid_) 270 return GURL(); 271 272 // Reserve enough room in the output for the input, plus some extra so that 273 // we have room if we have to escape a few things without reallocating. 274 result.spec_.reserve(spec_.size() + 32); 275 url_canon::StdStringCanonOutput output(&result.spec_); 276 277 result.is_valid_ = url_util::ReplaceComponents( 278 spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements, 279 NULL, &output, &result.parsed_); 280 281 output.Complete(); 282 if (result.is_valid_ && result.SchemeIsFileSystem()) { 283 result.inner_url_ = new GURL(spec_.data(), result.parsed_.Length(), 284 *result.parsed_.inner_parsed(), true); 285 } 286 return result; 287} 288 289// Note: code duplicated above (it's inconvenient to use a template here). 290GURL GURL::ReplaceComponents( 291 const url_canon::Replacements<base::char16>& replacements) const { 292 GURL result; 293 294 // Not allowed for invalid URLs. 295 if (!is_valid_) 296 return GURL(); 297 298 // Reserve enough room in the output for the input, plus some extra so that 299 // we have room if we have to escape a few things without reallocating. 300 result.spec_.reserve(spec_.size() + 32); 301 url_canon::StdStringCanonOutput output(&result.spec_); 302 303 result.is_valid_ = url_util::ReplaceComponents( 304 spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements, 305 NULL, &output, &result.parsed_); 306 307 output.Complete(); 308 if (result.is_valid_ && result.SchemeIsFileSystem()) { 309 result.inner_url_ = new GURL(spec_.data(), result.parsed_.Length(), 310 *result.parsed_.inner_parsed(), true); 311 } 312 return result; 313} 314 315GURL GURL::GetOrigin() const { 316 // This doesn't make sense for invalid or nonstandard URLs, so return 317 // the empty URL 318 if (!is_valid_ || !IsStandard()) 319 return GURL(); 320 321 if (SchemeIsFileSystem()) 322 return inner_url_->GetOrigin(); 323 324 url_canon::Replacements<char> replacements; 325 replacements.ClearUsername(); 326 replacements.ClearPassword(); 327 replacements.ClearPath(); 328 replacements.ClearQuery(); 329 replacements.ClearRef(); 330 331 return ReplaceComponents(replacements); 332} 333 334GURL GURL::GetWithEmptyPath() const { 335 // This doesn't make sense for invalid or nonstandard URLs, so return 336 // the empty URL. 337 if (!is_valid_ || !IsStandard()) 338 return GURL(); 339 340 // We could optimize this since we know that the URL is canonical, and we are 341 // appending a canonical path, so avoiding re-parsing. 342 GURL other(*this); 343 if (parsed_.path.len == 0) 344 return other; 345 346 // Clear everything after the path. 347 other.parsed_.query.reset(); 348 other.parsed_.ref.reset(); 349 350 // Set the path, since the path is longer than one, we can just set the 351 // first character and resize. 352 other.spec_[other.parsed_.path.begin] = '/'; 353 other.parsed_.path.len = 1; 354 other.spec_.resize(other.parsed_.path.begin + 1); 355 return other; 356} 357 358bool GURL::IsStandard() const { 359 return url_util::IsStandard(spec_.data(), parsed_.scheme); 360} 361 362bool GURL::SchemeIs(const char* lower_ascii_scheme) const { 363 if (parsed_.scheme.len <= 0) 364 return lower_ascii_scheme == NULL; 365 return url_util::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin, 366 spec_.data() + parsed_.scheme.end(), 367 lower_ascii_scheme); 368} 369 370bool GURL::SchemeIsHTTPOrHTTPS() const { 371 return SchemeIs("http") || SchemeIs("https"); 372} 373 374int GURL::IntPort() const { 375 if (parsed_.port.is_nonempty()) 376 return url_parse::ParsePort(spec_.data(), parsed_.port); 377 return url_parse::PORT_UNSPECIFIED; 378} 379 380int GURL::EffectiveIntPort() const { 381 int int_port = IntPort(); 382 if (int_port == url_parse::PORT_UNSPECIFIED && IsStandard()) 383 return url_canon::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin, 384 parsed_.scheme.len); 385 return int_port; 386} 387 388std::string GURL::ExtractFileName() const { 389 url_parse::Component file_component; 390 url_parse::ExtractFileName(spec_.data(), parsed_.path, &file_component); 391 return ComponentString(file_component); 392} 393 394std::string GURL::PathForRequest() const { 395 DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty"; 396 if (parsed_.ref.len >= 0) { 397 // Clip off the reference when it exists. The reference starts after the # 398 // sign, so we have to subtract one to also remove it. 399 return std::string(spec_, parsed_.path.begin, 400 parsed_.ref.begin - parsed_.path.begin - 1); 401 } 402 // Compute the actual path length, rather than depending on the spec's 403 // terminator. If we're an inner_url, our spec continues on into our outer 404 // url's path/query/ref. 405 int path_len = parsed_.path.len; 406 if (parsed_.query.is_valid()) 407 path_len = parsed_.query.end() - parsed_.path.begin; 408 409 return std::string(spec_, parsed_.path.begin, path_len); 410} 411 412std::string GURL::HostNoBrackets() const { 413 // If host looks like an IPv6 literal, strip the square brackets. 414 url_parse::Component h(parsed_.host); 415 if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') { 416 h.begin++; 417 h.len -= 2; 418 } 419 return ComponentString(h); 420} 421 422bool GURL::HostIsIPAddress() const { 423 if (!is_valid_ || spec_.empty()) 424 return false; 425 426 url_canon::RawCanonOutputT<char, 128> ignored_output; 427 url_canon::CanonHostInfo host_info; 428 url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host, 429 &ignored_output, &host_info); 430 return host_info.IsIPAddress(); 431} 432 433#ifdef WIN32 434 435const GURL& GURL::EmptyGURL() { 436 // Avoid static object construction/destruction on startup/shutdown. 437 if (!empty_gurl) { 438 // Create the string. Be careful that we don't break in the case that this 439 // is being called from multiple threads. 440 GURL* new_empty_gurl = new GURL; 441 if (InterlockedCompareExchangePointer( 442 reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) { 443 // The old value was non-NULL, so no replacement was done. Another 444 // thread did the initialization out from under us. 445 delete new_empty_gurl; 446 } 447 } 448 return *empty_gurl; 449} 450 451#else 452 453void EmptyGURLOnce(void) { 454 empty_gurl = new GURL; 455} 456 457const GURL& GURL::EmptyGURL() { 458 // Avoid static object construction/destruction on startup/shutdown. 459 pthread_once(&empty_gurl_once, EmptyGURLOnce); 460 return *empty_gurl; 461} 462 463#endif // WIN32 464 465bool GURL::DomainIs(const char* lower_ascii_domain, 466 int domain_len) const { 467 // Return false if this URL is not valid or domain is empty. 468 if (!is_valid_ || !domain_len) 469 return false; 470 471 // FileSystem URLs have empty parsed_.host, so check this first. 472 if (SchemeIsFileSystem() && inner_url_) 473 return inner_url_->DomainIs(lower_ascii_domain, domain_len); 474 475 if (!parsed_.host.is_nonempty()) 476 return false; 477 478 // Check whether the host name is end with a dot. If yes, treat it 479 // the same as no-dot unless the input comparison domain is end 480 // with dot. 481 const char* last_pos = spec_.data() + parsed_.host.end() - 1; 482 int host_len = parsed_.host.len; 483 if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) { 484 last_pos--; 485 host_len--; 486 } 487 488 // Return false if host's length is less than domain's length. 489 if (host_len < domain_len) 490 return false; 491 492 // Compare this url whether belong specific domain. 493 const char* start_pos = spec_.data() + parsed_.host.begin + 494 host_len - domain_len; 495 496 if (!url_util::LowerCaseEqualsASCII(start_pos, 497 last_pos + 1, 498 lower_ascii_domain, 499 lower_ascii_domain + domain_len)) 500 return false; 501 502 // Check whether host has right domain start with dot, make sure we got 503 // right domain range. For example www.google.com has domain 504 // "google.com" but www.iamnotgoogle.com does not. 505 if ('.' != lower_ascii_domain[0] && host_len > domain_len && 506 '.' != *(start_pos - 1)) 507 return false; 508 509 return true; 510} 511 512void GURL::Swap(GURL* other) { 513 spec_.swap(other->spec_); 514 std::swap(is_valid_, other->is_valid_); 515 std::swap(parsed_, other->parsed_); 516 std::swap(inner_url_, other->inner_url_); 517} 518 519std::ostream& operator<<(std::ostream& out, const GURL& url) { 520 return out << url.possibly_invalid_spec(); 521} 522