1c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Use of this source code is governed by a BSD-style license that can be 3c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// found in the LICENSE file. 4c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 5c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/browser/net/url_fixer_upper.h" 6c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 7c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include <algorithm> 8c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 93345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#if defined(OS_POSIX) 103345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#include "base/environment.h" 113345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#endif 12c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/file_util.h" 13c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/logging.h" 14c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/string_util.h" 15c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/utf_string_conversions.h" 16c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/common/url_constants.h" 17c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "googleurl/src/url_file.h" 18c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "googleurl/src/url_parse.h" 19c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "googleurl/src/url_util.h" 20c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "net/base/escape.h" 21c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "net/base/net_util.h" 22c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "net/base/registry_controlled_domain.h" 23c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 24c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochconst char* URLFixerUpper::home_directory_override = NULL; 25c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 26c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochnamespace { 27c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 28c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// TODO(estade): Remove these ugly, ugly functions. They are only used in 29c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// SegmentURL. A url_parse::Parsed object keeps track of a bunch of indices into 30c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// a url string, and these need to be updated when the URL is converted from 31c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// UTF8 to UTF16. Instead of this after-the-fact adjustment, we should parse it 32c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// in the correct string format to begin with. 3372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenurl_parse::Component UTF8ComponentToUTF16Component( 3472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen const std::string& text_utf8, 3572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen const url_parse::Component& component_utf8) { 3672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen if (component_utf8.len == -1) 3772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen return url_parse::Component(); 3872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen 3972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen std::string before_component_string = 4072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen text_utf8.substr(0, component_utf8.begin); 4172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen std::string component_string = text_utf8.substr(component_utf8.begin, 4272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen component_utf8.len); 4372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen string16 before_component_string_16 = UTF8ToUTF16(before_component_string); 4472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen string16 component_string_16 = UTF8ToUTF16(component_string); 4572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen url_parse::Component component_16(before_component_string_16.length(), 4672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen component_string_16.length()); 4772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen return component_16; 4872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen} 4972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen 5072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenvoid UTF8PartsToUTF16Parts(const std::string& text_utf8, 5172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen const url_parse::Parsed& parts_utf8, 5272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen url_parse::Parsed* parts) { 5372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen if (IsStringASCII(text_utf8)) { 5472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen *parts = parts_utf8; 5572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen return; 5672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen } 5772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen 5872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen parts->scheme = 5972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen UTF8ComponentToUTF16Component(text_utf8, parts_utf8.scheme); 6072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen parts ->username = 6172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen UTF8ComponentToUTF16Component(text_utf8, parts_utf8.username); 6272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen parts->password = 6372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen UTF8ComponentToUTF16Component(text_utf8, parts_utf8.password); 6472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen parts->host = 6572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen UTF8ComponentToUTF16Component(text_utf8, parts_utf8.host); 6672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen parts->port = 6772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen UTF8ComponentToUTF16Component(text_utf8, parts_utf8.port); 6872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen parts->path = 6972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen UTF8ComponentToUTF16Component(text_utf8, parts_utf8.path); 7072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen parts->query = 7172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen UTF8ComponentToUTF16Component(text_utf8, parts_utf8.query); 7272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen parts->ref = 7372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen UTF8ComponentToUTF16Component(text_utf8, parts_utf8.ref); 7472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen} 75c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 76c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochTrimPositions TrimWhitespaceUTF8(const std::string& input, 77c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch TrimPositions positions, 78c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string* output) { 79c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // This implementation is not so fast since it converts the text encoding 80c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // twice. Please feel free to file a bug if this function hurts the 81c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // performance of Chrome. 82c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK(IsStringUTF8(input)); 83c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::wstring input_wide = UTF8ToWide(input); 84c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::wstring output_wide; 85c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide); 86c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch *output = WideToUTF8(output_wide); 87c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return result; 88c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 89c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 90c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} // namespace 91c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 92c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// does some basic fixes for input that we want to test for file-ness 93c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic void PrepareStringForFileOps(const FilePath& text, 94c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath::StringType* output) { 95c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_WIN) 96c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch TrimWhitespace(text.value(), TRIM_ALL, output); 97c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch replace(output->begin(), output->end(), '/', '\\'); 98c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#else 99c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch TrimWhitespaceUTF8(text.value(), TRIM_ALL, output); 100c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif 101c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 102c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 103c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Tries to create a full path from |text|. If the result is valid and the 104c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// file exists, returns true and sets |full_path| to the result. Otherwise, 105c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// returns false and leaves |full_path| unchanged. 106c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic bool ValidPathForFile(const FilePath::StringType& text, 107c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath* full_path) { 108c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath file_path(text); 109c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!file_util::AbsolutePath(&file_path)) 110c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 111c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 112c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!file_util::PathExists(file_path)) 113c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 114c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 115c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch *full_path = file_path; 116c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return true; 117c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 118c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 119c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_POSIX) 120c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Given a path that starts with ~, return a path that starts with an 121c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// expanded-out /user/foobar directory. 122c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic std::string FixupHomedir(const std::string& text) { 123c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK(text.length() > 0 && text[0] == '~'); 124c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 125c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (text.length() == 1 || text[1] == '/') { 126c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const char* home = getenv(base::env_vars::kHome); 127c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (URLFixerUpper::home_directory_override) 128c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch home = URLFixerUpper::home_directory_override; 129c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We'll probably break elsewhere if $HOME is undefined, but check here 130c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // just in case. 131c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!home) 132c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return text; 133c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return home + text.substr(1); 134c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 135c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 136c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Otherwise, this is a path like ~foobar/baz, where we must expand to 137c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // user foobar's home directory. Officially, we should use getpwent(), 138c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // but that is a nasty blocking call. 139c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 140c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_MACOSX) 141c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch static const char kHome[] = "/Users/"; 142c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#else 143c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch static const char kHome[] = "/home/"; 144c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif 145c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return kHome + text.substr(1); 146c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 147c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif 148c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 149c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Tries to create a file: URL from |text| if it looks like a filename, even if 150c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// it doesn't resolve as a valid path or to an existing file. Returns a 151c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// (possibly invalid) file: URL in |fixed_up_url| for input beginning 152c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// with a drive specifier or "\\". Returns the unchanged input in other cases 153c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// (including file: URLs: these don't look like filenames). 154c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic std::string FixupPath(const std::string& text) { 155c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK(!text.empty()); 156c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 157c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath::StringType filename; 158c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_WIN) 159c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath input_path(UTF8ToWide(text)); 160c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch PrepareStringForFileOps(input_path, &filename); 161c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 162c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Fixup Windows-style drive letters, where "C:" gets rewritten to "C|". 163c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (filename.length() > 1 && filename[1] == '|') 164c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch filename[1] = ':'; 165c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#elif defined(OS_POSIX) 166c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath input_path(text); 167c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch PrepareStringForFileOps(input_path, &filename); 168c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (filename.length() > 0 && filename[0] == '~') 169c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch filename = FixupHomedir(filename); 170c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif 171c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 172c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Here, we know the input looks like a file. 173c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch GURL file_url = net::FilePathToFileURL(FilePath(filename)); 174c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (file_url.is_valid()) { 1753345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick return UTF16ToUTF8(net::FormatUrl(file_url, std::string(), 176c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch net::kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, NULL, 177c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch NULL, NULL)); 178c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 179c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 180c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Invalid file URL, just return the input. 181c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return text; 182c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 183c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 184c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Checks |domain| to see if a valid TLD is already present. If not, appends 185c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// |desired_tld| to the domain, and prepends "www." unless it's already present. 186c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic void AddDesiredTLD(const std::string& desired_tld, 187c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string* domain) { 188c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (desired_tld.empty() || domain->empty()) 189c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 190c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 191c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Check the TLD. If the return value is positive, we already have a TLD, so 192c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // abort. If the return value is std::string::npos, there's no valid host, 193c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // but we can try to append a TLD anyway, since the host may become valid once 194c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // the TLD is attached -- for example, "999999999999" is detected as a broken 195c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // IP address and marked invalid, but attaching ".com" makes it legal. When 196c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // the return value is 0, there's a valid host with no known TLD, so we can 197c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // definitely append the user's TLD. We disallow unknown registries here so 198c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // users can input "mail.yahoo" and hit ctrl-enter to get 199c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // "www.mail.yahoo.com". 200c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const size_t registry_length = 201c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch net::RegistryControlledDomainService::GetRegistryLength(*domain, false); 202c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if ((registry_length != 0) && (registry_length != std::string::npos)) 203c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 204c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 205c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Add the suffix at the end of the domain. 206c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const size_t domain_length(domain->length()); 207c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK_GT(domain_length, 0U); 208c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK_NE(desired_tld[0], '.'); 209c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if ((*domain)[domain_length - 1] != '.') 210c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch domain->push_back('.'); 211c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch domain->append(desired_tld); 212c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 213c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Now, if the domain begins with "www.", stop. 214c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const std::string prefix("www."); 215c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (domain->compare(0, prefix.length(), prefix) != 0) { 216c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Otherwise, add www. to the beginning of the URL. 217c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch domain->insert(0, prefix); 218c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 219c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 220c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 221c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic inline void FixupUsername(const std::string& text, 222c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const url_parse::Component& part, 223c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string* url) { 224c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!part.is_valid()) 225c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 226c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 227c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We don't fix up the username at the moment. 228c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url->append(text, part.begin, part.len); 229c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Do not append the trailing '@' because we might need to include the user's 230c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // password. FixupURL itself will append the '@' for us. 231c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 232c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 233c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic inline void FixupPassword(const std::string& text, 234c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const url_parse::Component& part, 235c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string* url) { 236c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!part.is_valid()) 237c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 238c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 239c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We don't fix up the password at the moment. 240c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url->append(":"); 241c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url->append(text, part.begin, part.len); 242c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 243c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 244c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic void FixupHost(const std::string& text, 245c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const url_parse::Component& part, 246c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch bool has_scheme, 247c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const std::string& desired_tld, 248c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string* url) { 249c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!part.is_valid()) 250c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 251c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 252c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Make domain valid. 253c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Strip all leading dots and all but one trailing dot, unless the user only 254c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // typed dots, in which case their input is totally invalid and we should just 255c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // leave it unchanged. 256c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string domain(text, part.begin, part.len); 257c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const size_t first_nondot(domain.find_first_not_of('.')); 258c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (first_nondot != std::string::npos) { 259c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch domain.erase(0, first_nondot); 260c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch size_t last_nondot(domain.find_last_not_of('.')); 261c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK(last_nondot != std::string::npos); 262c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch last_nondot += 2; // Point at second period in ending string 263c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (last_nondot < domain.length()) 264c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch domain.erase(last_nondot); 265c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 266c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 267c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Add any user-specified TLD, if applicable. 268c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch AddDesiredTLD(desired_tld, &domain); 269c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 270c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url->append(domain); 271c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 272c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 273c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic void FixupPort(const std::string& text, 274c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const url_parse::Component& part, 275c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string* url) { 276c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!part.is_valid()) 277c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 278c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 279c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We don't fix up the port at the moment. 280c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url->append(":"); 281c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url->append(text, part.begin, part.len); 282c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 283c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 284c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic inline void FixupPath(const std::string& text, 285c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const url_parse::Component& part, 286c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string* url) { 287c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!part.is_valid() || part.len == 0) { 288c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We should always have a path. 289c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url->append("/"); 290c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 291c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 292c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 293c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Append the path as is. 294c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url->append(text, part.begin, part.len); 295c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 296c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 297c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic inline void FixupQuery(const std::string& text, 298c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const url_parse::Component& part, 299c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string* url) { 300c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!part.is_valid()) 301c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 302c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 303c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We don't fix up the query at the moment. 304c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url->append("?"); 305c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url->append(text, part.begin, part.len); 306c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 307c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 308c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic inline void FixupRef(const std::string& text, 309c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const url_parse::Component& part, 310c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string* url) { 311c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!part.is_valid()) 312c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return; 313c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 314c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We don't fix up the ref at the moment. 315c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url->append("#"); 316c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url->append(text, part.begin, part.len); 317c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 318c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 319c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic bool HasPort(const std::string& original_text, 320c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const url_parse::Component& scheme_component) { 321c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Find the range between the ":" and the "/". 322c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch size_t port_start = scheme_component.end() + 1; 323c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch size_t port_end = port_start; 324c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch while ((port_end < original_text.length()) && 325c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch !url_parse::IsAuthorityTerminator(original_text[port_end])) 326c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ++port_end; 327c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (port_end == port_start) 328c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 329c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 330c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Scan the range to see if it is entirely digits. 331c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch for (size_t i = port_start; i < port_end; ++i) { 332c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!IsAsciiDigit(original_text[i])) 333c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 334c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 335c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 336c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return true; 337c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 338c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 339c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Try to extract a valid scheme from the beginning of |text|. 340c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// If successful, set |scheme_component| to the text range where the scheme 341c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// was located, and fill |canon_scheme| with its canonicalized form. 342c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Otherwise, return false and leave the outputs in an indeterminate state. 343c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic bool GetValidScheme(const std::string &text, 344c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url_parse::Component* scheme_component, 345c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string* canon_scheme) { 346c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Locate everything up to (but not including) the first ':' 347c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!url_parse::ExtractScheme(text.data(), static_cast<int>(text.length()), 348c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch scheme_component)) 349c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 350c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 351c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Make sure the scheme contains only valid characters, and convert 352c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // to lowercase. This also catches IPv6 literals like [::1], because 353c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // brackets are not in the whitelist. 354c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url_canon::StdStringCanonOutput canon_scheme_output(canon_scheme); 355c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url_parse::Component canon_scheme_component; 356c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!url_canon::CanonicalizeScheme(text.data(), *scheme_component, 357c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch &canon_scheme_output, 358c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch &canon_scheme_component)) 359c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 360c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 361c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Strip the ':', and any trailing buffer space. 362c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch DCHECK_EQ(0, canon_scheme_component.begin); 363c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch canon_scheme->erase(canon_scheme_component.len); 364c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 365c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We need to fix up the segmentation for "www.example.com:/". For this 366c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // case, we guess that schemes with a "." are not actually schemes. 367c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (canon_scheme->find('.') != std::string::npos) 368c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 369c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 370c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We need to fix up the segmentation for "www:123/". For this case, we 371c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // will add an HTTP scheme later and make the URL parser happy. 372c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // TODO(pkasting): Maybe we should try to use GURL's parser for this? 373c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (HasPort(text, *scheme_component)) 374c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return false; 375c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 376c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Everything checks out. 377c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return true; 378c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 379c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 380c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstd::string URLFixerUpper::SegmentURL(const std::string& text, 381c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url_parse::Parsed* parts) { 382c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Initialize the result. 383c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch *parts = url_parse::Parsed(); 384c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 385c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string trimmed; 386c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch TrimWhitespaceUTF8(text, TRIM_ALL, &trimmed); 387c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (trimmed.empty()) 388c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return std::string(); // Nothing to segment. 389c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 390c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_WIN) 391c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch int trimmed_length = static_cast<int>(trimmed.length()); 392c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (url_parse::DoesBeginWindowsDriveSpec(trimmed.data(), 0, trimmed_length) || 393c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url_parse::DoesBeginUNCPath(trimmed.data(), 0, trimmed_length, true)) 394c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return "file"; 395c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#elif defined(OS_POSIX) 396c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (FilePath::IsSeparator(trimmed.data()[0]) || trimmed.data()[0] == '~') 397c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return "file"; 398c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif 399c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 400c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Otherwise, we need to look at things carefully. 401c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string scheme; 402c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!GetValidScheme(text, &parts->scheme, &scheme)) { 403c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Couldn't determine the scheme, so just pick one. 404c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch parts->scheme.reset(); 405c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch scheme.assign(StartsWithASCII(text, "ftp.", false) ? 406c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch chrome::kFtpScheme : chrome::kHttpScheme); 407c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 408c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 409c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Not segmenting file schemes or nonstandard schemes. 410c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if ((scheme == chrome::kFileScheme) || 411c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch !url_util::IsStandard(scheme.c_str(), 412c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url_parse::Component(0, static_cast<int>(scheme.length())))) 413c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return scheme; 414c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 415c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (parts->scheme.is_valid()) { 416c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Have the GURL parser do the heavy lifting for us. 417c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url_parse::ParseStandardURL(text.data(), static_cast<int>(text.length()), 418c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch parts); 419c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return scheme; 420c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 421c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 422c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We need to add a scheme in order for ParseStandardURL to be happy. 423c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Find the first non-whitespace character. 424c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string::const_iterator first_nonwhite = text.begin(); 425c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch while ((first_nonwhite != text.end()) && IsWhitespace(*first_nonwhite)) 426c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch ++first_nonwhite; 427c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 428c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Construct the text to parse by inserting the scheme. 429c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string inserted_text(scheme); 430c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch inserted_text.append("://"); 431c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string text_to_parse(text.begin(), first_nonwhite); 432c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch text_to_parse.append(inserted_text); 433c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch text_to_parse.append(first_nonwhite, text.end()); 434c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 435c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Have the GURL parser do the heavy lifting for us. 436c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url_parse::ParseStandardURL(text_to_parse.data(), 437c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch static_cast<int>(text_to_parse.length()), 438c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch parts); 439c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 440c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Offset the results of the parse to match the original text. 441c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const int offset = -static_cast<int>(inserted_text.length()); 442c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch OffsetComponent(offset, &parts->scheme); 443c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch OffsetComponent(offset, &parts->username); 444c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch OffsetComponent(offset, &parts->password); 445c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch OffsetComponent(offset, &parts->host); 446c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch OffsetComponent(offset, &parts->port); 447c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch OffsetComponent(offset, &parts->path); 448c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch OffsetComponent(offset, &parts->query); 449c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch OffsetComponent(offset, &parts->ref); 450c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 451c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return scheme; 452c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 453c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 454c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochGURL URLFixerUpper::FixupURL(const std::string& text, 455c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const std::string& desired_tld) { 456c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string trimmed; 457c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch TrimWhitespaceUTF8(text, TRIM_ALL, &trimmed); 458c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (trimmed.empty()) 459c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return GURL(); // Nothing here. 460c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 461c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Segment the URL. 462c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url_parse::Parsed parts; 463c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string scheme(SegmentURL(trimmed, &parts)); 464c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 465c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // For view-source: URLs, we strip "view-source:", do fixup, and stick it back 466c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // on. This allows us to handle things like "view-source:google.com". 467c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (scheme == chrome::kViewSourceScheme) { 468c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Reject "view-source:view-source:..." to avoid deep recursion. 469c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string view_source(chrome::kViewSourceScheme + std::string(":")); 470c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!StartsWithASCII(text, view_source + view_source, false)) { 471c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return GURL(chrome::kViewSourceScheme + std::string(":") + 472c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FixupURL(trimmed.substr(scheme.length() + 1), 473c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch desired_tld).possibly_invalid_spec()); 474c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 475c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 476c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 477c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We handle the file scheme separately. 478c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (scheme == chrome::kFileScheme) 479c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return GURL(parts.scheme.is_valid() ? text : FixupPath(text)); 480c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 481c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // For some schemes whose layouts we understand, we rebuild it. 482c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (url_util::IsStandard(scheme.c_str(), 483c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url_parse::Component(0, static_cast<int>(scheme.length())))) { 484c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string url(scheme); 485c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url.append("://"); 486c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 487c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // We need to check whether the |username| is valid because it is our 488c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // responsibility to append the '@' to delineate the user information from 489c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // the host portion of the URL. 490c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (parts.username.is_valid()) { 491c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FixupUsername(trimmed, parts.username, &url); 492c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FixupPassword(trimmed, parts.password, &url); 493c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch url.append("@"); 494c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 495c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 496c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FixupHost(trimmed, parts.host, parts.scheme.is_valid(), desired_tld, &url); 497c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FixupPort(trimmed, parts.port, &url); 498c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FixupPath(trimmed, parts.path, &url); 499c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FixupQuery(trimmed, parts.query, &url); 500c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FixupRef(trimmed, parts.ref, &url); 501c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 502c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return GURL(url); 503c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 504c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 505c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // In the worst-case, we insert a scheme if the URL lacks one. 506c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!parts.scheme.is_valid()) { 507c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string fixed_scheme(scheme); 508c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch fixed_scheme.append("://"); 509c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch trimmed.insert(0, fixed_scheme); 510c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 511c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 512c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return GURL(trimmed); 513c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 514c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 515c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// The rules are different here than for regular fixup, since we need to handle 516c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// input like "hello.html" and know to look in the current directory. Regular 517c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// fixup will look for cues that it is actually a file path before trying to 518c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// figure out what file it is. If our logic doesn't work, we will fall back on 519c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// regular fixup. 520c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochGURL URLFixerUpper::FixupRelativeFile(const FilePath& base_dir, 521c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch const FilePath& text) { 522c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath old_cur_directory; 523c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!base_dir.empty()) { 524c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Save the old current directory before we move to the new one. 525c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch file_util::GetCurrentDirectory(&old_cur_directory); 526c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch file_util::SetCurrentDirectory(base_dir); 527c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 528c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 529c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Allow funny input with extra whitespace and the wrong kind of slashes. 530c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath::StringType trimmed; 531c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch PrepareStringForFileOps(text, &trimmed); 532c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 533c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch bool is_file = true; 534c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch FilePath full_path; 535c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!ValidPathForFile(trimmed, &full_path)) { 536c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Not a path as entered, try unescaping it in case the user has 537c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // escaped things. We need to go through 8-bit since the escaped values 538c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // only represent 8-bit values. 539c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_WIN) 540c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::wstring unescaped = UTF8ToWide(UnescapeURLComponent( 541c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch WideToUTF8(trimmed), 542c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS)); 543c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#elif defined(OS_POSIX) 544c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string unescaped = UnescapeURLComponent( 545c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch trimmed, 546c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS); 547c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif 548c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 549c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!ValidPathForFile(unescaped, &full_path)) 550c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch is_file = false; 551c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 552c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 553c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Put back the current directory if we saved it. 554c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (!base_dir.empty()) 555c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch file_util::SetCurrentDirectory(old_cur_directory); 556c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 557c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (is_file) { 558c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch GURL file_url = net::FilePathToFileURL(full_path); 559c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch if (file_url.is_valid()) 5603345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick return GURL(UTF16ToUTF8(net::FormatUrl(file_url, std::string(), 561c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch net::kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, NULL, 562c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch NULL, NULL))); 563c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Invalid files fall through to regular processing. 564c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch } 565c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 566c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch // Fall back on regular fixup for this input. 567c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_WIN) 568c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string text_utf8 = WideToUTF8(text.value()); 569c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#elif defined(OS_POSIX) 570c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch std::string text_utf8 = text.value(); 571c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif 572c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch return FixupURL(text_utf8, std::string()); 573c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch} 574c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch 57572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenstring16 URLFixerUpper::SegmentURL(const string16& text, 57672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen url_parse::Parsed* parts) { 57772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen std::string text_utf8 = UTF16ToUTF8(text); 57872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen url_parse::Parsed parts_utf8; 57972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen std::string scheme_utf8 = SegmentURL(text_utf8, &parts_utf8); 58072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen UTF8PartsToUTF16Parts(text_utf8, parts_utf8, parts); 58172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen return UTF8ToUTF16(scheme_utf8); 58272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen} 58321d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen 58421d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsenvoid URLFixerUpper::OffsetComponent(int offset, url_parse::Component* part) { 58521d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen DCHECK(part); 58621d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen 58721d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen if (part->is_valid()) { 58821d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen // Offset the location of this component. 58921d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen part->begin += offset; 59021d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen 59121d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen // This part might not have existed in the original text. 59221d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen if (part->begin < 0) 59321d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen part->reset(); 59421d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen } 59521d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen} 596