1c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Use of this source code is governed by a BSD-style license that can be
3c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// found in the LICENSE file.
4c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
5c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/browser/net/url_fixer_upper.h"
6c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
7c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include <algorithm>
8c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
93345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#if defined(OS_POSIX)
103345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#include "base/environment.h"
113345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick#endif
12c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/file_util.h"
13c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/logging.h"
14c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/string_util.h"
15c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "base/utf_string_conversions.h"
16c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "chrome/common/url_constants.h"
17c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "googleurl/src/url_file.h"
18c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "googleurl/src/url_parse.h"
19c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "googleurl/src/url_util.h"
20c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "net/base/escape.h"
21c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "net/base/net_util.h"
22c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#include "net/base/registry_controlled_domain.h"
23c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
24c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochconst char* URLFixerUpper::home_directory_override = NULL;
25c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
26c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochnamespace {
27c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
28c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// TODO(estade): Remove these ugly, ugly functions. They are only used in
29c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// SegmentURL. A url_parse::Parsed object keeps track of a bunch of indices into
30c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// a url string, and these need to be updated when the URL is converted from
31c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// UTF8 to UTF16. Instead of this after-the-fact adjustment, we should parse it
32c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// in the correct string format to begin with.
3372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenurl_parse::Component UTF8ComponentToUTF16Component(
3472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    const std::string& text_utf8,
3572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    const url_parse::Component& component_utf8) {
3672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  if (component_utf8.len == -1)
3772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    return url_parse::Component();
3872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
3972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  std::string before_component_string =
4072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen      text_utf8.substr(0, component_utf8.begin);
4172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  std::string component_string = text_utf8.substr(component_utf8.begin,
4272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen                                                  component_utf8.len);
4372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  string16 before_component_string_16 = UTF8ToUTF16(before_component_string);
4472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  string16 component_string_16 = UTF8ToUTF16(component_string);
4572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  url_parse::Component component_16(before_component_string_16.length(),
4672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen                                    component_string_16.length());
4772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  return component_16;
4872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen}
4972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
5072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenvoid UTF8PartsToUTF16Parts(const std::string& text_utf8,
5172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen                           const url_parse::Parsed& parts_utf8,
5272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen                           url_parse::Parsed* parts) {
5372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  if (IsStringASCII(text_utf8)) {
5472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    *parts = parts_utf8;
5572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen    return;
5672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  }
5772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen
5872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  parts->scheme =
5972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen      UTF8ComponentToUTF16Component(text_utf8, parts_utf8.scheme);
6072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  parts ->username =
6172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen      UTF8ComponentToUTF16Component(text_utf8, parts_utf8.username);
6272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  parts->password =
6372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen      UTF8ComponentToUTF16Component(text_utf8, parts_utf8.password);
6472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  parts->host =
6572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen      UTF8ComponentToUTF16Component(text_utf8, parts_utf8.host);
6672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  parts->port =
6772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen      UTF8ComponentToUTF16Component(text_utf8, parts_utf8.port);
6872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  parts->path =
6972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen      UTF8ComponentToUTF16Component(text_utf8, parts_utf8.path);
7072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  parts->query =
7172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen      UTF8ComponentToUTF16Component(text_utf8, parts_utf8.query);
7272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  parts->ref =
7372a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen      UTF8ComponentToUTF16Component(text_utf8, parts_utf8.ref);
7472a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen}
75c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
76c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochTrimPositions TrimWhitespaceUTF8(const std::string& input,
77c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                 TrimPositions positions,
78c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                 std::string* output) {
79c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // This implementation is not so fast since it converts the text encoding
80c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // twice. Please feel free to file a bug if this function hurts the
81c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // performance of Chrome.
82c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK(IsStringUTF8(input));
83c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::wstring input_wide = UTF8ToWide(input);
84c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::wstring output_wide;
85c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide);
86c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  *output = WideToUTF8(output_wide);
87c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return result;
88c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
89c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
90c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}  // namespace
91c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
92c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// does some basic fixes for input that we want to test for file-ness
93c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic void PrepareStringForFileOps(const FilePath& text,
94c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                    FilePath::StringType* output) {
95c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_WIN)
96c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  TrimWhitespace(text.value(), TRIM_ALL, output);
97c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  replace(output->begin(), output->end(), '/', '\\');
98c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#else
99c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  TrimWhitespaceUTF8(text.value(), TRIM_ALL, output);
100c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif
101c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
102c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
103c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Tries to create a full path from |text|.  If the result is valid and the
104c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// file exists, returns true and sets |full_path| to the result.  Otherwise,
105c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// returns false and leaves |full_path| unchanged.
106c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic bool ValidPathForFile(const FilePath::StringType& text,
107c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                             FilePath* full_path) {
108c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  FilePath file_path(text);
109c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!file_util::AbsolutePath(&file_path))
110c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
111c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
112c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!file_util::PathExists(file_path))
113c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
114c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
115c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  *full_path = file_path;
116c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return true;
117c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
118c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
119c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_POSIX)
120c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Given a path that starts with ~, return a path that starts with an
121c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// expanded-out /user/foobar directory.
122c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic std::string FixupHomedir(const std::string& text) {
123c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK(text.length() > 0 && text[0] == '~');
124c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
125c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (text.length() == 1 || text[1] == '/') {
126c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    const char* home = getenv(base::env_vars::kHome);
127c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (URLFixerUpper::home_directory_override)
128c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      home = URLFixerUpper::home_directory_override;
129c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // We'll probably break elsewhere if $HOME is undefined, but check here
130c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // just in case.
131c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (!home)
132c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      return text;
133c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return home + text.substr(1);
134c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
135c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
136c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Otherwise, this is a path like ~foobar/baz, where we must expand to
137c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // user foobar's home directory.  Officially, we should use getpwent(),
138c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // but that is a nasty blocking call.
139c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
140c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_MACOSX)
141c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  static const char kHome[] = "/Users/";
142c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#else
143c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  static const char kHome[] = "/home/";
144c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif
145c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return kHome + text.substr(1);
146c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
147c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif
148c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
149c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Tries to create a file: URL from |text| if it looks like a filename, even if
150c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// it doesn't resolve as a valid path or to an existing file.  Returns a
151c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// (possibly invalid) file: URL in |fixed_up_url| for input beginning
152c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// with a drive specifier or "\\".  Returns the unchanged input in other cases
153c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// (including file: URLs: these don't look like filenames).
154c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic std::string FixupPath(const std::string& text) {
155c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK(!text.empty());
156c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
157c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  FilePath::StringType filename;
158c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_WIN)
159c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  FilePath input_path(UTF8ToWide(text));
160c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  PrepareStringForFileOps(input_path, &filename);
161c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
162c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Fixup Windows-style drive letters, where "C:" gets rewritten to "C|".
163c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (filename.length() > 1 && filename[1] == '|')
164c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    filename[1] = ':';
165c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#elif defined(OS_POSIX)
166c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  FilePath input_path(text);
167c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  PrepareStringForFileOps(input_path, &filename);
168c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (filename.length() > 0 && filename[0] == '~')
169c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    filename = FixupHomedir(filename);
170c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif
171c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
172c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Here, we know the input looks like a file.
173c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  GURL file_url = net::FilePathToFileURL(FilePath(filename));
174c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (file_url.is_valid()) {
1753345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick    return UTF16ToUTF8(net::FormatUrl(file_url, std::string(),
176c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        net::kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, NULL,
177c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        NULL, NULL));
178c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
179c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
180c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Invalid file URL, just return the input.
181c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return text;
182c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
183c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
184c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Checks |domain| to see if a valid TLD is already present.  If not, appends
185c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// |desired_tld| to the domain, and prepends "www." unless it's already present.
186c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic void AddDesiredTLD(const std::string& desired_tld,
187c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                          std::string* domain) {
188c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (desired_tld.empty() || domain->empty())
189c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return;
190c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
191c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Check the TLD.  If the return value is positive, we already have a TLD, so
192c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // abort.  If the return value is std::string::npos, there's no valid host,
193c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // but we can try to append a TLD anyway, since the host may become valid once
194c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // the TLD is attached -- for example, "999999999999" is detected as a broken
195c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // IP address and marked invalid, but attaching ".com" makes it legal.  When
196c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // the return value is 0, there's a valid host with no known TLD, so we can
197c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // definitely append the user's TLD.  We disallow unknown registries here so
198c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // users can input "mail.yahoo" and hit ctrl-enter to get
199c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // "www.mail.yahoo.com".
200c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  const size_t registry_length =
201c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      net::RegistryControlledDomainService::GetRegistryLength(*domain, false);
202c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if ((registry_length != 0) && (registry_length != std::string::npos))
203c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return;
204c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
205c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Add the suffix at the end of the domain.
206c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  const size_t domain_length(domain->length());
207c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK_GT(domain_length, 0U);
208c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK_NE(desired_tld[0], '.');
209c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if ((*domain)[domain_length - 1] != '.')
210c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    domain->push_back('.');
211c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  domain->append(desired_tld);
212c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
213c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Now, if the domain begins with "www.", stop.
214c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  const std::string prefix("www.");
215c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (domain->compare(0, prefix.length(), prefix) != 0) {
216c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Otherwise, add www. to the beginning of the URL.
217c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    domain->insert(0, prefix);
218c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
219c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
220c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
221c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic inline void FixupUsername(const std::string& text,
222c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                 const url_parse::Component& part,
223c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                 std::string* url) {
224c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!part.is_valid())
225c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return;
226c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
227c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // We don't fix up the username at the moment.
228c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url->append(text, part.begin, part.len);
229c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Do not append the trailing '@' because we might need to include the user's
230c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // password.  FixupURL itself will append the '@' for us.
231c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
232c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
233c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic inline void FixupPassword(const std::string& text,
234c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                 const url_parse::Component& part,
235c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                 std::string* url) {
236c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!part.is_valid())
237c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return;
238c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
239c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // We don't fix up the password at the moment.
240c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url->append(":");
241c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url->append(text, part.begin, part.len);
242c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
243c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
244c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic void FixupHost(const std::string& text,
245c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                      const url_parse::Component& part,
246c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                      bool has_scheme,
247c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                      const std::string& desired_tld,
248c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                      std::string* url) {
249c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!part.is_valid())
250c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return;
251c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
252c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Make domain valid.
253c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Strip all leading dots and all but one trailing dot, unless the user only
254c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // typed dots, in which case their input is totally invalid and we should just
255c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // leave it unchanged.
256c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::string domain(text, part.begin, part.len);
257c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  const size_t first_nondot(domain.find_first_not_of('.'));
258c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (first_nondot != std::string::npos) {
259c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    domain.erase(0, first_nondot);
260c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    size_t last_nondot(domain.find_last_not_of('.'));
261c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    DCHECK(last_nondot != std::string::npos);
262c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    last_nondot += 2;  // Point at second period in ending string
263c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (last_nondot < domain.length())
264c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      domain.erase(last_nondot);
265c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
266c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
267c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Add any user-specified TLD, if applicable.
268c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  AddDesiredTLD(desired_tld, &domain);
269c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
270c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url->append(domain);
271c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
272c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
273c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic void FixupPort(const std::string& text,
274c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                      const url_parse::Component& part,
275c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                      std::string* url) {
276c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!part.is_valid())
277c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return;
278c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
279c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // We don't fix up the port at the moment.
280c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url->append(":");
281c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url->append(text, part.begin, part.len);
282c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
283c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
284c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic inline void FixupPath(const std::string& text,
285c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                             const url_parse::Component& part,
286c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                             std::string* url) {
287c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!part.is_valid() || part.len == 0) {
288c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // We should always have a path.
289c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    url->append("/");
290c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return;
291c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
292c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
293c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Append the path as is.
294c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url->append(text, part.begin, part.len);
295c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
296c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
297c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic inline void FixupQuery(const std::string& text,
298c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                              const url_parse::Component& part,
299c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                              std::string* url) {
300c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!part.is_valid())
301c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return;
302c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
303c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // We don't fix up the query at the moment.
304c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url->append("?");
305c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url->append(text, part.begin, part.len);
306c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
307c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
308c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic inline void FixupRef(const std::string& text,
309c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                            const url_parse::Component& part,
310c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                            std::string* url) {
311c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!part.is_valid())
312c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return;
313c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
314c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // We don't fix up the ref at the moment.
315c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url->append("#");
316c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url->append(text, part.begin, part.len);
317c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
318c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
319c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic bool HasPort(const std::string& original_text,
320c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                    const url_parse::Component& scheme_component) {
321c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Find the range between the ":" and the "/".
322c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  size_t port_start = scheme_component.end() + 1;
323c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  size_t port_end = port_start;
324c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  while ((port_end < original_text.length()) &&
325c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch         !url_parse::IsAuthorityTerminator(original_text[port_end]))
326c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    ++port_end;
327c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (port_end == port_start)
328c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
329c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
330c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Scan the range to see if it is entirely digits.
331c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  for (size_t i = port_start; i < port_end; ++i) {
332c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (!IsAsciiDigit(original_text[i]))
333c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      return false;
334c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
335c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
336c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return true;
337c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
338c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
339c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Try to extract a valid scheme from the beginning of |text|.
340c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// If successful, set |scheme_component| to the text range where the scheme
341c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// was located, and fill |canon_scheme| with its canonicalized form.
342c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// Otherwise, return false and leave the outputs in an indeterminate state.
343c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstatic bool GetValidScheme(const std::string &text,
344c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                           url_parse::Component* scheme_component,
345c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                           std::string* canon_scheme) {
346c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Locate everything up to (but not including) the first ':'
347c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!url_parse::ExtractScheme(text.data(), static_cast<int>(text.length()),
348c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                scheme_component))
349c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
350c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
351c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Make sure the scheme contains only valid characters, and convert
352c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // to lowercase.  This also catches IPv6 literals like [::1], because
353c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // brackets are not in the whitelist.
354c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url_canon::StdStringCanonOutput canon_scheme_output(canon_scheme);
355c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url_parse::Component canon_scheme_component;
356c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!url_canon::CanonicalizeScheme(text.data(), *scheme_component,
357c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                     &canon_scheme_output,
358c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                     &canon_scheme_component))
359c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
360c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
361c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Strip the ':', and any trailing buffer space.
362c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  DCHECK_EQ(0, canon_scheme_component.begin);
363c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  canon_scheme->erase(canon_scheme_component.len);
364c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
365c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // We need to fix up the segmentation for "www.example.com:/".  For this
366c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // case, we guess that schemes with a "." are not actually schemes.
367c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (canon_scheme->find('.') != std::string::npos)
368c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
369c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
370c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // We need to fix up the segmentation for "www:123/".  For this case, we
371c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // will add an HTTP scheme later and make the URL parser happy.
372c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // TODO(pkasting): Maybe we should try to use GURL's parser for this?
373c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (HasPort(text, *scheme_component))
374c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return false;
375c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
376c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Everything checks out.
377c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return true;
378c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
379c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
380c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdochstd::string URLFixerUpper::SegmentURL(const std::string& text,
381c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                      url_parse::Parsed* parts) {
382c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Initialize the result.
383c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  *parts = url_parse::Parsed();
384c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
385c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::string trimmed;
386c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  TrimWhitespaceUTF8(text, TRIM_ALL, &trimmed);
387c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (trimmed.empty())
388c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return std::string();  // Nothing to segment.
389c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
390c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_WIN)
391c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  int trimmed_length = static_cast<int>(trimmed.length());
392c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (url_parse::DoesBeginWindowsDriveSpec(trimmed.data(), 0, trimmed_length) ||
393c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      url_parse::DoesBeginUNCPath(trimmed.data(), 0, trimmed_length, true))
394c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return "file";
395c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#elif defined(OS_POSIX)
396c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (FilePath::IsSeparator(trimmed.data()[0]) || trimmed.data()[0] == '~')
397c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return "file";
398c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif
399c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
400c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Otherwise, we need to look at things carefully.
401c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::string scheme;
402c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!GetValidScheme(text, &parts->scheme, &scheme)) {
403c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Couldn't determine the scheme, so just pick one.
404c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    parts->scheme.reset();
405c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    scheme.assign(StartsWithASCII(text, "ftp.", false) ?
406c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        chrome::kFtpScheme : chrome::kHttpScheme);
407c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
408c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
409c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Not segmenting file schemes or nonstandard schemes.
410c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if ((scheme == chrome::kFileScheme) ||
411c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      !url_util::IsStandard(scheme.c_str(),
412c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      url_parse::Component(0, static_cast<int>(scheme.length()))))
413c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return scheme;
414c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
415c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (parts->scheme.is_valid()) {
416c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Have the GURL parser do the heavy lifting for us.
417c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    url_parse::ParseStandardURL(text.data(), static_cast<int>(text.length()),
418c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                parts);
419c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return scheme;
420c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
421c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
422c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // We need to add a scheme in order for ParseStandardURL to be happy.
423c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Find the first non-whitespace character.
424c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::string::const_iterator first_nonwhite = text.begin();
425c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  while ((first_nonwhite != text.end()) && IsWhitespace(*first_nonwhite))
426c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    ++first_nonwhite;
427c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
428c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Construct the text to parse by inserting the scheme.
429c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::string inserted_text(scheme);
430c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  inserted_text.append("://");
431c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::string text_to_parse(text.begin(), first_nonwhite);
432c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  text_to_parse.append(inserted_text);
433c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  text_to_parse.append(first_nonwhite, text.end());
434c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
435c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Have the GURL parser do the heavy lifting for us.
436c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url_parse::ParseStandardURL(text_to_parse.data(),
437c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                              static_cast<int>(text_to_parse.length()),
438c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                              parts);
439c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
440c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Offset the results of the parse to match the original text.
441c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  const int offset = -static_cast<int>(inserted_text.length());
442c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  OffsetComponent(offset, &parts->scheme);
443c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  OffsetComponent(offset, &parts->username);
444c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  OffsetComponent(offset, &parts->password);
445c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  OffsetComponent(offset, &parts->host);
446c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  OffsetComponent(offset, &parts->port);
447c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  OffsetComponent(offset, &parts->path);
448c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  OffsetComponent(offset, &parts->query);
449c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  OffsetComponent(offset, &parts->ref);
450c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
451c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return scheme;
452c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
453c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
454c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochGURL URLFixerUpper::FixupURL(const std::string& text,
455c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                             const std::string& desired_tld) {
456c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::string trimmed;
457c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  TrimWhitespaceUTF8(text, TRIM_ALL, &trimmed);
458c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (trimmed.empty())
459c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return GURL();  // Nothing here.
460c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
461c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Segment the URL.
462c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  url_parse::Parsed parts;
463c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::string scheme(SegmentURL(trimmed, &parts));
464c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
465c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // For view-source: URLs, we strip "view-source:", do fixup, and stick it back
466c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // on.  This allows us to handle things like "view-source:google.com".
467c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (scheme == chrome::kViewSourceScheme) {
468c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Reject "view-source:view-source:..." to avoid deep recursion.
469c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    std::string view_source(chrome::kViewSourceScheme + std::string(":"));
470c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (!StartsWithASCII(text, view_source + view_source, false)) {
471c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      return GURL(chrome::kViewSourceScheme + std::string(":") +
472c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          FixupURL(trimmed.substr(scheme.length() + 1),
473c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                   desired_tld).possibly_invalid_spec());
474c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
475c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
476c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
477c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // We handle the file scheme separately.
478c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (scheme == chrome::kFileScheme)
479c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return GURL(parts.scheme.is_valid() ? text : FixupPath(text));
480c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
481c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // For some schemes whose layouts we understand, we rebuild it.
482c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (url_util::IsStandard(scheme.c_str(),
483c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          url_parse::Component(0, static_cast<int>(scheme.length())))) {
484c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    std::string url(scheme);
485c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    url.append("://");
486c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
487c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // We need to check whether the |username| is valid because it is our
488c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // responsibility to append the '@' to delineate the user information from
489c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // the host portion of the URL.
490c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (parts.username.is_valid()) {
491c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      FixupUsername(trimmed, parts.username, &url);
492c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      FixupPassword(trimmed, parts.password, &url);
493c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      url.append("@");
494c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    }
495c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
496c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    FixupHost(trimmed, parts.host, parts.scheme.is_valid(), desired_tld, &url);
497c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    FixupPort(trimmed, parts.port, &url);
498c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    FixupPath(trimmed, parts.path, &url);
499c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    FixupQuery(trimmed, parts.query, &url);
500c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    FixupRef(trimmed, parts.ref, &url);
501c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
502c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    return GURL(url);
503c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
504c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
505c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // In the worst-case, we insert a scheme if the URL lacks one.
506c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!parts.scheme.is_valid()) {
507c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    std::string fixed_scheme(scheme);
508c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    fixed_scheme.append("://");
509c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    trimmed.insert(0, fixed_scheme);
510c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
511c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
512c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return GURL(trimmed);
513c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
514c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
515c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// The rules are different here than for regular fixup, since we need to handle
516c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// input like "hello.html" and know to look in the current directory.  Regular
517c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// fixup will look for cues that it is actually a file path before trying to
518c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// figure out what file it is.  If our logic doesn't work, we will fall back on
519c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch// regular fixup.
520c407dc5cd9bdc5668497f21b26b09d988ab439deBen MurdochGURL URLFixerUpper::FixupRelativeFile(const FilePath& base_dir,
521c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch                                      const FilePath& text) {
522c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  FilePath old_cur_directory;
523c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!base_dir.empty()) {
524c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Save the old current directory before we move to the new one.
525c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    file_util::GetCurrentDirectory(&old_cur_directory);
526c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    file_util::SetCurrentDirectory(base_dir);
527c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
528c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
529c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Allow funny input with extra whitespace and the wrong kind of slashes.
530c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  FilePath::StringType trimmed;
531c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  PrepareStringForFileOps(text, &trimmed);
532c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
533c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  bool is_file = true;
534c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  FilePath full_path;
535c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!ValidPathForFile(trimmed, &full_path)) {
536c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Not a path as entered, try unescaping it in case the user has
537c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // escaped things. We need to go through 8-bit since the escaped values
538c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // only represent 8-bit values.
539c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_WIN)
540c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    std::wstring unescaped = UTF8ToWide(UnescapeURLComponent(
541c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        WideToUTF8(trimmed),
542c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS));
543c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#elif defined(OS_POSIX)
544c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    std::string unescaped = UnescapeURLComponent(
545c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        trimmed,
546c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch        UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
547c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif
548c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
549c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (!ValidPathForFile(unescaped, &full_path))
550c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch      is_file = false;
551c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
552c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
553c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Put back the current directory if we saved it.
554c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (!base_dir.empty())
555c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    file_util::SetCurrentDirectory(old_cur_directory);
556c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
557c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  if (is_file) {
558c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    GURL file_url = net::FilePathToFileURL(full_path);
559c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    if (file_url.is_valid())
5603345a6884c488ff3a535c2c9acdd33d74b37e311Iain Merrick      return GURL(UTF16ToUTF8(net::FormatUrl(file_url, std::string(),
561c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          net::kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, NULL,
562c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch          NULL, NULL)));
563c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch    // Invalid files fall through to regular processing.
564c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  }
565c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
566c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  // Fall back on regular fixup for this input.
567c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#if defined(OS_WIN)
568c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::string text_utf8 = WideToUTF8(text.value());
569c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#elif defined(OS_POSIX)
570c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  std::string text_utf8 = text.value();
571c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch#endif
572c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch  return FixupURL(text_utf8, std::string());
573c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch}
574c407dc5cd9bdc5668497f21b26b09d988ab439deBen Murdoch
57572a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsenstring16 URLFixerUpper::SegmentURL(const string16& text,
57672a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen                                   url_parse::Parsed* parts) {
57772a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  std::string text_utf8 = UTF16ToUTF8(text);
57872a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  url_parse::Parsed parts_utf8;
57972a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  std::string scheme_utf8 = SegmentURL(text_utf8, &parts_utf8);
58072a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  UTF8PartsToUTF16Parts(text_utf8, parts_utf8, parts);
58172a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen  return UTF8ToUTF16(scheme_utf8);
58272a454cd3513ac24fbdd0e0cb9ad70b86a99b801Kristian Monsen}
58321d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen
58421d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsenvoid URLFixerUpper::OffsetComponent(int offset, url_parse::Component* part) {
58521d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen  DCHECK(part);
58621d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen
58721d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen  if (part->is_valid()) {
58821d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen    // Offset the location of this component.
58921d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen    part->begin += offset;
59021d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen
59121d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen    // This part might not have existed in the original text.
59221d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen    if (part->begin < 0)
59321d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen      part->reset();
59421d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen  }
59521d179b334e59e9a3bfcaed4c4430bef1bc5759dKristian Monsen}
596