1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/autocomplete/autocomplete_input.h"
6
7#include "base/strings/string_util.h"
8#include "base/strings/utf_string_conversions.h"
9#include "chrome/browser/external_protocol/external_protocol_handler.h"
10#include "chrome/browser/profiles/profile_io_data.h"
11#include "chrome/common/net/url_fixer_upper.h"
12#include "content/public/common/url_constants.h"
13#include "net/base/net_util.h"
14#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
15#include "url/url_canon_ip.h"
16#include "url/url_util.h"
17
18namespace {
19
20void AdjustCursorPositionIfNecessary(size_t num_leading_chars_removed,
21                                     size_t* cursor_position) {
22  if (*cursor_position == base::string16::npos)
23    return;
24  if (num_leading_chars_removed < *cursor_position)
25    *cursor_position -= num_leading_chars_removed;
26  else
27    *cursor_position = 0;
28}
29
30}  // namespace
31
32AutocompleteInput::AutocompleteInput()
33    : cursor_position_(base::string16::npos),
34      current_page_classification_(AutocompleteInput::INVALID_SPEC),
35      type_(INVALID),
36      prevent_inline_autocomplete_(false),
37      prefer_keyword_(false),
38      allow_exact_keyword_match_(true),
39      matches_requested_(ALL_MATCHES) {
40}
41
42AutocompleteInput::AutocompleteInput(
43    const base::string16& text,
44    size_t cursor_position,
45    const base::string16& desired_tld,
46    const GURL& current_url,
47    AutocompleteInput::PageClassification current_page_classification,
48    bool prevent_inline_autocomplete,
49    bool prefer_keyword,
50    bool allow_exact_keyword_match,
51    MatchesRequested matches_requested)
52    : cursor_position_(cursor_position),
53      current_url_(current_url),
54      current_page_classification_(current_page_classification),
55      prevent_inline_autocomplete_(prevent_inline_autocomplete),
56      prefer_keyword_(prefer_keyword),
57      allow_exact_keyword_match_(allow_exact_keyword_match),
58      matches_requested_(matches_requested) {
59  DCHECK(cursor_position <= text.length() ||
60         cursor_position == base::string16::npos)
61      << "Text: '" << text << "', cp: " << cursor_position;
62  // None of the providers care about leading white space so we always trim it.
63  // Providers that care about trailing white space handle trimming themselves.
64  if ((TrimWhitespace(text, TRIM_LEADING, &text_) & TRIM_LEADING) != 0)
65    AdjustCursorPositionIfNecessary(text.length() - text_.length(),
66                                    &cursor_position_);
67
68  GURL canonicalized_url;
69  type_ = Parse(text_, desired_tld, &parts_, &scheme_, &canonicalized_url);
70
71  if (type_ == INVALID)
72    return;
73
74  if (((type_ == UNKNOWN) || (type_ == URL)) &&
75      canonicalized_url.is_valid() &&
76      (!canonicalized_url.IsStandard() || canonicalized_url.SchemeIsFile() ||
77       canonicalized_url.SchemeIsFileSystem() ||
78       !canonicalized_url.host().empty()))
79    canonicalized_url_ = canonicalized_url;
80
81  size_t chars_removed = RemoveForcedQueryStringIfNecessary(type_, &text_);
82  AdjustCursorPositionIfNecessary(chars_removed, &cursor_position_);
83  if (chars_removed) {
84    // Remove spaces between opening question mark and first actual character.
85    base::string16 trimmed_text;
86    if ((TrimWhitespace(text_, TRIM_LEADING, &trimmed_text) & TRIM_LEADING) !=
87        0) {
88      AdjustCursorPositionIfNecessary(text_.length() - trimmed_text.length(),
89                                      &cursor_position_);
90      text_ = trimmed_text;
91    }
92  }
93}
94
95AutocompleteInput::~AutocompleteInput() {
96}
97
98// static
99size_t AutocompleteInput::RemoveForcedQueryStringIfNecessary(
100    Type type,
101    base::string16* text) {
102  if (type != FORCED_QUERY || text->empty() || (*text)[0] != L'?')
103    return 0;
104  // Drop the leading '?'.
105  text->erase(0, 1);
106  return 1;
107}
108
109// static
110std::string AutocompleteInput::TypeToString(Type type) {
111  switch (type) {
112    case INVALID:       return "invalid";
113    case UNKNOWN:       return "unknown";
114    case URL:           return "url";
115    case QUERY:         return "query";
116    case FORCED_QUERY:  return "forced-query";
117
118    default:
119      NOTREACHED();
120      return std::string();
121  }
122}
123
124// static
125AutocompleteInput::Type AutocompleteInput::Parse(
126    const base::string16& text,
127    const base::string16& desired_tld,
128    url_parse::Parsed* parts,
129    base::string16* scheme,
130    GURL* canonicalized_url) {
131  size_t first_non_white = text.find_first_not_of(base::kWhitespaceUTF16, 0);
132  if (first_non_white == base::string16::npos)
133    return INVALID;  // All whitespace.
134
135  if (text.at(first_non_white) == L'?') {
136    // If the first non-whitespace character is a '?', we magically treat this
137    // as a query.
138    return FORCED_QUERY;
139  }
140
141  // Ask our parsing back-end to help us understand what the user typed.  We
142  // use the URLFixerUpper here because we want to be smart about what we
143  // consider a scheme.  For example, we shouldn't consider www.google.com:80
144  // to have a scheme.
145  url_parse::Parsed local_parts;
146  if (!parts)
147    parts = &local_parts;
148  const base::string16 parsed_scheme(URLFixerUpper::SegmentURL(text, parts));
149  if (scheme)
150    *scheme = parsed_scheme;
151  if (canonicalized_url) {
152    *canonicalized_url = URLFixerUpper::FixupURL(UTF16ToUTF8(text),
153                                                 UTF16ToUTF8(desired_tld));
154  }
155
156  if (LowerCaseEqualsASCII(parsed_scheme, chrome::kFileScheme)) {
157    // A user might or might not type a scheme when entering a file URL.  In
158    // either case, |parsed_scheme| will tell us that this is a file URL, but
159    // |parts->scheme| might be empty, e.g. if the user typed "C:\foo".
160    return URL;
161  }
162
163  if (LowerCaseEqualsASCII(parsed_scheme, chrome::kFileSystemScheme)) {
164    // This could theoretically be a strange search, but let's check.
165    // If it's got an inner_url with a scheme, it's a URL, whether it's valid or
166    // not.
167    if (parts->inner_parsed() && parts->inner_parsed()->scheme.is_valid())
168      return URL;
169  }
170
171  // If the user typed a scheme, and it's HTTP or HTTPS, we know how to parse it
172  // well enough that we can fall through to the heuristics below.  If it's
173  // something else, we can just determine our action based on what we do with
174  // any input of this scheme.  In theory we could do better with some schemes
175  // (e.g. "ftp" or "view-source") but I'll wait to spend the effort on that
176  // until I run into some cases that really need it.
177  if (parts->scheme.is_nonempty() &&
178      !LowerCaseEqualsASCII(parsed_scheme, content::kHttpScheme) &&
179      !LowerCaseEqualsASCII(parsed_scheme, content::kHttpsScheme)) {
180    // See if we know how to handle the URL internally.
181    if (ProfileIOData::IsHandledProtocol(UTF16ToASCII(parsed_scheme)))
182      return URL;
183
184    // There are also some schemes that we convert to other things before they
185    // reach the renderer or else the renderer handles internally without
186    // reaching the net::URLRequest logic.  We thus won't catch these above, but
187    // we should still claim to handle them.
188    if (LowerCaseEqualsASCII(parsed_scheme, content::kViewSourceScheme) ||
189        LowerCaseEqualsASCII(parsed_scheme, content::kJavaScriptScheme) ||
190        LowerCaseEqualsASCII(parsed_scheme, chrome::kDataScheme))
191      return URL;
192
193    // Finally, check and see if the user has explicitly opened this scheme as
194    // a URL before, or if the "scheme" is actually a username.  We need to do
195    // this last because some schemes (e.g. "javascript") may be treated as
196    // "blocked" by the external protocol handler because we don't want pages to
197    // open them, but users still can.
198    // TODO(viettrungluu): get rid of conversion.
199    ExternalProtocolHandler::BlockState block_state =
200        ExternalProtocolHandler::GetBlockState(UTF16ToUTF8(parsed_scheme));
201    switch (block_state) {
202      case ExternalProtocolHandler::DONT_BLOCK:
203        return URL;
204
205      case ExternalProtocolHandler::BLOCK:
206        // If we don't want the user to open the URL, don't let it be navigated
207        // to at all.
208        return QUERY;
209
210      default: {
211        // We don't know about this scheme.  It might be that the user typed a
212        // URL of the form "username:password@foo.com".
213        const base::string16 http_scheme_prefix =
214            ASCIIToUTF16(std::string(content::kHttpScheme) +
215                         content::kStandardSchemeSeparator);
216        url_parse::Parsed http_parts;
217        base::string16 http_scheme;
218        GURL http_canonicalized_url;
219        Type http_type = Parse(http_scheme_prefix + text, desired_tld,
220                               &http_parts, &http_scheme,
221                               &http_canonicalized_url);
222        DCHECK_EQ(std::string(content::kHttpScheme), UTF16ToUTF8(http_scheme));
223
224        if (http_type == URL &&
225            http_parts.username.is_nonempty() &&
226            http_parts.password.is_nonempty()) {
227          // Manually re-jigger the parsed parts to match |text| (without the
228          // http scheme added).
229          http_parts.scheme.reset();
230          url_parse::Component* components[] = {
231            &http_parts.username,
232            &http_parts.password,
233            &http_parts.host,
234            &http_parts.port,
235            &http_parts.path,
236            &http_parts.query,
237            &http_parts.ref,
238          };
239          for (size_t i = 0; i < arraysize(components); ++i) {
240            URLFixerUpper::OffsetComponent(
241                -static_cast<int>(http_scheme_prefix.length()), components[i]);
242          }
243
244          *parts = http_parts;
245          if (scheme)
246            scheme->clear();
247          if (canonicalized_url)
248            *canonicalized_url = http_canonicalized_url;
249
250          return http_type;
251        }
252
253        // We don't know about this scheme and it doesn't look like the user
254        // typed a username and password.  It's likely to be a search operator
255        // like "site:" or "link:".  We classify it as UNKNOWN so the user has
256        // the option of treating it as a URL if we're wrong.
257        // Note that SegmentURL() is smart so we aren't tricked by "c:\foo" or
258        // "www.example.com:81" in this case.
259        return UNKNOWN;
260      }
261    }
262  }
263
264  // Either the user didn't type a scheme, in which case we need to distinguish
265  // between an HTTP URL and a query, or the scheme is HTTP or HTTPS, in which
266  // case we should reject invalid formulations.
267
268  // If we have an empty host it can't be a URL.
269  if (!parts->host.is_nonempty())
270    return QUERY;
271
272  // Likewise, the RCDS can reject certain obviously-invalid hosts.  (We also
273  // use the registry length later below.)
274  const base::string16 host(text.substr(parts->host.begin, parts->host.len));
275  const size_t registry_length =
276      net::registry_controlled_domains::GetRegistryLength(
277          UTF16ToUTF8(host),
278          net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
279          net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
280  if (registry_length == std::string::npos) {
281    // Try to append the desired_tld.
282    if (!desired_tld.empty()) {
283      base::string16 host_with_tld(host);
284      if (host[host.length() - 1] != '.')
285        host_with_tld += '.';
286      host_with_tld += desired_tld;
287      const size_t tld_length =
288          net::registry_controlled_domains::GetRegistryLength(
289              UTF16ToUTF8(host_with_tld),
290              net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
291              net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
292      if (tld_length != std::string::npos)
293        return URL;  // Something like "99999999999" that looks like a bad IP
294                     // address, but becomes valid on attaching a TLD.
295    }
296    return QUERY;  // Could be a broken IP address, etc.
297  }
298
299
300  // See if the hostname is valid.  While IE and GURL allow hostnames to contain
301  // many other characters (perhaps for weird intranet machines), it's extremely
302  // unlikely that a user would be trying to type those in for anything other
303  // than a search query.
304  url_canon::CanonHostInfo host_info;
305  const std::string canonicalized_host(net::CanonicalizeHost(UTF16ToUTF8(host),
306                                                             &host_info));
307  if ((host_info.family == url_canon::CanonHostInfo::NEUTRAL) &&
308      !net::IsCanonicalizedHostCompliant(canonicalized_host,
309                                         UTF16ToUTF8(desired_tld))) {
310    // Invalid hostname.  There are several possible cases:
311    // * Our checker is too strict and the user pasted in a real-world URL
312    //   that's "invalid" but resolves.  To catch these, we return UNKNOWN when
313    //   the user explicitly typed a scheme, so we'll still search by default
314    //   but we'll show the accidental search infobar if necessary.
315    // * The user is typing a multi-word query.  If we see a space anywhere in
316    //   the hostname we assume this is a search and return QUERY.
317    // * Our checker is too strict and the user is typing a real-world hostname
318    //   that's "invalid" but resolves.  We return UNKNOWN if the TLD is known.
319    //   Note that we explicitly excluded hosts with spaces above so that
320    //   "toys at amazon.com" will be treated as a search.
321    // * The user is typing some garbage string.  Return QUERY.
322    //
323    // Thus we fall down in the following cases:
324    // * Trying to navigate to a hostname with spaces
325    // * Trying to navigate to a hostname with invalid characters and an unknown
326    //   TLD
327    // These are rare, though probably possible in intranets.
328    return (parts->scheme.is_nonempty() ||
329           ((registry_length != 0) &&
330            (host.find(' ') == base::string16::npos))) ? UNKNOWN : QUERY;
331  }
332
333  // A port number is a good indicator that this is a URL.  However, it might
334  // also be a query like "1.66:1" that looks kind of like an IP address and
335  // port number. So here we only check for "port numbers" that are illegal and
336  // thus mean this can't be navigated to (e.g. "1.2.3.4:garbage"), and we save
337  // handling legal port numbers until after the "IP address" determination
338  // below.
339  if (url_parse::ParsePort(text.c_str(), parts->port) ==
340      url_parse::PORT_INVALID)
341    return QUERY;
342
343  // Now that we've ruled out all schemes other than http or https and done a
344  // little more sanity checking, the presence of a scheme means this is likely
345  // a URL.
346  if (parts->scheme.is_nonempty())
347    return URL;
348
349  // See if the host is an IP address.
350  if (host_info.family == url_canon::CanonHostInfo::IPV6)
351    return URL;
352  // If the user originally typed a host that looks like an IP address (a
353  // dotted quad), they probably want to open it.  If the original input was
354  // something else (like a single number), they probably wanted to search for
355  // it, unless they explicitly typed a scheme.  This is true even if the URL
356  // appears to have a path: "1.2/45" is more likely a search (for the answer
357  // to a math problem) than a URL.  However, if there are more non-host
358  // components, then maybe this really was intended to be a navigation.  For
359  // this reason we only check the dotted-quad case here, and save the "other
360  // IP addresses" case for after we check the number of non-host components
361  // below.
362  if ((host_info.family == url_canon::CanonHostInfo::IPV4) &&
363      (host_info.num_ipv4_components == 4))
364    return URL;
365
366  // Presence of a password means this is likely a URL.  Note that unless the
367  // user has typed an explicit "http://" or similar, we'll probably think that
368  // the username is some unknown scheme, and bail out in the scheme-handling
369  // code above.
370  if (parts->password.is_nonempty())
371    return URL;
372
373  // Trailing slashes force the input to be treated as a URL.
374  if (parts->path.is_nonempty()) {
375    char c = text[parts->path.end() - 1];
376    if ((c == '\\') || (c == '/'))
377      return URL;
378  }
379
380  // If there is more than one recognized non-host component, this is likely to
381  // be a URL, even if the TLD is unknown (in which case this is likely an
382  // intranet URL).
383  if (NumNonHostComponents(*parts) > 1)
384    return URL;
385
386  // If the host has a known TLD or a port, it's probably a URL, with the
387  // following exceptions:
388  // * Any "IP addresses" that make it here are more likely searches
389  //   (see above).
390  // * If we reach here with a username, our input looks like "user@host[.tld]".
391  //   Because there is no scheme explicitly specified, we think this is more
392  //   likely an email address than an HTTP auth attempt.  Hence, we search by
393  //   default and let users correct us on a case-by-case basis.
394  // Note that we special-case "localhost" as a known hostname.
395  if ((host_info.family != url_canon::CanonHostInfo::IPV4) &&
396      ((registry_length != 0) || (host == ASCIIToUTF16("localhost") ||
397       parts->port.is_nonempty())))
398    return parts->username.is_nonempty() ? UNKNOWN : URL;
399
400  // If we reach this point, we know there's no known TLD on the input, so if
401  // the user wishes to add a desired_tld, the fixup code will oblige; thus this
402  // is a URL.
403  if (!desired_tld.empty())
404    return URL;
405
406  // No scheme, password, port, path, and no known TLD on the host.
407  // This could be:
408  // * An "incomplete IP address"; likely a search (see above).
409  // * An email-like input like "user@host", where "host" has no known TLD.
410  //   It's not clear what the user means here and searching seems reasonable.
411  // * A single word "foo"; possibly an intranet site, but more likely a search.
412  //   This is ideally an UNKNOWN, and we can let the Alternate Nav URL code
413  //   catch our mistakes.
414  // * A URL with a valid TLD we don't know about yet.  If e.g. a registrar adds
415  //   "xxx" as a TLD, then until we add it to our data file, Chrome won't know
416  //   "foo.xxx" is a real URL.  So ideally this is a URL, but we can't really
417  //   distinguish this case from:
418  // * A "URL-like" string that's not really a URL (like
419  //   "browser.tabs.closeButtons" or "java.awt.event.*").  This is ideally a
420  //   QUERY.  Since this is indistinguishable from the case above, and this
421  //   case is much more likely, claim these are UNKNOWN, which should default
422  //   to the right thing and let users correct us on a case-by-case basis.
423  return UNKNOWN;
424}
425
426// static
427void AutocompleteInput::ParseForEmphasizeComponents(
428    const base::string16& text,
429    url_parse::Component* scheme,
430    url_parse::Component* host) {
431  url_parse::Parsed parts;
432  base::string16 scheme_str;
433  Parse(text, base::string16(), &parts, &scheme_str, NULL);
434
435  *scheme = parts.scheme;
436  *host = parts.host;
437
438  int after_scheme_and_colon = parts.scheme.end() + 1;
439  // For the view-source scheme, we should emphasize the scheme and host of the
440  // URL qualified by the view-source prefix.
441  if (LowerCaseEqualsASCII(scheme_str, content::kViewSourceScheme) &&
442      (static_cast<int>(text.length()) > after_scheme_and_colon)) {
443    // Obtain the URL prefixed by view-source and parse it.
444    base::string16 real_url(text.substr(after_scheme_and_colon));
445    url_parse::Parsed real_parts;
446    AutocompleteInput::Parse(real_url, base::string16(), &real_parts, NULL, NULL);
447    if (real_parts.scheme.is_nonempty() || real_parts.host.is_nonempty()) {
448      if (real_parts.scheme.is_nonempty()) {
449        *scheme = url_parse::Component(
450            after_scheme_and_colon + real_parts.scheme.begin,
451            real_parts.scheme.len);
452      } else {
453        scheme->reset();
454      }
455      if (real_parts.host.is_nonempty()) {
456        *host = url_parse::Component(
457            after_scheme_and_colon + real_parts.host.begin,
458            real_parts.host.len);
459      } else {
460        host->reset();
461      }
462    }
463  } else if (LowerCaseEqualsASCII(scheme_str, chrome::kFileSystemScheme) &&
464             parts.inner_parsed() && parts.inner_parsed()->scheme.is_valid()) {
465    *host = parts.inner_parsed()->host;
466  }
467}
468
469// static
470base::string16 AutocompleteInput::FormattedStringWithEquivalentMeaning(
471    const GURL& url,
472    const base::string16& formatted_url) {
473  if (!net::CanStripTrailingSlash(url))
474    return formatted_url;
475  const base::string16 url_with_path(formatted_url + char16('/'));
476  return (AutocompleteInput::Parse(formatted_url, base::string16(), NULL, NULL,
477                                   NULL) ==
478          AutocompleteInput::Parse(url_with_path, base::string16(), NULL, NULL,
479                                   NULL)) ?
480      formatted_url : url_with_path;
481}
482
483// static
484int AutocompleteInput::NumNonHostComponents(const url_parse::Parsed& parts) {
485  int num_nonhost_components = 0;
486  if (parts.scheme.is_nonempty())
487    ++num_nonhost_components;
488  if (parts.username.is_nonempty())
489    ++num_nonhost_components;
490  if (parts.password.is_nonempty())
491    ++num_nonhost_components;
492  if (parts.port.is_nonempty())
493    ++num_nonhost_components;
494  if (parts.path.is_nonempty())
495    ++num_nonhost_components;
496  if (parts.query.is_nonempty())
497    ++num_nonhost_components;
498  if (parts.ref.is_nonempty())
499    ++num_nonhost_components;
500  return num_nonhost_components;
501}
502
503// static
504bool AutocompleteInput::HasHTTPScheme(const base::string16& input) {
505  std::string utf8_input(UTF16ToUTF8(input));
506  url_parse::Component scheme;
507  if (url_util::FindAndCompareScheme(utf8_input, content::kViewSourceScheme,
508                                     &scheme))
509    utf8_input.erase(0, scheme.end() + 1);
510  return url_util::FindAndCompareScheme(utf8_input, content::kHttpScheme, NULL);
511}
512
513void AutocompleteInput::UpdateText(const base::string16& text,
514                                   size_t cursor_position,
515                                   const url_parse::Parsed& parts) {
516  DCHECK(cursor_position <= text.length() ||
517         cursor_position == base::string16::npos)
518      << "Text: '" << text << "', cp: " << cursor_position;
519  text_ = text;
520  cursor_position_ = cursor_position;
521  parts_ = parts;
522}
523
524void AutocompleteInput::Clear() {
525  text_.clear();
526  cursor_position_ = base::string16::npos;
527  current_url_ = GURL();
528  current_page_classification_ = AutocompleteInput::INVALID_SPEC;
529  type_ = INVALID;
530  parts_ = url_parse::Parsed();
531  scheme_.clear();
532  canonicalized_url_ = GURL();
533  prevent_inline_autocomplete_ = false;
534  prefer_keyword_ = false;
535  allow_exact_keyword_match_ = false;
536  matches_requested_ = ALL_MATCHES;
537}
538