1116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch// Copyright 2014 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#include "components/omnibox/autocomplete_input.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
7868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_util.h"
8868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/utf_string_conversions.h"
96d86b77056ed63eb6871182f42a9fd5f07550f90Torne (Richard Coles)#include "components/metrics/proto/omnibox_event.pb.h"
105f1c94371a64b3196d4be9466099bb892df9b88eTorne (Richard Coles)#include "components/omnibox/autocomplete_scheme_classifier.h"
11f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)#include "components/url_fixer/url_fixer.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "net/base/net_util.h"
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
14eb525c5499e34cc9c4b825d6d9e75bb07cc06aceBen Murdoch#include "url/url_canon_ip.h"
150f1bc08d4cfcc34181b0b5cbf065c40f687bf740Torne (Richard Coles)#include "url/url_util.h"
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)namespace {
182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
19116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch// Hardcode constant to avoid any dependencies on content/.
20116680a4aac90f2aa7413d9095a592090648e557Ben Murdochconst char kViewSourceScheme[] = "view-source";
21116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch
222a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)void AdjustCursorPositionIfNecessary(size_t num_leading_chars_removed,
232a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                                     size_t* cursor_position) {
24a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  if (*cursor_position == base::string16::npos)
252a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    return;
262a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  if (num_leading_chars_removed < *cursor_position)
272a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    *cursor_position -= num_leading_chars_removed;
282a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  else
292a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    *cursor_position = 0;
302a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}
312a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
322a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)}  // namespace
332a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)AutocompleteInput::AutocompleteInput()
35a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    : cursor_position_(base::string16::npos),
366d86b77056ed63eb6871182f42a9fd5f07550f90Torne (Richard Coles)      current_page_classification_(metrics::OmniboxEventProto::INVALID_SPEC),
37f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      type_(metrics::OmniboxInputType::INVALID),
382a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)      prevent_inline_autocomplete_(false),
392a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)      prefer_keyword_(false),
402a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)      allow_exact_keyword_match_(true),
410529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch      want_asynchronous_matches_(true) {
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
44d3868032626d59662ff73b372b5d584c1d144c53Ben MurdochAutocompleteInput::AutocompleteInput(
45a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    const base::string16& text,
46d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    size_t cursor_position,
47a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    const base::string16& desired_tld,
48d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    const GURL& current_url,
496d86b77056ed63eb6871182f42a9fd5f07550f90Torne (Richard Coles)    metrics::OmniboxEventProto::PageClassification current_page_classification,
50d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    bool prevent_inline_autocomplete,
51d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    bool prefer_keyword,
52d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    bool allow_exact_keyword_match,
53116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    bool want_asynchronous_matches,
54116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    const AutocompleteSchemeClassifier& scheme_classifier)
552a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    : cursor_position_(cursor_position),
562a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)      current_url_(current_url),
57d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      current_page_classification_(current_page_classification),
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      prevent_inline_autocomplete_(prevent_inline_autocomplete),
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      prefer_keyword_(prefer_keyword),
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      allow_exact_keyword_match_(allow_exact_keyword_match),
610529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch      want_asynchronous_matches_(want_asynchronous_matches) {
62a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  DCHECK(cursor_position <= text.length() ||
63a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)         cursor_position == base::string16::npos)
642a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)      << "Text: '" << text << "', cp: " << cursor_position;
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // None of the providers care about leading white space so we always trim it.
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Providers that care about trailing white space handle trimming themselves.
67a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)  if ((base::TrimWhitespace(text, base::TRIM_LEADING, &text_) &
68a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)       base::TRIM_LEADING) != 0)
692a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    AdjustCursorPositionIfNecessary(text.length() - text_.length(),
702a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                                    &cursor_position_);
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  GURL canonicalized_url;
73116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  type_ = Parse(text_, desired_tld, scheme_classifier, &parts_, &scheme_,
74116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch                &canonicalized_url);
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
76f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  if (type_ == metrics::OmniboxInputType::INVALID)
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return;
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
79f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  if (((type_ == metrics::OmniboxInputType::UNKNOWN) ||
80f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)       (type_ == metrics::OmniboxInputType::URL)) &&
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      canonicalized_url.is_valid() &&
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      (!canonicalized_url.IsStandard() || canonicalized_url.SchemeIsFile() ||
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)       canonicalized_url.SchemeIsFileSystem() ||
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)       !canonicalized_url.host().empty()))
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    canonicalized_url_ = canonicalized_url;
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
872a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  size_t chars_removed = RemoveForcedQueryStringIfNecessary(type_, &text_);
882a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  AdjustCursorPositionIfNecessary(chars_removed, &cursor_position_);
892a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  if (chars_removed) {
902a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    // Remove spaces between opening question mark and first actual character.
91a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    base::string16 trimmed_text;
92a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    if ((base::TrimWhitespace(text_, base::TRIM_LEADING, &trimmed_text) &
93a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)         base::TRIM_LEADING) != 0) {
942a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)      AdjustCursorPositionIfNecessary(text_.length() - trimmed_text.length(),
952a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                                      &cursor_position_);
962a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)      text_ = trimmed_text;
972a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    }
982a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  }
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)AutocompleteInput::~AutocompleteInput() {
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static
105a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)size_t AutocompleteInput::RemoveForcedQueryStringIfNecessary(
1066d86b77056ed63eb6871182f42a9fd5f07550f90Torne (Richard Coles)    metrics::OmniboxInputType::Type type,
107a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    base::string16* text) {
108f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  if ((type != metrics::OmniboxInputType::FORCED_QUERY) || text->empty() ||
109f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      (*text)[0] != L'?')
1102a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)    return 0;
1112a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  // Drop the leading '?'.
1122a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  text->erase(0, 1);
1132a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  return 1;
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static
1176d86b77056ed63eb6871182f42a9fd5f07550f90Torne (Richard Coles)std::string AutocompleteInput::TypeToString(
1186d86b77056ed63eb6871182f42a9fd5f07550f90Torne (Richard Coles)    metrics::OmniboxInputType::Type type) {
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  switch (type) {
120f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    case metrics::OmniboxInputType::INVALID:      return "invalid";
121f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    case metrics::OmniboxInputType::UNKNOWN:      return "unknown";
122f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    case metrics::OmniboxInputType::DEPRECATED_REQUESTED_URL:
123f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      return "deprecated-requested-url";
124f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    case metrics::OmniboxInputType::URL:          return "url";
125f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    case metrics::OmniboxInputType::QUERY:        return "query";
126f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    case metrics::OmniboxInputType::FORCED_QUERY: return "forced-query";
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
128f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  return std::string();
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static
1326d86b77056ed63eb6871182f42a9fd5f07550f90Torne (Richard Coles)metrics::OmniboxInputType::Type AutocompleteInput::Parse(
133a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    const base::string16& text,
134a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    const base::string16& desired_tld,
135116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    const AutocompleteSchemeClassifier& scheme_classifier,
1365c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu    url::Parsed* parts,
137a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    base::string16* scheme,
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    GURL* canonicalized_url) {
139a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  size_t first_non_white = text.find_first_not_of(base::kWhitespaceUTF16, 0);
140a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  if (first_non_white == base::string16::npos)
141f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return metrics::OmniboxInputType::INVALID;  // All whitespace.
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
143cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  if (text[first_non_white] == L'?') {
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // If the first non-whitespace character is a '?', we magically treat this
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // as a query.
146f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return metrics::OmniboxInputType::FORCED_QUERY;
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Ask our parsing back-end to help us understand what the user typed.  We
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // use the URLFixerUpper here because we want to be smart about what we
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // consider a scheme.  For example, we shouldn't consider www.google.com:80
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to have a scheme.
1535c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  url::Parsed local_parts;
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!parts)
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    parts = &local_parts;
156f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  const base::string16 parsed_scheme(url_fixer::SegmentURL(text, parts));
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (scheme)
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *scheme = parsed_scheme;
159116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  const std::string parsed_scheme_utf8(base::UTF16ToUTF8(parsed_scheme));
160a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)
1615c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // If we can't canonicalize the user's input, the rest of the autocomplete
1625c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // system isn't going to be able to produce a navigable URL match for it.
1635c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // So we just return QUERY immediately in these cases.
164a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)  GURL placeholder_canonicalized_url;
165a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)  if (!canonicalized_url)
166a1401311d1ab56c4ed0a474bd38c108f75cb0cd9Torne (Richard Coles)    canonicalized_url = &placeholder_canonicalized_url;
167f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  *canonicalized_url = url_fixer::FixupURL(base::UTF16ToUTF8(text),
168f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                                           base::UTF16ToUTF8(desired_tld));
1695c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  if (!canonicalized_url->is_valid())
170f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return metrics::OmniboxInputType::QUERY;
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
172116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  if (LowerCaseEqualsASCII(parsed_scheme_utf8, url::kFileScheme)) {
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // A user might or might not type a scheme when entering a file URL.  In
174116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    // either case, |parsed_scheme_utf8| will tell us that this is a file URL,
175116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    // but |parts->scheme| might be empty, e.g. if the user typed "C:\foo".
176f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return metrics::OmniboxInputType::URL;
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If the user typed a scheme, and it's HTTP or HTTPS, we know how to parse it
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // well enough that we can fall through to the heuristics below.  If it's
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // something else, we can just determine our action based on what we do with
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // any input of this scheme.  In theory we could do better with some schemes
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // (e.g. "ftp" or "view-source") but I'll wait to spend the effort on that
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // until I run into some cases that really need it.
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (parts->scheme.is_nonempty() &&
186116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      !LowerCaseEqualsASCII(parsed_scheme_utf8, url::kHttpScheme) &&
187116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      !LowerCaseEqualsASCII(parsed_scheme_utf8, url::kHttpsScheme)) {
188116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    metrics::OmniboxInputType::Type type =
189116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        scheme_classifier.GetInputTypeForScheme(parsed_scheme_utf8);
190116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    if (type != metrics::OmniboxInputType::INVALID)
191116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      return type;
192116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch
193116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    // We don't know about this scheme.  It might be that the user typed a
194116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    // URL of the form "username:password@foo.com".
195116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    const base::string16 http_scheme_prefix =
196116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        base::ASCIIToUTF16(std::string(url::kHttpScheme) +
197116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch                           url::kStandardSchemeSeparator);
198116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    url::Parsed http_parts;
199116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    base::string16 http_scheme;
200116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    GURL http_canonicalized_url;
201116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    metrics::OmniboxInputType::Type http_type =
202116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        Parse(http_scheme_prefix + text, desired_tld, scheme_classifier,
203116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch              &http_parts, &http_scheme, &http_canonicalized_url);
204116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    DCHECK_EQ(std::string(url::kHttpScheme),
205116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch              base::UTF16ToUTF8(http_scheme));
206116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch
207116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    if ((http_type == metrics::OmniboxInputType::URL) &&
208116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        http_parts.username.is_nonempty() &&
209116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        http_parts.password.is_nonempty()) {
210116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      // Manually re-jigger the parsed parts to match |text| (without the
211116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      // http scheme added).
212116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      http_parts.scheme.reset();
213116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      url::Component* components[] = {
214116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        &http_parts.username,
215116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        &http_parts.password,
216116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        &http_parts.host,
217116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        &http_parts.port,
218116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        &http_parts.path,
219116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        &http_parts.query,
220116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        &http_parts.ref,
221116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      };
222116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      for (size_t i = 0; i < arraysize(components); ++i) {
223116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        url_fixer::OffsetComponent(
224116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch            -static_cast<int>(http_scheme_prefix.length()), components[i]);
225116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      }
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
227116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      *parts = http_parts;
228116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      if (scheme)
229116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch        scheme->clear();
230116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      *canonicalized_url = http_canonicalized_url;
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
232116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch      return metrics::OmniboxInputType::URL;
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
234116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch
235116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    // We don't know about this scheme and it doesn't look like the user
236116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    // typed a username and password.  It's likely to be a search operator
237116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    // like "site:" or "link:".  We classify it as UNKNOWN so the user has
238116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    // the option of treating it as a URL if we're wrong.
239116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    // Note that SegmentURL() is smart so we aren't tricked by "c:\foo" or
240116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    // "www.example.com:81" in this case.
241116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    return metrics::OmniboxInputType::UNKNOWN;
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Either the user didn't type a scheme, in which case we need to distinguish
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // between an HTTP URL and a query, or the scheme is HTTP or HTTPS, in which
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // case we should reject invalid formulations.
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2485c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // If we have an empty host it can't be a valid HTTP[S] URL.  (This should
2495c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // only trigger for input that begins with a colon, which GURL will parse as a
2505c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // valid, non-standard URL; for standard URLs, an empty host would have
2515c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // resulted in an invalid |canonicalized_url| above.)
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!parts->host.is_nonempty())
253f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return metrics::OmniboxInputType::QUERY;
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2555c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // Sanity-check: GURL should have failed to canonicalize this URL if it had an
2565c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  // invalid port.
2575c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  DCHECK_NE(url::PORT_INVALID, url::ParsePort(text.c_str(), parts->port));
2585c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Likewise, the RCDS can reject certain obviously-invalid hosts.  (We also
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // use the registry length later below.)
261a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  const base::string16 host(text.substr(parts->host.begin, parts->host.len));
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const size_t registry_length =
263a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      net::registry_controlled_domains::GetRegistryLength(
2645d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)          base::UTF16ToUTF8(host),
265a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
266a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (registry_length == std::string::npos) {
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Try to append the desired_tld.
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (!desired_tld.empty()) {
270a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)      base::string16 host_with_tld(host);
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (host[host.length() - 1] != '.')
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        host_with_tld += '.';
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      host_with_tld += desired_tld;
274a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)      const size_t tld_length =
275a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)          net::registry_controlled_domains::GetRegistryLength(
2765d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)              base::UTF16ToUTF8(host_with_tld),
277a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)              net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
278a93a17c8d99d686bd4a1511e5504e5e6cc9fcadfTorne (Richard Coles)              net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
279f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      if (tld_length != std::string::npos) {
280f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        // Something like "99999999999" that looks like a bad IP
281f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        // address, but becomes valid on attaching a TLD.
282f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        return metrics::OmniboxInputType::URL;
283f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      }
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
285f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    // Could be a broken IP address, etc.
286f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return metrics::OmniboxInputType::QUERY;
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // See if the hostname is valid.  While IE and GURL allow hostnames to contain
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // many other characters (perhaps for weird intranet machines), it's extremely
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // unlikely that a user would be trying to type those in for anything other
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // than a search query.
2945c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  url::CanonHostInfo host_info;
2955d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  const std::string canonicalized_host(net::CanonicalizeHost(
2965d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      base::UTF16ToUTF8(host), &host_info));
2975c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  if ((host_info.family == url::CanonHostInfo::NEUTRAL) &&
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      !net::IsCanonicalizedHostCompliant(canonicalized_host,
2995d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)                                         base::UTF16ToUTF8(desired_tld))) {
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Invalid hostname.  There are several possible cases:
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // * Our checker is too strict and the user pasted in a real-world URL
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    //   that's "invalid" but resolves.  To catch these, we return UNKNOWN when
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    //   the user explicitly typed a scheme, so we'll still search by default
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    //   but we'll show the accidental search infobar if necessary.
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // * The user is typing a multi-word query.  If we see a space anywhere in
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    //   the hostname we assume this is a search and return QUERY.
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // * Our checker is too strict and the user is typing a real-world hostname
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    //   that's "invalid" but resolves.  We return UNKNOWN if the TLD is known.
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    //   Note that we explicitly excluded hosts with spaces above so that
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    //   "toys at amazon.com" will be treated as a search.
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // * The user is typing some garbage string.  Return QUERY.
3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    //
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Thus we fall down in the following cases:
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // * Trying to navigate to a hostname with spaces
3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // * Trying to navigate to a hostname with invalid characters and an unknown
3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    //   TLD
3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // These are rare, though probably possible in intranets.
3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return (parts->scheme.is_nonempty() ||
319a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)           ((registry_length != 0) &&
320f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)            (host.find(' ') == base::string16::npos))) ?
321f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)        metrics::OmniboxInputType::UNKNOWN : metrics::OmniboxInputType::QUERY;
3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Now that we've ruled out all schemes other than http or https and done a
3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // little more sanity checking, the presence of a scheme means this is likely
3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // a URL.
3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (parts->scheme.is_nonempty())
328f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return metrics::OmniboxInputType::URL;
3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // See if the host is an IP address.
3315c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  if (host_info.family == url::CanonHostInfo::IPV6)
332f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return metrics::OmniboxInputType::URL;
3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If the user originally typed a host that looks like an IP address (a
3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // dotted quad), they probably want to open it.  If the original input was
3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // something else (like a single number), they probably wanted to search for
3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // it, unless they explicitly typed a scheme.  This is true even if the URL
3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // appears to have a path: "1.2/45" is more likely a search (for the answer
3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // to a math problem) than a URL.  However, if there are more non-host
3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // components, then maybe this really was intended to be a navigation.  For
3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // this reason we only check the dotted-quad case here, and save the "other
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // IP addresses" case for after we check the number of non-host components
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // below.
3435c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  if ((host_info.family == url::CanonHostInfo::IPV4) &&
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      (host_info.num_ipv4_components == 4))
345f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return metrics::OmniboxInputType::URL;
3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Presence of a password means this is likely a URL.  Note that unless the
3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // user has typed an explicit "http://" or similar, we'll probably think that
3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the username is some unknown scheme, and bail out in the scheme-handling
3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // code above.
3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (parts->password.is_nonempty())
352f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return metrics::OmniboxInputType::URL;
3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Trailing slashes force the input to be treated as a URL.
3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (parts->path.is_nonempty()) {
3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    char c = text[parts->path.end() - 1];
3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if ((c == '\\') || (c == '/'))
358f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)      return metrics::OmniboxInputType::URL;
3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If there is more than one recognized non-host component, this is likely to
3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // be a URL, even if the TLD is unknown (in which case this is likely an
3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // intranet URL).
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (NumNonHostComponents(*parts) > 1)
365f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return metrics::OmniboxInputType::URL;
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If the host has a known TLD or a port, it's probably a URL, with the
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // following exceptions:
3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * Any "IP addresses" that make it here are more likely searches
3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   (see above).
3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * If we reach here with a username, our input looks like "user@host[.tld]".
3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   Because there is no scheme explicitly specified, we think this is more
3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   likely an email address than an HTTP auth attempt.  Hence, we search by
3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   default and let users correct us on a case-by-case basis.
3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Note that we special-case "localhost" as a known hostname.
3765c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  if ((host_info.family != url::CanonHostInfo::IPV4) &&
3775d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)      ((registry_length != 0) || (host == base::ASCIIToUTF16("localhost") ||
378f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)       parts->port.is_nonempty()))) {
379f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return parts->username.is_nonempty() ? metrics::OmniboxInputType::UNKNOWN :
380f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)                                           metrics::OmniboxInputType::URL;
381f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  }
3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If we reach this point, we know there's no known TLD on the input, so if
3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the user wishes to add a desired_tld, the fixup code will oblige; thus this
3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // is a URL.
3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!desired_tld.empty())
387f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)    return metrics::OmniboxInputType::URL;
3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // No scheme, password, port, path, and no known TLD on the host.
3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // This could be:
3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * An "incomplete IP address"; likely a search (see above).
3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * An email-like input like "user@host", where "host" has no known TLD.
3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   It's not clear what the user means here and searching seems reasonable.
3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * A single word "foo"; possibly an intranet site, but more likely a search.
3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   This is ideally an UNKNOWN, and we can let the Alternate Nav URL code
3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   catch our mistakes.
3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * A URL with a valid TLD we don't know about yet.  If e.g. a registrar adds
3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   "xxx" as a TLD, then until we add it to our data file, Chrome won't know
3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   "foo.xxx" is a real URL.  So ideally this is a URL, but we can't really
4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   distinguish this case from:
4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // * A "URL-like" string that's not really a URL (like
4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   "browser.tabs.closeButtons" or "java.awt.event.*").  This is ideally a
4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   QUERY.  Since this is indistinguishable from the case above, and this
4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   case is much more likely, claim these are UNKNOWN, which should default
4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   to the right thing and let users correct us on a case-by-case basis.
406f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  return metrics::OmniboxInputType::UNKNOWN;
4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static
410116680a4aac90f2aa7413d9095a592090648e557Ben Murdochvoid AutocompleteInput::ParseForEmphasizeComponents(
411116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    const base::string16& text,
412116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    const AutocompleteSchemeClassifier& scheme_classifier,
413116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    url::Component* scheme,
414116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    url::Component* host) {
4155c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  url::Parsed parts;
416a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  base::string16 scheme_str;
417116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  Parse(text, base::string16(), scheme_classifier, &parts, &scheme_str, NULL);
4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *scheme = parts.scheme;
4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  *host = parts.host;
4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int after_scheme_and_colon = parts.scheme.end() + 1;
4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // For the view-source scheme, we should emphasize the scheme and host of the
4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // URL qualified by the view-source prefix.
425116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  if (LowerCaseEqualsASCII(scheme_str, kViewSourceScheme) &&
4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      (static_cast<int>(text.length()) > after_scheme_and_colon)) {
4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Obtain the URL prefixed by view-source and parse it.
428a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)    base::string16 real_url(text.substr(after_scheme_and_colon));
4295c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu    url::Parsed real_parts;
430116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    AutocompleteInput::Parse(real_url, base::string16(), scheme_classifier,
431116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch                             &real_parts, NULL, NULL);
4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    if (real_parts.scheme.is_nonempty() || real_parts.host.is_nonempty()) {
4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (real_parts.scheme.is_nonempty()) {
4345c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu        *scheme = url::Component(
4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            after_scheme_and_colon + real_parts.scheme.begin,
4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)            real_parts.scheme.len);
4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      } else {
4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        scheme->reset();
4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      if (real_parts.host.is_nonempty()) {
4415c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu        *host = url::Component(after_scheme_and_colon + real_parts.host.begin,
4425c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu                               real_parts.host.len);
4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      } else {
4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        host->reset();
4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      }
4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    }
447cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  } else if (LowerCaseEqualsASCII(scheme_str, url::kFileSystemScheme) &&
4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)             parts.inner_parsed() && parts.inner_parsed()->scheme.is_valid()) {
4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    *host = parts.inner_parsed()->host;
4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static
454a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)base::string16 AutocompleteInput::FormattedStringWithEquivalentMeaning(
4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const GURL& url,
456116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    const base::string16& formatted_url,
457116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch    const AutocompleteSchemeClassifier& scheme_classifier) {
4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!net::CanStripTrailingSlash(url))
4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return formatted_url;
4605d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  const base::string16 url_with_path(formatted_url + base::char16('/'));
461116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  return (AutocompleteInput::Parse(formatted_url, base::string16(),
462116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch                                   scheme_classifier, NULL, NULL, NULL) ==
463116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch          AutocompleteInput::Parse(url_with_path, base::string16(),
464116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch                                   scheme_classifier, NULL, NULL, NULL)) ?
4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      formatted_url : url_with_path;
4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// static
4695c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liuint AutocompleteInput::NumNonHostComponents(const url::Parsed& parts) {
4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int num_nonhost_components = 0;
4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (parts.scheme.is_nonempty())
4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++num_nonhost_components;
4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (parts.username.is_nonempty())
4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++num_nonhost_components;
4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (parts.password.is_nonempty())
4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++num_nonhost_components;
4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (parts.port.is_nonempty())
4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++num_nonhost_components;
4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (parts.path.is_nonempty())
4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++num_nonhost_components;
4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (parts.query.is_nonempty())
4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++num_nonhost_components;
4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (parts.ref.is_nonempty())
4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ++num_nonhost_components;
4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return num_nonhost_components;
4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
4880f1bc08d4cfcc34181b0b5cbf065c40f687bf740Torne (Richard Coles)// static
489a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)bool AutocompleteInput::HasHTTPScheme(const base::string16& input) {
4905d1f7b1de12d16ceb2c938c56701a3e8bfa558f7Torne (Richard Coles)  std::string utf8_input(base::UTF16ToUTF8(input));
4915c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  url::Component scheme;
492116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  if (url::FindAndCompareScheme(utf8_input, kViewSourceScheme, &scheme)) {
4930f1bc08d4cfcc34181b0b5cbf065c40f687bf740Torne (Richard Coles)    utf8_input.erase(0, scheme.end() + 1);
4945c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  }
495010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  return url::FindAndCompareScheme(utf8_input, url::kHttpScheme, NULL);
4960f1bc08d4cfcc34181b0b5cbf065c40f687bf740Torne (Richard Coles)}
4970f1bc08d4cfcc34181b0b5cbf065c40f687bf740Torne (Richard Coles)
498a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)void AutocompleteInput::UpdateText(const base::string16& text,
4992a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)                                   size_t cursor_position,
5005c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu                                   const url::Parsed& parts) {
501a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  DCHECK(cursor_position <= text.length() ||
502a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)         cursor_position == base::string16::npos)
5032a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)      << "Text: '" << text << "', cp: " << cursor_position;
5045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  text_ = text;
5052a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  cursor_position_ = cursor_position;
5065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  parts_ = parts;
5075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
5085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
5095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void AutocompleteInput::Clear() {
5105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  text_.clear();
511a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7Torne (Richard Coles)  cursor_position_ = base::string16::npos;
5122a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  current_url_ = GURL();
5136d86b77056ed63eb6871182f42a9fd5f07550f90Torne (Richard Coles)  current_page_classification_ = metrics::OmniboxEventProto::INVALID_SPEC;
514f8ee788a64d60abd8f2d742a5fdedde054ecd910Torne (Richard Coles)  type_ = metrics::OmniboxInputType::INVALID;
5155c02ac1a9c1b504631c0a3d2b6e737b5d738bae1Bo Liu  parts_ = url::Parsed();
5165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  scheme_.clear();
5172a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  canonicalized_url_ = GURL();
5185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  prevent_inline_autocomplete_ = false;
5195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  prefer_keyword_ = false;
5202a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)  allow_exact_keyword_match_ = false;
5210529e5d033099cbfc42635f6f6183833b09dff6eBen Murdoch  want_asynchronous_matches_ = true;
5225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
523