1c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Copyright 2007, Google Inc.
2c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// All rights reserved.
3c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
4c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Redistribution and use in source and binary forms, with or without
5c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// modification, are permitted provided that the following conditions are
6c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// met:
7c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
8c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//     * Redistributions of source code must retain the above copyright
9c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// notice, this list of conditions and the following disclaimer.
10c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//     * Redistributions in binary form must reproduce the above
11c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// copyright notice, this list of conditions and the following disclaimer
12c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// in the documentation and/or other materials provided with the
13c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// distribution.
14c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//     * Neither the name of Google Inc. nor the names of its
15c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// contributors may be used to endorse or promote products derived from
16c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// this software without specific prior written permission.
17c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
18c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
30c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "base/logging.h"
31c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "googleurl/src/url_canon.h"
32c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott#include "googleurl/src/url_canon_internal.h"
33c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
34c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottnamespace url_canon {
35c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
36c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottnamespace {
37c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
38c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// For reference, here's what IE supports:
39c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Key: 0 (disallowed: failure if present in the input)
40c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//      + (allowed either escaped or unescaped, and unmodified)
41c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//      U (allowed escaped or unescaped but always unescaped if present in
42c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//         escaped form)
43c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//      E (allowed escaped or unescaped but always escaped if present in
44c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//         unescaped form)
45c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//      % (only allowed escaped in the input, will be unmodified).
46c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//      I left blank alpha numeric characters.
47c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
48c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//    00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
49c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//    -----------------------------------------------
50c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 0   0  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
51c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 1   E  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
52c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 2   E  +  E  E  +  E  +  +  +  +  +  +  +  U  U  0
53c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 3                                 %  %  E  +  E  0  <-- Those are  : ; < = > ?
54c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 4   %
55c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 5                                    U  0  U  U  U  <-- Those are  [ \ ] ^ _
56c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 6   E                                               <-- That's  `
57c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 7                                    E  E  E  U  E  <-- Those are { | } ~ (UNPRINTABLE)
58c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
59c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// NOTE: I didn't actually test all the control characters. Some may be
60c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// disallowed in the input, but they are all accepted escaped except for 0.
61c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// I also didn't test if characters affecting HTML parsing are allowed
62c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// unescaped, eg. (") or (#), which would indicate the beginning of the path.
63c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Surprisingly, space is accepted in the input and always escaped.
64c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
65c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// This table lists the canonical version of all characters we allow in the
66c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
67c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// value to indicate that this character should be escaped. We are a little more
68c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// restrictive than IE, but less restrictive than Firefox.
69c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
70c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Note that we disallow the % character. We will allow it when part of an
71c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// escape sequence, of course, but this disallows "%25". Even though IE allows
72c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// it, allowing it would put us in a funny state. If there was an invalid
73c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// escape sequence like "%zz", we'll add "%25zz" to the output and fail.
74c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Allowing percents means we'll succeed a second time, so validity would change
75c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// based on how many times you run the canonicalizer. We prefer to always report
76c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// the same vailidity, so reject this.
77c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst unsigned char kEsc = 0xff;
78c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst unsigned char kHostCharLookup[0x80] = {
79c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 00-1f: all are invalid
80c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
81c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
82c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
83c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott   kEsc,kEsc,kEsc,kEsc,kEsc,  0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',  0,
84c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
85c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',  0 ,kEsc,kEsc,kEsc,  0 ,
86c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
87c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott   kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
88c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
89c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[',  0 , ']',  0 , '_',
90c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
91c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott   kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
92c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
93c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc,  0 ,  0 };
94c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
95c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottconst int kTempHostBufferLen = 1024;
96c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scotttypedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
97c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scotttypedef RawCanonOutputT<char16, kTempHostBufferLen> StackBufferW;
98c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
99c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Scans a host name and fills in the output flags according to what we find.
100c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// |has_non_ascii| will be true if there are any non-7-bit characters, and
101c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// |has_escaped| will be true if there is a percent sign.
102c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scotttemplate<typename CHAR, typename UCHAR>
103c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid ScanHostname(const CHAR* spec, const url_parse::Component& host,
104c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                  bool* has_non_ascii, bool* has_escaped) {
105c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  int end = host.end();
106c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  *has_non_ascii = false;
107c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  *has_escaped = false;
108c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  for (int i = host.begin; i < end; i++) {
109c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (static_cast<UCHAR>(spec[i]) >= 0x80)
110c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      *has_non_ascii = true;
111c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    else if (spec[i] == '%')
112c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      *has_escaped = true;
113c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
114c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
115c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
116c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Canonicalizes a host name that is entirely 8-bit characters (even though
117c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// the type holding them may be 16 bits. Escaped characters will be unescaped.
118c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Non-7-bit characters (for example, UTF-8) will be passed unchanged.
119c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
120c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// The |*has_non_ascii| flag will be true if there are non-7-bit characters in
121c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// the output.
122c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
123c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// This function is used in two situations:
124c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
125c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//  * When the caller knows there is no non-ASCII or percent escaped
126c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//    characters. This is what DoHost does. The result will be a completely
127c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//    canonicalized host since we know nothing weird can happen (escaped
128c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//    characters could be unescaped to non-7-bit, so they have to be treated
129c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//    with suspicion at this point). It does not use the |has_non_ascii| flag.
130c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
131c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//  * When the caller has an 8-bit string that may need unescaping.
132c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//    DoComplexHost calls us this situation to do unescaping and validation.
133c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//    After this, it may do other IDN operations depending on the value of the
134c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//    |*has_non_ascii| flag.
135c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott//
136c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// The return value indicates if the output is a potentially valid host name.
137c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scotttemplate<typename INCHAR, typename OUTCHAR>
138c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool DoSimpleHost(const INCHAR* host,
139c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                  int host_len,
140c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                  CanonOutputT<OUTCHAR>* output,
141c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                  bool* has_non_ascii) {
142c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  *has_non_ascii = false;
143c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
144c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  bool success = true;
145c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  for (int i = 0; i < host_len; ++i) {
146c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    unsigned int source = host[i];
147c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (source == '%') {
148c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      // Unescape first, if possible.
149c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      // Source will be used only if decode operation was successful.
150c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      if (!DecodeEscaped(host, &i, host_len,
151c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                         reinterpret_cast<unsigned char*>(&source))) {
152c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // Invalid escaped character. There is nothing that can make this
153c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // host valid. We append an escaped percent so the URL looks reasonable
154c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // and mark as failed.
155c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        AppendEscapedChar('%', output);
156c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        success = false;
157c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        continue;
158c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      }
159c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
160c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
161c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (source < 0x80) {
162c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      // We have ASCII input, we can use our lookup table.
163c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      unsigned char replacement = kHostCharLookup[source];
164c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      if (!replacement) {
165c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // Invalid character, add it as percent-escaped and mark as failed.
166c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        AppendEscapedChar(source, output);
167c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        success = false;
168c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      } else if (replacement == kEsc) {
169c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // This character is valid but should be escaped.
170c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        AppendEscapedChar(source, output);
171c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      } else {
172c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // Common case, the given character is valid in a hostname, the lookup
173c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // table tells us the canonical representation of that character (lower
174c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        // cased).
175c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott        output->push_back(replacement);
176c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      }
177c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    } else {
178c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      // It's a non-ascii char. Just push it to the output.
179c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      // In case where we have char16 input, and char output it's safe to
180c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      // cast char16->char only if input string was converted to ASCII.
181c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      output->push_back(static_cast<OUTCHAR>(source));
182c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      *has_non_ascii = true;
183c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
184c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
185c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
186c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return success;
187c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
188c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
189c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// Canonicalizes a host that requires IDN conversion. Returns true on success
190c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool DoIDNHost(const char16* src, int src_len, CanonOutput* output) {
191c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // We need to escape URL before doing IDN conversion, since punicode strings
192c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // cannot be escaped after they are created.
193c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
194c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  bool has_non_ascii;
195c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
196c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
197c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  StackBufferW wide_output;
198c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (!IDNToASCII(url_escaped_host.data(),
199c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                  url_escaped_host.length(),
200c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                  &wide_output)) {
201c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Some error, give up. This will write some reasonable looking
202c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // representation of the string to the output.
203c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    AppendInvalidNarrowString(src, 0, src_len, output);
204c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return false;
205c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
206c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
207c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Now we check the ASCII output like a normal host. It will also handle
208c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // unescaping. Although we unescaped everything before this function call, if
209c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // somebody does %00 as fullwidth, ICU will convert this to ASCII.
210c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  bool success = DoSimpleHost(wide_output.data(),
211c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                              wide_output.length(),
212c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                              output, &has_non_ascii);
213c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  DCHECK(!has_non_ascii);
214c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return success;
215c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
216c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
217c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// 8-bit convert host to its ASCII version: this converts the UTF-8 input to
218c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// UTF-16. The has_escaped flag should be set if the input string requires
219c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// unescaping.
220c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool DoComplexHost(const char* host, int host_len,
221c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                   bool has_non_ascii, bool has_escaped, CanonOutput* output) {
222c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Save the current position in the output. We may write stuff and rewind it
223c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // below, so we need to know where to rewind to.
224c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  int begin_length = output->length();
225c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
226c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Points to the UTF-8 data we want to convert. This will either be the
227c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // input or the unescaped version written to |*output| if necessary.
228c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  const char* utf8_source;
229c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  int utf8_source_len;
230c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (has_escaped) {
231c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Unescape before converting to UTF-16 for IDN. We write this into the
232c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // output because it most likely does not require IDNization, and we can
233c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // save another huge stack buffer. It will be replaced below if it requires
234c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // IDN. This will also update our non-ASCII flag so we know whether the
235c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // unescaped input requires IDN.
236c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
237c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      // Error with some escape sequence. We'll call the current output
238c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      // complete. DoSimpleHost will have written some "reasonable" output.
239c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      return false;
240c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
241c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
242c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Unescaping may have left us with ASCII input, in which case the
243c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // unescaped version we wrote to output is complete.
244c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (!has_non_ascii) {
245c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      return true;
246c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
247c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
248c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Save the pointer into the data was just converted (it may be appended to
249c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // other data in the output buffer).
250c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    utf8_source = &output->data()[begin_length];
251c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    utf8_source_len = output->length() - begin_length;
252c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  } else {
253c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // We don't need to unescape, use input for IDNization later. (We know the
254c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // input has non-ASCII, or the simple version would have been called
255c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // instead of us.)
256c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    utf8_source = host;
257c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    utf8_source_len = host_len;
258c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
259c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
260c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
261c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Above, we may have used the output to write the unescaped values to, so
262c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // we have to rewind it to where we started after we convert it to UTF-16.
263c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  StackBufferW utf16;
264c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
265c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // In this error case, the input may or may not be the output.
266c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    StackBuffer utf8;
267c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    for (int i = 0; i < utf8_source_len; i++)
268c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      utf8.push_back(utf8_source[i]);
269c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    output->set_length(begin_length);
270c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
271c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return false;
272c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
273c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  output->set_length(begin_length);
274c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
275c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // This will call DoSimpleHost which will do normal ASCII canonicalization
276c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // and also check for IP addresses in the outpt.
277c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return DoIDNHost(utf16.data(), utf16.length(), output);
278c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
279c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
280c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// UTF-16 convert host to its ASCII version. The set up is already ready for
281c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// the backend, so we just pass through. The has_escaped flag should be set if
282c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott// the input string requires unescaping.
283c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool DoComplexHost(const char16* host, int host_len,
284c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                   bool has_non_ascii, bool has_escaped, CanonOutput* output) {
285c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (has_escaped) {
286c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Yikes, we have escaped characters with wide input. The escaped
287c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // characters should be interpreted as UTF-8. To solve this problem,
288c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
289c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    //
290c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // We don't bother to optimize the conversion in the ASCII case (which
291c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // *could* just be a copy) and use the UTF-8 path, because it should be
292c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // very rare that host names have escaped characters, and it is relatively
293c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // fast to do the conversion anyway.
294c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    StackBuffer utf8;
295c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
296c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      AppendInvalidNarrowString(host, 0, host_len, output);
297c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      return false;
298c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
299c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
300c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Once we convert to UTF-8, we can use the 8-bit version of the complex
301c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // host handling code above.
302c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
303c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                         has_escaped, output);
304c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
305c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
306c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // No unescaping necessary, we can safely pass the input to ICU. This
307c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // function will only get called if we either have escaped or non-ascii
308c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // input, so it's safe to just use ICU now. Even if the input is ASCII,
309c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // this function will do the right thing (just slower than we could).
310c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return DoIDNHost(host, host_len, output);
311c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
312c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
313c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scotttemplate<typename CHAR, typename UCHAR>
314c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid DoHost(const CHAR* spec,
315c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            const url_parse::Component& host,
316c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            CanonOutput* output,
317c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott            CanonHostInfo* host_info) {
318c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (host.len <= 0) {
319c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Empty hosts don't need anything.
320c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    host_info->family = CanonHostInfo::NEUTRAL;
321c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    host_info->out_host = url_parse::Component();
322c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    return;
323c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
324c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
325c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  bool has_non_ascii, has_escaped;
326c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
327c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
328c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  // Keep track of output's initial length, so we can rewind later.
329c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  const int output_begin = output->length();
330c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
331c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  bool success;
332c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (!has_non_ascii && !has_escaped) {
333c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    success = DoSimpleHost(&spec[host.begin], host.len,
334c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                           output, &has_non_ascii);
335c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    DCHECK(!has_non_ascii);
336c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  } else {
337c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    success = DoComplexHost(&spec[host.begin], host.len,
338c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                            has_non_ascii, has_escaped, output);
339c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
340c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
341c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  if (!success) {
342c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // Canonicalization failed.  Set BROKEN to notify the caller.
343c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    host_info->family = CanonHostInfo::BROKEN;
344c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  } else {
345c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // After all the other canonicalization, check if we ended up with an IP
346c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // address.  IP addresses are small, so writing into this temporary buffer
347c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // should not cause an allocation.
348c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    RawCanonOutput<64> canon_ip;
349c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    CanonicalizeIPAddress(output->data(),
350c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                          url_parse::MakeRange(output_begin, output->length()),
351c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                          &canon_ip, host_info);
352c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
353c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // If we got an IPv4/IPv6 address, copy the canonical form back to the
354c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // real buffer.  Otherwise, it's a hostname or broken IP, in which case
355c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    // we just leave it in place.
356c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    if (host_info->IsIPAddress()) {
357c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      output->set_length(output_begin);
358c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott      output->Append(canon_ip.data(), canon_ip.length());
359c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott    }
360c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  }
361c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
362c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  host_info->out_host = url_parse::MakeRange(output_begin, output->length());
363c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
364c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
365c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}  // namespace
366c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
367c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool CanonicalizeHost(const char* spec,
368c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                      const url_parse::Component& host,
369c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                      CanonOutput* output,
370c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                      url_parse::Component* out_host) {
371c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  CanonHostInfo host_info;
372c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  DoHost<char, unsigned char>(spec, host, output, &host_info);
373c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  *out_host = host_info.out_host;
374c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return (host_info.family != CanonHostInfo::BROKEN);
375c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
376c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
377c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottbool CanonicalizeHost(const char16* spec,
378c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                      const url_parse::Component& host,
379c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                      CanonOutput* output,
380c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                      url_parse::Component* out_host) {
381c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  CanonHostInfo host_info;
382c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  DoHost<char16, char16>(spec, host, output, &host_info);
383c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  *out_host = host_info.out_host;
384c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  return (host_info.family != CanonHostInfo::BROKEN);
385c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
386c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
387c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid CanonicalizeHostVerbose(const char* spec,
388c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                             const url_parse::Component& host,
389c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                             CanonOutput* output,
390c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                             CanonHostInfo *host_info) {
391c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  DoHost<char, unsigned char>(spec, host, output, host_info);
392c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
393c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
394c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scottvoid CanonicalizeHostVerbose(const char16* spec,
395c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                             const url_parse::Component& host,
396c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                             CanonOutput* output,
397c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott                             CanonHostInfo *host_info) {
398c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott  DoHost<char16, char16>(spec, host, output, host_info);
399c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}
400c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott
401c7f5f8508d98d5952d42ed7648c2a8f30a4da156Patrick Scott}  // namespace url_canon
402