1// Copyright 2007, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30#include "base/logging.h"
31#include "googleurl/src/url_canon.h"
32#include "googleurl/src/url_canon_internal.h"
33
34namespace url_canon {
35
36namespace {
37
38// For reference, here's what IE supports:
39// Key: 0 (disallowed: failure if present in the input)
40//      + (allowed either escaped or unescaped, and unmodified)
41//      U (allowed escaped or unescaped but always unescaped if present in
42//         escaped form)
43//      E (allowed escaped or unescaped but always escaped if present in
44//         unescaped form)
45//      % (only allowed escaped in the input, will be unmodified).
46//      I left blank alpha numeric characters.
47//
48//    00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
49//    -----------------------------------------------
50// 0   0  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
51// 1   E  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
52// 2   E  +  E  E  +  E  +  +  +  +  +  +  +  U  U  0
53// 3                                 %  %  E  +  E  0  <-- Those are  : ; < = > ?
54// 4   %
55// 5                                    U  0  U  U  U  <-- Those are  [ \ ] ^ _
56// 6   E                                               <-- That's  `
57// 7                                    E  E  E  U  E  <-- Those are { | } ~ (UNPRINTABLE)
58//
59// NOTE: I didn't actually test all the control characters. Some may be
60// disallowed in the input, but they are all accepted escaped except for 0.
61// I also didn't test if characters affecting HTML parsing are allowed
62// unescaped, eg. (") or (#), which would indicate the beginning of the path.
63// Surprisingly, space is accepted in the input and always escaped.
64
65// This table lists the canonical version of all characters we allow in the
66// input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
67// value to indicate that this character should be escaped. We are a little more
68// restrictive than IE, but less restrictive than Firefox.
69//
70// Note that we disallow the % character. We will allow it when part of an
71// escape sequence, of course, but this disallows "%25". Even though IE allows
72// it, allowing it would put us in a funny state. If there was an invalid
73// escape sequence like "%zz", we'll add "%25zz" to the output and fail.
74// Allowing percents means we'll succeed a second time, so validity would change
75// based on how many times you run the canonicalizer. We prefer to always report
76// the same vailidity, so reject this.
77const unsigned char kEsc = 0xff;
78const unsigned char kHostCharLookup[0x80] = {
79// 00-1f: all are invalid
80     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
81     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
82//  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
83   kEsc,kEsc,kEsc,kEsc,kEsc,  0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',  0,
84//   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
85    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',  0 ,kEsc,kEsc,kEsc,  0 ,
86//   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
87   kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
88//   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
89    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[',  0 , ']',  0 , '_',
90//   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
91   kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
92//   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
93    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc,  0 ,  0 };
94
95const int kTempHostBufferLen = 1024;
96typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
97typedef RawCanonOutputT<char16, kTempHostBufferLen> StackBufferW;
98
99// Scans a host name and fills in the output flags according to what we find.
100// |has_non_ascii| will be true if there are any non-7-bit characters, and
101// |has_escaped| will be true if there is a percent sign.
102template<typename CHAR, typename UCHAR>
103void ScanHostname(const CHAR* spec, const url_parse::Component& host,
104                  bool* has_non_ascii, bool* has_escaped) {
105  int end = host.end();
106  *has_non_ascii = false;
107  *has_escaped = false;
108  for (int i = host.begin; i < end; i++) {
109    if (static_cast<UCHAR>(spec[i]) >= 0x80)
110      *has_non_ascii = true;
111    else if (spec[i] == '%')
112      *has_escaped = true;
113  }
114}
115
116// Canonicalizes a host name that is entirely 8-bit characters (even though
117// the type holding them may be 16 bits. Escaped characters will be unescaped.
118// Non-7-bit characters (for example, UTF-8) will be passed unchanged.
119//
120// The |*has_non_ascii| flag will be true if there are non-7-bit characters in
121// the output.
122//
123// This function is used in two situations:
124//
125//  * When the caller knows there is no non-ASCII or percent escaped
126//    characters. This is what DoHost does. The result will be a completely
127//    canonicalized host since we know nothing weird can happen (escaped
128//    characters could be unescaped to non-7-bit, so they have to be treated
129//    with suspicion at this point). It does not use the |has_non_ascii| flag.
130//
131//  * When the caller has an 8-bit string that may need unescaping.
132//    DoComplexHost calls us this situation to do unescaping and validation.
133//    After this, it may do other IDN operations depending on the value of the
134//    |*has_non_ascii| flag.
135//
136// The return value indicates if the output is a potentially valid host name.
137template<typename INCHAR, typename OUTCHAR>
138bool DoSimpleHost(const INCHAR* host,
139                  int host_len,
140                  CanonOutputT<OUTCHAR>* output,
141                  bool* has_non_ascii) {
142  *has_non_ascii = false;
143
144  bool success = true;
145  for (int i = 0; i < host_len; ++i) {
146    unsigned int source = host[i];
147    if (source == '%') {
148      // Unescape first, if possible.
149      // Source will be used only if decode operation was successful.
150      if (!DecodeEscaped(host, &i, host_len,
151                         reinterpret_cast<unsigned char*>(&source))) {
152        // Invalid escaped character. There is nothing that can make this
153        // host valid. We append an escaped percent so the URL looks reasonable
154        // and mark as failed.
155        AppendEscapedChar('%', output);
156        success = false;
157        continue;
158      }
159    }
160
161    if (source < 0x80) {
162      // We have ASCII input, we can use our lookup table.
163      unsigned char replacement = kHostCharLookup[source];
164      if (!replacement) {
165        // Invalid character, add it as percent-escaped and mark as failed.
166        AppendEscapedChar(source, output);
167        success = false;
168      } else if (replacement == kEsc) {
169        // This character is valid but should be escaped.
170        AppendEscapedChar(source, output);
171      } else {
172        // Common case, the given character is valid in a hostname, the lookup
173        // table tells us the canonical representation of that character (lower
174        // cased).
175        output->push_back(replacement);
176      }
177    } else {
178      // It's a non-ascii char. Just push it to the output.
179      // In case where we have char16 input, and char output it's safe to
180      // cast char16->char only if input string was converted to ASCII.
181      output->push_back(static_cast<OUTCHAR>(source));
182      *has_non_ascii = true;
183    }
184  }
185
186  return success;
187}
188
189// Canonicalizes a host that requires IDN conversion. Returns true on success
190bool DoIDNHost(const char16* src, int src_len, CanonOutput* output) {
191  // We need to escape URL before doing IDN conversion, since punicode strings
192  // cannot be escaped after they are created.
193  RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
194  bool has_non_ascii;
195  DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
196
197  StackBufferW wide_output;
198  if (!IDNToASCII(url_escaped_host.data(),
199                  url_escaped_host.length(),
200                  &wide_output)) {
201    // Some error, give up. This will write some reasonable looking
202    // representation of the string to the output.
203    AppendInvalidNarrowString(src, 0, src_len, output);
204    return false;
205  }
206
207  // Now we check the ASCII output like a normal host. It will also handle
208  // unescaping. Although we unescaped everything before this function call, if
209  // somebody does %00 as fullwidth, ICU will convert this to ASCII.
210  bool success = DoSimpleHost(wide_output.data(),
211                              wide_output.length(),
212                              output, &has_non_ascii);
213  DCHECK(!has_non_ascii);
214  return success;
215}
216
217// 8-bit convert host to its ASCII version: this converts the UTF-8 input to
218// UTF-16. The has_escaped flag should be set if the input string requires
219// unescaping.
220bool DoComplexHost(const char* host, int host_len,
221                   bool has_non_ascii, bool has_escaped, CanonOutput* output) {
222  // Save the current position in the output. We may write stuff and rewind it
223  // below, so we need to know where to rewind to.
224  int begin_length = output->length();
225
226  // Points to the UTF-8 data we want to convert. This will either be the
227  // input or the unescaped version written to |*output| if necessary.
228  const char* utf8_source;
229  int utf8_source_len;
230  if (has_escaped) {
231    // Unescape before converting to UTF-16 for IDN. We write this into the
232    // output because it most likely does not require IDNization, and we can
233    // save another huge stack buffer. It will be replaced below if it requires
234    // IDN. This will also update our non-ASCII flag so we know whether the
235    // unescaped input requires IDN.
236    if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
237      // Error with some escape sequence. We'll call the current output
238      // complete. DoSimpleHost will have written some "reasonable" output.
239      return false;
240    }
241
242    // Unescaping may have left us with ASCII input, in which case the
243    // unescaped version we wrote to output is complete.
244    if (!has_non_ascii) {
245      return true;
246    }
247
248    // Save the pointer into the data was just converted (it may be appended to
249    // other data in the output buffer).
250    utf8_source = &output->data()[begin_length];
251    utf8_source_len = output->length() - begin_length;
252  } else {
253    // We don't need to unescape, use input for IDNization later. (We know the
254    // input has non-ASCII, or the simple version would have been called
255    // instead of us.)
256    utf8_source = host;
257    utf8_source_len = host_len;
258  }
259
260  // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
261  // Above, we may have used the output to write the unescaped values to, so
262  // we have to rewind it to where we started after we convert it to UTF-16.
263  StackBufferW utf16;
264  if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
265    // In this error case, the input may or may not be the output.
266    StackBuffer utf8;
267    for (int i = 0; i < utf8_source_len; i++)
268      utf8.push_back(utf8_source[i]);
269    output->set_length(begin_length);
270    AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
271    return false;
272  }
273  output->set_length(begin_length);
274
275  // This will call DoSimpleHost which will do normal ASCII canonicalization
276  // and also check for IP addresses in the outpt.
277  return DoIDNHost(utf16.data(), utf16.length(), output);
278}
279
280// UTF-16 convert host to its ASCII version. The set up is already ready for
281// the backend, so we just pass through. The has_escaped flag should be set if
282// the input string requires unescaping.
283bool DoComplexHost(const char16* host, int host_len,
284                   bool has_non_ascii, bool has_escaped, CanonOutput* output) {
285  if (has_escaped) {
286    // Yikes, we have escaped characters with wide input. The escaped
287    // characters should be interpreted as UTF-8. To solve this problem,
288    // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
289    //
290    // We don't bother to optimize the conversion in the ASCII case (which
291    // *could* just be a copy) and use the UTF-8 path, because it should be
292    // very rare that host names have escaped characters, and it is relatively
293    // fast to do the conversion anyway.
294    StackBuffer utf8;
295    if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
296      AppendInvalidNarrowString(host, 0, host_len, output);
297      return false;
298    }
299
300    // Once we convert to UTF-8, we can use the 8-bit version of the complex
301    // host handling code above.
302    return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
303                         has_escaped, output);
304  }
305
306  // No unescaping necessary, we can safely pass the input to ICU. This
307  // function will only get called if we either have escaped or non-ascii
308  // input, so it's safe to just use ICU now. Even if the input is ASCII,
309  // this function will do the right thing (just slower than we could).
310  return DoIDNHost(host, host_len, output);
311}
312
313template<typename CHAR, typename UCHAR>
314void DoHost(const CHAR* spec,
315            const url_parse::Component& host,
316            CanonOutput* output,
317            CanonHostInfo* host_info) {
318  if (host.len <= 0) {
319    // Empty hosts don't need anything.
320    host_info->family = CanonHostInfo::NEUTRAL;
321    host_info->out_host = url_parse::Component();
322    return;
323  }
324
325  bool has_non_ascii, has_escaped;
326  ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
327
328  // Keep track of output's initial length, so we can rewind later.
329  const int output_begin = output->length();
330
331  bool success;
332  if (!has_non_ascii && !has_escaped) {
333    success = DoSimpleHost(&spec[host.begin], host.len,
334                           output, &has_non_ascii);
335    DCHECK(!has_non_ascii);
336  } else {
337    success = DoComplexHost(&spec[host.begin], host.len,
338                            has_non_ascii, has_escaped, output);
339  }
340
341  if (!success) {
342    // Canonicalization failed.  Set BROKEN to notify the caller.
343    host_info->family = CanonHostInfo::BROKEN;
344  } else {
345    // After all the other canonicalization, check if we ended up with an IP
346    // address.  IP addresses are small, so writing into this temporary buffer
347    // should not cause an allocation.
348    RawCanonOutput<64> canon_ip;
349    CanonicalizeIPAddress(output->data(),
350                          url_parse::MakeRange(output_begin, output->length()),
351                          &canon_ip, host_info);
352
353    // If we got an IPv4/IPv6 address, copy the canonical form back to the
354    // real buffer.  Otherwise, it's a hostname or broken IP, in which case
355    // we just leave it in place.
356    if (host_info->IsIPAddress()) {
357      output->set_length(output_begin);
358      output->Append(canon_ip.data(), canon_ip.length());
359    }
360  }
361
362  host_info->out_host = url_parse::MakeRange(output_begin, output->length());
363}
364
365}  // namespace
366
367bool CanonicalizeHost(const char* spec,
368                      const url_parse::Component& host,
369                      CanonOutput* output,
370                      url_parse::Component* out_host) {
371  CanonHostInfo host_info;
372  DoHost<char, unsigned char>(spec, host, output, &host_info);
373  *out_host = host_info.out_host;
374  return (host_info.family != CanonHostInfo::BROKEN);
375}
376
377bool CanonicalizeHost(const char16* spec,
378                      const url_parse::Component& host,
379                      CanonOutput* output,
380                      url_parse::Component* out_host) {
381  CanonHostInfo host_info;
382  DoHost<char16, char16>(spec, host, output, &host_info);
383  *out_host = host_info.out_host;
384  return (host_info.family != CanonHostInfo::BROKEN);
385}
386
387void CanonicalizeHostVerbose(const char* spec,
388                             const url_parse::Component& host,
389                             CanonOutput* output,
390                             CanonHostInfo *host_info) {
391  DoHost<char, unsigned char>(spec, host, output, host_info);
392}
393
394void CanonicalizeHostVerbose(const char16* spec,
395                             const url_parse::Component& host,
396                             CanonOutput* output,
397                             CanonHostInfo *host_info) {
398  DoHost<char16, char16>(spec, host, output, host_info);
399}
400
401}  // namespace url_canon
402