1// Copyright 2007, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30// Canonicalizers for random bits that aren't big enough for their own files.
31
32#include <string.h>
33
34#include "googleurl/src/url_canon.h"
35#include "googleurl/src/url_canon_internal.h"
36
37namespace url_canon {
38
39namespace {
40
41// Returns true if the given character should be removed from the middle of a
42// URL.
43inline bool IsRemovableURLWhitespace(int ch) {
44  return ch == '\r' || ch == '\n' || ch == '\t';
45}
46
47// Backend for RemoveURLWhitespace (see declaration in url_canon.h).
48// It sucks that we have to do this, since this takes about 13% of the total URL
49// canonicalization time.
50template<typename CHAR>
51const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
52                                  CanonOutputT<CHAR>* buffer,
53                                  int* output_len) {
54  // Fast verification that there's nothing that needs removal. This is the 99%
55  // case, so we want it to be fast and don't care about impacting the speed
56  // when we do find whitespace.
57  int found_whitespace = false;
58  for (int i = 0; i < input_len; i++) {
59    if (!IsRemovableURLWhitespace(input[i]))
60      continue;
61    found_whitespace = true;
62    break;
63  }
64
65  if (!found_whitespace) {
66    // Didn't find any whitespace, we don't need to do anything. We can just
67    // return the input as the output.
68    *output_len = input_len;
69    return input;
70  }
71
72  // Remove the whitespace into the new buffer and return it.
73  for (int i = 0; i < input_len; i++) {
74    if (!IsRemovableURLWhitespace(input[i]))
75      buffer->push_back(input[i]);
76  }
77  *output_len = buffer->length();
78  return buffer->data();
79}
80
81// Contains the canonical version of each possible input letter in the scheme
82// (basically, lower-cased). The corresponding entry will be 0 if the letter
83// is not allowed in a scheme.
84const char kSchemeCanonical[0x80] = {
85// 00-1f: all are invalid
86     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
87     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
88//  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
89     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  '+',  0,  '-', '.',  0,
90//   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
91    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
92//   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
93     0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
94//   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
95    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0,   0 ,  0,   0 ,  0,
96//   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
97     0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
98//   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
99    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
100
101// This could be a table lookup as well by setting the high bit for each
102// valid character, but it's only called once per URL, and it makes the lookup
103// table easier to read not having extra stuff in it.
104inline bool IsSchemeFirstChar(unsigned char c) {
105  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
106}
107
108template<typename CHAR, typename UCHAR>
109bool DoScheme(const CHAR* spec,
110              const url_parse::Component& scheme,
111              CanonOutput* output,
112              url_parse::Component* out_scheme) {
113  if (scheme.len <= 0) {
114    // Scheme is unspecified or empty, convert to empty by appending a colon.
115    *out_scheme = url_parse::Component(output->length(), 0);
116    output->push_back(':');
117    return true;
118  }
119
120  // The output scheme starts from the current position.
121  out_scheme->begin = output->length();
122
123  bool success = true;
124  int end = scheme.end();
125  for (int i = scheme.begin; i < end; i++) {
126    UCHAR ch = static_cast<UCHAR>(spec[i]);
127    char replacement = 0;
128    if (ch < 0x80) {
129      if (i == scheme.begin) {
130        // Need to do a special check for the first letter of the scheme.
131        if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
132          replacement = kSchemeCanonical[ch];
133      } else {
134        replacement = kSchemeCanonical[ch];
135      }
136    }
137
138    if (replacement) {
139      output->push_back(replacement);
140    } else if (ch == '%') {
141      // Canonicalizing the scheme multiple times should lead to the same
142      // result. Since invalid characters will be escaped, we need to preserve
143      // the percent to avoid multiple escaping. The scheme will be invalid.
144      success = false;
145      output->push_back('%');
146    } else {
147      // Invalid character, store it but mark this scheme as invalid.
148      success = false;
149
150      // This will escape the output and also handle encoding issues.
151      // Ignore the return value since we already failed.
152      AppendUTF8EscapedChar(spec, &i, end, output);
153    }
154  }
155
156  // The output scheme ends with the the current position, before appending
157  // the colon.
158  out_scheme->len = output->length() - out_scheme->begin;
159  output->push_back(':');
160  return success;
161}
162
163// The username and password components reference ranges in the corresponding
164// *_spec strings. Typically, these specs will be the same (we're
165// canonicalizing a single source string), but may be different when
166// replacing components.
167template<typename CHAR, typename UCHAR>
168bool DoUserInfo(const CHAR* username_spec,
169                const url_parse::Component& username,
170                const CHAR* password_spec,
171                const url_parse::Component& password,
172                CanonOutput* output,
173                url_parse::Component* out_username,
174                url_parse::Component* out_password) {
175  if (username.len <= 0 && password.len <= 0) {
176    // Common case: no user info. We strip empty username/passwords.
177    *out_username = url_parse::Component();
178    *out_password = url_parse::Component();
179    return true;
180  }
181
182  // Write the username.
183  out_username->begin = output->length();
184  if (username.len > 0) {
185    // This will escape characters not valid for the username.
186    AppendStringOfType(&username_spec[username.begin], username.len,
187                       CHAR_USERINFO, output);
188  }
189  out_username->len = output->length() - out_username->begin;
190
191  // When there is a password, we need the separator. Note that we strip
192  // empty but specified passwords.
193  if (password.len > 0) {
194    output->push_back(':');
195    out_password->begin = output->length();
196    AppendStringOfType(&password_spec[password.begin], password.len,
197                       CHAR_USERINFO, output);
198    out_password->len = output->length() - out_password->begin;
199  } else {
200    *out_password = url_parse::Component();
201  }
202
203  output->push_back('@');
204  return true;
205}
206
207// Helper functions for converting port integers to strings.
208inline void WritePortInt(char* output, int output_len, int port) {
209  _itoa_s(port, output, output_len, 10);
210}
211inline void WritePortInt(char16* output, int output_len, int port) {
212  _itow_s(port, output, output_len, 10);
213}
214
215// This function will prepend the colon if there will be a port.
216template<typename CHAR, typename UCHAR>
217bool DoPort(const CHAR* spec,
218            const url_parse::Component& port,
219            int default_port_for_scheme,
220            CanonOutput* output,
221            url_parse::Component* out_port) {
222  int port_num = url_parse::ParsePort(spec, port);
223  if (port_num == url_parse::PORT_UNSPECIFIED ||
224      port_num == default_port_for_scheme) {
225    *out_port = url_parse::Component();
226    return true;  // Leave port empty.
227  }
228
229  if (port_num == url_parse::PORT_INVALID) {
230    // Invalid port: We'll copy the text from the input so the user can see
231    // what the error was, and mark the URL as invalid by returning false.
232    output->push_back(':');
233    out_port->begin = output->length();
234    AppendInvalidNarrowString(spec, port.begin, port.end(), output);
235    out_port->len = output->length() - out_port->begin;
236    return false;
237  }
238
239  // Convert port number back to an integer. Max port value is 5 digits, and
240  // the Parsed::ExtractPort will have made sure the integer is in range.
241  const int buf_size = 6;
242  char buf[buf_size];
243  WritePortInt(buf, buf_size, port_num);
244
245  // Append the port number to the output, preceeded by a colon.
246  output->push_back(':');
247  out_port->begin = output->length();
248  for (int i = 0; i < buf_size && buf[i]; i++)
249    output->push_back(buf[i]);
250
251  out_port->len = output->length() - out_port->begin;
252  return true;
253}
254
255template<typename CHAR, typename UCHAR>
256void DoCanonicalizeRef(const CHAR* spec,
257                       const url_parse::Component& ref,
258                       CanonOutput* output,
259                       url_parse::Component* out_ref) {
260  if (ref.len < 0) {
261    // Common case of no ref.
262    *out_ref = url_parse::Component();
263    return;
264  }
265
266  // Append the ref separator. Note that we need to do this even when the ref
267  // is empty but present.
268  output->push_back('#');
269  out_ref->begin = output->length();
270
271  // Now iterate through all the characters, converting to UTF-8 and validating.
272  int end = ref.end();
273  for (int i = ref.begin; i < end; i++) {
274    if (spec[i] == 0) {
275      // IE just strips NULLs, so we do too.
276      continue;
277    } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
278      // Unline IE seems to, we escape control characters. This will probably
279      // make the reference fragment unusable on a web page, but people
280      // shouldn't be using control characters in their anchor names.
281      AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
282    } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
283      // Normal ASCII characters are just appended.
284      output->push_back(static_cast<char>(spec[i]));
285    } else {
286      // Non-ASCII characters are appended unescaped, but only when they are
287      // valid. Invalid Unicode characters are replaced with the "invalid
288      // character" as IE seems to.
289      unsigned code_point;
290      if (!ReadUTFChar(spec, &i, end, &code_point))
291        AppendUTF8Value(kUnicodeReplacementCharacter, output);
292      else
293        AppendUTF8Value(code_point, output);
294    }
295  }
296
297  out_ref->len = output->length() - out_ref->begin;
298}
299
300}  // namespace
301
302const char* RemoveURLWhitespace(const char* input, int input_len,
303                                CanonOutputT<char>* buffer,
304                                int* output_len) {
305  return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
306}
307
308const char16* RemoveURLWhitespace(const char16* input, int input_len,
309                                  CanonOutputT<char16>* buffer,
310                                  int* output_len) {
311  return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
312}
313
314char CanonicalSchemeChar(char16 ch) {
315  if (ch >= 0x80)
316    return 0;  // Non-ASCII is not supported by schemes.
317  return kSchemeCanonical[ch];
318}
319
320bool CanonicalizeScheme(const char* spec,
321                        const url_parse::Component& scheme,
322                        CanonOutput* output,
323                        url_parse::Component* out_scheme) {
324  return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
325}
326
327bool CanonicalizeScheme(const char16* spec,
328                        const url_parse::Component& scheme,
329                        CanonOutput* output,
330                        url_parse::Component* out_scheme) {
331  return DoScheme<char16, char16>(spec, scheme, output, out_scheme);
332}
333
334bool CanonicalizeUserInfo(const char* username_source,
335                          const url_parse::Component& username,
336                          const char* password_source,
337                          const url_parse::Component& password,
338                          CanonOutput* output,
339                          url_parse::Component* out_username,
340                          url_parse::Component* out_password) {
341  return DoUserInfo<char, unsigned char>(
342      username_source, username, password_source, password,
343      output, out_username, out_password);
344}
345
346bool CanonicalizeUserInfo(const char16* username_source,
347                          const url_parse::Component& username,
348                          const char16* password_source,
349                          const url_parse::Component& password,
350                          CanonOutput* output,
351                          url_parse::Component* out_username,
352                          url_parse::Component* out_password) {
353  return DoUserInfo<char16, char16>(
354      username_source, username, password_source, password,
355      output, out_username, out_password);
356}
357
358bool CanonicalizePort(const char* spec,
359                      const url_parse::Component& port,
360                      int default_port_for_scheme,
361                      CanonOutput* output,
362                      url_parse::Component* out_port) {
363  return DoPort<char, unsigned char>(spec, port,
364                                     default_port_for_scheme,
365                                     output, out_port);
366}
367
368bool CanonicalizePort(const char16* spec,
369                      const url_parse::Component& port,
370                      int default_port_for_scheme,
371                      CanonOutput* output,
372                      url_parse::Component* out_port) {
373  return DoPort<char16, char16>(spec, port, default_port_for_scheme,
374                                      output, out_port);
375}
376
377void CanonicalizeRef(const char* spec,
378                     const url_parse::Component& ref,
379                     CanonOutput* output,
380                     url_parse::Component* out_ref) {
381  DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
382}
383
384void CanonicalizeRef(const char16* spec,
385                     const url_parse::Component& ref,
386                     CanonOutput* output,
387                     url_parse::Component* out_ref) {
388  DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref);
389}
390
391}  // namespace url_canon
392