1// Copyright 2007, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30// Canonicalizers for random bits that aren't big enough for their own files.
31
32#include <string.h>
33
34#include "googleurl/src/url_canon.h"
35#include "googleurl/src/url_canon_internal.h"
36
37namespace url_canon {
38
39namespace {
40
41// Returns true if the given character should be removed from the middle of a
42// URL.
43inline bool IsRemovableURLWhitespace(int ch) {
44  return ch == '\r' || ch == '\n' || ch == '\t';
45}
46
47// Backend for RemoveURLWhitespace (see declaration in url_canon.h).
48// It sucks that we have to do this, since this takes about 13% of the total URL
49// canonicalization time.
50template<typename CHAR>
51const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
52                                  CanonOutputT<CHAR>* buffer,
53                                  int* output_len) {
54  // Fast verification that there's nothing that needs removal. This is the 99%
55  // case, so we want it to be fast and don't care about impacting the speed
56  // when we do find whitespace.
57  int found_whitespace = false;
58  for (int i = 0; i < input_len; i++) {
59    if (!IsRemovableURLWhitespace(input[i]))
60      continue;
61    found_whitespace = true;
62    break;
63  }
64
65  if (!found_whitespace) {
66    // Didn't find any whitespace, we don't need to do anything. We can just
67    // return the input as the output.
68    *output_len = input_len;
69    return input;
70  }
71
72  // Remove the whitespace into the new buffer and return it.
73  for (int i = 0; i < input_len; i++) {
74    if (!IsRemovableURLWhitespace(input[i]))
75      buffer->push_back(input[i]);
76  }
77  *output_len = buffer->length();
78  return buffer->data();
79}
80
81// Contains the canonical version of each possible input letter in the scheme
82// (basically, lower-cased). The corresponding entry will be 0 if the letter
83// is not allowed in a scheme.
84const char kSchemeCanonical[0x80] = {
85// 00-1f: all are invalid
86     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
87     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
88//  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
89     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  '+',  0,  '-', '.',  0,
90//   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
91    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
92//   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
93     0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
94//   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
95    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0,   0 ,  0,   0 ,  0,
96//   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
97     0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
98//   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
99    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
100
101// This could be a table lookup as well by setting the high bit for each
102// valid character, but it's only called once per URL, and it makes the lookup
103// table easier to read not having extra stuff in it.
104inline bool IsSchemeFirstChar(unsigned char c) {
105  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
106}
107
108template<typename CHAR, typename UCHAR>
109bool DoScheme(const CHAR* spec,
110              const url_parse::Component& scheme,
111              CanonOutput* output,
112              url_parse::Component* out_scheme) {
113  if (scheme.len <= 0) {
114    // Scheme is unspecified or empty, convert to empty by appending a colon.
115    *out_scheme = url_parse::Component(output->length(), 0);
116    output->push_back(':');
117    return true;
118  }
119
120  // The output scheme starts from the current position.
121  out_scheme->begin = output->length();
122
123  // Danger: it's important that this code does not strip any characters: it
124  // only emits the canonical version (be it valid or escaped) of each of
125  // the input characters. Stripping would put it out of sync with
126  // url_util::FindAndCompareScheme, which could cause some security checks on
127  // schemes to be incorrect.
128  bool success = true;
129  int end = scheme.end();
130  for (int i = scheme.begin; i < end; i++) {
131    UCHAR ch = static_cast<UCHAR>(spec[i]);
132    char replacement = 0;
133    if (ch < 0x80) {
134      if (i == scheme.begin) {
135        // Need to do a special check for the first letter of the scheme.
136        if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
137          replacement = kSchemeCanonical[ch];
138      } else {
139        replacement = kSchemeCanonical[ch];
140      }
141    }
142
143    if (replacement) {
144      output->push_back(replacement);
145    } else if (ch == '%') {
146      // Canonicalizing the scheme multiple times should lead to the same
147      // result. Since invalid characters will be escaped, we need to preserve
148      // the percent to avoid multiple escaping. The scheme will be invalid.
149      success = false;
150      output->push_back('%');
151    } else {
152      // Invalid character, store it but mark this scheme as invalid.
153      success = false;
154
155      // This will escape the output and also handle encoding issues.
156      // Ignore the return value since we already failed.
157      AppendUTF8EscapedChar(spec, &i, end, output);
158    }
159  }
160
161  // The output scheme ends with the the current position, before appending
162  // the colon.
163  out_scheme->len = output->length() - out_scheme->begin;
164  output->push_back(':');
165  return success;
166}
167
168// The username and password components reference ranges in the corresponding
169// *_spec strings. Typically, these specs will be the same (we're
170// canonicalizing a single source string), but may be different when
171// replacing components.
172template<typename CHAR, typename UCHAR>
173bool DoUserInfo(const CHAR* username_spec,
174                const url_parse::Component& username,
175                const CHAR* password_spec,
176                const url_parse::Component& password,
177                CanonOutput* output,
178                url_parse::Component* out_username,
179                url_parse::Component* out_password) {
180  if (username.len <= 0 && password.len <= 0) {
181    // Common case: no user info. We strip empty username/passwords.
182    *out_username = url_parse::Component();
183    *out_password = url_parse::Component();
184    return true;
185  }
186
187  // Write the username.
188  out_username->begin = output->length();
189  if (username.len > 0) {
190    // This will escape characters not valid for the username.
191    AppendStringOfType(&username_spec[username.begin], username.len,
192                       CHAR_USERINFO, output);
193  }
194  out_username->len = output->length() - out_username->begin;
195
196  // When there is a password, we need the separator. Note that we strip
197  // empty but specified passwords.
198  if (password.len > 0) {
199    output->push_back(':');
200    out_password->begin = output->length();
201    AppendStringOfType(&password_spec[password.begin], password.len,
202                       CHAR_USERINFO, output);
203    out_password->len = output->length() - out_password->begin;
204  } else {
205    *out_password = url_parse::Component();
206  }
207
208  output->push_back('@');
209  return true;
210}
211
212// Helper functions for converting port integers to strings.
213inline void WritePortInt(char* output, int output_len, int port) {
214  _itoa_s(port, output, output_len, 10);
215}
216
217// This function will prepend the colon if there will be a port.
218template<typename CHAR, typename UCHAR>
219bool DoPort(const CHAR* spec,
220            const url_parse::Component& port,
221            int default_port_for_scheme,
222            CanonOutput* output,
223            url_parse::Component* out_port) {
224  int port_num = url_parse::ParsePort(spec, port);
225  if (port_num == url_parse::PORT_UNSPECIFIED ||
226      port_num == default_port_for_scheme) {
227    *out_port = url_parse::Component();
228    return true;  // Leave port empty.
229  }
230
231  if (port_num == url_parse::PORT_INVALID) {
232    // Invalid port: We'll copy the text from the input so the user can see
233    // what the error was, and mark the URL as invalid by returning false.
234    output->push_back(':');
235    out_port->begin = output->length();
236    AppendInvalidNarrowString(spec, port.begin, port.end(), output);
237    out_port->len = output->length() - out_port->begin;
238    return false;
239  }
240
241  // Convert port number back to an integer. Max port value is 5 digits, and
242  // the Parsed::ExtractPort will have made sure the integer is in range.
243  const int buf_size = 6;
244  char buf[buf_size];
245  WritePortInt(buf, buf_size, port_num);
246
247  // Append the port number to the output, preceeded by a colon.
248  output->push_back(':');
249  out_port->begin = output->length();
250  for (int i = 0; i < buf_size && buf[i]; i++)
251    output->push_back(buf[i]);
252
253  out_port->len = output->length() - out_port->begin;
254  return true;
255}
256
257template<typename CHAR, typename UCHAR>
258void DoCanonicalizeRef(const CHAR* spec,
259                       const url_parse::Component& ref,
260                       CanonOutput* output,
261                       url_parse::Component* out_ref) {
262  if (ref.len < 0) {
263    // Common case of no ref.
264    *out_ref = url_parse::Component();
265    return;
266  }
267
268  // Append the ref separator. Note that we need to do this even when the ref
269  // is empty but present.
270  output->push_back('#');
271  out_ref->begin = output->length();
272
273  // Now iterate through all the characters, converting to UTF-8 and validating.
274  int end = ref.end();
275  for (int i = ref.begin; i < end; i++) {
276    if (spec[i] == 0) {
277      // IE just strips NULLs, so we do too.
278      continue;
279    } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
280      // Unline IE seems to, we escape control characters. This will probably
281      // make the reference fragment unusable on a web page, but people
282      // shouldn't be using control characters in their anchor names.
283      AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
284    } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
285      // Normal ASCII characters are just appended.
286      output->push_back(static_cast<char>(spec[i]));
287    } else {
288      // Non-ASCII characters are appended unescaped, but only when they are
289      // valid. Invalid Unicode characters are replaced with the "invalid
290      // character" as IE seems to (ReadUTFChar puts the unicode replacement
291      // character in the output on failure for us).
292      unsigned code_point;
293      ReadUTFChar(spec, &i, end, &code_point);
294      AppendUTF8Value(code_point, output);
295    }
296  }
297
298  out_ref->len = output->length() - out_ref->begin;
299}
300
301}  // namespace
302
303const char* RemoveURLWhitespace(const char* input, int input_len,
304                                CanonOutputT<char>* buffer,
305                                int* output_len) {
306  return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
307}
308
309const char16* RemoveURLWhitespace(const char16* input, int input_len,
310                                  CanonOutputT<char16>* buffer,
311                                  int* output_len) {
312  return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
313}
314
315char CanonicalSchemeChar(char16 ch) {
316  if (ch >= 0x80)
317    return 0;  // Non-ASCII is not supported by schemes.
318  return kSchemeCanonical[ch];
319}
320
321bool CanonicalizeScheme(const char* spec,
322                        const url_parse::Component& scheme,
323                        CanonOutput* output,
324                        url_parse::Component* out_scheme) {
325  return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
326}
327
328bool CanonicalizeScheme(const char16* spec,
329                        const url_parse::Component& scheme,
330                        CanonOutput* output,
331                        url_parse::Component* out_scheme) {
332  return DoScheme<char16, char16>(spec, scheme, output, out_scheme);
333}
334
335bool CanonicalizeUserInfo(const char* username_source,
336                          const url_parse::Component& username,
337                          const char* password_source,
338                          const url_parse::Component& password,
339                          CanonOutput* output,
340                          url_parse::Component* out_username,
341                          url_parse::Component* out_password) {
342  return DoUserInfo<char, unsigned char>(
343      username_source, username, password_source, password,
344      output, out_username, out_password);
345}
346
347bool CanonicalizeUserInfo(const char16* username_source,
348                          const url_parse::Component& username,
349                          const char16* password_source,
350                          const url_parse::Component& password,
351                          CanonOutput* output,
352                          url_parse::Component* out_username,
353                          url_parse::Component* out_password) {
354  return DoUserInfo<char16, char16>(
355      username_source, username, password_source, password,
356      output, out_username, out_password);
357}
358
359bool CanonicalizePort(const char* spec,
360                      const url_parse::Component& port,
361                      int default_port_for_scheme,
362                      CanonOutput* output,
363                      url_parse::Component* out_port) {
364  return DoPort<char, unsigned char>(spec, port,
365                                     default_port_for_scheme,
366                                     output, out_port);
367}
368
369bool CanonicalizePort(const char16* spec,
370                      const url_parse::Component& port,
371                      int default_port_for_scheme,
372                      CanonOutput* output,
373                      url_parse::Component* out_port) {
374  return DoPort<char16, char16>(spec, port, default_port_for_scheme,
375                                      output, out_port);
376}
377
378void CanonicalizeRef(const char* spec,
379                     const url_parse::Component& ref,
380                     CanonOutput* output,
381                     url_parse::Component* out_ref) {
382  DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
383}
384
385void CanonicalizeRef(const char16* spec,
386                     const url_parse::Component& ref,
387                     CanonOutput* output,
388                     url_parse::Component* out_ref) {
389  DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref);
390}
391
392}  // namespace url_canon
393