1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Canonicalizers for random bits that aren't big enough for their own files.
6
7#include <string.h>
8
9#include "url/url_canon.h"
10#include "url/url_canon_internal.h"
11
12namespace url {
13
14namespace {
15
16// Returns true if the given character should be removed from the middle of a
17// URL.
18inline bool IsRemovableURLWhitespace(int ch) {
19  return ch == '\r' || ch == '\n' || ch == '\t';
20}
21
22// Backend for RemoveURLWhitespace (see declaration in url_canon.h).
23// It sucks that we have to do this, since this takes about 13% of the total URL
24// canonicalization time.
25template<typename CHAR>
26const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
27                                  CanonOutputT<CHAR>* buffer,
28                                  int* output_len) {
29  // Fast verification that there's nothing that needs removal. This is the 99%
30  // case, so we want it to be fast and don't care about impacting the speed
31  // when we do find whitespace.
32  int found_whitespace = false;
33  for (int i = 0; i < input_len; i++) {
34    if (!IsRemovableURLWhitespace(input[i]))
35      continue;
36    found_whitespace = true;
37    break;
38  }
39
40  if (!found_whitespace) {
41    // Didn't find any whitespace, we don't need to do anything. We can just
42    // return the input as the output.
43    *output_len = input_len;
44    return input;
45  }
46
47  // Remove the whitespace into the new buffer and return it.
48  for (int i = 0; i < input_len; i++) {
49    if (!IsRemovableURLWhitespace(input[i]))
50      buffer->push_back(input[i]);
51  }
52  *output_len = buffer->length();
53  return buffer->data();
54}
55
56// Contains the canonical version of each possible input letter in the scheme
57// (basically, lower-cased). The corresponding entry will be 0 if the letter
58// is not allowed in a scheme.
59const char kSchemeCanonical[0x80] = {
60// 00-1f: all are invalid
61     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
62     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
63//  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
64     0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  '+',  0,  '-', '.',  0,
65//   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
66    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
67//   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
68     0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
69//   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
70    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0,   0 ,  0,   0 ,  0,
71//   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
72     0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
73//   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
74    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
75
76// This could be a table lookup as well by setting the high bit for each
77// valid character, but it's only called once per URL, and it makes the lookup
78// table easier to read not having extra stuff in it.
79inline bool IsSchemeFirstChar(unsigned char c) {
80  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
81}
82
83template<typename CHAR, typename UCHAR>
84bool DoScheme(const CHAR* spec,
85              const Component& scheme,
86              CanonOutput* output,
87              Component* out_scheme) {
88  if (scheme.len <= 0) {
89    // Scheme is unspecified or empty, convert to empty by appending a colon.
90    *out_scheme = Component(output->length(), 0);
91    output->push_back(':');
92    return true;
93  }
94
95  // The output scheme starts from the current position.
96  out_scheme->begin = output->length();
97
98  // Danger: it's important that this code does not strip any characters: it
99  // only emits the canonical version (be it valid or escaped) of each of
100  // the input characters. Stripping would put it out of sync with
101  // FindAndCompareScheme, which could cause some security checks on
102  // schemes to be incorrect.
103  bool success = true;
104  int end = scheme.end();
105  for (int i = scheme.begin; i < end; i++) {
106    UCHAR ch = static_cast<UCHAR>(spec[i]);
107    char replacement = 0;
108    if (ch < 0x80) {
109      if (i == scheme.begin) {
110        // Need to do a special check for the first letter of the scheme.
111        if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
112          replacement = kSchemeCanonical[ch];
113      } else {
114        replacement = kSchemeCanonical[ch];
115      }
116    }
117
118    if (replacement) {
119      output->push_back(replacement);
120    } else if (ch == '%') {
121      // Canonicalizing the scheme multiple times should lead to the same
122      // result. Since invalid characters will be escaped, we need to preserve
123      // the percent to avoid multiple escaping. The scheme will be invalid.
124      success = false;
125      output->push_back('%');
126    } else {
127      // Invalid character, store it but mark this scheme as invalid.
128      success = false;
129
130      // This will escape the output and also handle encoding issues.
131      // Ignore the return value since we already failed.
132      AppendUTF8EscapedChar(spec, &i, end, output);
133    }
134  }
135
136  // The output scheme ends with the the current position, before appending
137  // the colon.
138  out_scheme->len = output->length() - out_scheme->begin;
139  output->push_back(':');
140  return success;
141}
142
143// The username and password components reference ranges in the corresponding
144// *_spec strings. Typically, these specs will be the same (we're
145// canonicalizing a single source string), but may be different when
146// replacing components.
147template<typename CHAR, typename UCHAR>
148bool DoUserInfo(const CHAR* username_spec,
149                const Component& username,
150                const CHAR* password_spec,
151                const Component& password,
152                CanonOutput* output,
153                Component* out_username,
154                Component* out_password) {
155  if (username.len <= 0 && password.len <= 0) {
156    // Common case: no user info. We strip empty username/passwords.
157    *out_username = Component();
158    *out_password = Component();
159    return true;
160  }
161
162  // Write the username.
163  out_username->begin = output->length();
164  if (username.len > 0) {
165    // This will escape characters not valid for the username.
166    AppendStringOfType(&username_spec[username.begin], username.len,
167                       CHAR_USERINFO, output);
168  }
169  out_username->len = output->length() - out_username->begin;
170
171  // When there is a password, we need the separator. Note that we strip
172  // empty but specified passwords.
173  if (password.len > 0) {
174    output->push_back(':');
175    out_password->begin = output->length();
176    AppendStringOfType(&password_spec[password.begin], password.len,
177                       CHAR_USERINFO, output);
178    out_password->len = output->length() - out_password->begin;
179  } else {
180    *out_password = Component();
181  }
182
183  output->push_back('@');
184  return true;
185}
186
187// Helper functions for converting port integers to strings.
188inline void WritePortInt(char* output, int output_len, int port) {
189  _itoa_s(port, output, output_len, 10);
190}
191
192// This function will prepend the colon if there will be a port.
193template<typename CHAR, typename UCHAR>
194bool DoPort(const CHAR* spec,
195            const Component& port,
196            int default_port_for_scheme,
197            CanonOutput* output,
198            Component* out_port) {
199  int port_num = ParsePort(spec, port);
200  if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) {
201    *out_port = Component();
202    return true;  // Leave port empty.
203  }
204
205  if (port_num == PORT_INVALID) {
206    // Invalid port: We'll copy the text from the input so the user can see
207    // what the error was, and mark the URL as invalid by returning false.
208    output->push_back(':');
209    out_port->begin = output->length();
210    AppendInvalidNarrowString(spec, port.begin, port.end(), output);
211    out_port->len = output->length() - out_port->begin;
212    return false;
213  }
214
215  // Convert port number back to an integer. Max port value is 5 digits, and
216  // the Parsed::ExtractPort will have made sure the integer is in range.
217  const int buf_size = 6;
218  char buf[buf_size];
219  WritePortInt(buf, buf_size, port_num);
220
221  // Append the port number to the output, preceeded by a colon.
222  output->push_back(':');
223  out_port->begin = output->length();
224  for (int i = 0; i < buf_size && buf[i]; i++)
225    output->push_back(buf[i]);
226
227  out_port->len = output->length() - out_port->begin;
228  return true;
229}
230
231template<typename CHAR, typename UCHAR>
232void DoCanonicalizeRef(const CHAR* spec,
233                       const Component& ref,
234                       CanonOutput* output,
235                       Component* out_ref) {
236  if (ref.len < 0) {
237    // Common case of no ref.
238    *out_ref = Component();
239    return;
240  }
241
242  // Append the ref separator. Note that we need to do this even when the ref
243  // is empty but present.
244  output->push_back('#');
245  out_ref->begin = output->length();
246
247  // Now iterate through all the characters, converting to UTF-8 and validating.
248  int end = ref.end();
249  for (int i = ref.begin; i < end; i++) {
250    if (spec[i] == 0) {
251      // IE just strips NULLs, so we do too.
252      continue;
253    } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
254      // Unline IE seems to, we escape control characters. This will probably
255      // make the reference fragment unusable on a web page, but people
256      // shouldn't be using control characters in their anchor names.
257      AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
258    } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
259      // Normal ASCII characters are just appended.
260      output->push_back(static_cast<char>(spec[i]));
261    } else {
262      // Non-ASCII characters are appended unescaped, but only when they are
263      // valid. Invalid Unicode characters are replaced with the "invalid
264      // character" as IE seems to (ReadUTFChar puts the unicode replacement
265      // character in the output on failure for us).
266      unsigned code_point;
267      ReadUTFChar(spec, &i, end, &code_point);
268      AppendUTF8Value(code_point, output);
269    }
270  }
271
272  out_ref->len = output->length() - out_ref->begin;
273}
274
275}  // namespace
276
277const char* RemoveURLWhitespace(const char* input, int input_len,
278                                CanonOutputT<char>* buffer,
279                                int* output_len) {
280  return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
281}
282
283const base::char16* RemoveURLWhitespace(const base::char16* input,
284                                        int input_len,
285                                        CanonOutputT<base::char16>* buffer,
286                                        int* output_len) {
287  return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
288}
289
290char CanonicalSchemeChar(base::char16 ch) {
291  if (ch >= 0x80)
292    return 0;  // Non-ASCII is not supported by schemes.
293  return kSchemeCanonical[ch];
294}
295
296bool CanonicalizeScheme(const char* spec,
297                        const Component& scheme,
298                        CanonOutput* output,
299                        Component* out_scheme) {
300  return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
301}
302
303bool CanonicalizeScheme(const base::char16* spec,
304                        const Component& scheme,
305                        CanonOutput* output,
306                        Component* out_scheme) {
307  return DoScheme<base::char16, base::char16>(spec, scheme, output, out_scheme);
308}
309
310bool CanonicalizeUserInfo(const char* username_source,
311                          const Component& username,
312                          const char* password_source,
313                          const Component& password,
314                          CanonOutput* output,
315                          Component* out_username,
316                          Component* out_password) {
317  return DoUserInfo<char, unsigned char>(
318      username_source, username, password_source, password,
319      output, out_username, out_password);
320}
321
322bool CanonicalizeUserInfo(const base::char16* username_source,
323                          const Component& username,
324                          const base::char16* password_source,
325                          const Component& password,
326                          CanonOutput* output,
327                          Component* out_username,
328                          Component* out_password) {
329  return DoUserInfo<base::char16, base::char16>(
330      username_source, username, password_source, password,
331      output, out_username, out_password);
332}
333
334bool CanonicalizePort(const char* spec,
335                      const Component& port,
336                      int default_port_for_scheme,
337                      CanonOutput* output,
338                      Component* out_port) {
339  return DoPort<char, unsigned char>(spec, port,
340                                     default_port_for_scheme,
341                                     output, out_port);
342}
343
344bool CanonicalizePort(const base::char16* spec,
345                      const Component& port,
346                      int default_port_for_scheme,
347                      CanonOutput* output,
348                      Component* out_port) {
349  return DoPort<base::char16, base::char16>(spec, port, default_port_for_scheme,
350                                            output, out_port);
351}
352
353void CanonicalizeRef(const char* spec,
354                     const Component& ref,
355                     CanonOutput* output,
356                     Component* out_ref) {
357  DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
358}
359
360void CanonicalizeRef(const base::char16* spec,
361                     const Component& ref,
362                     CanonOutput* output,
363                     Component* out_ref) {
364  DoCanonicalizeRef<base::char16, base::char16>(spec, ref, output, out_ref);
365}
366
367}  // namespace url
368