1// Copyright 2007, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30#include <cstdio>
31#include <errno.h>
32#include <stdlib.h>
33#include <string>
34
35#include "googleurl/src/url_canon_internal.h"
36
37namespace url_canon {
38
39namespace {
40
41template<typename CHAR, typename UCHAR>
42void DoAppendStringOfType(const CHAR* source, int length,
43                          SharedCharTypes type,
44                          CanonOutput* output) {
45  for (int i = 0; i < length; i++) {
46    if (static_cast<UCHAR>(source[i]) >= 0x80) {
47      // ReadChar will fill the code point with kUnicodeReplacementCharacter
48      // when the input is invalid, which is what we want.
49      unsigned code_point;
50      ReadUTFChar(source, &i, length, &code_point);
51      AppendUTF8EscapedValue(code_point, output);
52    } else {
53      // Just append the 7-bit character, possibly escaping it.
54      unsigned char uch = static_cast<unsigned char>(source[i]);
55      if (!IsCharOfType(uch, type))
56        AppendEscapedChar(uch, output);
57      else
58        output->push_back(uch);
59    }
60  }
61}
62
63// This function assumes the input values are all contained in 8-bit,
64// although it allows any type. Returns true if input is valid, false if not.
65template<typename CHAR, typename UCHAR>
66void DoAppendInvalidNarrowString(const CHAR* spec, int begin, int end,
67                                 CanonOutput* output) {
68  for (int i = begin; i < end; i++) {
69    UCHAR uch = static_cast<UCHAR>(spec[i]);
70    if (uch >= 0x80) {
71      // Handle UTF-8/16 encodings. This call will correctly handle the error
72      // case by appending the invalid character.
73      AppendUTF8EscapedChar(spec, &i, end, output);
74    } else if (uch <= ' ' || uch == 0x7f) {
75      // This function is for error handling, so we escape all control
76      // characters and spaces, but not anything else since we lack
77      // context to do something more specific.
78      AppendEscapedChar(static_cast<unsigned char>(uch), output);
79    } else {
80      output->push_back(static_cast<char>(uch));
81    }
82  }
83}
84
85// Overrides one component, see the url_canon::Replacements structure for
86// what the various combionations of source pointer and component mean.
87void DoOverrideComponent(const char* override_source,
88                         const url_parse::Component& override_component,
89                         const char** dest,
90                         url_parse::Component* dest_component) {
91  if (override_source) {
92    *dest = override_source;
93    *dest_component = override_component;
94  }
95}
96
97// Similar to DoOverrideComponent except that it takes a UTF-16 input and does
98// not actually set the output character pointer.
99//
100// The input is converted to UTF-8 at the end of the given buffer as a temporary
101// holding place. The component indentifying the portion of the buffer used in
102// the |utf8_buffer| will be specified in |*dest_component|.
103//
104// This will not actually set any |dest| pointer like DoOverrideComponent
105// does because all of the pointers will point into the |utf8_buffer|, which
106// may get resized while we're overriding a subsequent component. Instead, the
107// caller should use the beginning of the |utf8_buffer| as the string pointer
108// for all components once all overrides have been prepared.
109bool PrepareUTF16OverrideComponent(
110    const char16* override_source,
111    const url_parse::Component& override_component,
112    CanonOutput* utf8_buffer,
113    url_parse::Component* dest_component) {
114  bool success = true;
115  if (override_source) {
116    if (!override_component.is_valid()) {
117      // Non-"valid" component (means delete), so we need to preserve that.
118      *dest_component = url_parse::Component();
119    } else {
120      // Convert to UTF-8.
121      dest_component->begin = utf8_buffer->length();
122      success = ConvertUTF16ToUTF8(&override_source[override_component.begin],
123                                   override_component.len, utf8_buffer);
124      dest_component->len = utf8_buffer->length() - dest_component->begin;
125    }
126  }
127  return success;
128}
129
130}  // namespace
131
132// See the header file for this array's declaration.
133const unsigned char kSharedCharTypeTable[0x100] = {
134    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x00 - 0x0f
135    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x10 - 0x1f
136    0,                           // 0x20  ' ' (escape spaces in queries)
137    CHAR_QUERY | CHAR_USERINFO,  // 0x21  !
138    0,                           // 0x22  "
139    0,                           // 0x23  #  (invalid in query since it marks the ref)
140    CHAR_QUERY | CHAR_USERINFO,  // 0x24  $
141    CHAR_QUERY | CHAR_USERINFO,  // 0x25  %
142    CHAR_QUERY | CHAR_USERINFO,  // 0x26  &
143    CHAR_QUERY | CHAR_USERINFO,  // 0x27  '
144    CHAR_QUERY | CHAR_USERINFO,  // 0x28  (
145    CHAR_QUERY | CHAR_USERINFO,  // 0x29  )
146    CHAR_QUERY | CHAR_USERINFO,  // 0x2a  *
147    CHAR_QUERY | CHAR_USERINFO,  // 0x2b  +
148    CHAR_QUERY | CHAR_USERINFO,  // 0x2c  ,
149    CHAR_QUERY | CHAR_USERINFO,  // 0x2d  -
150    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4,  // 0x2e  .
151    CHAR_QUERY,                              // 0x2f  /
152    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x30  0
153    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x31  1
154    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x32  2
155    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x33  3
156    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x34  4
157    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x35  5
158    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x36  6
159    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC | CHAR_OCT,  // 0x37  7
160    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC,             // 0x38  8
161    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX | CHAR_DEC,             // 0x39  9
162    CHAR_QUERY,  // 0x3a  :
163    CHAR_QUERY,  // 0x3b  ;
164    0,           // 0x3c  <  (Try to prevent certain types of XSS.)
165    CHAR_QUERY,  // 0x3d  =
166    0,           // 0x3e  >  (Try to prevent certain types of XSS.)
167    CHAR_QUERY,  // 0x3f  ?
168    CHAR_QUERY,  // 0x40  @
169    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x41  A
170    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x42  B
171    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x43  C
172    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x44  D
173    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x45  E
174    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x46  F
175    CHAR_QUERY | CHAR_USERINFO,  // 0x47  G
176    CHAR_QUERY | CHAR_USERINFO,  // 0x48  H
177    CHAR_QUERY | CHAR_USERINFO,  // 0x49  I
178    CHAR_QUERY | CHAR_USERINFO,  // 0x4a  J
179    CHAR_QUERY | CHAR_USERINFO,  // 0x4b  K
180    CHAR_QUERY | CHAR_USERINFO,  // 0x4c  L
181    CHAR_QUERY | CHAR_USERINFO,  // 0x4d  M
182    CHAR_QUERY | CHAR_USERINFO,  // 0x4e  N
183    CHAR_QUERY | CHAR_USERINFO,  // 0x4f  O
184    CHAR_QUERY | CHAR_USERINFO,  // 0x50  P
185    CHAR_QUERY | CHAR_USERINFO,  // 0x51  Q
186    CHAR_QUERY | CHAR_USERINFO,  // 0x52  R
187    CHAR_QUERY | CHAR_USERINFO,  // 0x53  S
188    CHAR_QUERY | CHAR_USERINFO,  // 0x54  T
189    CHAR_QUERY | CHAR_USERINFO,  // 0x55  U
190    CHAR_QUERY | CHAR_USERINFO,  // 0x56  V
191    CHAR_QUERY | CHAR_USERINFO,  // 0x57  W
192    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4, // 0x58  X
193    CHAR_QUERY | CHAR_USERINFO,  // 0x59  Y
194    CHAR_QUERY | CHAR_USERINFO,  // 0x5a  Z
195    CHAR_QUERY,  // 0x5b  [
196    CHAR_QUERY,  // 0x5c  '\'
197    CHAR_QUERY,  // 0x5d  ]
198    CHAR_QUERY,  // 0x5e  ^
199    CHAR_QUERY | CHAR_USERINFO,  // 0x5f  _
200    CHAR_QUERY,  // 0x60  `
201    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x61  a
202    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x62  b
203    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x63  c
204    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x64  d
205    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x65  e
206    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4 | CHAR_HEX,  // 0x66  f
207    CHAR_QUERY | CHAR_USERINFO,  // 0x67  g
208    CHAR_QUERY | CHAR_USERINFO,  // 0x68  h
209    CHAR_QUERY | CHAR_USERINFO,  // 0x69  i
210    CHAR_QUERY | CHAR_USERINFO,  // 0x6a  j
211    CHAR_QUERY | CHAR_USERINFO,  // 0x6b  k
212    CHAR_QUERY | CHAR_USERINFO,  // 0x6c  l
213    CHAR_QUERY | CHAR_USERINFO,  // 0x6d  m
214    CHAR_QUERY | CHAR_USERINFO,  // 0x6e  n
215    CHAR_QUERY | CHAR_USERINFO,  // 0x6f  o
216    CHAR_QUERY | CHAR_USERINFO,  // 0x70  p
217    CHAR_QUERY | CHAR_USERINFO,  // 0x71  q
218    CHAR_QUERY | CHAR_USERINFO,  // 0x72  r
219    CHAR_QUERY | CHAR_USERINFO,  // 0x73  s
220    CHAR_QUERY | CHAR_USERINFO,  // 0x74  t
221    CHAR_QUERY | CHAR_USERINFO,  // 0x75  u
222    CHAR_QUERY | CHAR_USERINFO,  // 0x76  v
223    CHAR_QUERY | CHAR_USERINFO,  // 0x77  w
224    CHAR_QUERY | CHAR_USERINFO | CHAR_IPV4,  // 0x78  x
225    CHAR_QUERY | CHAR_USERINFO,  // 0x79  y
226    CHAR_QUERY | CHAR_USERINFO,  // 0x7a  z
227    CHAR_QUERY,  // 0x7b  {
228    CHAR_QUERY,  // 0x7c  |
229    CHAR_QUERY,  // 0x7d  }
230    CHAR_QUERY | CHAR_USERINFO,  // 0x7e  ~
231    0,           // 0x7f
232    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x80 - 0x8f
233    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0x90 - 0x9f
234    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xa0 - 0xaf
235    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xb0 - 0xbf
236    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xc0 - 0xcf
237    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xd0 - 0xdf
238    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xe0 - 0xef
239    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 0xf0 - 0xff
240};
241
242const char kHexCharLookup[0x10] = {
243    '0', '1', '2', '3', '4', '5', '6', '7',
244    '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
245};
246
247const char kCharToHexLookup[8] = {
248    0,         // 0x00 - 0x1f
249    '0',       // 0x20 - 0x3f: digits 0 - 9 are 0x30 - 0x39
250    'A' - 10,  // 0x40 - 0x5f: letters A - F are 0x41 - 0x46
251    'a' - 10,  // 0x60 - 0x7f: letters a - f are 0x61 - 0x66
252    0,         // 0x80 - 0x9F
253    0,         // 0xA0 - 0xBF
254    0,         // 0xC0 - 0xDF
255    0,         // 0xE0 - 0xFF
256};
257
258const char16 kUnicodeReplacementCharacter = 0xfffd;
259
260void AppendStringOfType(const char* source, int length,
261                        SharedCharTypes type,
262                        CanonOutput* output) {
263  DoAppendStringOfType<char, unsigned char>(source, length, type, output);
264}
265
266void AppendStringOfType(const char16* source, int length,
267                        SharedCharTypes type,
268                        CanonOutput* output) {
269  DoAppendStringOfType<char16, char16>(source, length, type, output);
270}
271
272void AppendInvalidNarrowString(const char* spec, int begin, int end,
273                               CanonOutput* output) {
274  DoAppendInvalidNarrowString<char, unsigned char>(spec, begin, end, output);
275}
276
277void AppendInvalidNarrowString(const char16* spec, int begin, int end,
278                               CanonOutput* output) {
279  DoAppendInvalidNarrowString<char16, char16>(spec, begin, end, output);
280}
281
282bool ConvertUTF16ToUTF8(const char16* input, int input_len,
283                        CanonOutput* output) {
284  bool success = true;
285  for (int i = 0; i < input_len; i++) {
286    unsigned code_point;
287    success &= ReadUTFChar(input, &i, input_len, &code_point);
288    AppendUTF8Value(code_point, output);
289  }
290  return success;
291}
292
293bool ConvertUTF8ToUTF16(const char* input, int input_len,
294                        CanonOutputT<char16>* output) {
295  bool success = true;
296  for (int i = 0; i < input_len; i++) {
297    unsigned code_point;
298    success &= ReadUTFChar(input, &i, input_len, &code_point);
299    AppendUTF16Value(code_point, output);
300  }
301  return success;
302}
303
304void SetupOverrideComponents(const char* base,
305                             const Replacements<char>& repl,
306                             URLComponentSource<char>* source,
307                             url_parse::Parsed* parsed) {
308  // Get the source and parsed structures of the things we are replacing.
309  const URLComponentSource<char>& repl_source = repl.sources();
310  const url_parse::Parsed& repl_parsed = repl.components();
311
312  DoOverrideComponent(repl_source.scheme, repl_parsed.scheme,
313                      &source->scheme, &parsed->scheme);
314  DoOverrideComponent(repl_source.username, repl_parsed.username,
315                      &source->username, &parsed->username);
316  DoOverrideComponent(repl_source.password, repl_parsed.password,
317                      &source->password, &parsed->password);
318
319  // Our host should be empty if not present, so override the default setup.
320  DoOverrideComponent(repl_source.host, repl_parsed.host,
321                      &source->host, &parsed->host);
322  if (parsed->host.len == -1)
323    parsed->host.len = 0;
324
325  DoOverrideComponent(repl_source.port, repl_parsed.port,
326                      &source->port, &parsed->port);
327  DoOverrideComponent(repl_source.path, repl_parsed.path,
328                      &source->path, &parsed->path);
329  DoOverrideComponent(repl_source.query, repl_parsed.query,
330                      &source->query, &parsed->query);
331  DoOverrideComponent(repl_source.ref, repl_parsed.ref,
332                      &source->ref, &parsed->ref);
333}
334
335bool SetupUTF16OverrideComponents(const char* base,
336                                  const Replacements<char16>& repl,
337                                  CanonOutput* utf8_buffer,
338                                  URLComponentSource<char>* source,
339                                  url_parse::Parsed* parsed) {
340  bool success = true;
341
342  // Get the source and parsed structures of the things we are replacing.
343  const URLComponentSource<char16>& repl_source = repl.sources();
344  const url_parse::Parsed& repl_parsed = repl.components();
345
346  success &= PrepareUTF16OverrideComponent(
347      repl_source.scheme, repl_parsed.scheme,
348      utf8_buffer, &parsed->scheme);
349  success &= PrepareUTF16OverrideComponent(
350      repl_source.username, repl_parsed.username,
351      utf8_buffer, &parsed->username);
352  success &= PrepareUTF16OverrideComponent(
353      repl_source.password, repl_parsed.password,
354      utf8_buffer, &parsed->password);
355  success &= PrepareUTF16OverrideComponent(
356      repl_source.host, repl_parsed.host,
357      utf8_buffer, &parsed->host);
358  success &= PrepareUTF16OverrideComponent(
359      repl_source.port, repl_parsed.port,
360      utf8_buffer, &parsed->port);
361  success &= PrepareUTF16OverrideComponent(
362      repl_source.path, repl_parsed.path,
363      utf8_buffer, &parsed->path);
364  success &= PrepareUTF16OverrideComponent(
365      repl_source.query, repl_parsed.query,
366      utf8_buffer, &parsed->query);
367  success &= PrepareUTF16OverrideComponent(
368      repl_source.ref, repl_parsed.ref,
369      utf8_buffer, &parsed->ref);
370
371  // PrepareUTF16OverrideComponent will not have set the data pointer since the
372  // buffer could be resized, invalidating the pointers. We set the data
373  // pointers for affected components now that the buffer is finalized.
374  if (repl_source.scheme)   source->scheme = utf8_buffer->data();
375  if (repl_source.username) source->username = utf8_buffer->data();
376  if (repl_source.password) source->password = utf8_buffer->data();
377  if (repl_source.host)     source->host = utf8_buffer->data();
378  if (repl_source.port)     source->port = utf8_buffer->data();
379  if (repl_source.path)     source->path = utf8_buffer->data();
380  if (repl_source.query)    source->query = utf8_buffer->data();
381  if (repl_source.ref)      source->ref = utf8_buffer->data();
382
383  return success;
384}
385
386#ifndef WIN32
387
388int _itoa_s(int value, char* buffer, size_t size_in_chars, int radix) {
389  const char* format_str;
390  if (radix == 10)
391    format_str = "%d";
392  else if (radix == 16)
393    format_str = "%x";
394  else
395    return EINVAL;
396
397  int written = snprintf(buffer, size_in_chars, format_str, value);
398  if (static_cast<size_t>(written) >= size_in_chars) {
399    // Output was truncated, or written was negative.
400    return EINVAL;
401  }
402  return 0;
403}
404
405int _itow_s(int value, char16* buffer, size_t size_in_chars, int radix) {
406  if (radix != 10)
407    return EINVAL;
408
409  // No more than 12 characters will be required for a 32-bit integer.
410  // Add an extra byte for the terminating null.
411  char temp[13];
412  int written = snprintf(temp, sizeof(temp), "%d", value);
413  if (static_cast<size_t>(written) >= size_in_chars) {
414    // Output was truncated, or written was negative.
415    return EINVAL;
416  }
417
418  for (int i = 0; i < written; ++i) {
419    buffer[i] = static_cast<char16>(temp[i]);
420  }
421  buffer[written] = '\0';
422  return 0;
423}
424
425#endif  // !WIN32
426
427}  // namespace url_canon
428